Print this page
13275 bhyve needs richer INIT/SIPI support
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>


  92 #include "vioapic.h"
  93 #include "vlapic.h"
  94 #include "vpmtmr.h"
  95 #include "vrtc.h"
  96 #include "vmm_stat.h"
  97 #include "vmm_lapic.h"
  98 
  99 #include "io/ppt.h"
 100 #include "io/iommu.h"
 101 
 102 struct vlapic;
 103 
 104 /*
 105  * Initialization:
 106  * (a) allocated when vcpu is created
 107  * (i) initialized when vcpu is created and when it is reinitialized
 108  * (o) initialized the first time the vcpu is created
 109  * (x) initialized before use
 110  */
 111 struct vcpu {
 112         struct mtx      mtx;            /* (o) protects 'state' and 'hostcpu' */


 113         enum vcpu_state state;          /* (o) vcpu state */
 114 #ifndef __FreeBSD__
 115         kcondvar_t      vcpu_cv;        /* (o) cpu waiter cv */
 116         kcondvar_t      state_cv;       /* (o) IDLE-transition cv */
 117 #endif /* __FreeBSD__ */
 118         int             hostcpu;        /* (o) vcpu's current host cpu */
 119 #ifndef __FreeBSD__
 120         int             lastloccpu;     /* (o) last host cpu localized to */
 121 #endif
 122         uint_t          runblock;       /* (i) block vcpu from run state */
 123         int             reqidle;        /* (i) request vcpu to idle */
 124         struct vlapic   *vlapic;        /* (i) APIC device model */
 125         enum x2apic_state x2apic_state; /* (i) APIC mode */
 126         uint64_t        exitintinfo;    /* (i) events pending at VM exit */
 127         int             nmi_pending;    /* (i) NMI pending */
 128         int             extint_pending; /* (i) INTR pending */
 129         int     exception_pending;      /* (i) exception pending */
 130         int     exc_vector;             /* (x) exception collateral */
 131         int     exc_errcode_valid;
 132         uint32_t exc_errcode;

 133         struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
 134         uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
 135         void            *stats;         /* (a,i) statistics */
 136         struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
 137         uint64_t        nextrip;        /* (x) next instruction to execute */
 138         struct vie      *vie_ctx;       /* (x) instruction emulation context */
 139 #ifndef __FreeBSD__
 140         uint64_t        tsc_offset;     /* (x) offset from host TSC */
 141 #endif
 142 };
 143 
 144 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 145 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 146 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
 147 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
 148 #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
 149 
 150 struct mem_seg {
 151         size_t  len;
 152         bool    sysmem;


 183         struct vatpit   *vatpit;                /* (i) virtual atpit */
 184         struct vpmtmr   *vpmtmr;                /* (i) virtual ACPI PM timer */
 185         struct vrtc     *vrtc;                  /* (o) virtual RTC */
 186         volatile cpuset_t active_cpus;          /* (i) active vcpus */
 187         volatile cpuset_t debug_cpus;           /* (i) vcpus stopped for dbg */
 188         int             suspend;                /* (i) stop VM execution */
 189         volatile cpuset_t suspended_cpus;       /* (i) suspended vcpus */
 190         volatile cpuset_t halted_cpus;          /* (x) cpus in a hard halt */
 191         struct mem_map  mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 192         struct mem_seg  mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 193         struct vmspace  *vmspace;               /* (o) guest's address space */
 194         char            name[VM_MAX_NAMELEN];   /* (o) virtual machine name */
 195         struct vcpu     vcpu[VM_MAXCPU];        /* (i) guest vcpus */
 196         /* The following describe the vm cpu topology */
 197         uint16_t        sockets;                /* (o) num of sockets */
 198         uint16_t        cores;                  /* (o) num of cores/socket */
 199         uint16_t        threads;                /* (o) num of threads/core */
 200         uint16_t        maxcpus;                /* (o) max pluggable cpus */
 201 
 202         struct ioport_config ioports;           /* (o) ioport handling */
 203 
 204         bool            sipi_req;               /* (i) SIPI requested */
 205         int             sipi_req_vcpu;          /* (i) SIPI destination */
 206         uint64_t        sipi_req_rip;           /* (i) SIPI start %rip */
 207 
 208         /* Miscellaneous VM-wide statistics and counters */
 209         struct vm_wide_stats {
 210                 uint64_t sipi_supersede;
 211         } stats;
 212 };
 213 
 214 static int vmm_initialized;
 215 
 216 
 217 static void
 218 nullop_panic(void)
 219 {
 220         panic("null vmm operation call");
 221 }
 222 
 223 /* Do not allow use of an un-set `ops` to do anything but panic */
 224 static struct vmm_ops vmm_ops_null = {
 225         .init           = (vmm_init_func_t)nullop_panic,
 226         .cleanup        = (vmm_cleanup_func_t)nullop_panic,
 227         .resume         = (vmm_resume_func_t)nullop_panic,
 228         .vminit         = (vmi_init_func_t)nullop_panic,
 229         .vmrun          = (vmi_run_func_t)nullop_panic,
 230         .vmcleanup      = (vmi_cleanup_func_t)nullop_panic,
 231         .vmgetreg       = (vmi_get_register_t)nullop_panic,
 232         .vmsetreg       = (vmi_set_register_t)nullop_panic,
 233         .vmgetdesc      = (vmi_get_desc_t)nullop_panic,
 234         .vmsetdesc      = (vmi_set_desc_t)nullop_panic,
 235         .vmgetcap       = (vmi_get_cap_t)nullop_panic,
 236         .vmsetcap       = (vmi_set_cap_t)nullop_panic,
 237         .vmspace_alloc  = (vmi_vmspace_alloc)nullop_panic,
 238         .vmspace_free   = (vmi_vmspace_free)nullop_panic,
 239         .vlapic_init    = (vmi_vlapic_init)nullop_panic,
 240         .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
 241         .vmsavectx      = (vmi_savectx)nullop_panic,
 242         .vmrestorectx   = (vmi_restorectx)nullop_panic,
 243 };
 244 
 245 static struct vmm_ops *ops = &vmm_ops_null;
 246 
 247 #define VMM_INIT(num)                   ((*ops->init)(num))
 248 #define VMM_CLEANUP()                   ((*ops->cleanup)())
 249 #define VMM_RESUME()                    ((*ops->resume)())
 250 
 251 #define VMINIT(vm, pmap)                ((*ops->vminit)(vm, pmap))
 252 #define VMRUN(vmi, vcpu, rip, pmap, evinfo) \
 253         ((*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo))
 254 #define VMCLEANUP(vmi)                  ((*ops->vmcleanup)(vmi))
 255 #define VMSPACE_ALLOC(min, max)         ((*ops->vmspace_alloc)(min, max))
 256 #define VMSPACE_FREE(vmspace)           ((*ops->vmspace_free)(vmspace))
 257 
 258 #define VMGETREG(vmi, vcpu, num, rv)    ((*ops->vmgetreg)(vmi, vcpu, num, rv))
 259 #define VMSETREG(vmi, vcpu, num, val)   ((*ops->vmsetreg)(vmi, vcpu, num, val))
 260 #define VMGETDESC(vmi, vcpu, num, dsc)  ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
 261 #define VMSETDESC(vmi, vcpu, num, dsc)  ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
 262 #define VMGETCAP(vmi, vcpu, num, rv)    ((*ops->vmgetcap)(vmi, vcpu, num, rv))
 263 #define VMSETCAP(vmi, vcpu, num, val)   ((*ops->vmsetcap)(vmi, vcpu, num, val))
 264 #define VLAPIC_INIT(vmi, vcpu)          ((*ops->vlapic_init)(vmi, vcpu))
 265 #define VLAPIC_CLEANUP(vmi, vlapic)     ((*ops->vlapic_cleanup)(vmi, vlapic))
 266 
 267 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 268 #define fpu_stop_emulating()    clts()
 269 
 270 SDT_PROVIDER_DEFINE(vmm);
 271 
 272 static MALLOC_DEFINE(M_VM, "vm", "vm");
 273 


 275 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 276 
 277 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 278     NULL);
 279 
 280 /*
 281  * Halt the guest if all vcpus are executing a HLT instruction with
 282  * interrupts disabled.
 283  */
 284 static int halt_detection_enabled = 1;
 285 
 286 /* IPI vector used for vcpu notifications */
 287 static int vmm_ipinum;
 288 
 289 /* Trap into hypervisor on all guest exceptions and reflect them back */
 290 static int trace_guest_exceptions;
 291 
 292 static void vm_free_memmap(struct vm *vm, int ident);
 293 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 294 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);


 295 
 296 #ifndef __FreeBSD__
 297 static void vm_clear_memseg(struct vm *, int);
 298 
 299 /* Flags for vtc_status */
 300 #define VTCS_FPU_RESTORED       1 /* guest FPU restored, host FPU saved */
 301 #define VTCS_FPU_CTX_CRITICAL   2 /* in ctx where FPU restore cannot be lazy */
 302 
 303 typedef struct vm_thread_ctx {
 304         struct vm       *vtc_vm;
 305         int             vtc_vcpuid;
 306         uint_t          vtc_status;
 307 } vm_thread_ctx_t;
 308 #endif /* __FreeBSD__ */
 309 
 310 #ifdef KTR
 311 static const char *
 312 vcpu_state2str(enum vcpu_state state)
 313 {
 314 


 353 
 354         if (create) {
 355 #ifdef __FreeBSD__
 356                 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 357                     "initialized", vcpu_id));
 358 #endif
 359                 vcpu_lock_init(vcpu);
 360                 vcpu->state = VCPU_IDLE;
 361                 vcpu->hostcpu = NOCPU;
 362 #ifndef __FreeBSD__
 363                 vcpu->lastloccpu = NOCPU;
 364 #endif
 365                 vcpu->guestfpu = fpu_save_area_alloc();
 366                 vcpu->stats = vmm_stat_alloc();
 367                 vcpu->vie_ctx = vie_alloc();
 368         } else {
 369                 vie_reset(vcpu->vie_ctx);
 370                 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
 371         }
 372 

 373         vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 374         vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 375         vcpu->runblock = 0;
 376         vcpu->reqidle = 0;
 377         vcpu->exitintinfo = 0;
 378         vcpu->nmi_pending = 0;
 379         vcpu->extint_pending = 0;
 380         vcpu->exception_pending = 0;
 381         vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 382         fpu_save_area_reset(vcpu->guestfpu);
 383         vmm_stat_init(vcpu->stats);
 384 }
 385 
 386 int
 387 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 388 {
 389 
 390         return (trace_guest_exceptions);
 391 }
 392 
 393 struct vm_exit *
 394 vm_exitinfo(struct vm *vm, int cpuid)
 395 {


1216                 return (true);
1217         default:
1218                 return (false);
1219         }
1220 }
1221 
1222 int
1223 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1224 {
1225 
1226         if (vcpu < 0 || vcpu >= vm->maxcpus)
1227                 return (EINVAL);
1228 
1229         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1230                 return (EINVAL);
1231 
1232         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1233 }
1234 
1235 int
1236 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1237 {
1238         if (vcpu < 0 || vcpu >= vm->maxcpus)
1239                 return (EINVAL);
1240 
1241         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1242                 return (EINVAL);
1243 
1244         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1245 }
1246 











































1247 static void
1248 restore_guest_fpustate(struct vcpu *vcpu)
1249 {
1250 
1251         /* flush host state to the pcb */
1252         fpuexit(curthread);
1253 
1254         /* restore guest FPU state */
1255         fpu_stop_emulating();
1256         fpurestore(vcpu->guestfpu);
1257 
1258         /* restore guest XCR0 if XSAVE is enabled in the host */
1259         if (rcr4() & CR4_XSAVE)
1260                 load_xcr(0, vcpu->guest_xcr0);
1261 
1262         /*
1263          * The FPU is now "dirty" with the guest's state so turn on emulation
1264          * to trap any access to the FPU by the host.
1265          */
1266         fpu_start_emulating();


1337         /*
1338          * The following state transitions are allowed:
1339          * IDLE -> FROZEN -> IDLE
1340          * FROZEN -> RUNNING -> FROZEN
1341          * FROZEN -> SLEEPING -> FROZEN
1342          */
1343         switch (vcpu->state) {
1344         case VCPU_IDLE:
1345         case VCPU_RUNNING:
1346         case VCPU_SLEEPING:
1347                 error = (newstate != VCPU_FROZEN);
1348                 break;
1349         case VCPU_FROZEN:
1350                 error = (newstate == VCPU_FROZEN);
1351                 break;
1352         default:
1353                 error = 1;
1354                 break;
1355         }
1356 
1357         if (newstate == VCPU_RUNNING) {
1358                 while (vcpu->runblock != 0) {
1359 #ifdef __FreeBSD__
1360                         msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
1361 #else
1362                         cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1363 #endif
1364                 }
1365         }
1366 
1367         if (error)
1368                 return (EBUSY);
1369 
1370         VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1371             vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1372 
1373         vcpu->state = newstate;
1374         if (newstate == VCPU_RUNNING)
1375                 vcpu->hostcpu = curcpu;
1376         else
1377                 vcpu->hostcpu = NOCPU;
1378 
1379         if (newstate == VCPU_IDLE ||
1380             (newstate == VCPU_FROZEN && vcpu->runblock != 0)) {
1381 #ifdef __FreeBSD__
1382                 wakeup(&vcpu->state);
1383 #else
1384                 cv_broadcast(&vcpu->state_cv);
1385 #endif
1386         }
1387 
1388         return (0);
1389 }
1390 
1391 static void
1392 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1393 {
1394         int error;
1395 
1396         if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1397                 panic("Error %d setting state to %d\n", error, newstate);
1398 }
1399 
1400 static void
1401 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1402 {
1403         int error;
1404 
1405         if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1406                 panic("Error %d setting state to %d", error, newstate);
1407 }
1408 
1409 /*
1410  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1411  */
1412 static int
1413 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1414 {
1415         struct vcpu *vcpu;
1416 #ifdef __FreeBSD__
1417         const char *wmesg;
1418 #else
1419         const char *wmesg __unused;
1420 #endif
1421         int t, vcpu_halted, vm_halted;

1422 
1423         KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1424 
1425         vcpu = &vm->vcpu[vcpuid];
1426         vcpu_halted = 0;
1427         vm_halted = 0;
1428 
1429         vcpu_lock(vcpu);
1430         while (1) {
1431                 /*
1432                  * Do a final check for pending NMI or interrupts before
1433                  * really putting this thread to sleep. Also check for
1434                  * software events that would cause this vcpu to wakeup.
1435                  *
1436                  * These interrupts/events could have happened after the
1437                  * vcpu returned from VMRUN() and before it acquired the
1438                  * vcpu lock above.
1439                  */
1440                 if (vm->suspend || vcpu->reqidle)
1441                         break;
1442                 if (vm_nmi_pending(vm, vcpuid))
1443                         break;


1444                 if (!intr_disabled) {
1445                         if (vm_extint_pending(vm, vcpuid) ||
1446                             vlapic_pending_intr(vcpu->vlapic, NULL)) {
1447                                 break;
1448                         }
1449                 }
1450 
1451                 /* Don't go to sleep if the vcpu thread needs to yield */
1452                 if (vcpu_should_yield(vm, vcpuid))





1453                         break;

1454 
1455                 if (vcpu_debugged(vm, vcpuid))
1456                         break;
1457 
1458                 /*
1459                  * Some Linux guests implement "halt" by having all vcpus
1460                  * execute HLT with interrupts disabled. 'halted_cpus' keeps
1461                  * track of the vcpus that have entered this state. When all
1462                  * vcpus enter the halted state the virtual machine is halted.
1463                  */
1464                 if (intr_disabled) {
1465                         wmesg = "vmhalt";
1466                         VCPU_CTR0(vm, vcpuid, "Halted");
1467                         if (!vcpu_halted && halt_detection_enabled) {
1468                                 vcpu_halted = 1;
1469                                 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1470                         }
1471                         if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1472                                 vm_halted = 1;
1473                                 break;
1474                         }
1475                 } else {
1476                         wmesg = "vmidle";
1477                 }
1478 
1479                 t = ticks;
1480                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1481 #ifdef __FreeBSD__
1482                 /*
1483                  * XXX msleep_spin() cannot be interrupted by signals so
1484                  * wake up periodically to check pending signals.
1485                  */
1486                 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1487 #else
1488                 /*
1489                  * Fortunately, cv_wait_sig can be interrupted by signals, so
1490                  * there is no need to periodically wake up.
1491                  */
1492                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1493 #endif
1494                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1495                 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1496         }
1497 
1498         if (vcpu_halted)
1499                 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1500 
1501         vcpu_unlock(vcpu);
1502 
1503         if (vm_halted)
1504                 vm_suspend(vm, VM_SUSPEND_HALT);
1505 
1506         return (0);
1507 }
1508 
1509 static int
1510 vm_handle_paging(struct vm *vm, int vcpuid)
1511 {
1512         int rv, ftype;
1513         struct vm_map *map;
1514         struct vcpu *vcpu;
1515         struct vm_exit *vme;
1516 
1517         vcpu = &vm->vcpu[vcpuid];
1518         vme = &vcpu->exitinfo;
1519 
1520         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1521             __func__, vme->inst_length));
1522 
1523         ftype = vme->u.paging.fault_type;
1524         KASSERT(ftype == VM_PROT_READ ||
1525             ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1526             ("vm_handle_paging: invalid fault_type %d", ftype));


1815                 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1816                         vcpu_notify_event(vm, i);
1817                 }
1818         }
1819 
1820         return (-1);
1821 }
1822 
1823 static int
1824 vm_handle_reqidle(struct vm *vm, int vcpuid)
1825 {
1826         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1827 
1828         vcpu_lock(vcpu);
1829         KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1830         vcpu->reqidle = 0;
1831         vcpu_unlock(vcpu);
1832         return (-1);
1833 }
1834 
























































1835 #ifndef __FreeBSD__
1836 static int
1837 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1838 {
1839         struct vcpu *cpu = &vm->vcpu[vcpuid];
1840         const uint32_t code = vme->u.msr.code;
1841         const uint64_t val = vme->u.msr.wval;
1842 
1843         switch (code) {
1844         case MSR_TSC:
1845                 cpu->tsc_offset = val - rdtsc();
1846                 return (0);
1847         }
1848 
1849         return (-1);
1850 }
1851 #endif /* __FreeBSD__ */
1852 
1853 void
1854 vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip)
1855 {
1856         if (vm->sipi_req) {
1857                 /* This should never occur if userspace is doing its job. */
1858                 vm->stats.sipi_supersede++;
1859         }
1860         vm->sipi_req = true;
1861         vm->sipi_req_vcpu = req_vcpuid;
1862         vm->sipi_req_rip = req_rip;
1863 }
1864 
1865 int
1866 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1867 {
1868         int i;
1869 
1870         if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1871                 return (EINVAL);
1872 
1873         if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1874                 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1875                     vm->suspend, how);
1876                 return (EALREADY);
1877         }
1878 
1879         VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1880 
1881         /*
1882          * Notify all active vcpus that they are now suspended.
1883          */
1884         for (i = 0; i < vm->maxcpus; i++) {
1885                 if (CPU_ISSET(i, &vm->active_cpus))
1886                         vcpu_notify_event(vm, i);
1887         }
1888 
1889         return (0);
1890 }
1891 
1892 void
1893 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1894 {
1895         struct vm_exit *vmexit;
1896 
1897         KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1898             ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1899 
1900         vmexit = vm_exitinfo(vm, vcpuid);
1901         vmexit->rip = rip;
1902         vmexit->inst_length = 0;
1903         vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1904         vmexit->u.suspended.how = vm->suspend;
1905 }
1906 
1907 void
1908 vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
1909 {
1910         struct vm_exit *vmexit;
1911 
1912         vmexit = vm_exitinfo(vm, vcpuid);
1913         vmexit->rip = rip;
1914         vmexit->inst_length = 0;
1915         vmexit->exitcode = VM_EXITCODE_DEBUG;
1916 }
1917 
1918 void
1919 vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip)
1920 {
1921         struct vm_exit *vmexit;
1922 
1923         vmexit = vm_exitinfo(vm, vcpuid);
1924         vmexit->rip = rip;
1925         vmexit->inst_length = 0;
1926         vmexit->exitcode = VM_EXITCODE_RUNBLOCK;
1927         vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1);
1928 }
1929 
1930 void
1931 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
1932 {
1933         struct vm_exit *vmexit;
1934 
1935         vmexit = vm_exitinfo(vm, vcpuid);
1936         vmexit->rip = rip;
1937         vmexit->inst_length = 0;
1938         vmexit->exitcode = VM_EXITCODE_REQIDLE;
1939         vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
1940 }
1941 
1942 void
1943 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1944 {
1945         struct vm_exit *vmexit;
1946 
1947         vmexit = vm_exitinfo(vm, vcpuid);
1948         vmexit->rip = rip;
1949         vmexit->inst_length = 0;
1950         vmexit->exitcode = VM_EXITCODE_BOGUS;
1951         vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1952 }
1953 
1954 #ifndef __FreeBSD__
1955 /*
1956  * Some vmm resources, such as the lapic, may have CPU-specific resources
1957  * allocated to them which would benefit from migration onto the host CPU which
1958  * is processing the vcpu state.
1959  */
1960 static void
1961 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
1962 {
1963         /*
1964          * Localizing cyclic resources requires acquisition of cpu_lock, and
1965          * doing so with kpreempt disabled is a recipe for deadlock disaster.
1966          */
1967         VERIFY(curthread->t_preempt == 0);
1968 
1969         /*
1970          * Do not bother with localization if this vCPU is about to return to
1971          * the host CPU it was last localized to.
1972          */
1973         if (vcpu->lastloccpu == curcpu)


2055 #endif /* __FreeBSD */
2056 
2057 static int
2058 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2059     struct vm_exit *vme)
2060 {
2061         struct vcpu *vcpu;
2062         struct vie *vie;
2063         int err;
2064 
2065         vcpu = &vm->vcpu[vcpuid];
2066         vie = vcpu->vie_ctx;
2067         err = 0;
2068 
2069         switch (entry->cmd) {
2070         case VEC_DEFAULT:
2071                 return (0);
2072         case VEC_DISCARD_INSTR:
2073                 vie_reset(vie);
2074                 return (0);
2075         case VEC_COMPLETE_MMIO:
2076                 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2077                 if (err == 0) {
2078                         err = vie_emulate_mmio(vie, vm, vcpuid);
2079                         if (err == 0) {
2080                                 vie_advance_pc(vie, &vcpu->nextrip);
2081                         } else if (err < 0) {
2082                                 vie_exitinfo(vie, vme);
2083                         } else if (err == EAGAIN) {
2084                                 /*
2085                                  * Clear the instruction emulation state in
2086                                  * order to re-enter VM context and continue
2087                                  * this 'rep <instruction>'
2088                                  */
2089                                 vie_reset(vie);
2090                                 err = 0;
2091                         }
2092                 }
2093                 break;
2094         case VEC_COMPLETE_INOUT:
2095                 err = vie_fulfill_inout(vie, &entry->u.inout);
2096                 if (err == 0) {
2097                         err = vie_emulate_inout(vie, vm, vcpuid);
2098                         if (err == 0) {
2099                                 vie_advance_pc(vie, &vcpu->nextrip);
2100                         } else if (err < 0) {
2101                                 vie_exitinfo(vie, vme);
2102                         } else if (err == EAGAIN) {
2103                                 /*
2104                                  * Clear the instruction emulation state in
2105                                  * order to re-enter VM context and continue
2106                                  * this 'rep ins/outs'
2107                                  */
2108                                 vie_reset(vie);
2109                                 err = 0;
2110                         }
2111                 }
2112                 break;
2113         default:
2114                 return (EINVAL);
2115         }
2116         return (err);
2117 }
2118 
2119 static int
2120 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2121 {
2122         struct vie *vie;
2123 
2124         vie = vm->vcpu[vcpuid].vie_ctx;
2125 
2126         if (vie_pending(vie)) {
2127                 /*
2128                  * Userspace has not fulfilled the pending needs of the
2129                  * instruction emulation, so bail back out.
2130                  */
2131                 vie_exitinfo(vie, vme);
2132                 return (-1);
2133         }
2134 
2135         if (vcpuid == 0 && vm->sipi_req) {
2136                 /* The boot vCPU has sent a SIPI to one of the other CPUs */
2137                 vme->exitcode = VM_EXITCODE_SPINUP_AP;
2138                 vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu;
2139                 vme->u.spinup_ap.rip = vm->sipi_req_rip;
2140 
2141                 vm->sipi_req = false;
2142                 vm->sipi_req_vcpu = 0;
2143                 vm->sipi_req_rip = 0;
2144                 return (-1);
2145         }
2146 
2147         return (0);
2148 }
2149 
2150 int
2151 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2152 {
2153         struct vm_eventinfo evinfo;
2154         int error;
2155         struct vcpu *vcpu;
2156 #ifdef  __FreeBSD__
2157         struct pcb *pcb;
2158 #endif
2159         uint64_t tscval;
2160         struct vm_exit *vme;
2161         bool intr_disabled;
2162         pmap_t pmap;
2163 #ifndef __FreeBSD__
2164         vm_thread_ctx_t vtc;
2165         int affinity_type = CPU_CURRENT;
2166 #endif
2167 
2168         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2169                 return (EINVAL);
2170 
2171         if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2172                 return (EINVAL);
2173 
2174         if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2175                 return (EINVAL);
2176 
2177         pmap = vmspace_pmap(vm->vmspace);
2178         vcpu = &vm->vcpu[vcpuid];
2179         vme = &vcpu->exitinfo;
2180         evinfo.rptr = &vcpu->runblock;
2181         evinfo.sptr = &vm->suspend;
2182         evinfo.iptr = &vcpu->reqidle;
2183 
2184 #ifndef __FreeBSD__
2185         vtc.vtc_vm = vm;
2186         vtc.vtc_vcpuid = vcpuid;
2187         vtc.vtc_status = 0;
2188 
2189         installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2190             NULL, vmm_freectx);
2191 #endif
2192 
2193         error = vm_entry_actions(vm, vcpuid, entry, vme);
2194         if (error != 0) {
2195                 goto exit;
2196         }
2197 
2198 restart:
2199         error = vm_loop_checks(vm, vcpuid, vme);
2200         if (error != 0) {
2201                 goto exit;
2202         }


2225 
2226 #ifdef  __FreeBSD__
2227         pcb = PCPU_GET(curpcb);
2228         set_pcb_flags(pcb, PCB_FULL_IRET);
2229 #else
2230         /* Force a trip through update_sregs to reload %fs/%gs and friends */
2231         PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2232 #endif
2233 
2234 #ifdef  __FreeBSD__
2235         restore_guest_fpustate(vcpu);
2236 #else
2237         if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2238                 restore_guest_fpustate(vcpu);
2239                 vtc.vtc_status |= VTCS_FPU_RESTORED;
2240         }
2241         vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2242 #endif
2243 
2244         vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2245         error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
2246         vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2247 
2248 #ifdef  __FreeBSD__
2249         save_guest_fpustate(vcpu);
2250 #else
2251         vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2252 #endif
2253 
2254 #ifndef __FreeBSD__
2255         /*
2256          * Once clear of the delicate contexts comprising the VM_RUN handler,
2257          * thread CPU affinity can be loosened while other processing occurs.
2258          */
2259         thread_affinity_clear(curthread);
2260 #endif
2261 
2262         vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
2263 
2264         critical_exit();
2265 
2266         if (error != 0) {
2267                 /* Communicate out any error from VMRUN() above */
2268                 goto exit;
2269         }
2270 
2271         vcpu->nextrip = vme->rip + vme->inst_length;
2272         switch (vme->exitcode) {
2273         case VM_EXITCODE_REQIDLE:
2274                 error = vm_handle_reqidle(vm, vcpuid);
2275                 break;



2276         case VM_EXITCODE_SUSPENDED:
2277                 error = vm_handle_suspend(vm, vcpuid);
2278                 break;
2279         case VM_EXITCODE_IOAPIC_EOI:
2280                 vioapic_process_eoi(vm, vcpuid,
2281                     vme->u.ioapic_eoi.vector);
2282                 break;
2283         case VM_EXITCODE_RUNBLOCK:
2284                 break;
2285         case VM_EXITCODE_HLT:
2286                 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2287                 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2288                 break;
2289         case VM_EXITCODE_PAGING:
2290                 error = vm_handle_paging(vm, vcpuid);
2291                 break;
2292         case VM_EXITCODE_MMIO_EMUL:
2293                 error = vm_handle_mmio_emul(vm, vcpuid);
2294                 break;
2295         case VM_EXITCODE_INOUT:
2296                 error = vm_handle_inout(vm, vcpuid, vme);
2297                 break;
2298         case VM_EXITCODE_MONITOR:
2299         case VM_EXITCODE_MWAIT:
2300         case VM_EXITCODE_VMINSN:
2301                 vm_inject_ud(vm, vcpuid);
2302                 break;
2303 #ifndef __FreeBSD__
2304         case VM_EXITCODE_WRMSR:


2775 }
2776 
2777 void
2778 vm_extint_clear(struct vm *vm, int vcpuid)
2779 {
2780         struct vcpu *vcpu;
2781 
2782         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2783                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2784 
2785         vcpu = &vm->vcpu[vcpuid];
2786 
2787         if (vcpu->extint_pending == 0)
2788                 panic("vm_extint_clear: inconsistent extint_pending state");
2789 
2790         vcpu->extint_pending = 0;
2791         vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2792 }
2793 
2794 int






























































































































































































2795 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2796 {
2797         if (vcpu < 0 || vcpu >= vm->maxcpus)
2798                 return (EINVAL);
2799 
2800         if (type < 0 || type >= VM_CAP_MAX)
2801                 return (EINVAL);
2802 
2803         return (VMGETCAP(vm->cookie, vcpu, type, retval));
2804 }
2805 
2806 int
2807 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
2808 {
2809         if (vcpu < 0 || vcpu >= vm->maxcpus)
2810                 return (EINVAL);
2811 
2812         if (type < 0 || type >= VM_CAP_MAX)
2813                 return (EINVAL);
2814 


2877         }
2878         return (found);
2879 }
2880 #endif
2881 
2882 void *
2883 vm_iommu_domain(struct vm *vm)
2884 {
2885 
2886         return (vm->iommu);
2887 }
2888 
2889 int
2890 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
2891     bool from_idle)
2892 {
2893         int error;
2894         struct vcpu *vcpu;
2895 
2896         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2897                 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
2898 
2899         vcpu = &vm->vcpu[vcpuid];
2900 
2901         vcpu_lock(vcpu);
2902         error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
2903         vcpu_unlock(vcpu);
2904 
2905         return (error);
2906 }
2907 
2908 enum vcpu_state
2909 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
2910 {
2911         struct vcpu *vcpu;
2912         enum vcpu_state state;
2913 
2914         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2915                 panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
2916 
2917         vcpu = &vm->vcpu[vcpuid];
2918 
2919         vcpu_lock(vcpu);
2920         state = vcpu->state;
2921         if (hostcpu != NULL)
2922                 *hostcpu = vcpu->hostcpu;
2923         vcpu_unlock(vcpu);
2924 
2925         return (state);
2926 }
2927 
2928 void
2929 vcpu_block_run(struct vm *vm, int vcpuid)
2930 {
2931         struct vcpu *vcpu;
2932 
2933         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2934                 panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
2935 
2936         vcpu = &vm->vcpu[vcpuid];
2937 
2938         vcpu_lock(vcpu);
2939         vcpu->runblock++;
2940         if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) {
2941                 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2942         }
2943         while (vcpu->state == VCPU_RUNNING) {
2944 #ifdef __FreeBSD__
2945                 msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
2946 #else
2947                 cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
2948 #endif
2949         }
2950         vcpu_unlock(vcpu);
2951 }
2952 
2953 void
2954 vcpu_unblock_run(struct vm *vm, int vcpuid)
2955 {
2956         struct vcpu *vcpu;
2957 
2958         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2959                 panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
2960 
2961         vcpu = &vm->vcpu[vcpuid];
2962 
2963         vcpu_lock(vcpu);
2964         KASSERT(vcpu->runblock != 0, ("expected non-zero runblock"));
2965         vcpu->runblock--;
2966         if (vcpu->runblock == 0) {
2967 #ifdef __FreeBSD__
2968                 wakeup(&vcpu->state);
2969 #else
2970                 cv_broadcast(&vcpu->state_cv);
2971 #endif
2972         }
2973         vcpu_unlock(vcpu);
2974 }
2975 
2976 #ifndef __FreeBSD__
2977 uint64_t
2978 vcpu_tsc_offset(struct vm *vm, int vcpuid)
2979 {
2980         return (vm->vcpu[vcpuid].tsc_offset);
2981 }
2982 #endif /* __FreeBSD__ */
2983 
2984 int
2985 vm_activate_cpu(struct vm *vm, int vcpuid)
2986 {
2987 
2988         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2989                 return (EINVAL);
2990 
2991         if (CPU_ISSET(vcpuid, &vm->active_cpus))
2992                 return (EBUSY);
2993 
2994         VCPU_CTR0(vm, vcpuid, "activated");
2995         CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);


3021 }
3022 
3023 int
3024 vm_resume_cpu(struct vm *vm, int vcpuid)
3025 {
3026 
3027         if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3028                 return (EINVAL);
3029 
3030         if (vcpuid == -1) {
3031                 CPU_ZERO(&vm->debug_cpus);
3032         } else {
3033                 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3034                         return (EINVAL);
3035 
3036                 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3037         }
3038         return (0);
3039 }
3040 
3041 int
3042 vcpu_debugged(struct vm *vm, int vcpuid)

3043 {



3044 
3045         return (CPU_ISSET(vcpuid, &vm->debug_cpus));
























































3046 }
3047 






















3048 cpuset_t
3049 vm_active_cpus(struct vm *vm)
3050 {
3051 
3052         return (vm->active_cpus);
3053 }
3054 
3055 cpuset_t
3056 vm_debug_cpus(struct vm *vm)
3057 {
3058 
3059         return (vm->debug_cpus);
3060 }
3061 
3062 cpuset_t
3063 vm_suspended_cpus(struct vm *vm)
3064 {
3065 
3066         return (vm->suspended_cpus);
3067 }




  92 #include "vioapic.h"
  93 #include "vlapic.h"
  94 #include "vpmtmr.h"
  95 #include "vrtc.h"
  96 #include "vmm_stat.h"
  97 #include "vmm_lapic.h"
  98 
  99 #include "io/ppt.h"
 100 #include "io/iommu.h"
 101 
 102 struct vlapic;
 103 
 104 /*
 105  * Initialization:
 106  * (a) allocated when vcpu is created
 107  * (i) initialized when vcpu is created and when it is reinitialized
 108  * (o) initialized the first time the vcpu is created
 109  * (x) initialized before use
 110  */
 111 struct vcpu {
 112         /* (o) protects state, run_state, hostcpu, sipi_vector */
 113         struct mtx      mtx;
 114 
 115         enum vcpu_state state;          /* (o) vcpu state */
 116         enum vcpu_run_state run_state;  /* (i) vcpu init/sipi/run state */
 117         kcondvar_t      vcpu_cv;        /* (o) cpu waiter cv */
 118         kcondvar_t      state_cv;       /* (o) IDLE-transition cv */

 119         int             hostcpu;        /* (o) vcpu's current host cpu */

 120         int             lastloccpu;     /* (o) last host cpu localized to */


 121         int             reqidle;        /* (i) request vcpu to idle */
 122         struct vlapic   *vlapic;        /* (i) APIC device model */
 123         enum x2apic_state x2apic_state; /* (i) APIC mode */
 124         uint64_t        exitintinfo;    /* (i) events pending at VM exit */
 125         int             nmi_pending;    /* (i) NMI pending */
 126         int             extint_pending; /* (i) INTR pending */
 127         int     exception_pending;      /* (i) exception pending */
 128         int     exc_vector;             /* (x) exception collateral */
 129         int     exc_errcode_valid;
 130         uint32_t exc_errcode;
 131         uint8_t         sipi_vector;    /* (i) SIPI vector */
 132         struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
 133         uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
 134         void            *stats;         /* (a,i) statistics */
 135         struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
 136         uint64_t        nextrip;        /* (x) next instruction to execute */
 137         struct vie      *vie_ctx;       /* (x) instruction emulation context */
 138 #ifndef __FreeBSD__
 139         uint64_t        tsc_offset;     /* (x) offset from host TSC */
 140 #endif
 141 };
 142 
 143 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 144 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 145 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
 146 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
 147 #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
 148 
 149 struct mem_seg {
 150         size_t  len;
 151         bool    sysmem;


 182         struct vatpit   *vatpit;                /* (i) virtual atpit */
 183         struct vpmtmr   *vpmtmr;                /* (i) virtual ACPI PM timer */
 184         struct vrtc     *vrtc;                  /* (o) virtual RTC */
 185         volatile cpuset_t active_cpus;          /* (i) active vcpus */
 186         volatile cpuset_t debug_cpus;           /* (i) vcpus stopped for dbg */
 187         int             suspend;                /* (i) stop VM execution */
 188         volatile cpuset_t suspended_cpus;       /* (i) suspended vcpus */
 189         volatile cpuset_t halted_cpus;          /* (x) cpus in a hard halt */
 190         struct mem_map  mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 191         struct mem_seg  mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 192         struct vmspace  *vmspace;               /* (o) guest's address space */
 193         char            name[VM_MAX_NAMELEN];   /* (o) virtual machine name */
 194         struct vcpu     vcpu[VM_MAXCPU];        /* (i) guest vcpus */
 195         /* The following describe the vm cpu topology */
 196         uint16_t        sockets;                /* (o) num of sockets */
 197         uint16_t        cores;                  /* (o) num of cores/socket */
 198         uint16_t        threads;                /* (o) num of threads/core */
 199         uint16_t        maxcpus;                /* (o) max pluggable cpus */
 200 
 201         struct ioport_config ioports;           /* (o) ioport handling */









 202 };
 203 
 204 static int vmm_initialized;
 205 
 206 
 207 static void
 208 nullop_panic(void)
 209 {
 210         panic("null vmm operation call");
 211 }
 212 
 213 /* Do not allow use of an un-set `ops` to do anything but panic */
 214 static struct vmm_ops vmm_ops_null = {
 215         .init           = (vmm_init_func_t)nullop_panic,
 216         .cleanup        = (vmm_cleanup_func_t)nullop_panic,
 217         .resume         = (vmm_resume_func_t)nullop_panic,
 218         .vminit         = (vmi_init_func_t)nullop_panic,
 219         .vmrun          = (vmi_run_func_t)nullop_panic,
 220         .vmcleanup      = (vmi_cleanup_func_t)nullop_panic,
 221         .vmgetreg       = (vmi_get_register_t)nullop_panic,
 222         .vmsetreg       = (vmi_set_register_t)nullop_panic,
 223         .vmgetdesc      = (vmi_get_desc_t)nullop_panic,
 224         .vmsetdesc      = (vmi_set_desc_t)nullop_panic,
 225         .vmgetcap       = (vmi_get_cap_t)nullop_panic,
 226         .vmsetcap       = (vmi_set_cap_t)nullop_panic,
 227         .vmspace_alloc  = (vmi_vmspace_alloc)nullop_panic,
 228         .vmspace_free   = (vmi_vmspace_free)nullop_panic,
 229         .vlapic_init    = (vmi_vlapic_init)nullop_panic,
 230         .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
 231         .vmsavectx      = (vmi_savectx)nullop_panic,
 232         .vmrestorectx   = (vmi_restorectx)nullop_panic,
 233 };
 234 
 235 static struct vmm_ops *ops = &vmm_ops_null;
 236 
 237 #define VMM_INIT(num)                   ((*ops->init)(num))
 238 #define VMM_CLEANUP()                   ((*ops->cleanup)())
 239 #define VMM_RESUME()                    ((*ops->resume)())
 240 
 241 #define VMINIT(vm, pmap)                ((*ops->vminit)(vm, pmap))
 242 #define VMRUN(vmi, vcpu, rip, pmap) \
 243         ((*ops->vmrun)(vmi, vcpu, rip, pmap))
 244 #define VMCLEANUP(vmi)                  ((*ops->vmcleanup)(vmi))
 245 #define VMSPACE_ALLOC(min, max)         ((*ops->vmspace_alloc)(min, max))
 246 #define VMSPACE_FREE(vmspace)           ((*ops->vmspace_free)(vmspace))
 247 
 248 #define VMGETREG(vmi, vcpu, num, rv)    ((*ops->vmgetreg)(vmi, vcpu, num, rv))
 249 #define VMSETREG(vmi, vcpu, num, val)   ((*ops->vmsetreg)(vmi, vcpu, num, val))
 250 #define VMGETDESC(vmi, vcpu, num, dsc)  ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
 251 #define VMSETDESC(vmi, vcpu, num, dsc)  ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
 252 #define VMGETCAP(vmi, vcpu, num, rv)    ((*ops->vmgetcap)(vmi, vcpu, num, rv))
 253 #define VMSETCAP(vmi, vcpu, num, val)   ((*ops->vmsetcap)(vmi, vcpu, num, val))
 254 #define VLAPIC_INIT(vmi, vcpu)          ((*ops->vlapic_init)(vmi, vcpu))
 255 #define VLAPIC_CLEANUP(vmi, vlapic)     ((*ops->vlapic_cleanup)(vmi, vlapic))
 256 
 257 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 258 #define fpu_stop_emulating()    clts()
 259 
 260 SDT_PROVIDER_DEFINE(vmm);
 261 
 262 static MALLOC_DEFINE(M_VM, "vm", "vm");
 263 


 265 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 266 
 267 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 268     NULL);
 269 
 270 /*
 271  * Halt the guest if all vcpus are executing a HLT instruction with
 272  * interrupts disabled.
 273  */
 274 static int halt_detection_enabled = 1;
 275 
 276 /* IPI vector used for vcpu notifications */
 277 static int vmm_ipinum;
 278 
 279 /* Trap into hypervisor on all guest exceptions and reflect them back */
 280 static int trace_guest_exceptions;
 281 
 282 static void vm_free_memmap(struct vm *vm, int ident);
 283 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 284 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
 285 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
 286 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
 287 
 288 #ifndef __FreeBSD__
 289 static void vm_clear_memseg(struct vm *, int);
 290 
 291 /* Flags for vtc_status */
 292 #define VTCS_FPU_RESTORED       1 /* guest FPU restored, host FPU saved */
 293 #define VTCS_FPU_CTX_CRITICAL   2 /* in ctx where FPU restore cannot be lazy */
 294 
 295 typedef struct vm_thread_ctx {
 296         struct vm       *vtc_vm;
 297         int             vtc_vcpuid;
 298         uint_t          vtc_status;
 299 } vm_thread_ctx_t;
 300 #endif /* __FreeBSD__ */
 301 
 302 #ifdef KTR
 303 static const char *
 304 vcpu_state2str(enum vcpu_state state)
 305 {
 306 


 345 
 346         if (create) {
 347 #ifdef __FreeBSD__
 348                 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 349                     "initialized", vcpu_id));
 350 #endif
 351                 vcpu_lock_init(vcpu);
 352                 vcpu->state = VCPU_IDLE;
 353                 vcpu->hostcpu = NOCPU;
 354 #ifndef __FreeBSD__
 355                 vcpu->lastloccpu = NOCPU;
 356 #endif
 357                 vcpu->guestfpu = fpu_save_area_alloc();
 358                 vcpu->stats = vmm_stat_alloc();
 359                 vcpu->vie_ctx = vie_alloc();
 360         } else {
 361                 vie_reset(vcpu->vie_ctx);
 362                 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
 363         }
 364 
 365         vcpu->run_state = VRS_HALT;
 366         vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 367         vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);

 368         vcpu->reqidle = 0;
 369         vcpu->exitintinfo = 0;
 370         vcpu->nmi_pending = 0;
 371         vcpu->extint_pending = 0;
 372         vcpu->exception_pending = 0;
 373         vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 374         fpu_save_area_reset(vcpu->guestfpu);
 375         vmm_stat_init(vcpu->stats);
 376 }
 377 
 378 int
 379 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 380 {
 381 
 382         return (trace_guest_exceptions);
 383 }
 384 
 385 struct vm_exit *
 386 vm_exitinfo(struct vm *vm, int cpuid)
 387 {


1208                 return (true);
1209         default:
1210                 return (false);
1211         }
1212 }
1213 
1214 int
1215 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1216 {
1217 
1218         if (vcpu < 0 || vcpu >= vm->maxcpus)
1219                 return (EINVAL);
1220 
1221         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1222                 return (EINVAL);
1223 
1224         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1225 }
1226 
1227 int
1228 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1229 {
1230         if (vcpu < 0 || vcpu >= vm->maxcpus)
1231                 return (EINVAL);
1232 
1233         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1234                 return (EINVAL);
1235 
1236         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1237 }
1238 
1239 int
1240 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1241 {
1242         struct vcpu *vcpu;
1243 
1244         if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1245                 return (EINVAL);
1246         }
1247 
1248         vcpu = &vm->vcpu[vcpuid];
1249 
1250         vcpu_lock(vcpu);
1251         *state = vcpu->run_state;
1252         *sipi_vec = vcpu->sipi_vector;
1253         vcpu_unlock(vcpu);
1254 
1255         return (0);
1256 }
1257 
1258 int
1259 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1260 {
1261         struct vcpu *vcpu;
1262 
1263         if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1264                 return (EINVAL);
1265         }
1266         if (!VRS_IS_VALID(state)) {
1267                 return (EINVAL);
1268         }
1269 
1270         vcpu = &vm->vcpu[vcpuid];
1271 
1272         vcpu_lock(vcpu);
1273         vcpu->run_state = state;
1274         vcpu->sipi_vector = sipi_vec;
1275         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1276         vcpu_unlock(vcpu);
1277 
1278         return (0);
1279 }
1280 
1281 
1282 static void
1283 restore_guest_fpustate(struct vcpu *vcpu)
1284 {
1285 
1286         /* flush host state to the pcb */
1287         fpuexit(curthread);
1288 
1289         /* restore guest FPU state */
1290         fpu_stop_emulating();
1291         fpurestore(vcpu->guestfpu);
1292 
1293         /* restore guest XCR0 if XSAVE is enabled in the host */
1294         if (rcr4() & CR4_XSAVE)
1295                 load_xcr(0, vcpu->guest_xcr0);
1296 
1297         /*
1298          * The FPU is now "dirty" with the guest's state so turn on emulation
1299          * to trap any access to the FPU by the host.
1300          */
1301         fpu_start_emulating();


1372         /*
1373          * The following state transitions are allowed:
1374          * IDLE -> FROZEN -> IDLE
1375          * FROZEN -> RUNNING -> FROZEN
1376          * FROZEN -> SLEEPING -> FROZEN
1377          */
1378         switch (vcpu->state) {
1379         case VCPU_IDLE:
1380         case VCPU_RUNNING:
1381         case VCPU_SLEEPING:
1382                 error = (newstate != VCPU_FROZEN);
1383                 break;
1384         case VCPU_FROZEN:
1385                 error = (newstate == VCPU_FROZEN);
1386                 break;
1387         default:
1388                 error = 1;
1389                 break;
1390         }
1391 










1392         if (error)
1393                 return (EBUSY);
1394 
1395         VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1396             vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1397 
1398         vcpu->state = newstate;
1399         if (newstate == VCPU_RUNNING)
1400                 vcpu->hostcpu = curcpu;
1401         else
1402                 vcpu->hostcpu = NOCPU;
1403 
1404         if (newstate == VCPU_IDLE) {

1405 #ifdef __FreeBSD__
1406                 wakeup(&vcpu->state);
1407 #else
1408                 cv_broadcast(&vcpu->state_cv);
1409 #endif
1410         }
1411 
1412         return (0);
1413 }
1414 
1415 static void
1416 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1417 {
1418         int error;
1419 
1420         if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1421                 panic("Error %d setting state to %d\n", error, newstate);
1422 }
1423 
1424 static void
1425 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1426 {
1427         int error;
1428 
1429         if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1430                 panic("Error %d setting state to %d", error, newstate);
1431 }
1432 
1433 /*
1434  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1435  */
1436 static int
1437 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1438 {
1439         struct vcpu *vcpu;





1440         int t, vcpu_halted, vm_halted;
1441         bool userspace_exit = false;
1442 
1443         KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1444 
1445         vcpu = &vm->vcpu[vcpuid];
1446         vcpu_halted = 0;
1447         vm_halted = 0;
1448 
1449         vcpu_lock(vcpu);
1450         while (1) {
1451                 /*
1452                  * Do a final check for pending interrupts (including NMI and
1453                  * INIT) before putting this thread to sleep.





1454                  */


1455                 if (vm_nmi_pending(vm, vcpuid))
1456                         break;
1457                 if (vcpu_run_state_pending(vm, vcpuid))
1458                         break;
1459                 if (!intr_disabled) {
1460                         if (vm_extint_pending(vm, vcpuid) ||
1461                             vlapic_pending_intr(vcpu->vlapic, NULL)) {
1462                                 break;
1463                         }
1464                 }
1465 
1466                 /*
1467                  * Also check for software events which would cause a wake-up.
1468                  * This will set the appropriate exitcode directly, rather than
1469                  * requiring a trip through VM_RUN().
1470                  */
1471                 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1472                         userspace_exit = true;
1473                         break;
1474                 }
1475 



1476                 /*
1477                  * Some Linux guests implement "halt" by having all vcpus
1478                  * execute HLT with interrupts disabled. 'halted_cpus' keeps
1479                  * track of the vcpus that have entered this state. When all
1480                  * vcpus enter the halted state the virtual machine is halted.
1481                  */
1482                 if (intr_disabled) {


1483                         if (!vcpu_halted && halt_detection_enabled) {
1484                                 vcpu_halted = 1;
1485                                 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1486                         }
1487                         if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1488                                 vm_halted = 1;
1489                                 break;
1490                         }


1491                 }
1492 
1493                 t = ticks;
1494                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);











1495                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);

1496                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1497                 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1498         }
1499 
1500         if (vcpu_halted)
1501                 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1502 
1503         vcpu_unlock(vcpu);
1504 
1505         if (vm_halted)
1506                 vm_suspend(vm, VM_SUSPEND_HALT);
1507 
1508         return (userspace_exit ? -1 : 0);
1509 }
1510 
1511 static int
1512 vm_handle_paging(struct vm *vm, int vcpuid)
1513 {
1514         int rv, ftype;
1515         struct vm_map *map;
1516         struct vcpu *vcpu;
1517         struct vm_exit *vme;
1518 
1519         vcpu = &vm->vcpu[vcpuid];
1520         vme = &vcpu->exitinfo;
1521 
1522         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1523             __func__, vme->inst_length));
1524 
1525         ftype = vme->u.paging.fault_type;
1526         KASSERT(ftype == VM_PROT_READ ||
1527             ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1528             ("vm_handle_paging: invalid fault_type %d", ftype));


1817                 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1818                         vcpu_notify_event(vm, i);
1819                 }
1820         }
1821 
1822         return (-1);
1823 }
1824 
1825 static int
1826 vm_handle_reqidle(struct vm *vm, int vcpuid)
1827 {
1828         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1829 
1830         vcpu_lock(vcpu);
1831         KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1832         vcpu->reqidle = 0;
1833         vcpu_unlock(vcpu);
1834         return (-1);
1835 }
1836 
1837 static int
1838 vm_handle_run_state(struct vm *vm, int vcpuid)
1839 {
1840         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1841         bool handled = false;
1842 
1843         vcpu_lock(vcpu);
1844         while (1) {
1845                 if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1846                         vcpu_unlock(vcpu);
1847                         VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1848                         vcpu_lock(vcpu);
1849 
1850                         vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1851                         vcpu->run_state |= VRS_INIT;
1852                 }
1853 
1854                 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1855                     (VRS_INIT | VRS_PEND_SIPI)) {
1856                         const uint8_t vector = vcpu->sipi_vector;
1857 
1858                         vcpu_unlock(vcpu);
1859                         VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1860                         vcpu_lock(vcpu);
1861 
1862                         vcpu->run_state &= ~VRS_PEND_SIPI;
1863                         vcpu->run_state |= VRS_RUN;
1864                 }
1865 
1866                 /*
1867                  * If the vCPU is now in the running state, there is no need to
1868                  * wait for anything prior to re-entry.
1869                  */
1870                 if ((vcpu->run_state & VRS_RUN) != 0) {
1871                         handled = true;
1872                         break;
1873                 }
1874 
1875                 /*
1876                  * Also check for software events which would cause a wake-up.
1877                  * This will set the appropriate exitcode directly, rather than
1878                  * requiring a trip through VM_RUN().
1879                  */
1880                 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1881                         break;
1882                 }
1883 
1884                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1885                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1886                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1887         }
1888         vcpu_unlock(vcpu);
1889 
1890         return (handled ? 0 : -1);
1891 }
1892 
1893 #ifndef __FreeBSD__
1894 static int
1895 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1896 {
1897         struct vcpu *cpu = &vm->vcpu[vcpuid];
1898         const uint32_t code = vme->u.msr.code;
1899         const uint64_t val = vme->u.msr.wval;
1900 
1901         switch (code) {
1902         case MSR_TSC:
1903                 cpu->tsc_offset = val - rdtsc();
1904                 return (0);
1905         }
1906 
1907         return (-1);
1908 }
1909 #endif /* __FreeBSD__ */
1910 












1911 int
1912 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1913 {
1914         int i;
1915 
1916         if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1917                 return (EINVAL);
1918 
1919         if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1920                 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1921                     vm->suspend, how);
1922                 return (EALREADY);
1923         }
1924 
1925         VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1926 
1927         /*
1928          * Notify all active vcpus that they are now suspended.
1929          */
1930         for (i = 0; i < vm->maxcpus; i++) {
1931                 if (CPU_ISSET(i, &vm->active_cpus))
1932                         vcpu_notify_event(vm, i);
1933         }
1934 
1935         return (0);
1936 }
1937 
1938 void
1939 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1940 {
1941         struct vm_exit *vmexit;
1942 



1943         vmexit = vm_exitinfo(vm, vcpuid);
1944         vmexit->rip = rip;
1945         vmexit->inst_length = 0;
1946         vmexit->exitcode = VM_EXITCODE_RUN_STATE;
1947         vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
1948 }
1949 




1950 










































1951 #ifndef __FreeBSD__
1952 /*
1953  * Some vmm resources, such as the lapic, may have CPU-specific resources
1954  * allocated to them which would benefit from migration onto the host CPU which
1955  * is processing the vcpu state.
1956  */
1957 static void
1958 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
1959 {
1960         /*
1961          * Localizing cyclic resources requires acquisition of cpu_lock, and
1962          * doing so with kpreempt disabled is a recipe for deadlock disaster.
1963          */
1964         VERIFY(curthread->t_preempt == 0);
1965 
1966         /*
1967          * Do not bother with localization if this vCPU is about to return to
1968          * the host CPU it was last localized to.
1969          */
1970         if (vcpu->lastloccpu == curcpu)


2052 #endif /* __FreeBSD */
2053 
2054 static int
2055 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2056     struct vm_exit *vme)
2057 {
2058         struct vcpu *vcpu;
2059         struct vie *vie;
2060         int err;
2061 
2062         vcpu = &vm->vcpu[vcpuid];
2063         vie = vcpu->vie_ctx;
2064         err = 0;
2065 
2066         switch (entry->cmd) {
2067         case VEC_DEFAULT:
2068                 return (0);
2069         case VEC_DISCARD_INSTR:
2070                 vie_reset(vie);
2071                 return (0);
2072         case VEC_FULFILL_MMIO:
2073                 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2074                 if (err == 0) {
2075                         err = vie_emulate_mmio(vie, vm, vcpuid);
2076                         if (err == 0) {
2077                                 vie_advance_pc(vie, &vcpu->nextrip);
2078                         } else if (err < 0) {
2079                                 vie_exitinfo(vie, vme);
2080                         } else if (err == EAGAIN) {
2081                                 /*
2082                                  * Clear the instruction emulation state in
2083                                  * order to re-enter VM context and continue
2084                                  * this 'rep <instruction>'
2085                                  */
2086                                 vie_reset(vie);
2087                                 err = 0;
2088                         }
2089                 }
2090                 break;
2091         case VEC_FULFILL_INOUT:
2092                 err = vie_fulfill_inout(vie, &entry->u.inout);
2093                 if (err == 0) {
2094                         err = vie_emulate_inout(vie, vm, vcpuid);
2095                         if (err == 0) {
2096                                 vie_advance_pc(vie, &vcpu->nextrip);
2097                         } else if (err < 0) {
2098                                 vie_exitinfo(vie, vme);
2099                         } else if (err == EAGAIN) {
2100                                 /*
2101                                  * Clear the instruction emulation state in
2102                                  * order to re-enter VM context and continue
2103                                  * this 'rep ins/outs'
2104                                  */
2105                                 vie_reset(vie);
2106                                 err = 0;
2107                         }
2108                 }
2109                 break;
2110         default:
2111                 return (EINVAL);
2112         }
2113         return (err);
2114 }
2115 
2116 static int
2117 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2118 {
2119         struct vie *vie;
2120 
2121         vie = vm->vcpu[vcpuid].vie_ctx;
2122 
2123         if (vie_pending(vie)) {
2124                 /*
2125                  * Userspace has not fulfilled the pending needs of the
2126                  * instruction emulation, so bail back out.
2127                  */
2128                 vie_exitinfo(vie, vme);
2129                 return (-1);
2130         }
2131 












2132         return (0);
2133 }
2134 
2135 int
2136 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2137 {

2138         int error;
2139         struct vcpu *vcpu;
2140 #ifdef  __FreeBSD__
2141         struct pcb *pcb;
2142 #endif
2143         uint64_t tscval;
2144         struct vm_exit *vme;
2145         bool intr_disabled;
2146         pmap_t pmap;
2147 #ifndef __FreeBSD__
2148         vm_thread_ctx_t vtc;
2149         int affinity_type = CPU_CURRENT;
2150 #endif
2151 
2152         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2153                 return (EINVAL);
2154 
2155         if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2156                 return (EINVAL);
2157 
2158         if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2159                 return (EINVAL);
2160 
2161         pmap = vmspace_pmap(vm->vmspace);
2162         vcpu = &vm->vcpu[vcpuid];
2163         vme = &vcpu->exitinfo;



2164 
2165 #ifndef __FreeBSD__
2166         vtc.vtc_vm = vm;
2167         vtc.vtc_vcpuid = vcpuid;
2168         vtc.vtc_status = 0;
2169 
2170         installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2171             NULL, vmm_freectx);
2172 #endif
2173 
2174         error = vm_entry_actions(vm, vcpuid, entry, vme);
2175         if (error != 0) {
2176                 goto exit;
2177         }
2178 
2179 restart:
2180         error = vm_loop_checks(vm, vcpuid, vme);
2181         if (error != 0) {
2182                 goto exit;
2183         }


2206 
2207 #ifdef  __FreeBSD__
2208         pcb = PCPU_GET(curpcb);
2209         set_pcb_flags(pcb, PCB_FULL_IRET);
2210 #else
2211         /* Force a trip through update_sregs to reload %fs/%gs and friends */
2212         PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2213 #endif
2214 
2215 #ifdef  __FreeBSD__
2216         restore_guest_fpustate(vcpu);
2217 #else
2218         if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2219                 restore_guest_fpustate(vcpu);
2220                 vtc.vtc_status |= VTCS_FPU_RESTORED;
2221         }
2222         vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2223 #endif
2224 
2225         vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2226         error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2227         vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2228 
2229 #ifdef  __FreeBSD__
2230         save_guest_fpustate(vcpu);
2231 #else
2232         vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2233 #endif
2234 
2235 #ifndef __FreeBSD__
2236         /*
2237          * Once clear of the delicate contexts comprising the VM_RUN handler,
2238          * thread CPU affinity can be loosened while other processing occurs.
2239          */
2240         thread_affinity_clear(curthread);
2241 #endif
2242 
2243         vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
2244 
2245         critical_exit();
2246 
2247         if (error != 0) {
2248                 /* Communicate out any error from VMRUN() above */
2249                 goto exit;
2250         }
2251 
2252         vcpu->nextrip = vme->rip + vme->inst_length;
2253         switch (vme->exitcode) {
2254         case VM_EXITCODE_REQIDLE:
2255                 error = vm_handle_reqidle(vm, vcpuid);
2256                 break;
2257         case VM_EXITCODE_RUN_STATE:
2258                 error = vm_handle_run_state(vm, vcpuid);
2259                 break;
2260         case VM_EXITCODE_SUSPENDED:
2261                 error = vm_handle_suspend(vm, vcpuid);
2262                 break;
2263         case VM_EXITCODE_IOAPIC_EOI:
2264                 vioapic_process_eoi(vm, vcpuid,
2265                     vme->u.ioapic_eoi.vector);
2266                 break;


2267         case VM_EXITCODE_HLT:
2268                 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2269                 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2270                 break;
2271         case VM_EXITCODE_PAGING:
2272                 error = vm_handle_paging(vm, vcpuid);
2273                 break;
2274         case VM_EXITCODE_MMIO_EMUL:
2275                 error = vm_handle_mmio_emul(vm, vcpuid);
2276                 break;
2277         case VM_EXITCODE_INOUT:
2278                 error = vm_handle_inout(vm, vcpuid, vme);
2279                 break;
2280         case VM_EXITCODE_MONITOR:
2281         case VM_EXITCODE_MWAIT:
2282         case VM_EXITCODE_VMINSN:
2283                 vm_inject_ud(vm, vcpuid);
2284                 break;
2285 #ifndef __FreeBSD__
2286         case VM_EXITCODE_WRMSR:


2757 }
2758 
2759 void
2760 vm_extint_clear(struct vm *vm, int vcpuid)
2761 {
2762         struct vcpu *vcpu;
2763 
2764         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2765                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2766 
2767         vcpu = &vm->vcpu[vcpuid];
2768 
2769         if (vcpu->extint_pending == 0)
2770                 panic("vm_extint_clear: inconsistent extint_pending state");
2771 
2772         vcpu->extint_pending = 0;
2773         vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2774 }
2775 
2776 int
2777 vm_inject_init(struct vm *vm, int vcpuid)
2778 {
2779         struct vcpu *vcpu;
2780 
2781         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2782                 return (EINVAL);
2783 
2784         vcpu = &vm->vcpu[vcpuid];
2785         vcpu_lock(vcpu);
2786         vcpu->run_state |= VRS_PEND_INIT;
2787         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2788         vcpu_unlock(vcpu);
2789         return (0);
2790 }
2791 
2792 int
2793 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2794 {
2795         struct vcpu *vcpu;
2796 
2797         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2798                 return (EINVAL);
2799 
2800         vcpu = &vm->vcpu[vcpuid];
2801         vcpu_lock(vcpu);
2802         vcpu->run_state |= VRS_PEND_SIPI;
2803         vcpu->sipi_vector = vector;
2804         /* SIPI is only actionable if the CPU is waiting in INIT state */
2805         if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2806                 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2807         }
2808         vcpu_unlock(vcpu);
2809         return (0);
2810 }
2811 
2812 bool
2813 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2814 {
2815         struct vcpu *vcpu;
2816 
2817         ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2818         vcpu = &vm->vcpu[vcpuid];
2819 
2820         /* Of interest: vCPU not in running state or with pending INIT */
2821         return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2822 }
2823 
2824 int
2825 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2826 {
2827         struct seg_desc desc;
2828         const enum vm_reg_name clear_regs[] = {
2829                 VM_REG_GUEST_CR2,
2830                 VM_REG_GUEST_CR3,
2831                 VM_REG_GUEST_CR4,
2832                 VM_REG_GUEST_RAX,
2833                 VM_REG_GUEST_RBX,
2834                 VM_REG_GUEST_RCX,
2835                 VM_REG_GUEST_RSI,
2836                 VM_REG_GUEST_RDI,
2837                 VM_REG_GUEST_RBP,
2838                 VM_REG_GUEST_RSP,
2839                 VM_REG_GUEST_R8,
2840                 VM_REG_GUEST_R9,
2841                 VM_REG_GUEST_R10,
2842                 VM_REG_GUEST_R11,
2843                 VM_REG_GUEST_R12,
2844                 VM_REG_GUEST_R13,
2845                 VM_REG_GUEST_R14,
2846                 VM_REG_GUEST_R15,
2847                 VM_REG_GUEST_DR0,
2848                 VM_REG_GUEST_DR1,
2849                 VM_REG_GUEST_DR2,
2850                 VM_REG_GUEST_DR3,
2851                 VM_REG_GUEST_EFER,
2852         };
2853         const enum vm_reg_name data_segs[] = {
2854                 VM_REG_GUEST_SS,
2855                 VM_REG_GUEST_DS,
2856                 VM_REG_GUEST_ES,
2857                 VM_REG_GUEST_FS,
2858                 VM_REG_GUEST_GS,
2859         };
2860         struct vcpu *vcpu = &vm->vcpu[vcpuid];
2861 
2862         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2863                 return (EINVAL);
2864 
2865         for (uint_t i = 0; i < nitems(clear_regs); i++) {
2866                 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2867         }
2868 
2869         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2870         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2871         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2872 
2873         /*
2874          * The prescribed contents of %rdx differ slightly between the Intel and
2875          * AMD architectural definitions.  The former expects the Extended Model
2876          * in bits 16-19 where the latter expects all the Family, Model, and
2877          * Stepping be there.  Common boot ROMs appear to disregard this
2878          * anyways, so we stick with a compromise value similar to what is
2879          * spelled out in the Intel SDM.
2880          */
2881         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2882 
2883         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2884         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2885 
2886         /* CS: Present, R/W, Accessed */
2887         desc.access = 0x0093;
2888         desc.base = 0xffff0000;
2889         desc.limit = 0xffff;
2890         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2891         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2892 
2893         /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2894         desc.access = 0x0093;
2895         desc.base = 0;
2896         desc.limit = 0xffff;
2897         for (uint_t i = 0; i < nitems(data_segs); i++) {
2898                 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2899                 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2900         }
2901 
2902         /* GDTR, IDTR */
2903         desc.base = 0;
2904         desc.limit = 0xffff;
2905         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2906         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2907 
2908         /* LDTR: Present, LDT */
2909         desc.access = 0x0082;
2910         desc.base = 0;
2911         desc.limit = 0xffff;
2912         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2913         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2914 
2915         /* TR: Present, 32-bit TSS */
2916         desc.access = 0x008b;
2917         desc.base = 0;
2918         desc.limit = 0xffff;
2919         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2920         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2921 
2922         vlapic_reset(vm_lapic(vm, vcpuid));
2923 
2924         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2925 
2926         vcpu->exitintinfo = 0;
2927         vcpu->exception_pending = 0;
2928         vcpu->nmi_pending = 0;
2929         vcpu->extint_pending = 0;
2930 
2931         /*
2932          * A CPU reset caused by power-on or system reset clears more state than
2933          * one which is trigged from an INIT IPI.
2934          */
2935         if (!init_only) {
2936                 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2937                 fpu_save_area_reset(vcpu->guestfpu);
2938 
2939                 /* XXX: clear MSRs and other pieces */
2940         }
2941 
2942         return (0);
2943 }
2944 
2945 static int
2946 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2947 {
2948         struct seg_desc desc;
2949 
2950         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2951                 return (EINVAL);
2952 
2953         /* CS: Present, R/W, Accessed */
2954         desc.access = 0x0093;
2955         desc.base = (uint64_t)vector << 12;
2956         desc.limit = 0xffff;
2957         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2958         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2959             (uint64_t)vector << 8));
2960 
2961         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2962 
2963         return (0);
2964 }
2965 
2966 int
2967 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2968 {
2969         if (vcpu < 0 || vcpu >= vm->maxcpus)
2970                 return (EINVAL);
2971 
2972         if (type < 0 || type >= VM_CAP_MAX)
2973                 return (EINVAL);
2974 
2975         return (VMGETCAP(vm->cookie, vcpu, type, retval));
2976 }
2977 
2978 int
2979 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
2980 {
2981         if (vcpu < 0 || vcpu >= vm->maxcpus)
2982                 return (EINVAL);
2983 
2984         if (type < 0 || type >= VM_CAP_MAX)
2985                 return (EINVAL);
2986 


3049         }
3050         return (found);
3051 }
3052 #endif
3053 
3054 void *
3055 vm_iommu_domain(struct vm *vm)
3056 {
3057 
3058         return (vm->iommu);
3059 }
3060 
3061 int
3062 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3063     bool from_idle)
3064 {
3065         int error;
3066         struct vcpu *vcpu;
3067 
3068         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3069                 panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3070 
3071         vcpu = &vm->vcpu[vcpuid];
3072 
3073         vcpu_lock(vcpu);
3074         error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3075         vcpu_unlock(vcpu);
3076 
3077         return (error);
3078 }
3079 
3080 enum vcpu_state
3081 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3082 {
3083         struct vcpu *vcpu;
3084         enum vcpu_state state;
3085 
3086         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3087                 panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3088 
3089         vcpu = &vm->vcpu[vcpuid];
3090 
3091         vcpu_lock(vcpu);
3092         state = vcpu->state;
3093         if (hostcpu != NULL)
3094                 *hostcpu = vcpu->hostcpu;
3095         vcpu_unlock(vcpu);
3096 
3097         return (state);
3098 }
3099 
















































3100 #ifndef __FreeBSD__
3101 uint64_t
3102 vcpu_tsc_offset(struct vm *vm, int vcpuid)
3103 {
3104         return (vm->vcpu[vcpuid].tsc_offset);
3105 }
3106 #endif /* __FreeBSD__ */
3107 
3108 int
3109 vm_activate_cpu(struct vm *vm, int vcpuid)
3110 {
3111 
3112         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3113                 return (EINVAL);
3114 
3115         if (CPU_ISSET(vcpuid, &vm->active_cpus))
3116                 return (EBUSY);
3117 
3118         VCPU_CTR0(vm, vcpuid, "activated");
3119         CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);


3145 }
3146 
3147 int
3148 vm_resume_cpu(struct vm *vm, int vcpuid)
3149 {
3150 
3151         if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3152                 return (EINVAL);
3153 
3154         if (vcpuid == -1) {
3155                 CPU_ZERO(&vm->debug_cpus);
3156         } else {
3157                 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3158                         return (EINVAL);
3159 
3160                 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3161         }
3162         return (0);
3163 }
3164 
3165 static bool
3166 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3167     uint64_t entry_rip)
3168 {
3169         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3170         struct vm_exit *vme = &vcpu->exitinfo;
3171         bool bail = false;
3172 
3173         ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3174 
3175         if (vm->suspend) {
3176                 if (on_entry) {
3177                         VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3178                             vm->suspend < VM_SUSPEND_LAST);
3179 
3180                         vme->exitcode = VM_EXITCODE_SUSPENDED;
3181                         vme->u.suspended.how = vm->suspend;
3182                 } else {
3183                         /*
3184                          * Handling VM suspend is complicated, so if that
3185                          * condition is detected outside of VM-entry itself,
3186                          * just emit a BOGUS exitcode so we take a lap to pick
3187                          * up the event during an entry and are directed into
3188                          * the vm_handle_suspend() logic.
3189                          */
3190                         vme->exitcode = VM_EXITCODE_BOGUS;
3191                 }
3192                 bail = true;
3193         }
3194         if (vcpu->reqidle) {
3195                 vme->exitcode = VM_EXITCODE_REQIDLE;
3196                 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3197 
3198                 if (!on_entry) {
3199                         /*
3200                          * A reqidle request detected outside of VM-entry can be
3201                          * handled directly by clearing the request (and taking
3202                          * a lap to userspace).
3203                          */
3204                         vcpu_assert_locked(vcpu);
3205                         vcpu->reqidle = 0;
3206                 }
3207                 bail = true;
3208         }
3209         if (vcpu_should_yield(vm, vcpuid)) {
3210                 vme->exitcode = VM_EXITCODE_BOGUS;
3211                 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3212                 bail = true;
3213         }
3214         if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3215                 vme->exitcode = VM_EXITCODE_DEBUG;
3216                 bail = true;
3217         }
3218 
3219         if (bail) {
3220                 if (on_entry) {
3221                         /*
3222                          * If bailing out during VM-entry, the current %rip must
3223                          * be recorded in the exitinfo.
3224                          */
3225                         vme->rip = entry_rip;
3226                 }
3227                 vme->inst_length = 0;
3228         }
3229         return (bail);
3230 }
3231 
3232 static bool
3233 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3234 {
3235         /*
3236          * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3237          * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3238          * structure, and we would only modify the exitcode.
3239          */
3240         return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3241 }
3242 
3243 bool
3244 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3245 {
3246         /*
3247          * Bail-out checks done as part of VM entry require an updated %rip to
3248          * populate the vm_exit struct if any of the conditions of interest are
3249          * matched in the check.
3250          */
3251         return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3252 }
3253 
3254 cpuset_t
3255 vm_active_cpus(struct vm *vm)
3256 {
3257 
3258         return (vm->active_cpus);
3259 }
3260 
3261 cpuset_t
3262 vm_debug_cpus(struct vm *vm)
3263 {
3264 
3265         return (vm->debug_cpus);
3266 }
3267 
3268 cpuset_t
3269 vm_suspended_cpus(struct vm *vm)
3270 {
3271 
3272         return (vm->suspended_cpus);
3273 }