92 #include "vioapic.h"
93 #include "vlapic.h"
94 #include "vpmtmr.h"
95 #include "vrtc.h"
96 #include "vmm_stat.h"
97 #include "vmm_lapic.h"
98
99 #include "io/ppt.h"
100 #include "io/iommu.h"
101
102 struct vlapic;
103
104 /*
105 * Initialization:
106 * (a) allocated when vcpu is created
107 * (i) initialized when vcpu is created and when it is reinitialized
108 * (o) initialized the first time the vcpu is created
109 * (x) initialized before use
110 */
111 struct vcpu {
112 struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */
113 enum vcpu_state state; /* (o) vcpu state */
114 #ifndef __FreeBSD__
115 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */
116 kcondvar_t state_cv; /* (o) IDLE-transition cv */
117 #endif /* __FreeBSD__ */
118 int hostcpu; /* (o) vcpu's current host cpu */
119 #ifndef __FreeBSD__
120 int lastloccpu; /* (o) last host cpu localized to */
121 #endif
122 uint_t runblock; /* (i) block vcpu from run state */
123 int reqidle; /* (i) request vcpu to idle */
124 struct vlapic *vlapic; /* (i) APIC device model */
125 enum x2apic_state x2apic_state; /* (i) APIC mode */
126 uint64_t exitintinfo; /* (i) events pending at VM exit */
127 int nmi_pending; /* (i) NMI pending */
128 int extint_pending; /* (i) INTR pending */
129 int exception_pending; /* (i) exception pending */
130 int exc_vector; /* (x) exception collateral */
131 int exc_errcode_valid;
132 uint32_t exc_errcode;
133 struct savefpu *guestfpu; /* (a,i) guest fpu state */
134 uint64_t guest_xcr0; /* (i) guest %xcr0 register */
135 void *stats; /* (a,i) statistics */
136 struct vm_exit exitinfo; /* (x) exit reason and collateral */
137 uint64_t nextrip; /* (x) next instruction to execute */
138 struct vie *vie_ctx; /* (x) instruction emulation context */
139 #ifndef __FreeBSD__
140 uint64_t tsc_offset; /* (x) offset from host TSC */
141 #endif
142 };
143
144 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
145 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
146 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
147 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
148 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
149
150 struct mem_seg {
151 size_t len;
152 bool sysmem;
183 struct vatpit *vatpit; /* (i) virtual atpit */
184 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
185 struct vrtc *vrtc; /* (o) virtual RTC */
186 volatile cpuset_t active_cpus; /* (i) active vcpus */
187 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */
188 int suspend; /* (i) stop VM execution */
189 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
190 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
191 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
192 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
193 struct vmspace *vmspace; /* (o) guest's address space */
194 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
195 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
196 /* The following describe the vm cpu topology */
197 uint16_t sockets; /* (o) num of sockets */
198 uint16_t cores; /* (o) num of cores/socket */
199 uint16_t threads; /* (o) num of threads/core */
200 uint16_t maxcpus; /* (o) max pluggable cpus */
201
202 struct ioport_config ioports; /* (o) ioport handling */
203
204 bool sipi_req; /* (i) SIPI requested */
205 int sipi_req_vcpu; /* (i) SIPI destination */
206 uint64_t sipi_req_rip; /* (i) SIPI start %rip */
207
208 /* Miscellaneous VM-wide statistics and counters */
209 struct vm_wide_stats {
210 uint64_t sipi_supersede;
211 } stats;
212 };
213
214 static int vmm_initialized;
215
216
217 static void
218 nullop_panic(void)
219 {
220 panic("null vmm operation call");
221 }
222
223 /* Do not allow use of an un-set `ops` to do anything but panic */
224 static struct vmm_ops vmm_ops_null = {
225 .init = (vmm_init_func_t)nullop_panic,
226 .cleanup = (vmm_cleanup_func_t)nullop_panic,
227 .resume = (vmm_resume_func_t)nullop_panic,
228 .vminit = (vmi_init_func_t)nullop_panic,
229 .vmrun = (vmi_run_func_t)nullop_panic,
230 .vmcleanup = (vmi_cleanup_func_t)nullop_panic,
231 .vmgetreg = (vmi_get_register_t)nullop_panic,
232 .vmsetreg = (vmi_set_register_t)nullop_panic,
233 .vmgetdesc = (vmi_get_desc_t)nullop_panic,
234 .vmsetdesc = (vmi_set_desc_t)nullop_panic,
235 .vmgetcap = (vmi_get_cap_t)nullop_panic,
236 .vmsetcap = (vmi_set_cap_t)nullop_panic,
237 .vmspace_alloc = (vmi_vmspace_alloc)nullop_panic,
238 .vmspace_free = (vmi_vmspace_free)nullop_panic,
239 .vlapic_init = (vmi_vlapic_init)nullop_panic,
240 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
241 .vmsavectx = (vmi_savectx)nullop_panic,
242 .vmrestorectx = (vmi_restorectx)nullop_panic,
243 };
244
245 static struct vmm_ops *ops = &vmm_ops_null;
246
247 #define VMM_INIT(num) ((*ops->init)(num))
248 #define VMM_CLEANUP() ((*ops->cleanup)())
249 #define VMM_RESUME() ((*ops->resume)())
250
251 #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap))
252 #define VMRUN(vmi, vcpu, rip, pmap, evinfo) \
253 ((*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo))
254 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
255 #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max))
256 #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace))
257
258 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
259 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val))
260 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
261 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
262 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv))
263 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val))
264 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu))
265 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic))
266
267 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
268 #define fpu_stop_emulating() clts()
269
270 SDT_PROVIDER_DEFINE(vmm);
271
272 static MALLOC_DEFINE(M_VM, "vm", "vm");
273
275 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
276
277 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
278 NULL);
279
280 /*
281 * Halt the guest if all vcpus are executing a HLT instruction with
282 * interrupts disabled.
283 */
284 static int halt_detection_enabled = 1;
285
286 /* IPI vector used for vcpu notifications */
287 static int vmm_ipinum;
288
289 /* Trap into hypervisor on all guest exceptions and reflect them back */
290 static int trace_guest_exceptions;
291
292 static void vm_free_memmap(struct vm *vm, int ident);
293 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
294 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
295
296 #ifndef __FreeBSD__
297 static void vm_clear_memseg(struct vm *, int);
298
299 /* Flags for vtc_status */
300 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */
301 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */
302
303 typedef struct vm_thread_ctx {
304 struct vm *vtc_vm;
305 int vtc_vcpuid;
306 uint_t vtc_status;
307 } vm_thread_ctx_t;
308 #endif /* __FreeBSD__ */
309
310 #ifdef KTR
311 static const char *
312 vcpu_state2str(enum vcpu_state state)
313 {
314
353
354 if (create) {
355 #ifdef __FreeBSD__
356 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
357 "initialized", vcpu_id));
358 #endif
359 vcpu_lock_init(vcpu);
360 vcpu->state = VCPU_IDLE;
361 vcpu->hostcpu = NOCPU;
362 #ifndef __FreeBSD__
363 vcpu->lastloccpu = NOCPU;
364 #endif
365 vcpu->guestfpu = fpu_save_area_alloc();
366 vcpu->stats = vmm_stat_alloc();
367 vcpu->vie_ctx = vie_alloc();
368 } else {
369 vie_reset(vcpu->vie_ctx);
370 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
371 }
372
373 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
374 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
375 vcpu->runblock = 0;
376 vcpu->reqidle = 0;
377 vcpu->exitintinfo = 0;
378 vcpu->nmi_pending = 0;
379 vcpu->extint_pending = 0;
380 vcpu->exception_pending = 0;
381 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
382 fpu_save_area_reset(vcpu->guestfpu);
383 vmm_stat_init(vcpu->stats);
384 }
385
386 int
387 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
388 {
389
390 return (trace_guest_exceptions);
391 }
392
393 struct vm_exit *
394 vm_exitinfo(struct vm *vm, int cpuid)
395 {
1216 return (true);
1217 default:
1218 return (false);
1219 }
1220 }
1221
1222 int
1223 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1224 {
1225
1226 if (vcpu < 0 || vcpu >= vm->maxcpus)
1227 return (EINVAL);
1228
1229 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1230 return (EINVAL);
1231
1232 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1233 }
1234
1235 int
1236 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1237 {
1238 if (vcpu < 0 || vcpu >= vm->maxcpus)
1239 return (EINVAL);
1240
1241 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1242 return (EINVAL);
1243
1244 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1245 }
1246
1247 static void
1248 restore_guest_fpustate(struct vcpu *vcpu)
1249 {
1250
1251 /* flush host state to the pcb */
1252 fpuexit(curthread);
1253
1254 /* restore guest FPU state */
1255 fpu_stop_emulating();
1256 fpurestore(vcpu->guestfpu);
1257
1258 /* restore guest XCR0 if XSAVE is enabled in the host */
1259 if (rcr4() & CR4_XSAVE)
1260 load_xcr(0, vcpu->guest_xcr0);
1261
1262 /*
1263 * The FPU is now "dirty" with the guest's state so turn on emulation
1264 * to trap any access to the FPU by the host.
1265 */
1266 fpu_start_emulating();
1337 /*
1338 * The following state transitions are allowed:
1339 * IDLE -> FROZEN -> IDLE
1340 * FROZEN -> RUNNING -> FROZEN
1341 * FROZEN -> SLEEPING -> FROZEN
1342 */
1343 switch (vcpu->state) {
1344 case VCPU_IDLE:
1345 case VCPU_RUNNING:
1346 case VCPU_SLEEPING:
1347 error = (newstate != VCPU_FROZEN);
1348 break;
1349 case VCPU_FROZEN:
1350 error = (newstate == VCPU_FROZEN);
1351 break;
1352 default:
1353 error = 1;
1354 break;
1355 }
1356
1357 if (newstate == VCPU_RUNNING) {
1358 while (vcpu->runblock != 0) {
1359 #ifdef __FreeBSD__
1360 msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
1361 #else
1362 cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1363 #endif
1364 }
1365 }
1366
1367 if (error)
1368 return (EBUSY);
1369
1370 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1371 vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1372
1373 vcpu->state = newstate;
1374 if (newstate == VCPU_RUNNING)
1375 vcpu->hostcpu = curcpu;
1376 else
1377 vcpu->hostcpu = NOCPU;
1378
1379 if (newstate == VCPU_IDLE ||
1380 (newstate == VCPU_FROZEN && vcpu->runblock != 0)) {
1381 #ifdef __FreeBSD__
1382 wakeup(&vcpu->state);
1383 #else
1384 cv_broadcast(&vcpu->state_cv);
1385 #endif
1386 }
1387
1388 return (0);
1389 }
1390
1391 static void
1392 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1393 {
1394 int error;
1395
1396 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1397 panic("Error %d setting state to %d\n", error, newstate);
1398 }
1399
1400 static void
1401 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1402 {
1403 int error;
1404
1405 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1406 panic("Error %d setting state to %d", error, newstate);
1407 }
1408
1409 /*
1410 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1411 */
1412 static int
1413 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1414 {
1415 struct vcpu *vcpu;
1416 #ifdef __FreeBSD__
1417 const char *wmesg;
1418 #else
1419 const char *wmesg __unused;
1420 #endif
1421 int t, vcpu_halted, vm_halted;
1422
1423 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1424
1425 vcpu = &vm->vcpu[vcpuid];
1426 vcpu_halted = 0;
1427 vm_halted = 0;
1428
1429 vcpu_lock(vcpu);
1430 while (1) {
1431 /*
1432 * Do a final check for pending NMI or interrupts before
1433 * really putting this thread to sleep. Also check for
1434 * software events that would cause this vcpu to wakeup.
1435 *
1436 * These interrupts/events could have happened after the
1437 * vcpu returned from VMRUN() and before it acquired the
1438 * vcpu lock above.
1439 */
1440 if (vm->suspend || vcpu->reqidle)
1441 break;
1442 if (vm_nmi_pending(vm, vcpuid))
1443 break;
1444 if (!intr_disabled) {
1445 if (vm_extint_pending(vm, vcpuid) ||
1446 vlapic_pending_intr(vcpu->vlapic, NULL)) {
1447 break;
1448 }
1449 }
1450
1451 /* Don't go to sleep if the vcpu thread needs to yield */
1452 if (vcpu_should_yield(vm, vcpuid))
1453 break;
1454
1455 if (vcpu_debugged(vm, vcpuid))
1456 break;
1457
1458 /*
1459 * Some Linux guests implement "halt" by having all vcpus
1460 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1461 * track of the vcpus that have entered this state. When all
1462 * vcpus enter the halted state the virtual machine is halted.
1463 */
1464 if (intr_disabled) {
1465 wmesg = "vmhalt";
1466 VCPU_CTR0(vm, vcpuid, "Halted");
1467 if (!vcpu_halted && halt_detection_enabled) {
1468 vcpu_halted = 1;
1469 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1470 }
1471 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1472 vm_halted = 1;
1473 break;
1474 }
1475 } else {
1476 wmesg = "vmidle";
1477 }
1478
1479 t = ticks;
1480 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1481 #ifdef __FreeBSD__
1482 /*
1483 * XXX msleep_spin() cannot be interrupted by signals so
1484 * wake up periodically to check pending signals.
1485 */
1486 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1487 #else
1488 /*
1489 * Fortunately, cv_wait_sig can be interrupted by signals, so
1490 * there is no need to periodically wake up.
1491 */
1492 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1493 #endif
1494 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1495 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1496 }
1497
1498 if (vcpu_halted)
1499 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1500
1501 vcpu_unlock(vcpu);
1502
1503 if (vm_halted)
1504 vm_suspend(vm, VM_SUSPEND_HALT);
1505
1506 return (0);
1507 }
1508
1509 static int
1510 vm_handle_paging(struct vm *vm, int vcpuid)
1511 {
1512 int rv, ftype;
1513 struct vm_map *map;
1514 struct vcpu *vcpu;
1515 struct vm_exit *vme;
1516
1517 vcpu = &vm->vcpu[vcpuid];
1518 vme = &vcpu->exitinfo;
1519
1520 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1521 __func__, vme->inst_length));
1522
1523 ftype = vme->u.paging.fault_type;
1524 KASSERT(ftype == VM_PROT_READ ||
1525 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1526 ("vm_handle_paging: invalid fault_type %d", ftype));
1815 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1816 vcpu_notify_event(vm, i);
1817 }
1818 }
1819
1820 return (-1);
1821 }
1822
1823 static int
1824 vm_handle_reqidle(struct vm *vm, int vcpuid)
1825 {
1826 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1827
1828 vcpu_lock(vcpu);
1829 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1830 vcpu->reqidle = 0;
1831 vcpu_unlock(vcpu);
1832 return (-1);
1833 }
1834
1835 #ifndef __FreeBSD__
1836 static int
1837 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1838 {
1839 struct vcpu *cpu = &vm->vcpu[vcpuid];
1840 const uint32_t code = vme->u.msr.code;
1841 const uint64_t val = vme->u.msr.wval;
1842
1843 switch (code) {
1844 case MSR_TSC:
1845 cpu->tsc_offset = val - rdtsc();
1846 return (0);
1847 }
1848
1849 return (-1);
1850 }
1851 #endif /* __FreeBSD__ */
1852
1853 void
1854 vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip)
1855 {
1856 if (vm->sipi_req) {
1857 /* This should never occur if userspace is doing its job. */
1858 vm->stats.sipi_supersede++;
1859 }
1860 vm->sipi_req = true;
1861 vm->sipi_req_vcpu = req_vcpuid;
1862 vm->sipi_req_rip = req_rip;
1863 }
1864
1865 int
1866 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1867 {
1868 int i;
1869
1870 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1871 return (EINVAL);
1872
1873 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1874 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1875 vm->suspend, how);
1876 return (EALREADY);
1877 }
1878
1879 VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1880
1881 /*
1882 * Notify all active vcpus that they are now suspended.
1883 */
1884 for (i = 0; i < vm->maxcpus; i++) {
1885 if (CPU_ISSET(i, &vm->active_cpus))
1886 vcpu_notify_event(vm, i);
1887 }
1888
1889 return (0);
1890 }
1891
1892 void
1893 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1894 {
1895 struct vm_exit *vmexit;
1896
1897 KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1898 ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1899
1900 vmexit = vm_exitinfo(vm, vcpuid);
1901 vmexit->rip = rip;
1902 vmexit->inst_length = 0;
1903 vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1904 vmexit->u.suspended.how = vm->suspend;
1905 }
1906
1907 void
1908 vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
1909 {
1910 struct vm_exit *vmexit;
1911
1912 vmexit = vm_exitinfo(vm, vcpuid);
1913 vmexit->rip = rip;
1914 vmexit->inst_length = 0;
1915 vmexit->exitcode = VM_EXITCODE_DEBUG;
1916 }
1917
1918 void
1919 vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip)
1920 {
1921 struct vm_exit *vmexit;
1922
1923 vmexit = vm_exitinfo(vm, vcpuid);
1924 vmexit->rip = rip;
1925 vmexit->inst_length = 0;
1926 vmexit->exitcode = VM_EXITCODE_RUNBLOCK;
1927 vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1);
1928 }
1929
1930 void
1931 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
1932 {
1933 struct vm_exit *vmexit;
1934
1935 vmexit = vm_exitinfo(vm, vcpuid);
1936 vmexit->rip = rip;
1937 vmexit->inst_length = 0;
1938 vmexit->exitcode = VM_EXITCODE_REQIDLE;
1939 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
1940 }
1941
1942 void
1943 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1944 {
1945 struct vm_exit *vmexit;
1946
1947 vmexit = vm_exitinfo(vm, vcpuid);
1948 vmexit->rip = rip;
1949 vmexit->inst_length = 0;
1950 vmexit->exitcode = VM_EXITCODE_BOGUS;
1951 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1952 }
1953
1954 #ifndef __FreeBSD__
1955 /*
1956 * Some vmm resources, such as the lapic, may have CPU-specific resources
1957 * allocated to them which would benefit from migration onto the host CPU which
1958 * is processing the vcpu state.
1959 */
1960 static void
1961 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
1962 {
1963 /*
1964 * Localizing cyclic resources requires acquisition of cpu_lock, and
1965 * doing so with kpreempt disabled is a recipe for deadlock disaster.
1966 */
1967 VERIFY(curthread->t_preempt == 0);
1968
1969 /*
1970 * Do not bother with localization if this vCPU is about to return to
1971 * the host CPU it was last localized to.
1972 */
1973 if (vcpu->lastloccpu == curcpu)
2055 #endif /* __FreeBSD */
2056
2057 static int
2058 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2059 struct vm_exit *vme)
2060 {
2061 struct vcpu *vcpu;
2062 struct vie *vie;
2063 int err;
2064
2065 vcpu = &vm->vcpu[vcpuid];
2066 vie = vcpu->vie_ctx;
2067 err = 0;
2068
2069 switch (entry->cmd) {
2070 case VEC_DEFAULT:
2071 return (0);
2072 case VEC_DISCARD_INSTR:
2073 vie_reset(vie);
2074 return (0);
2075 case VEC_COMPLETE_MMIO:
2076 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2077 if (err == 0) {
2078 err = vie_emulate_mmio(vie, vm, vcpuid);
2079 if (err == 0) {
2080 vie_advance_pc(vie, &vcpu->nextrip);
2081 } else if (err < 0) {
2082 vie_exitinfo(vie, vme);
2083 } else if (err == EAGAIN) {
2084 /*
2085 * Clear the instruction emulation state in
2086 * order to re-enter VM context and continue
2087 * this 'rep <instruction>'
2088 */
2089 vie_reset(vie);
2090 err = 0;
2091 }
2092 }
2093 break;
2094 case VEC_COMPLETE_INOUT:
2095 err = vie_fulfill_inout(vie, &entry->u.inout);
2096 if (err == 0) {
2097 err = vie_emulate_inout(vie, vm, vcpuid);
2098 if (err == 0) {
2099 vie_advance_pc(vie, &vcpu->nextrip);
2100 } else if (err < 0) {
2101 vie_exitinfo(vie, vme);
2102 } else if (err == EAGAIN) {
2103 /*
2104 * Clear the instruction emulation state in
2105 * order to re-enter VM context and continue
2106 * this 'rep ins/outs'
2107 */
2108 vie_reset(vie);
2109 err = 0;
2110 }
2111 }
2112 break;
2113 default:
2114 return (EINVAL);
2115 }
2116 return (err);
2117 }
2118
2119 static int
2120 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2121 {
2122 struct vie *vie;
2123
2124 vie = vm->vcpu[vcpuid].vie_ctx;
2125
2126 if (vie_pending(vie)) {
2127 /*
2128 * Userspace has not fulfilled the pending needs of the
2129 * instruction emulation, so bail back out.
2130 */
2131 vie_exitinfo(vie, vme);
2132 return (-1);
2133 }
2134
2135 if (vcpuid == 0 && vm->sipi_req) {
2136 /* The boot vCPU has sent a SIPI to one of the other CPUs */
2137 vme->exitcode = VM_EXITCODE_SPINUP_AP;
2138 vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu;
2139 vme->u.spinup_ap.rip = vm->sipi_req_rip;
2140
2141 vm->sipi_req = false;
2142 vm->sipi_req_vcpu = 0;
2143 vm->sipi_req_rip = 0;
2144 return (-1);
2145 }
2146
2147 return (0);
2148 }
2149
2150 int
2151 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2152 {
2153 struct vm_eventinfo evinfo;
2154 int error;
2155 struct vcpu *vcpu;
2156 #ifdef __FreeBSD__
2157 struct pcb *pcb;
2158 #endif
2159 uint64_t tscval;
2160 struct vm_exit *vme;
2161 bool intr_disabled;
2162 pmap_t pmap;
2163 #ifndef __FreeBSD__
2164 vm_thread_ctx_t vtc;
2165 int affinity_type = CPU_CURRENT;
2166 #endif
2167
2168 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2169 return (EINVAL);
2170
2171 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2172 return (EINVAL);
2173
2174 if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2175 return (EINVAL);
2176
2177 pmap = vmspace_pmap(vm->vmspace);
2178 vcpu = &vm->vcpu[vcpuid];
2179 vme = &vcpu->exitinfo;
2180 evinfo.rptr = &vcpu->runblock;
2181 evinfo.sptr = &vm->suspend;
2182 evinfo.iptr = &vcpu->reqidle;
2183
2184 #ifndef __FreeBSD__
2185 vtc.vtc_vm = vm;
2186 vtc.vtc_vcpuid = vcpuid;
2187 vtc.vtc_status = 0;
2188
2189 installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2190 NULL, vmm_freectx);
2191 #endif
2192
2193 error = vm_entry_actions(vm, vcpuid, entry, vme);
2194 if (error != 0) {
2195 goto exit;
2196 }
2197
2198 restart:
2199 error = vm_loop_checks(vm, vcpuid, vme);
2200 if (error != 0) {
2201 goto exit;
2202 }
2225
2226 #ifdef __FreeBSD__
2227 pcb = PCPU_GET(curpcb);
2228 set_pcb_flags(pcb, PCB_FULL_IRET);
2229 #else
2230 /* Force a trip through update_sregs to reload %fs/%gs and friends */
2231 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2232 #endif
2233
2234 #ifdef __FreeBSD__
2235 restore_guest_fpustate(vcpu);
2236 #else
2237 if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2238 restore_guest_fpustate(vcpu);
2239 vtc.vtc_status |= VTCS_FPU_RESTORED;
2240 }
2241 vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2242 #endif
2243
2244 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2245 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
2246 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2247
2248 #ifdef __FreeBSD__
2249 save_guest_fpustate(vcpu);
2250 #else
2251 vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2252 #endif
2253
2254 #ifndef __FreeBSD__
2255 /*
2256 * Once clear of the delicate contexts comprising the VM_RUN handler,
2257 * thread CPU affinity can be loosened while other processing occurs.
2258 */
2259 thread_affinity_clear(curthread);
2260 #endif
2261
2262 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
2263
2264 critical_exit();
2265
2266 if (error != 0) {
2267 /* Communicate out any error from VMRUN() above */
2268 goto exit;
2269 }
2270
2271 vcpu->nextrip = vme->rip + vme->inst_length;
2272 switch (vme->exitcode) {
2273 case VM_EXITCODE_REQIDLE:
2274 error = vm_handle_reqidle(vm, vcpuid);
2275 break;
2276 case VM_EXITCODE_SUSPENDED:
2277 error = vm_handle_suspend(vm, vcpuid);
2278 break;
2279 case VM_EXITCODE_IOAPIC_EOI:
2280 vioapic_process_eoi(vm, vcpuid,
2281 vme->u.ioapic_eoi.vector);
2282 break;
2283 case VM_EXITCODE_RUNBLOCK:
2284 break;
2285 case VM_EXITCODE_HLT:
2286 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2287 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2288 break;
2289 case VM_EXITCODE_PAGING:
2290 error = vm_handle_paging(vm, vcpuid);
2291 break;
2292 case VM_EXITCODE_MMIO_EMUL:
2293 error = vm_handle_mmio_emul(vm, vcpuid);
2294 break;
2295 case VM_EXITCODE_INOUT:
2296 error = vm_handle_inout(vm, vcpuid, vme);
2297 break;
2298 case VM_EXITCODE_MONITOR:
2299 case VM_EXITCODE_MWAIT:
2300 case VM_EXITCODE_VMINSN:
2301 vm_inject_ud(vm, vcpuid);
2302 break;
2303 #ifndef __FreeBSD__
2304 case VM_EXITCODE_WRMSR:
2775 }
2776
2777 void
2778 vm_extint_clear(struct vm *vm, int vcpuid)
2779 {
2780 struct vcpu *vcpu;
2781
2782 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2783 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2784
2785 vcpu = &vm->vcpu[vcpuid];
2786
2787 if (vcpu->extint_pending == 0)
2788 panic("vm_extint_clear: inconsistent extint_pending state");
2789
2790 vcpu->extint_pending = 0;
2791 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2792 }
2793
2794 int
2795 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2796 {
2797 if (vcpu < 0 || vcpu >= vm->maxcpus)
2798 return (EINVAL);
2799
2800 if (type < 0 || type >= VM_CAP_MAX)
2801 return (EINVAL);
2802
2803 return (VMGETCAP(vm->cookie, vcpu, type, retval));
2804 }
2805
2806 int
2807 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
2808 {
2809 if (vcpu < 0 || vcpu >= vm->maxcpus)
2810 return (EINVAL);
2811
2812 if (type < 0 || type >= VM_CAP_MAX)
2813 return (EINVAL);
2814
2877 }
2878 return (found);
2879 }
2880 #endif
2881
2882 void *
2883 vm_iommu_domain(struct vm *vm)
2884 {
2885
2886 return (vm->iommu);
2887 }
2888
2889 int
2890 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
2891 bool from_idle)
2892 {
2893 int error;
2894 struct vcpu *vcpu;
2895
2896 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2897 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
2898
2899 vcpu = &vm->vcpu[vcpuid];
2900
2901 vcpu_lock(vcpu);
2902 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
2903 vcpu_unlock(vcpu);
2904
2905 return (error);
2906 }
2907
2908 enum vcpu_state
2909 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
2910 {
2911 struct vcpu *vcpu;
2912 enum vcpu_state state;
2913
2914 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2915 panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
2916
2917 vcpu = &vm->vcpu[vcpuid];
2918
2919 vcpu_lock(vcpu);
2920 state = vcpu->state;
2921 if (hostcpu != NULL)
2922 *hostcpu = vcpu->hostcpu;
2923 vcpu_unlock(vcpu);
2924
2925 return (state);
2926 }
2927
2928 void
2929 vcpu_block_run(struct vm *vm, int vcpuid)
2930 {
2931 struct vcpu *vcpu;
2932
2933 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2934 panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
2935
2936 vcpu = &vm->vcpu[vcpuid];
2937
2938 vcpu_lock(vcpu);
2939 vcpu->runblock++;
2940 if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) {
2941 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2942 }
2943 while (vcpu->state == VCPU_RUNNING) {
2944 #ifdef __FreeBSD__
2945 msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
2946 #else
2947 cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
2948 #endif
2949 }
2950 vcpu_unlock(vcpu);
2951 }
2952
2953 void
2954 vcpu_unblock_run(struct vm *vm, int vcpuid)
2955 {
2956 struct vcpu *vcpu;
2957
2958 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2959 panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
2960
2961 vcpu = &vm->vcpu[vcpuid];
2962
2963 vcpu_lock(vcpu);
2964 KASSERT(vcpu->runblock != 0, ("expected non-zero runblock"));
2965 vcpu->runblock--;
2966 if (vcpu->runblock == 0) {
2967 #ifdef __FreeBSD__
2968 wakeup(&vcpu->state);
2969 #else
2970 cv_broadcast(&vcpu->state_cv);
2971 #endif
2972 }
2973 vcpu_unlock(vcpu);
2974 }
2975
2976 #ifndef __FreeBSD__
2977 uint64_t
2978 vcpu_tsc_offset(struct vm *vm, int vcpuid)
2979 {
2980 return (vm->vcpu[vcpuid].tsc_offset);
2981 }
2982 #endif /* __FreeBSD__ */
2983
2984 int
2985 vm_activate_cpu(struct vm *vm, int vcpuid)
2986 {
2987
2988 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2989 return (EINVAL);
2990
2991 if (CPU_ISSET(vcpuid, &vm->active_cpus))
2992 return (EBUSY);
2993
2994 VCPU_CTR0(vm, vcpuid, "activated");
2995 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3021 }
3022
3023 int
3024 vm_resume_cpu(struct vm *vm, int vcpuid)
3025 {
3026
3027 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3028 return (EINVAL);
3029
3030 if (vcpuid == -1) {
3031 CPU_ZERO(&vm->debug_cpus);
3032 } else {
3033 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3034 return (EINVAL);
3035
3036 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3037 }
3038 return (0);
3039 }
3040
3041 int
3042 vcpu_debugged(struct vm *vm, int vcpuid)
3043 {
3044
3045 return (CPU_ISSET(vcpuid, &vm->debug_cpus));
3046 }
3047
3048 cpuset_t
3049 vm_active_cpus(struct vm *vm)
3050 {
3051
3052 return (vm->active_cpus);
3053 }
3054
3055 cpuset_t
3056 vm_debug_cpus(struct vm *vm)
3057 {
3058
3059 return (vm->debug_cpus);
3060 }
3061
3062 cpuset_t
3063 vm_suspended_cpus(struct vm *vm)
3064 {
3065
3066 return (vm->suspended_cpus);
3067 }
|
92 #include "vioapic.h"
93 #include "vlapic.h"
94 #include "vpmtmr.h"
95 #include "vrtc.h"
96 #include "vmm_stat.h"
97 #include "vmm_lapic.h"
98
99 #include "io/ppt.h"
100 #include "io/iommu.h"
101
102 struct vlapic;
103
104 /*
105 * Initialization:
106 * (a) allocated when vcpu is created
107 * (i) initialized when vcpu is created and when it is reinitialized
108 * (o) initialized the first time the vcpu is created
109 * (x) initialized before use
110 */
111 struct vcpu {
112 /* (o) protects state, run_state, hostcpu, sipi_vector */
113 struct mtx mtx;
114
115 enum vcpu_state state; /* (o) vcpu state */
116 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */
117 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */
118 kcondvar_t state_cv; /* (o) IDLE-transition cv */
119 int hostcpu; /* (o) vcpu's current host cpu */
120 int lastloccpu; /* (o) last host cpu localized to */
121 int reqidle; /* (i) request vcpu to idle */
122 struct vlapic *vlapic; /* (i) APIC device model */
123 enum x2apic_state x2apic_state; /* (i) APIC mode */
124 uint64_t exitintinfo; /* (i) events pending at VM exit */
125 int nmi_pending; /* (i) NMI pending */
126 int extint_pending; /* (i) INTR pending */
127 int exception_pending; /* (i) exception pending */
128 int exc_vector; /* (x) exception collateral */
129 int exc_errcode_valid;
130 uint32_t exc_errcode;
131 uint8_t sipi_vector; /* (i) SIPI vector */
132 struct savefpu *guestfpu; /* (a,i) guest fpu state */
133 uint64_t guest_xcr0; /* (i) guest %xcr0 register */
134 void *stats; /* (a,i) statistics */
135 struct vm_exit exitinfo; /* (x) exit reason and collateral */
136 uint64_t nextrip; /* (x) next instruction to execute */
137 struct vie *vie_ctx; /* (x) instruction emulation context */
138 #ifndef __FreeBSD__
139 uint64_t tsc_offset; /* (x) offset from host TSC */
140 #endif
141 };
142
143 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
144 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
145 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
146 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
147 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
148
149 struct mem_seg {
150 size_t len;
151 bool sysmem;
182 struct vatpit *vatpit; /* (i) virtual atpit */
183 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
184 struct vrtc *vrtc; /* (o) virtual RTC */
185 volatile cpuset_t active_cpus; /* (i) active vcpus */
186 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */
187 int suspend; /* (i) stop VM execution */
188 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
189 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
190 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
191 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
192 struct vmspace *vmspace; /* (o) guest's address space */
193 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
194 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
195 /* The following describe the vm cpu topology */
196 uint16_t sockets; /* (o) num of sockets */
197 uint16_t cores; /* (o) num of cores/socket */
198 uint16_t threads; /* (o) num of threads/core */
199 uint16_t maxcpus; /* (o) max pluggable cpus */
200
201 struct ioport_config ioports; /* (o) ioport handling */
202 };
203
204 static int vmm_initialized;
205
206
207 static void
208 nullop_panic(void)
209 {
210 panic("null vmm operation call");
211 }
212
213 /* Do not allow use of an un-set `ops` to do anything but panic */
214 static struct vmm_ops vmm_ops_null = {
215 .init = (vmm_init_func_t)nullop_panic,
216 .cleanup = (vmm_cleanup_func_t)nullop_panic,
217 .resume = (vmm_resume_func_t)nullop_panic,
218 .vminit = (vmi_init_func_t)nullop_panic,
219 .vmrun = (vmi_run_func_t)nullop_panic,
220 .vmcleanup = (vmi_cleanup_func_t)nullop_panic,
221 .vmgetreg = (vmi_get_register_t)nullop_panic,
222 .vmsetreg = (vmi_set_register_t)nullop_panic,
223 .vmgetdesc = (vmi_get_desc_t)nullop_panic,
224 .vmsetdesc = (vmi_set_desc_t)nullop_panic,
225 .vmgetcap = (vmi_get_cap_t)nullop_panic,
226 .vmsetcap = (vmi_set_cap_t)nullop_panic,
227 .vmspace_alloc = (vmi_vmspace_alloc)nullop_panic,
228 .vmspace_free = (vmi_vmspace_free)nullop_panic,
229 .vlapic_init = (vmi_vlapic_init)nullop_panic,
230 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
231 .vmsavectx = (vmi_savectx)nullop_panic,
232 .vmrestorectx = (vmi_restorectx)nullop_panic,
233 };
234
235 static struct vmm_ops *ops = &vmm_ops_null;
236
237 #define VMM_INIT(num) ((*ops->init)(num))
238 #define VMM_CLEANUP() ((*ops->cleanup)())
239 #define VMM_RESUME() ((*ops->resume)())
240
241 #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap))
242 #define VMRUN(vmi, vcpu, rip, pmap) \
243 ((*ops->vmrun)(vmi, vcpu, rip, pmap))
244 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
245 #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max))
246 #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace))
247
248 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
249 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val))
250 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
251 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
252 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv))
253 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val))
254 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu))
255 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic))
256
257 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
258 #define fpu_stop_emulating() clts()
259
260 SDT_PROVIDER_DEFINE(vmm);
261
262 static MALLOC_DEFINE(M_VM, "vm", "vm");
263
265 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
266
267 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
268 NULL);
269
270 /*
271 * Halt the guest if all vcpus are executing a HLT instruction with
272 * interrupts disabled.
273 */
274 static int halt_detection_enabled = 1;
275
276 /* IPI vector used for vcpu notifications */
277 static int vmm_ipinum;
278
279 /* Trap into hypervisor on all guest exceptions and reflect them back */
280 static int trace_guest_exceptions;
281
282 static void vm_free_memmap(struct vm *vm, int ident);
283 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
284 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
285 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
286 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
287
288 #ifndef __FreeBSD__
289 static void vm_clear_memseg(struct vm *, int);
290
291 /* Flags for vtc_status */
292 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */
293 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */
294
295 typedef struct vm_thread_ctx {
296 struct vm *vtc_vm;
297 int vtc_vcpuid;
298 uint_t vtc_status;
299 } vm_thread_ctx_t;
300 #endif /* __FreeBSD__ */
301
302 #ifdef KTR
303 static const char *
304 vcpu_state2str(enum vcpu_state state)
305 {
306
345
346 if (create) {
347 #ifdef __FreeBSD__
348 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
349 "initialized", vcpu_id));
350 #endif
351 vcpu_lock_init(vcpu);
352 vcpu->state = VCPU_IDLE;
353 vcpu->hostcpu = NOCPU;
354 #ifndef __FreeBSD__
355 vcpu->lastloccpu = NOCPU;
356 #endif
357 vcpu->guestfpu = fpu_save_area_alloc();
358 vcpu->stats = vmm_stat_alloc();
359 vcpu->vie_ctx = vie_alloc();
360 } else {
361 vie_reset(vcpu->vie_ctx);
362 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
363 }
364
365 vcpu->run_state = VRS_HALT;
366 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
367 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
368 vcpu->reqidle = 0;
369 vcpu->exitintinfo = 0;
370 vcpu->nmi_pending = 0;
371 vcpu->extint_pending = 0;
372 vcpu->exception_pending = 0;
373 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
374 fpu_save_area_reset(vcpu->guestfpu);
375 vmm_stat_init(vcpu->stats);
376 }
377
378 int
379 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
380 {
381
382 return (trace_guest_exceptions);
383 }
384
385 struct vm_exit *
386 vm_exitinfo(struct vm *vm, int cpuid)
387 {
1208 return (true);
1209 default:
1210 return (false);
1211 }
1212 }
1213
1214 int
1215 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1216 {
1217
1218 if (vcpu < 0 || vcpu >= vm->maxcpus)
1219 return (EINVAL);
1220
1221 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1222 return (EINVAL);
1223
1224 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1225 }
1226
1227 int
1228 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1229 {
1230 if (vcpu < 0 || vcpu >= vm->maxcpus)
1231 return (EINVAL);
1232
1233 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1234 return (EINVAL);
1235
1236 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1237 }
1238
1239 int
1240 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1241 {
1242 struct vcpu *vcpu;
1243
1244 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1245 return (EINVAL);
1246 }
1247
1248 vcpu = &vm->vcpu[vcpuid];
1249
1250 vcpu_lock(vcpu);
1251 *state = vcpu->run_state;
1252 *sipi_vec = vcpu->sipi_vector;
1253 vcpu_unlock(vcpu);
1254
1255 return (0);
1256 }
1257
1258 int
1259 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1260 {
1261 struct vcpu *vcpu;
1262
1263 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1264 return (EINVAL);
1265 }
1266 if (!VRS_IS_VALID(state)) {
1267 return (EINVAL);
1268 }
1269
1270 vcpu = &vm->vcpu[vcpuid];
1271
1272 vcpu_lock(vcpu);
1273 vcpu->run_state = state;
1274 vcpu->sipi_vector = sipi_vec;
1275 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1276 vcpu_unlock(vcpu);
1277
1278 return (0);
1279 }
1280
1281
1282 static void
1283 restore_guest_fpustate(struct vcpu *vcpu)
1284 {
1285
1286 /* flush host state to the pcb */
1287 fpuexit(curthread);
1288
1289 /* restore guest FPU state */
1290 fpu_stop_emulating();
1291 fpurestore(vcpu->guestfpu);
1292
1293 /* restore guest XCR0 if XSAVE is enabled in the host */
1294 if (rcr4() & CR4_XSAVE)
1295 load_xcr(0, vcpu->guest_xcr0);
1296
1297 /*
1298 * The FPU is now "dirty" with the guest's state so turn on emulation
1299 * to trap any access to the FPU by the host.
1300 */
1301 fpu_start_emulating();
1372 /*
1373 * The following state transitions are allowed:
1374 * IDLE -> FROZEN -> IDLE
1375 * FROZEN -> RUNNING -> FROZEN
1376 * FROZEN -> SLEEPING -> FROZEN
1377 */
1378 switch (vcpu->state) {
1379 case VCPU_IDLE:
1380 case VCPU_RUNNING:
1381 case VCPU_SLEEPING:
1382 error = (newstate != VCPU_FROZEN);
1383 break;
1384 case VCPU_FROZEN:
1385 error = (newstate == VCPU_FROZEN);
1386 break;
1387 default:
1388 error = 1;
1389 break;
1390 }
1391
1392 if (error)
1393 return (EBUSY);
1394
1395 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1396 vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1397
1398 vcpu->state = newstate;
1399 if (newstate == VCPU_RUNNING)
1400 vcpu->hostcpu = curcpu;
1401 else
1402 vcpu->hostcpu = NOCPU;
1403
1404 if (newstate == VCPU_IDLE) {
1405 #ifdef __FreeBSD__
1406 wakeup(&vcpu->state);
1407 #else
1408 cv_broadcast(&vcpu->state_cv);
1409 #endif
1410 }
1411
1412 return (0);
1413 }
1414
1415 static void
1416 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1417 {
1418 int error;
1419
1420 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1421 panic("Error %d setting state to %d\n", error, newstate);
1422 }
1423
1424 static void
1425 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1426 {
1427 int error;
1428
1429 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1430 panic("Error %d setting state to %d", error, newstate);
1431 }
1432
1433 /*
1434 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1435 */
1436 static int
1437 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1438 {
1439 struct vcpu *vcpu;
1440 int t, vcpu_halted, vm_halted;
1441 bool userspace_exit = false;
1442
1443 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1444
1445 vcpu = &vm->vcpu[vcpuid];
1446 vcpu_halted = 0;
1447 vm_halted = 0;
1448
1449 vcpu_lock(vcpu);
1450 while (1) {
1451 /*
1452 * Do a final check for pending interrupts (including NMI and
1453 * INIT) before putting this thread to sleep.
1454 */
1455 if (vm_nmi_pending(vm, vcpuid))
1456 break;
1457 if (vcpu_run_state_pending(vm, vcpuid))
1458 break;
1459 if (!intr_disabled) {
1460 if (vm_extint_pending(vm, vcpuid) ||
1461 vlapic_pending_intr(vcpu->vlapic, NULL)) {
1462 break;
1463 }
1464 }
1465
1466 /*
1467 * Also check for software events which would cause a wake-up.
1468 * This will set the appropriate exitcode directly, rather than
1469 * requiring a trip through VM_RUN().
1470 */
1471 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1472 userspace_exit = true;
1473 break;
1474 }
1475
1476 /*
1477 * Some Linux guests implement "halt" by having all vcpus
1478 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1479 * track of the vcpus that have entered this state. When all
1480 * vcpus enter the halted state the virtual machine is halted.
1481 */
1482 if (intr_disabled) {
1483 if (!vcpu_halted && halt_detection_enabled) {
1484 vcpu_halted = 1;
1485 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1486 }
1487 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1488 vm_halted = 1;
1489 break;
1490 }
1491 }
1492
1493 t = ticks;
1494 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1495 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1496 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1497 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1498 }
1499
1500 if (vcpu_halted)
1501 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1502
1503 vcpu_unlock(vcpu);
1504
1505 if (vm_halted)
1506 vm_suspend(vm, VM_SUSPEND_HALT);
1507
1508 return (userspace_exit ? -1 : 0);
1509 }
1510
1511 static int
1512 vm_handle_paging(struct vm *vm, int vcpuid)
1513 {
1514 int rv, ftype;
1515 struct vm_map *map;
1516 struct vcpu *vcpu;
1517 struct vm_exit *vme;
1518
1519 vcpu = &vm->vcpu[vcpuid];
1520 vme = &vcpu->exitinfo;
1521
1522 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1523 __func__, vme->inst_length));
1524
1525 ftype = vme->u.paging.fault_type;
1526 KASSERT(ftype == VM_PROT_READ ||
1527 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1528 ("vm_handle_paging: invalid fault_type %d", ftype));
1817 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1818 vcpu_notify_event(vm, i);
1819 }
1820 }
1821
1822 return (-1);
1823 }
1824
1825 static int
1826 vm_handle_reqidle(struct vm *vm, int vcpuid)
1827 {
1828 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1829
1830 vcpu_lock(vcpu);
1831 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1832 vcpu->reqidle = 0;
1833 vcpu_unlock(vcpu);
1834 return (-1);
1835 }
1836
1837 static int
1838 vm_handle_run_state(struct vm *vm, int vcpuid)
1839 {
1840 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1841 bool handled = false;
1842
1843 vcpu_lock(vcpu);
1844 while (1) {
1845 if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1846 vcpu_unlock(vcpu);
1847 VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1848 vcpu_lock(vcpu);
1849
1850 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1851 vcpu->run_state |= VRS_INIT;
1852 }
1853
1854 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1855 (VRS_INIT | VRS_PEND_SIPI)) {
1856 const uint8_t vector = vcpu->sipi_vector;
1857
1858 vcpu_unlock(vcpu);
1859 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1860 vcpu_lock(vcpu);
1861
1862 vcpu->run_state &= ~VRS_PEND_SIPI;
1863 vcpu->run_state |= VRS_RUN;
1864 }
1865
1866 /*
1867 * If the vCPU is now in the running state, there is no need to
1868 * wait for anything prior to re-entry.
1869 */
1870 if ((vcpu->run_state & VRS_RUN) != 0) {
1871 handled = true;
1872 break;
1873 }
1874
1875 /*
1876 * Also check for software events which would cause a wake-up.
1877 * This will set the appropriate exitcode directly, rather than
1878 * requiring a trip through VM_RUN().
1879 */
1880 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1881 break;
1882 }
1883
1884 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1885 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1886 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1887 }
1888 vcpu_unlock(vcpu);
1889
1890 return (handled ? 0 : -1);
1891 }
1892
1893 #ifndef __FreeBSD__
1894 static int
1895 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1896 {
1897 struct vcpu *cpu = &vm->vcpu[vcpuid];
1898 const uint32_t code = vme->u.msr.code;
1899 const uint64_t val = vme->u.msr.wval;
1900
1901 switch (code) {
1902 case MSR_TSC:
1903 cpu->tsc_offset = val - rdtsc();
1904 return (0);
1905 }
1906
1907 return (-1);
1908 }
1909 #endif /* __FreeBSD__ */
1910
1911 int
1912 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1913 {
1914 int i;
1915
1916 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1917 return (EINVAL);
1918
1919 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1920 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1921 vm->suspend, how);
1922 return (EALREADY);
1923 }
1924
1925 VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1926
1927 /*
1928 * Notify all active vcpus that they are now suspended.
1929 */
1930 for (i = 0; i < vm->maxcpus; i++) {
1931 if (CPU_ISSET(i, &vm->active_cpus))
1932 vcpu_notify_event(vm, i);
1933 }
1934
1935 return (0);
1936 }
1937
1938 void
1939 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1940 {
1941 struct vm_exit *vmexit;
1942
1943 vmexit = vm_exitinfo(vm, vcpuid);
1944 vmexit->rip = rip;
1945 vmexit->inst_length = 0;
1946 vmexit->exitcode = VM_EXITCODE_RUN_STATE;
1947 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
1948 }
1949
1950
1951 #ifndef __FreeBSD__
1952 /*
1953 * Some vmm resources, such as the lapic, may have CPU-specific resources
1954 * allocated to them which would benefit from migration onto the host CPU which
1955 * is processing the vcpu state.
1956 */
1957 static void
1958 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
1959 {
1960 /*
1961 * Localizing cyclic resources requires acquisition of cpu_lock, and
1962 * doing so with kpreempt disabled is a recipe for deadlock disaster.
1963 */
1964 VERIFY(curthread->t_preempt == 0);
1965
1966 /*
1967 * Do not bother with localization if this vCPU is about to return to
1968 * the host CPU it was last localized to.
1969 */
1970 if (vcpu->lastloccpu == curcpu)
2052 #endif /* __FreeBSD */
2053
2054 static int
2055 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2056 struct vm_exit *vme)
2057 {
2058 struct vcpu *vcpu;
2059 struct vie *vie;
2060 int err;
2061
2062 vcpu = &vm->vcpu[vcpuid];
2063 vie = vcpu->vie_ctx;
2064 err = 0;
2065
2066 switch (entry->cmd) {
2067 case VEC_DEFAULT:
2068 return (0);
2069 case VEC_DISCARD_INSTR:
2070 vie_reset(vie);
2071 return (0);
2072 case VEC_FULFILL_MMIO:
2073 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2074 if (err == 0) {
2075 err = vie_emulate_mmio(vie, vm, vcpuid);
2076 if (err == 0) {
2077 vie_advance_pc(vie, &vcpu->nextrip);
2078 } else if (err < 0) {
2079 vie_exitinfo(vie, vme);
2080 } else if (err == EAGAIN) {
2081 /*
2082 * Clear the instruction emulation state in
2083 * order to re-enter VM context and continue
2084 * this 'rep <instruction>'
2085 */
2086 vie_reset(vie);
2087 err = 0;
2088 }
2089 }
2090 break;
2091 case VEC_FULFILL_INOUT:
2092 err = vie_fulfill_inout(vie, &entry->u.inout);
2093 if (err == 0) {
2094 err = vie_emulate_inout(vie, vm, vcpuid);
2095 if (err == 0) {
2096 vie_advance_pc(vie, &vcpu->nextrip);
2097 } else if (err < 0) {
2098 vie_exitinfo(vie, vme);
2099 } else if (err == EAGAIN) {
2100 /*
2101 * Clear the instruction emulation state in
2102 * order to re-enter VM context and continue
2103 * this 'rep ins/outs'
2104 */
2105 vie_reset(vie);
2106 err = 0;
2107 }
2108 }
2109 break;
2110 default:
2111 return (EINVAL);
2112 }
2113 return (err);
2114 }
2115
2116 static int
2117 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2118 {
2119 struct vie *vie;
2120
2121 vie = vm->vcpu[vcpuid].vie_ctx;
2122
2123 if (vie_pending(vie)) {
2124 /*
2125 * Userspace has not fulfilled the pending needs of the
2126 * instruction emulation, so bail back out.
2127 */
2128 vie_exitinfo(vie, vme);
2129 return (-1);
2130 }
2131
2132 return (0);
2133 }
2134
2135 int
2136 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2137 {
2138 int error;
2139 struct vcpu *vcpu;
2140 #ifdef __FreeBSD__
2141 struct pcb *pcb;
2142 #endif
2143 uint64_t tscval;
2144 struct vm_exit *vme;
2145 bool intr_disabled;
2146 pmap_t pmap;
2147 #ifndef __FreeBSD__
2148 vm_thread_ctx_t vtc;
2149 int affinity_type = CPU_CURRENT;
2150 #endif
2151
2152 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2153 return (EINVAL);
2154
2155 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2156 return (EINVAL);
2157
2158 if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2159 return (EINVAL);
2160
2161 pmap = vmspace_pmap(vm->vmspace);
2162 vcpu = &vm->vcpu[vcpuid];
2163 vme = &vcpu->exitinfo;
2164
2165 #ifndef __FreeBSD__
2166 vtc.vtc_vm = vm;
2167 vtc.vtc_vcpuid = vcpuid;
2168 vtc.vtc_status = 0;
2169
2170 installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2171 NULL, vmm_freectx);
2172 #endif
2173
2174 error = vm_entry_actions(vm, vcpuid, entry, vme);
2175 if (error != 0) {
2176 goto exit;
2177 }
2178
2179 restart:
2180 error = vm_loop_checks(vm, vcpuid, vme);
2181 if (error != 0) {
2182 goto exit;
2183 }
2206
2207 #ifdef __FreeBSD__
2208 pcb = PCPU_GET(curpcb);
2209 set_pcb_flags(pcb, PCB_FULL_IRET);
2210 #else
2211 /* Force a trip through update_sregs to reload %fs/%gs and friends */
2212 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2213 #endif
2214
2215 #ifdef __FreeBSD__
2216 restore_guest_fpustate(vcpu);
2217 #else
2218 if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2219 restore_guest_fpustate(vcpu);
2220 vtc.vtc_status |= VTCS_FPU_RESTORED;
2221 }
2222 vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2223 #endif
2224
2225 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2226 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2227 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2228
2229 #ifdef __FreeBSD__
2230 save_guest_fpustate(vcpu);
2231 #else
2232 vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2233 #endif
2234
2235 #ifndef __FreeBSD__
2236 /*
2237 * Once clear of the delicate contexts comprising the VM_RUN handler,
2238 * thread CPU affinity can be loosened while other processing occurs.
2239 */
2240 thread_affinity_clear(curthread);
2241 #endif
2242
2243 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
2244
2245 critical_exit();
2246
2247 if (error != 0) {
2248 /* Communicate out any error from VMRUN() above */
2249 goto exit;
2250 }
2251
2252 vcpu->nextrip = vme->rip + vme->inst_length;
2253 switch (vme->exitcode) {
2254 case VM_EXITCODE_REQIDLE:
2255 error = vm_handle_reqidle(vm, vcpuid);
2256 break;
2257 case VM_EXITCODE_RUN_STATE:
2258 error = vm_handle_run_state(vm, vcpuid);
2259 break;
2260 case VM_EXITCODE_SUSPENDED:
2261 error = vm_handle_suspend(vm, vcpuid);
2262 break;
2263 case VM_EXITCODE_IOAPIC_EOI:
2264 vioapic_process_eoi(vm, vcpuid,
2265 vme->u.ioapic_eoi.vector);
2266 break;
2267 case VM_EXITCODE_HLT:
2268 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2269 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2270 break;
2271 case VM_EXITCODE_PAGING:
2272 error = vm_handle_paging(vm, vcpuid);
2273 break;
2274 case VM_EXITCODE_MMIO_EMUL:
2275 error = vm_handle_mmio_emul(vm, vcpuid);
2276 break;
2277 case VM_EXITCODE_INOUT:
2278 error = vm_handle_inout(vm, vcpuid, vme);
2279 break;
2280 case VM_EXITCODE_MONITOR:
2281 case VM_EXITCODE_MWAIT:
2282 case VM_EXITCODE_VMINSN:
2283 vm_inject_ud(vm, vcpuid);
2284 break;
2285 #ifndef __FreeBSD__
2286 case VM_EXITCODE_WRMSR:
2757 }
2758
2759 void
2760 vm_extint_clear(struct vm *vm, int vcpuid)
2761 {
2762 struct vcpu *vcpu;
2763
2764 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2765 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2766
2767 vcpu = &vm->vcpu[vcpuid];
2768
2769 if (vcpu->extint_pending == 0)
2770 panic("vm_extint_clear: inconsistent extint_pending state");
2771
2772 vcpu->extint_pending = 0;
2773 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2774 }
2775
2776 int
2777 vm_inject_init(struct vm *vm, int vcpuid)
2778 {
2779 struct vcpu *vcpu;
2780
2781 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2782 return (EINVAL);
2783
2784 vcpu = &vm->vcpu[vcpuid];
2785 vcpu_lock(vcpu);
2786 vcpu->run_state |= VRS_PEND_INIT;
2787 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2788 vcpu_unlock(vcpu);
2789 return (0);
2790 }
2791
2792 int
2793 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2794 {
2795 struct vcpu *vcpu;
2796
2797 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2798 return (EINVAL);
2799
2800 vcpu = &vm->vcpu[vcpuid];
2801 vcpu_lock(vcpu);
2802 vcpu->run_state |= VRS_PEND_SIPI;
2803 vcpu->sipi_vector = vector;
2804 /* SIPI is only actionable if the CPU is waiting in INIT state */
2805 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2806 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2807 }
2808 vcpu_unlock(vcpu);
2809 return (0);
2810 }
2811
2812 bool
2813 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2814 {
2815 struct vcpu *vcpu;
2816
2817 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2818 vcpu = &vm->vcpu[vcpuid];
2819
2820 /* Of interest: vCPU not in running state or with pending INIT */
2821 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2822 }
2823
2824 int
2825 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2826 {
2827 struct seg_desc desc;
2828 const enum vm_reg_name clear_regs[] = {
2829 VM_REG_GUEST_CR2,
2830 VM_REG_GUEST_CR3,
2831 VM_REG_GUEST_CR4,
2832 VM_REG_GUEST_RAX,
2833 VM_REG_GUEST_RBX,
2834 VM_REG_GUEST_RCX,
2835 VM_REG_GUEST_RSI,
2836 VM_REG_GUEST_RDI,
2837 VM_REG_GUEST_RBP,
2838 VM_REG_GUEST_RSP,
2839 VM_REG_GUEST_R8,
2840 VM_REG_GUEST_R9,
2841 VM_REG_GUEST_R10,
2842 VM_REG_GUEST_R11,
2843 VM_REG_GUEST_R12,
2844 VM_REG_GUEST_R13,
2845 VM_REG_GUEST_R14,
2846 VM_REG_GUEST_R15,
2847 VM_REG_GUEST_DR0,
2848 VM_REG_GUEST_DR1,
2849 VM_REG_GUEST_DR2,
2850 VM_REG_GUEST_DR3,
2851 VM_REG_GUEST_EFER,
2852 };
2853 const enum vm_reg_name data_segs[] = {
2854 VM_REG_GUEST_SS,
2855 VM_REG_GUEST_DS,
2856 VM_REG_GUEST_ES,
2857 VM_REG_GUEST_FS,
2858 VM_REG_GUEST_GS,
2859 };
2860 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2861
2862 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2863 return (EINVAL);
2864
2865 for (uint_t i = 0; i < nitems(clear_regs); i++) {
2866 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2867 }
2868
2869 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2870 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2871 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2872
2873 /*
2874 * The prescribed contents of %rdx differ slightly between the Intel and
2875 * AMD architectural definitions. The former expects the Extended Model
2876 * in bits 16-19 where the latter expects all the Family, Model, and
2877 * Stepping be there. Common boot ROMs appear to disregard this
2878 * anyways, so we stick with a compromise value similar to what is
2879 * spelled out in the Intel SDM.
2880 */
2881 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2882
2883 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2884 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2885
2886 /* CS: Present, R/W, Accessed */
2887 desc.access = 0x0093;
2888 desc.base = 0xffff0000;
2889 desc.limit = 0xffff;
2890 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2891 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2892
2893 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2894 desc.access = 0x0093;
2895 desc.base = 0;
2896 desc.limit = 0xffff;
2897 for (uint_t i = 0; i < nitems(data_segs); i++) {
2898 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2899 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2900 }
2901
2902 /* GDTR, IDTR */
2903 desc.base = 0;
2904 desc.limit = 0xffff;
2905 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2906 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2907
2908 /* LDTR: Present, LDT */
2909 desc.access = 0x0082;
2910 desc.base = 0;
2911 desc.limit = 0xffff;
2912 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2913 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2914
2915 /* TR: Present, 32-bit TSS */
2916 desc.access = 0x008b;
2917 desc.base = 0;
2918 desc.limit = 0xffff;
2919 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2920 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2921
2922 vlapic_reset(vm_lapic(vm, vcpuid));
2923
2924 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2925
2926 vcpu->exitintinfo = 0;
2927 vcpu->exception_pending = 0;
2928 vcpu->nmi_pending = 0;
2929 vcpu->extint_pending = 0;
2930
2931 /*
2932 * A CPU reset caused by power-on or system reset clears more state than
2933 * one which is trigged from an INIT IPI.
2934 */
2935 if (!init_only) {
2936 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2937 fpu_save_area_reset(vcpu->guestfpu);
2938
2939 /* XXX: clear MSRs and other pieces */
2940 }
2941
2942 return (0);
2943 }
2944
2945 static int
2946 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2947 {
2948 struct seg_desc desc;
2949
2950 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2951 return (EINVAL);
2952
2953 /* CS: Present, R/W, Accessed */
2954 desc.access = 0x0093;
2955 desc.base = (uint64_t)vector << 12;
2956 desc.limit = 0xffff;
2957 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2958 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2959 (uint64_t)vector << 8));
2960
2961 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2962
2963 return (0);
2964 }
2965
2966 int
2967 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2968 {
2969 if (vcpu < 0 || vcpu >= vm->maxcpus)
2970 return (EINVAL);
2971
2972 if (type < 0 || type >= VM_CAP_MAX)
2973 return (EINVAL);
2974
2975 return (VMGETCAP(vm->cookie, vcpu, type, retval));
2976 }
2977
2978 int
2979 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
2980 {
2981 if (vcpu < 0 || vcpu >= vm->maxcpus)
2982 return (EINVAL);
2983
2984 if (type < 0 || type >= VM_CAP_MAX)
2985 return (EINVAL);
2986
3049 }
3050 return (found);
3051 }
3052 #endif
3053
3054 void *
3055 vm_iommu_domain(struct vm *vm)
3056 {
3057
3058 return (vm->iommu);
3059 }
3060
3061 int
3062 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3063 bool from_idle)
3064 {
3065 int error;
3066 struct vcpu *vcpu;
3067
3068 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3069 panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3070
3071 vcpu = &vm->vcpu[vcpuid];
3072
3073 vcpu_lock(vcpu);
3074 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3075 vcpu_unlock(vcpu);
3076
3077 return (error);
3078 }
3079
3080 enum vcpu_state
3081 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3082 {
3083 struct vcpu *vcpu;
3084 enum vcpu_state state;
3085
3086 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3087 panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3088
3089 vcpu = &vm->vcpu[vcpuid];
3090
3091 vcpu_lock(vcpu);
3092 state = vcpu->state;
3093 if (hostcpu != NULL)
3094 *hostcpu = vcpu->hostcpu;
3095 vcpu_unlock(vcpu);
3096
3097 return (state);
3098 }
3099
3100 #ifndef __FreeBSD__
3101 uint64_t
3102 vcpu_tsc_offset(struct vm *vm, int vcpuid)
3103 {
3104 return (vm->vcpu[vcpuid].tsc_offset);
3105 }
3106 #endif /* __FreeBSD__ */
3107
3108 int
3109 vm_activate_cpu(struct vm *vm, int vcpuid)
3110 {
3111
3112 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3113 return (EINVAL);
3114
3115 if (CPU_ISSET(vcpuid, &vm->active_cpus))
3116 return (EBUSY);
3117
3118 VCPU_CTR0(vm, vcpuid, "activated");
3119 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3145 }
3146
3147 int
3148 vm_resume_cpu(struct vm *vm, int vcpuid)
3149 {
3150
3151 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3152 return (EINVAL);
3153
3154 if (vcpuid == -1) {
3155 CPU_ZERO(&vm->debug_cpus);
3156 } else {
3157 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3158 return (EINVAL);
3159
3160 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3161 }
3162 return (0);
3163 }
3164
3165 static bool
3166 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3167 uint64_t entry_rip)
3168 {
3169 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3170 struct vm_exit *vme = &vcpu->exitinfo;
3171 bool bail = false;
3172
3173 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3174
3175 if (vm->suspend) {
3176 if (on_entry) {
3177 VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3178 vm->suspend < VM_SUSPEND_LAST);
3179
3180 vme->exitcode = VM_EXITCODE_SUSPENDED;
3181 vme->u.suspended.how = vm->suspend;
3182 } else {
3183 /*
3184 * Handling VM suspend is complicated, so if that
3185 * condition is detected outside of VM-entry itself,
3186 * just emit a BOGUS exitcode so we take a lap to pick
3187 * up the event during an entry and are directed into
3188 * the vm_handle_suspend() logic.
3189 */
3190 vme->exitcode = VM_EXITCODE_BOGUS;
3191 }
3192 bail = true;
3193 }
3194 if (vcpu->reqidle) {
3195 vme->exitcode = VM_EXITCODE_REQIDLE;
3196 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3197
3198 if (!on_entry) {
3199 /*
3200 * A reqidle request detected outside of VM-entry can be
3201 * handled directly by clearing the request (and taking
3202 * a lap to userspace).
3203 */
3204 vcpu_assert_locked(vcpu);
3205 vcpu->reqidle = 0;
3206 }
3207 bail = true;
3208 }
3209 if (vcpu_should_yield(vm, vcpuid)) {
3210 vme->exitcode = VM_EXITCODE_BOGUS;
3211 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3212 bail = true;
3213 }
3214 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3215 vme->exitcode = VM_EXITCODE_DEBUG;
3216 bail = true;
3217 }
3218
3219 if (bail) {
3220 if (on_entry) {
3221 /*
3222 * If bailing out during VM-entry, the current %rip must
3223 * be recorded in the exitinfo.
3224 */
3225 vme->rip = entry_rip;
3226 }
3227 vme->inst_length = 0;
3228 }
3229 return (bail);
3230 }
3231
3232 static bool
3233 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3234 {
3235 /*
3236 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3237 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3238 * structure, and we would only modify the exitcode.
3239 */
3240 return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3241 }
3242
3243 bool
3244 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3245 {
3246 /*
3247 * Bail-out checks done as part of VM entry require an updated %rip to
3248 * populate the vm_exit struct if any of the conditions of interest are
3249 * matched in the check.
3250 */
3251 return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3252 }
3253
3254 cpuset_t
3255 vm_active_cpus(struct vm *vm)
3256 {
3257
3258 return (vm->active_cpus);
3259 }
3260
3261 cpuset_t
3262 vm_debug_cpus(struct vm *vm)
3263 {
3264
3265 return (vm->debug_cpus);
3266 }
3267
3268 cpuset_t
3269 vm_suspended_cpus(struct vm *vm)
3270 {
3271
3272 return (vm->suspended_cpus);
3273 }
|