1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2015 Pluribus Networks Inc.
15 * Copyright 2019 Joyent, Inc.
16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17 * Copyright 2021 Oxide Computer Company
18 */
19
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48
49 #include <vm/seg_dev.h>
50
51 #include "io/ppt.h"
52 #include "io/vatpic.h"
53 #include "io/vioapic.h"
54 #include "io/vrtc.h"
55 #include "io/vhpet.h"
56 #include "io/vpmtmr.h"
57 #include "vmm_lapic.h"
58 #include "vmm_stat.h"
59 #include "vmm_util.h"
60
61 /*
62 * Locking details:
63 *
64 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
65 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data
66 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire
67 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to
68 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
69 */
70
71 static kmutex_t vmmdev_mtx;
72 static dev_info_t *vmmdev_dip;
73 static hma_reg_t *vmmdev_hma_reg;
74 static uint_t vmmdev_hma_ref;
75 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
76
77 static kmutex_t vmm_mtx;
78 static list_t vmm_list;
79 static list_t vmm_destroy_list;
80 static id_space_t *vmm_minors;
81 static void *vmm_statep;
82
83 static const char *vmmdev_hvm_name = "bhyve";
84
85 /* For sdev plugin (/dev) */
86 #define VMM_SDEV_ROOT "/dev/vmm"
87
88 /* From uts/i86pc/io/vmm/intel/vmx.c */
89 extern int vmx_x86_supported(const char **);
90
91 /* Holds and hooks from drivers external to vmm */
92 struct vmm_hold {
93 list_node_t vmh_node;
94 vmm_softc_t *vmh_sc;
95 boolean_t vmh_release_req;
96 uint_t vmh_ioport_hook_cnt;
97 };
98
99 struct vmm_lease {
100 list_node_t vml_node;
101 struct vm *vml_vm;
102 boolean_t vml_expired;
103 boolean_t (*vml_expire_func)(void *);
104 void *vml_expire_arg;
105 list_node_t vml_expire_node;
106 struct vmm_hold *vml_hold;
107 };
108
109 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
110 static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
111 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
112 static void vmm_kstat_init(vmm_softc_t *);
113 static void vmm_kstat_fini(vmm_softc_t *);
114
115 static int
116 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
117 {
118 int error;
119 bool sysmem;
120
121 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
122 NULL);
123 if (error || mseg->len == 0)
124 return (error);
125
126 if (!sysmem) {
127 vmm_devmem_entry_t *de;
128 list_t *dl = &sc->vmm_devmem_list;
129
130 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
131 if (de->vde_segid == mseg->segid) {
132 break;
133 }
134 }
135 if (de != NULL) {
136 (void) strlcpy(mseg->name, de->vde_name,
137 sizeof (mseg->name));
138 }
139 } else {
140 bzero(mseg->name, sizeof (mseg->name));
141 }
142
143 return (error);
144 }
145
146 /*
147 * The 'devmem' hack:
148 *
149 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
150 * in the vm which appear with their own name related to the vm under /dev.
151 * Since this would be a hassle from an sdev perspective and would require a
152 * new cdev interface (or complicate the existing one), we choose to implement
153 * this in a different manner. When 'devmem' mappings are created, an
154 * identifying off_t is communicated back out to userspace. That off_t,
155 * residing above the normal guest memory space, can be used to mmap the
156 * 'devmem' mapping from the already-open vm device.
157 */
158
159 static int
160 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
161 {
162 off_t map_offset;
163 vmm_devmem_entry_t *entry;
164
165 if (list_is_empty(&sc->vmm_devmem_list)) {
166 map_offset = VM_DEVMEM_START;
167 } else {
168 entry = list_tail(&sc->vmm_devmem_list);
169 map_offset = entry->vde_off + entry->vde_len;
170 if (map_offset < entry->vde_off) {
171 /* Do not tolerate overflow */
172 return (ERANGE);
173 }
174 /*
175 * XXXJOY: We could choose to search the list for duplicate
176 * names and toss an error. Since we're using the offset
177 * method for now, it does not make much of a difference.
178 */
179 }
180
181 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
182 entry->vde_segid = mseg->segid;
183 entry->vde_len = mseg->len;
184 entry->vde_off = map_offset;
185 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
186 list_insert_tail(&sc->vmm_devmem_list, entry);
187
188 return (0);
189 }
190
191 static boolean_t
192 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
193 off_t *map_offp)
194 {
195 list_t *dl = &sc->vmm_devmem_list;
196 vmm_devmem_entry_t *de = NULL;
197 const off_t map_end = off + len;
198
199 VERIFY(off >= VM_DEVMEM_START);
200
201 if (map_end < off) {
202 /* No match on overflow */
203 return (B_FALSE);
204 }
205
206 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
207 const off_t item_end = de->vde_off + de->vde_len;
208
209 if (de->vde_off <= off && item_end >= map_end) {
210 *segidp = de->vde_segid;
211 *map_offp = off - de->vde_off;
212 return (B_TRUE);
213 }
214 }
215 return (B_FALSE);
216 }
217
218 static void
219 vmmdev_devmem_purge(vmm_softc_t *sc)
220 {
221 vmm_devmem_entry_t *entry;
222
223 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
224 kmem_free(entry, sizeof (*entry));
225 }
226 }
227
228 static int
229 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
230 {
231 int error;
232 bool sysmem = true;
233
234 if (VM_MEMSEG_NAME(mseg)) {
235 sysmem = false;
236 }
237 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
238
239 if (error == 0 && VM_MEMSEG_NAME(mseg)) {
240 /*
241 * Rather than create a whole fresh device from which userspace
242 * can mmap this segment, instead make it available at an
243 * offset above where the main guest memory resides.
244 */
245 error = vmmdev_devmem_create(sc, mseg, mseg->name);
246 if (error != 0) {
247 vm_free_memseg(sc->vmm_vm, mseg->segid);
248 }
249 }
250 return (error);
251 }
252
253 /*
254 * Resource Locking and Exclusion
255 *
256 * Much of bhyve depends on key portions of VM state, such as the guest memory
257 * map, to remain unchanged while the guest is running. As ported from
258 * FreeBSD, the initial strategy for this resource exclusion hinged on gating
259 * access to the instance vCPUs. Threads acting on a single vCPU, like those
260 * performing the work of actually running the guest in VMX/SVM, would lock
261 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide
262 * state, all of the vCPUs would be first locked, ensuring that the
263 * operation(s) could complete without any other threads stumbling into
264 * intermediate states.
265 *
266 * This approach is largely effective for bhyve. Common operations, such as
267 * running the vCPUs, steer clear of lock contention. The model begins to
268 * break down for operations which do not occur in the context of a specific
269 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker
270 * thread in the bhyve process. In order to properly protect those vCPU-less
271 * operations from encountering invalid states, additional locking is required.
272 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
273 * It does mean that class of operations will be serialized on locking the
274 * specific vCPU and that instances sized at VM_MAXCPU will potentially see
275 * undue contention on the VM_MAXCPU-1 vCPU.
276 *
277 * In order to address the shortcomings of this model, the concept of a
278 * read/write lock has been added to bhyve. Operations which change
279 * fundamental aspects of a VM (such as the memory map) must acquire the write
280 * lock, which also implies locking all of the vCPUs and waiting for all read
281 * lock holders to release. While it increases the cost and waiting time for
282 * those few operations, it allows most hot-path operations on the VM (which
283 * depend on its configuration remaining stable) to occur with minimal locking.
284 *
285 * Consumers of the Driver API (see below) are a special case when it comes to
286 * this locking, since they may hold a read lock via the drv_lease mechanism
287 * for an extended period of time. Rather than forcing those consumers to
288 * continuously poll for a write lock attempt, the lease system forces them to
289 * provide a release callback to trigger their clean-up (and potential later
290 * reacquisition) of the read lock.
291 */
292
293 static void
294 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
295 {
296 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
297
298 /*
299 * Since this state transition is utilizing from_idle=true, it should
300 * not fail, but rather block until it can be successful.
301 */
302 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
303 }
304
305 static void
306 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
307 {
308 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
309
310 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
311 vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
312 }
313
314 static void
315 vmm_read_lock(vmm_softc_t *sc)
316 {
317 rw_enter(&sc->vmm_rwlock, RW_READER);
318 }
319
320 static void
321 vmm_read_unlock(vmm_softc_t *sc)
322 {
323 rw_exit(&sc->vmm_rwlock);
324 }
325
326 static void
327 vmm_write_lock(vmm_softc_t *sc)
328 {
329 int maxcpus;
330
331 /* First lock all the vCPUs */
332 maxcpus = vm_get_maxcpus(sc->vmm_vm);
333 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
334 vcpu_lock_one(sc, vcpu);
335 }
336
337 mutex_enter(&sc->vmm_lease_lock);
338 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
339 sc->vmm_lease_blocker++;
340 if (sc->vmm_lease_blocker == 1) {
341 list_t *list = &sc->vmm_lease_list;
342 vmm_lease_t *lease = list_head(list);
343
344 while (lease != NULL) {
345 boolean_t sync_break = B_FALSE;
346
347 if (!lease->vml_expired) {
348 void *arg = lease->vml_expire_arg;
349 lease->vml_expired = B_TRUE;
350 sync_break = lease->vml_expire_func(arg);
351 }
352
353 if (sync_break) {
354 vmm_lease_t *next;
355
356 /*
357 * These leases which are synchronously broken
358 * result in vmm_read_unlock() calls from a
359 * different thread than the corresponding
360 * vmm_read_lock(). This is acceptable, given
361 * that the rwlock underpinning the whole
362 * mechanism tolerates the behavior. This
363 * flexibility is _only_ afforded to VM read
364 * lock (RW_READER) holders.
365 */
366 next = list_next(list, lease);
367 vmm_lease_break_locked(sc, lease);
368 lease = next;
369 } else {
370 lease = list_next(list, lease);
371 }
372 }
373 }
374 mutex_exit(&sc->vmm_lease_lock);
375
376 rw_enter(&sc->vmm_rwlock, RW_WRITER);
377 /*
378 * For now, the 'maxcpus' value for an instance is fixed at the
379 * compile-time constant of VM_MAXCPU at creation. If this changes in
380 * the future, allowing for dynamic vCPU resource sizing, acquisition
381 * of the write lock will need to be wary of such changes.
382 */
383 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
384 }
385
386 static void
387 vmm_write_unlock(vmm_softc_t *sc)
388 {
389 int maxcpus;
390
391 mutex_enter(&sc->vmm_lease_lock);
392 VERIFY3U(sc->vmm_lease_blocker, !=, 0);
393 sc->vmm_lease_blocker--;
394 if (sc->vmm_lease_blocker == 0) {
395 cv_broadcast(&sc->vmm_lease_cv);
396 }
397 mutex_exit(&sc->vmm_lease_lock);
398
399 /*
400 * The VM write lock _must_ be released from the same thread it was
401 * acquired in, unlike the read lock.
402 */
403 VERIFY(rw_write_held(&sc->vmm_rwlock));
404 rw_exit(&sc->vmm_rwlock);
405
406 /* Unlock all the vCPUs */
407 maxcpus = vm_get_maxcpus(sc->vmm_vm);
408 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
409 vcpu_unlock_one(sc, vcpu);
410 }
411 }
412
413 static int
414 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
415 cred_t *credp, int *rvalp)
416 {
417 int error = 0, vcpu = -1;
418 void *datap = (void *)arg;
419 enum vm_lock_type {
420 LOCK_NONE = 0,
421 LOCK_VCPU,
422 LOCK_READ_HOLD,
423 LOCK_WRITE_HOLD
424 } lock_type = LOCK_NONE;
425
426 /* Acquire any exclusion resources needed for the operation. */
427 switch (cmd) {
428 case VM_RUN:
429 case VM_GET_REGISTER:
430 case VM_SET_REGISTER:
431 case VM_GET_SEGMENT_DESCRIPTOR:
432 case VM_SET_SEGMENT_DESCRIPTOR:
433 case VM_GET_REGISTER_SET:
434 case VM_SET_REGISTER_SET:
435 case VM_INJECT_EXCEPTION:
436 case VM_GET_CAPABILITY:
437 case VM_SET_CAPABILITY:
438 case VM_PPTDEV_MSI:
439 case VM_PPTDEV_MSIX:
440 case VM_SET_X2APIC_STATE:
441 case VM_GLA2GPA:
442 case VM_GLA2GPA_NOFAULT:
443 case VM_ACTIVATE_CPU:
444 case VM_SET_INTINFO:
445 case VM_GET_INTINFO:
446 case VM_RESTART_INSTRUCTION:
447 case VM_SET_KERNEMU_DEV:
448 case VM_GET_KERNEMU_DEV:
449 case VM_RESET_CPU:
450 case VM_GET_RUN_STATE:
451 case VM_SET_RUN_STATE:
452 /*
453 * Copy in the ID of the vCPU chosen for this operation.
454 * Since a nefarious caller could update their struct between
455 * this locking and when the rest of the ioctl data is copied
456 * in, it is _critical_ that this local 'vcpu' variable be used
457 * rather than the in-struct one when performing the ioctl.
458 */
459 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
460 return (EFAULT);
461 }
462 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
463 return (EINVAL);
464 }
465 vcpu_lock_one(sc, vcpu);
466 lock_type = LOCK_VCPU;
467 break;
468
469 case VM_REINIT:
470 case VM_BIND_PPTDEV:
471 case VM_UNBIND_PPTDEV:
472 case VM_MAP_PPTDEV_MMIO:
473 case VM_UNMAP_PPTDEV_MMIO:
474 case VM_ALLOC_MEMSEG:
475 case VM_MMAP_MEMSEG:
476 case VM_MUNMAP_MEMSEG:
477 case VM_WRLOCK_CYCLE:
478 case VM_PMTMR_LOCATE:
479 vmm_write_lock(sc);
480 lock_type = LOCK_WRITE_HOLD;
481 break;
482
483 case VM_GET_GPA_PMAP:
484 case VM_GET_MEMSEG:
485 case VM_MMAP_GETNEXT:
486 case VM_LAPIC_IRQ:
487 case VM_INJECT_NMI:
488 case VM_IOAPIC_ASSERT_IRQ:
489 case VM_IOAPIC_DEASSERT_IRQ:
490 case VM_IOAPIC_PULSE_IRQ:
491 case VM_LAPIC_MSI:
492 case VM_LAPIC_LOCAL_IRQ:
493 case VM_GET_X2APIC_STATE:
494 case VM_RTC_READ:
495 case VM_RTC_WRITE:
496 case VM_RTC_SETTIME:
497 case VM_RTC_GETTIME:
498 case VM_PPTDEV_DISABLE_MSIX:
499 case VM_DEVMEM_GETOFFSET:
500 vmm_read_lock(sc);
501 lock_type = LOCK_READ_HOLD;
502 break;
503
504 case VM_IOAPIC_PINCOUNT:
505 default:
506 break;
507 }
508
509 /* Execute the primary logic for the ioctl. */
510 switch (cmd) {
511 case VM_RUN: {
512 struct vm_entry entry;
513
514 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
515 error = EFAULT;
516 break;
517 }
518
519 if (!(curthread->t_schedflag & TS_VCPU))
520 smt_mark_as_vcpu();
521
522 error = vm_run(sc->vmm_vm, vcpu, &entry);
523
524 /*
525 * Unexpected states in vm_run() are expressed through positive
526 * errno-oriented return values. VM states which expect further
527 * processing in userspace (necessary context via exitinfo) are
528 * expressed through negative return values. For the time being
529 * a return value of 0 is not expected from vm_run().
530 */
531 ASSERT(error != 0);
532 if (error < 0) {
533 const struct vm_exit *vme;
534 void *outp = entry.exit_data;
535
536 error = 0;
537 vme = vm_exitinfo(sc->vmm_vm, vcpu);
538 if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
539 error = EFAULT;
540 }
541 }
542 break;
543 }
544 case VM_SUSPEND: {
545 struct vm_suspend vmsuspend;
546
547 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
548 error = EFAULT;
549 break;
550 }
551 error = vm_suspend(sc->vmm_vm, vmsuspend.how);
552 break;
553 }
554 case VM_REINIT:
555 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
556 /*
557 * The VM instance should be free of driver-attached
558 * hooks during the reinitialization process.
559 */
560 break;
561 }
562 error = vm_reinit(sc->vmm_vm);
563 (void) vmm_drv_block_hook(sc, B_FALSE);
564 break;
565 case VM_STAT_DESC: {
566 struct vm_stat_desc statdesc;
567
568 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
569 error = EFAULT;
570 break;
571 }
572 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
573 sizeof (statdesc.desc));
574 if (error == 0 &&
575 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
576 error = EFAULT;
577 break;
578 }
579 break;
580 }
581 case VM_STATS_IOC: {
582 struct vm_stats vmstats;
583
584 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
585 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
586 error = EFAULT;
587 break;
588 }
589 hrt2tv(gethrtime(), &vmstats.tv);
590 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
591 &vmstats.num_entries, vmstats.statbuf);
592 if (error == 0 &&
593 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
594 error = EFAULT;
595 break;
596 }
597 break;
598 }
599
600 case VM_PPTDEV_MSI: {
601 struct vm_pptdev_msi pptmsi;
602
603 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
604 error = EFAULT;
605 break;
606 }
607 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
608 pptmsi.addr, pptmsi.msg, pptmsi.numvec);
609 break;
610 }
611 case VM_PPTDEV_MSIX: {
612 struct vm_pptdev_msix pptmsix;
613
614 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
615 error = EFAULT;
616 break;
617 }
618 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
619 pptmsix.idx, pptmsix.addr, pptmsix.msg,
620 pptmsix.vector_control);
621 break;
622 }
623 case VM_PPTDEV_DISABLE_MSIX: {
624 struct vm_pptdev pptdev;
625
626 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
627 error = EFAULT;
628 break;
629 }
630 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
631 break;
632 }
633 case VM_MAP_PPTDEV_MMIO: {
634 struct vm_pptdev_mmio pptmmio;
635
636 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
637 error = EFAULT;
638 break;
639 }
640 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
641 pptmmio.len, pptmmio.hpa);
642 break;
643 }
644 case VM_UNMAP_PPTDEV_MMIO: {
645 struct vm_pptdev_mmio pptmmio;
646
647 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
648 error = EFAULT;
649 break;
650 }
651 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
652 pptmmio.len);
653 break;
654 }
655 case VM_BIND_PPTDEV: {
656 struct vm_pptdev pptdev;
657
658 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
659 error = EFAULT;
660 break;
661 }
662 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
663 break;
664 }
665 case VM_UNBIND_PPTDEV: {
666 struct vm_pptdev pptdev;
667
668 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
669 error = EFAULT;
670 break;
671 }
672 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
673 break;
674 }
675 case VM_GET_PPTDEV_LIMITS: {
676 struct vm_pptdev_limits pptlimits;
677
678 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
679 error = EFAULT;
680 break;
681 }
682 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
683 &pptlimits.msi_limit, &pptlimits.msix_limit);
684 if (error == 0 &&
685 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
686 error = EFAULT;
687 break;
688 }
689 break;
690 }
691 case VM_INJECT_EXCEPTION: {
692 struct vm_exception vmexc;
693 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
694 error = EFAULT;
695 break;
696 }
697 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
698 vmexc.error_code_valid, vmexc.error_code,
699 vmexc.restart_instruction);
700 break;
701 }
702 case VM_INJECT_NMI: {
703 struct vm_nmi vmnmi;
704
705 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
706 error = EFAULT;
707 break;
708 }
709 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
710 break;
711 }
712 case VM_LAPIC_IRQ: {
713 struct vm_lapic_irq vmirq;
714
715 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
716 error = EFAULT;
717 break;
718 }
719 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
720 break;
721 }
722 case VM_LAPIC_LOCAL_IRQ: {
723 struct vm_lapic_irq vmirq;
724
725 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
726 error = EFAULT;
727 break;
728 }
729 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
730 vmirq.vector);
731 break;
732 }
733 case VM_LAPIC_MSI: {
734 struct vm_lapic_msi vmmsi;
735
736 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
737 error = EFAULT;
738 break;
739 }
740 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
741 break;
742 }
743
744 case VM_IOAPIC_ASSERT_IRQ: {
745 struct vm_ioapic_irq ioapic_irq;
746
747 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
748 error = EFAULT;
749 break;
750 }
751 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
752 break;
753 }
754 case VM_IOAPIC_DEASSERT_IRQ: {
755 struct vm_ioapic_irq ioapic_irq;
756
757 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
758 error = EFAULT;
759 break;
760 }
761 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
762 break;
763 }
764 case VM_IOAPIC_PULSE_IRQ: {
765 struct vm_ioapic_irq ioapic_irq;
766
767 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
768 error = EFAULT;
769 break;
770 }
771 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
772 break;
773 }
774 case VM_IOAPIC_PINCOUNT: {
775 int pincount;
776
777 pincount = vioapic_pincount(sc->vmm_vm);
778 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
779 error = EFAULT;
780 break;
781 }
782 break;
783 }
784
785 case VM_ISA_ASSERT_IRQ: {
786 struct vm_isa_irq isa_irq;
787
788 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
789 error = EFAULT;
790 break;
791 }
792 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
793 if (error == 0 && isa_irq.ioapic_irq != -1) {
794 error = vioapic_assert_irq(sc->vmm_vm,
795 isa_irq.ioapic_irq);
796 }
797 break;
798 }
799 case VM_ISA_DEASSERT_IRQ: {
800 struct vm_isa_irq isa_irq;
801
802 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
803 error = EFAULT;
804 break;
805 }
806 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
807 if (error == 0 && isa_irq.ioapic_irq != -1) {
808 error = vioapic_deassert_irq(sc->vmm_vm,
809 isa_irq.ioapic_irq);
810 }
811 break;
812 }
813 case VM_ISA_PULSE_IRQ: {
814 struct vm_isa_irq isa_irq;
815
816 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
817 error = EFAULT;
818 break;
819 }
820 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
821 if (error == 0 && isa_irq.ioapic_irq != -1) {
822 error = vioapic_pulse_irq(sc->vmm_vm,
823 isa_irq.ioapic_irq);
824 }
825 break;
826 }
827 case VM_ISA_SET_IRQ_TRIGGER: {
828 struct vm_isa_irq_trigger isa_irq_trigger;
829
830 if (ddi_copyin(datap, &isa_irq_trigger,
831 sizeof (isa_irq_trigger), md)) {
832 error = EFAULT;
833 break;
834 }
835 error = vatpic_set_irq_trigger(sc->vmm_vm,
836 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
837 break;
838 }
839
840 case VM_MMAP_GETNEXT: {
841 struct vm_memmap mm;
842
843 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
844 error = EFAULT;
845 break;
846 }
847 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
848 &mm.segoff, &mm.len, &mm.prot, &mm.flags);
849 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
850 error = EFAULT;
851 break;
852 }
853 break;
854 }
855 case VM_MMAP_MEMSEG: {
856 struct vm_memmap mm;
857
858 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
859 error = EFAULT;
860 break;
861 }
862 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
863 mm.len, mm.prot, mm.flags);
864 break;
865 }
866 case VM_MUNMAP_MEMSEG: {
867 struct vm_munmap mu;
868
869 if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
870 error = EFAULT;
871 break;
872 }
873 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
874 break;
875 }
876 case VM_ALLOC_MEMSEG: {
877 struct vm_memseg vmseg;
878
879 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
880 error = EFAULT;
881 break;
882 }
883 error = vmmdev_alloc_memseg(sc, &vmseg);
884 break;
885 }
886 case VM_GET_MEMSEG: {
887 struct vm_memseg vmseg;
888
889 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
890 error = EFAULT;
891 break;
892 }
893 error = vmmdev_get_memseg(sc, &vmseg);
894 if (error == 0 &&
895 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
896 error = EFAULT;
897 break;
898 }
899 break;
900 }
901 case VM_GET_REGISTER: {
902 struct vm_register vmreg;
903
904 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
905 error = EFAULT;
906 break;
907 }
908 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
909 &vmreg.regval);
910 if (error == 0 &&
911 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
912 error = EFAULT;
913 break;
914 }
915 break;
916 }
917 case VM_SET_REGISTER: {
918 struct vm_register vmreg;
919
920 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
921 error = EFAULT;
922 break;
923 }
924 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
925 vmreg.regval);
926 break;
927 }
928 case VM_SET_SEGMENT_DESCRIPTOR: {
929 struct vm_seg_desc vmsegd;
930
931 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
932 error = EFAULT;
933 break;
934 }
935 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
936 &vmsegd.desc);
937 break;
938 }
939 case VM_GET_SEGMENT_DESCRIPTOR: {
940 struct vm_seg_desc vmsegd;
941
942 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
943 error = EFAULT;
944 break;
945 }
946 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
947 &vmsegd.desc);
948 if (error == 0 &&
949 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
950 error = EFAULT;
951 break;
952 }
953 break;
954 }
955 case VM_GET_REGISTER_SET: {
956 struct vm_register_set vrs;
957 int regnums[VM_REG_LAST];
958 uint64_t regvals[VM_REG_LAST];
959
960 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
961 error = EFAULT;
962 break;
963 }
964 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
965 error = EINVAL;
966 break;
967 }
968 if (ddi_copyin(vrs.regnums, regnums,
969 sizeof (int) * vrs.count, md)) {
970 error = EFAULT;
971 break;
972 }
973
974 error = 0;
975 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
976 if (regnums[i] < 0) {
977 error = EINVAL;
978 break;
979 }
980 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
981 ®vals[i]);
982 }
983 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
984 sizeof (uint64_t) * vrs.count, md)) {
985 error = EFAULT;
986 }
987 break;
988 }
989 case VM_SET_REGISTER_SET: {
990 struct vm_register_set vrs;
991 int regnums[VM_REG_LAST];
992 uint64_t regvals[VM_REG_LAST];
993
994 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
995 error = EFAULT;
996 break;
997 }
998 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
999 error = EINVAL;
1000 break;
1001 }
1002 if (ddi_copyin(vrs.regnums, regnums,
1003 sizeof (int) * vrs.count, md)) {
1004 error = EFAULT;
1005 break;
1006 }
1007 if (ddi_copyin(vrs.regvals, regvals,
1008 sizeof (uint64_t) * vrs.count, md)) {
1009 error = EFAULT;
1010 break;
1011 }
1012
1013 error = 0;
1014 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1015 /*
1016 * Setting registers in a set is not atomic, since a
1017 * failure in the middle of the set will cause a
1018 * bail-out and inconsistent register state. Callers
1019 * should be wary of this.
1020 */
1021 if (regnums[i] < 0) {
1022 error = EINVAL;
1023 break;
1024 }
1025 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1026 regvals[i]);
1027 }
1028 break;
1029 }
1030 case VM_RESET_CPU: {
1031 struct vm_vcpu_reset vvr;
1032
1033 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1034 error = EFAULT;
1035 break;
1036 }
1037 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1038 error = EINVAL;
1039 }
1040
1041 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1042 break;
1043 }
1044 case VM_GET_RUN_STATE: {
1045 struct vm_run_state vrs;
1046
1047 bzero(&vrs, sizeof (vrs));
1048 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1049 &vrs.sipi_vector);
1050 if (error == 0) {
1051 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1052 error = EFAULT;
1053 break;
1054 }
1055 }
1056 break;
1057 }
1058 case VM_SET_RUN_STATE: {
1059 struct vm_run_state vrs;
1060
1061 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1062 error = EFAULT;
1063 break;
1064 }
1065 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1066 vrs.sipi_vector);
1067 break;
1068 }
1069
1070 case VM_SET_KERNEMU_DEV:
1071 case VM_GET_KERNEMU_DEV: {
1072 struct vm_readwrite_kernemu_device kemu;
1073 size_t size = 0;
1074
1075 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1076 error = EFAULT;
1077 break;
1078 }
1079
1080 if (kemu.access_width > 3) {
1081 error = EINVAL;
1082 break;
1083 }
1084 size = (1 << kemu.access_width);
1085 ASSERT(size >= 1 && size <= 8);
1086
1087 if (cmd == VM_SET_KERNEMU_DEV) {
1088 error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1089 kemu.gpa, kemu.value, size);
1090 } else {
1091 error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1092 kemu.gpa, &kemu.value, size);
1093 }
1094
1095 if (error == 0) {
1096 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1097 error = EFAULT;
1098 break;
1099 }
1100 }
1101 break;
1102 }
1103
1104 case VM_GET_CAPABILITY: {
1105 struct vm_capability vmcap;
1106
1107 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1108 error = EFAULT;
1109 break;
1110 }
1111 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1112 &vmcap.capval);
1113 if (error == 0 &&
1114 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1115 error = EFAULT;
1116 break;
1117 }
1118 break;
1119 }
1120 case VM_SET_CAPABILITY: {
1121 struct vm_capability vmcap;
1122
1123 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1124 error = EFAULT;
1125 break;
1126 }
1127 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1128 vmcap.capval);
1129 break;
1130 }
1131 case VM_SET_X2APIC_STATE: {
1132 struct vm_x2apic x2apic;
1133
1134 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1135 error = EFAULT;
1136 break;
1137 }
1138 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1139 break;
1140 }
1141 case VM_GET_X2APIC_STATE: {
1142 struct vm_x2apic x2apic;
1143
1144 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1145 error = EFAULT;
1146 break;
1147 }
1148 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1149 &x2apic.state);
1150 if (error == 0 &&
1151 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1152 error = EFAULT;
1153 break;
1154 }
1155 break;
1156 }
1157 case VM_GET_GPA_PMAP: {
1158 struct vm_gpa_pte gpapte;
1159
1160 if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1161 error = EFAULT;
1162 break;
1163 }
1164 #ifdef __FreeBSD__
1165 /* XXXJOY: add function? */
1166 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1167 gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1168 #endif
1169 error = 0;
1170 break;
1171 }
1172 case VM_GET_HPET_CAPABILITIES: {
1173 struct vm_hpet_cap hpetcap;
1174
1175 error = vhpet_getcap(&hpetcap);
1176 if (error == 0 &&
1177 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1178 error = EFAULT;
1179 break;
1180 }
1181 break;
1182 }
1183 case VM_GLA2GPA: {
1184 struct vm_gla2gpa gg;
1185
1186 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1187 error = EFAULT;
1188 break;
1189 }
1190 gg.vcpuid = vcpu;
1191 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1192 gg.prot, &gg.gpa, &gg.fault);
1193 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1194 error = EFAULT;
1195 break;
1196 }
1197 break;
1198 }
1199 case VM_GLA2GPA_NOFAULT: {
1200 struct vm_gla2gpa gg;
1201
1202 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1203 error = EFAULT;
1204 break;
1205 }
1206 gg.vcpuid = vcpu;
1207 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1208 gg.gla, gg.prot, &gg.gpa, &gg.fault);
1209 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1210 error = EFAULT;
1211 break;
1212 }
1213 break;
1214 }
1215
1216 case VM_ACTIVATE_CPU:
1217 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1218 break;
1219
1220 case VM_SUSPEND_CPU:
1221 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1222 error = EFAULT;
1223 } else {
1224 error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1225 }
1226 break;
1227
1228 case VM_RESUME_CPU:
1229 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1230 error = EFAULT;
1231 } else {
1232 error = vm_resume_cpu(sc->vmm_vm, vcpu);
1233 }
1234 break;
1235
1236 case VM_GET_CPUS: {
1237 struct vm_cpuset vm_cpuset;
1238 cpuset_t tempset;
1239 void *srcp = &tempset;
1240 int size;
1241
1242 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1243 error = EFAULT;
1244 break;
1245 }
1246
1247 /* Be more generous about sizing since our cpuset_t is large. */
1248 size = vm_cpuset.cpusetsize;
1249 if (size <= 0 || size > sizeof (cpuset_t)) {
1250 error = ERANGE;
1251 }
1252 /*
1253 * If they want a ulong_t or less, make sure they receive the
1254 * low bits with all the useful information.
1255 */
1256 if (size <= sizeof (tempset.cpub[0])) {
1257 srcp = &tempset.cpub[0];
1258 }
1259
1260 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1261 tempset = vm_active_cpus(sc->vmm_vm);
1262 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1263 tempset = vm_suspended_cpus(sc->vmm_vm);
1264 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1265 tempset = vm_debug_cpus(sc->vmm_vm);
1266 } else {
1267 error = EINVAL;
1268 }
1269
1270 ASSERT(size > 0 && size <= sizeof (tempset));
1271 if (error == 0 &&
1272 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1273 error = EFAULT;
1274 break;
1275 }
1276 break;
1277 }
1278 case VM_SET_INTINFO: {
1279 struct vm_intinfo vmii;
1280
1281 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1282 error = EFAULT;
1283 break;
1284 }
1285 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1286 break;
1287 }
1288 case VM_GET_INTINFO: {
1289 struct vm_intinfo vmii;
1290
1291 vmii.vcpuid = vcpu;
1292 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1293 &vmii.info2);
1294 if (error == 0 &&
1295 ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1296 error = EFAULT;
1297 break;
1298 }
1299 break;
1300 }
1301 case VM_RTC_WRITE: {
1302 struct vm_rtc_data rtcdata;
1303
1304 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1305 error = EFAULT;
1306 break;
1307 }
1308 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1309 rtcdata.value);
1310 break;
1311 }
1312 case VM_RTC_READ: {
1313 struct vm_rtc_data rtcdata;
1314
1315 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1316 error = EFAULT;
1317 break;
1318 }
1319 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1320 &rtcdata.value);
1321 if (error == 0 &&
1322 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1323 error = EFAULT;
1324 break;
1325 }
1326 break;
1327 }
1328 case VM_RTC_SETTIME: {
1329 struct vm_rtc_time rtctime;
1330
1331 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1332 error = EFAULT;
1333 break;
1334 }
1335 error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1336 break;
1337 }
1338 case VM_RTC_GETTIME: {
1339 struct vm_rtc_time rtctime;
1340
1341 rtctime.secs = vrtc_get_time(sc->vmm_vm);
1342 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1343 error = EFAULT;
1344 break;
1345 }
1346 break;
1347 }
1348
1349 case VM_PMTMR_LOCATE: {
1350 uint16_t port = arg;
1351 error = vpmtmr_set_location(sc->vmm_vm, port);
1352 break;
1353 }
1354
1355 case VM_RESTART_INSTRUCTION:
1356 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1357 break;
1358
1359 case VM_SET_TOPOLOGY: {
1360 struct vm_cpu_topology topo;
1361
1362 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1363 error = EFAULT;
1364 break;
1365 }
1366 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1367 topo.threads, topo.maxcpus);
1368 break;
1369 }
1370 case VM_GET_TOPOLOGY: {
1371 struct vm_cpu_topology topo;
1372
1373 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1374 &topo.threads, &topo.maxcpus);
1375 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1376 error = EFAULT;
1377 break;
1378 }
1379 break;
1380 }
1381
1382 case VM_DEVMEM_GETOFFSET: {
1383 struct vm_devmem_offset vdo;
1384 list_t *dl = &sc->vmm_devmem_list;
1385 vmm_devmem_entry_t *de = NULL;
1386
1387 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1388 error = EFAULT;
1389 break;
1390 }
1391
1392 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1393 if (de->vde_segid == vdo.segid) {
1394 break;
1395 }
1396 }
1397 if (de != NULL) {
1398 vdo.offset = de->vde_off;
1399 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1400 error = EFAULT;
1401 }
1402 } else {
1403 error = ENOENT;
1404 }
1405 break;
1406 }
1407 case VM_WRLOCK_CYCLE: {
1408 /*
1409 * Present a test mechanism to acquire/release the write lock
1410 * on the VM without any other effects.
1411 */
1412 break;
1413 }
1414
1415 default:
1416 error = ENOTTY;
1417 break;
1418 }
1419
1420 /* Release exclusion resources */
1421 switch (lock_type) {
1422 case LOCK_NONE:
1423 break;
1424 case LOCK_VCPU:
1425 vcpu_unlock_one(sc, vcpu);
1426 break;
1427 case LOCK_READ_HOLD:
1428 vmm_read_unlock(sc);
1429 break;
1430 case LOCK_WRITE_HOLD:
1431 vmm_write_unlock(sc);
1432 break;
1433 default:
1434 panic("unexpected lock type");
1435 break;
1436 }
1437
1438 return (error);
1439 }
1440
1441 static vmm_softc_t *
1442 vmm_lookup(const char *name)
1443 {
1444 list_t *vml = &vmm_list;
1445 vmm_softc_t *sc;
1446
1447 ASSERT(MUTEX_HELD(&vmm_mtx));
1448
1449 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1450 if (strcmp(sc->vmm_name, name) == 0) {
1451 break;
1452 }
1453 }
1454
1455 return (sc);
1456 }
1457
1458 /*
1459 * Acquire an HMA registration if not already held.
1460 */
1461 static boolean_t
1462 vmm_hma_acquire(void)
1463 {
1464 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1465
1466 mutex_enter(&vmmdev_mtx);
1467
1468 if (vmmdev_hma_reg == NULL) {
1469 VERIFY3U(vmmdev_hma_ref, ==, 0);
1470 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1471 if (vmmdev_hma_reg == NULL) {
1472 cmn_err(CE_WARN, "%s HMA registration failed.",
1473 vmmdev_hvm_name);
1474 mutex_exit(&vmmdev_mtx);
1475 return (B_FALSE);
1476 }
1477 }
1478
1479 vmmdev_hma_ref++;
1480
1481 mutex_exit(&vmmdev_mtx);
1482
1483 return (B_TRUE);
1484 }
1485
1486 /*
1487 * Release the HMA registration if held and there are no remaining VMs.
1488 */
1489 static void
1490 vmm_hma_release(void)
1491 {
1492 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1493
1494 mutex_enter(&vmmdev_mtx);
1495
1496 VERIFY3U(vmmdev_hma_ref, !=, 0);
1497
1498 vmmdev_hma_ref--;
1499
1500 if (vmmdev_hma_ref == 0) {
1501 VERIFY(vmmdev_hma_reg != NULL);
1502 hma_unregister(vmmdev_hma_reg);
1503 vmmdev_hma_reg = NULL;
1504 }
1505 mutex_exit(&vmmdev_mtx);
1506 }
1507
1508 static int
1509 vmmdev_do_vm_create(char *name, cred_t *cr)
1510 {
1511 vmm_softc_t *sc = NULL;
1512 minor_t minor;
1513 int error = ENOMEM;
1514
1515 if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1516 return (EINVAL);
1517 }
1518
1519 if (!vmm_hma_acquire())
1520 return (ENXIO);
1521
1522 mutex_enter(&vmm_mtx);
1523
1524 /* Look for duplicate names */
1525 if (vmm_lookup(name) != NULL) {
1526 mutex_exit(&vmm_mtx);
1527 vmm_hma_release();
1528 return (EEXIST);
1529 }
1530
1531 /* Allow only one instance per non-global zone. */
1532 if (!INGLOBALZONE(curproc)) {
1533 for (sc = list_head(&vmm_list); sc != NULL;
1534 sc = list_next(&vmm_list, sc)) {
1535 if (sc->vmm_zone == curzone) {
1536 mutex_exit(&vmm_mtx);
1537 vmm_hma_release();
1538 return (EINVAL);
1539 }
1540 }
1541 }
1542
1543 minor = id_alloc(vmm_minors);
1544 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1545 goto fail;
1546 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1547 ddi_soft_state_free(vmm_statep, minor);
1548 goto fail;
1549 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1550 DDI_PSEUDO, 0) != DDI_SUCCESS) {
1551 goto fail;
1552 }
1553
1554 if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1555 goto fail;
1556 }
1557
1558 error = vm_create(name, &sc->vmm_vm);
1559 if (error == 0) {
1560 /* Complete VM intialization and report success. */
1561 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1562 sc->vmm_minor = minor;
1563 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1564 offsetof(vmm_devmem_entry_t, vde_node));
1565
1566 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1567 offsetof(vmm_hold_t, vmh_node));
1568 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1569
1570 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1571 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1572 offsetof(vmm_lease_t, vml_node));
1573 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1574 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1575
1576 sc->vmm_zone = crgetzone(cr);
1577 zone_hold(sc->vmm_zone);
1578 vmm_zsd_add_vm(sc);
1579 vmm_kstat_init(sc);
1580
1581 list_insert_tail(&vmm_list, sc);
1582 mutex_exit(&vmm_mtx);
1583 return (0);
1584 }
1585
1586 vmm_kstat_fini(sc);
1587 ddi_remove_minor_node(vmmdev_dip, name);
1588 fail:
1589 id_free(vmm_minors, minor);
1590 if (sc != NULL) {
1591 ddi_soft_state_free(vmm_statep, minor);
1592 }
1593 mutex_exit(&vmm_mtx);
1594 vmm_hma_release();
1595
1596 return (error);
1597 }
1598
1599 /*
1600 * Bhyve 'Driver' Interface
1601 *
1602 * While many devices are emulated in the bhyve userspace process, there are
1603 * others with performance constraints which require that they run mostly or
1604 * entirely in-kernel. For those not integrated directly into bhyve, an API is
1605 * needed so they can query/manipulate the portions of VM state needed to
1606 * fulfill their purpose.
1607 *
1608 * This includes:
1609 * - Translating guest-physical addresses to host-virtual pointers
1610 * - Injecting MSIs
1611 * - Hooking IO port addresses
1612 *
1613 * The vmm_drv interface exists to provide that functionality to its consumers.
1614 * (At this time, 'viona' is the only user)
1615 */
1616 int
1617 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1618 {
1619 vnode_t *vp = fp->f_vnode;
1620 const dev_t dev = vp->v_rdev;
1621 vmm_softc_t *sc;
1622 vmm_hold_t *hold;
1623 int err = 0;
1624
1625 if (vp->v_type != VCHR) {
1626 return (ENXIO);
1627 }
1628 const major_t major = getmajor(dev);
1629 const minor_t minor = getminor(dev);
1630
1631 mutex_enter(&vmmdev_mtx);
1632 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1633 mutex_exit(&vmmdev_mtx);
1634 return (ENOENT);
1635 }
1636 mutex_enter(&vmm_mtx);
1637 mutex_exit(&vmmdev_mtx);
1638
1639 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1640 err = ENOENT;
1641 goto out;
1642 }
1643 /* XXXJOY: check cred permissions against instance */
1644
1645 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1646 err = EBUSY;
1647 goto out;
1648 }
1649
1650 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1651 hold->vmh_sc = sc;
1652 hold->vmh_release_req = B_FALSE;
1653
1654 list_insert_tail(&sc->vmm_holds, hold);
1655 sc->vmm_flags |= VMM_HELD;
1656 *holdp = hold;
1657
1658 out:
1659 mutex_exit(&vmm_mtx);
1660 return (err);
1661 }
1662
1663 void
1664 vmm_drv_rele(vmm_hold_t *hold)
1665 {
1666 vmm_softc_t *sc;
1667
1668 ASSERT(hold != NULL);
1669 ASSERT(hold->vmh_sc != NULL);
1670 VERIFY(hold->vmh_ioport_hook_cnt == 0);
1671
1672 mutex_enter(&vmm_mtx);
1673 sc = hold->vmh_sc;
1674 list_remove(&sc->vmm_holds, hold);
1675 if (list_is_empty(&sc->vmm_holds)) {
1676 sc->vmm_flags &= ~VMM_HELD;
1677 cv_broadcast(&sc->vmm_cv);
1678 }
1679 mutex_exit(&vmm_mtx);
1680 kmem_free(hold, sizeof (*hold));
1681 }
1682
1683 boolean_t
1684 vmm_drv_release_reqd(vmm_hold_t *hold)
1685 {
1686 ASSERT(hold != NULL);
1687
1688 return (hold->vmh_release_req);
1689 }
1690
1691 vmm_lease_t *
1692 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1693 {
1694 vmm_softc_t *sc = hold->vmh_sc;
1695 vmm_lease_t *lease;
1696
1697 ASSERT3P(expiref, !=, NULL);
1698
1699 if (hold->vmh_release_req) {
1700 return (NULL);
1701 }
1702
1703 lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1704 list_link_init(&lease->vml_node);
1705 lease->vml_expire_func = expiref;
1706 lease->vml_expire_arg = arg;
1707 lease->vml_expired = B_FALSE;
1708 lease->vml_hold = hold;
1709 /* cache the VM pointer for one less pointer chase */
1710 lease->vml_vm = sc->vmm_vm;
1711
1712 mutex_enter(&sc->vmm_lease_lock);
1713 while (sc->vmm_lease_blocker != 0) {
1714 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1715 }
1716 list_insert_tail(&sc->vmm_lease_list, lease);
1717 vmm_read_lock(sc);
1718 mutex_exit(&sc->vmm_lease_lock);
1719
1720 return (lease);
1721 }
1722
1723 static void
1724 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1725 {
1726 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1727
1728 list_remove(&sc->vmm_lease_list, lease);
1729 vmm_read_unlock(sc);
1730 kmem_free(lease, sizeof (*lease));
1731 }
1732
1733 void
1734 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1735 {
1736 vmm_softc_t *sc = hold->vmh_sc;
1737
1738 VERIFY3P(hold, ==, lease->vml_hold);
1739
1740 mutex_enter(&sc->vmm_lease_lock);
1741 vmm_lease_break_locked(sc, lease);
1742 mutex_exit(&sc->vmm_lease_lock);
1743 }
1744
1745 boolean_t
1746 vmm_drv_lease_expired(vmm_lease_t *lease)
1747 {
1748 return (lease->vml_expired);
1749 }
1750
1751 void *
1752 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1753 {
1754 ASSERT(lease != NULL);
1755
1756 return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1757 }
1758
1759 int
1760 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1761 {
1762 ASSERT(lease != NULL);
1763
1764 return (lapic_intr_msi(lease->vml_vm, addr, msg));
1765 }
1766
1767 int
1768 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1769 void *arg, void **cookie)
1770 {
1771 vmm_softc_t *sc;
1772 int err;
1773
1774 ASSERT(hold != NULL);
1775 ASSERT(cookie != NULL);
1776
1777 sc = hold->vmh_sc;
1778 mutex_enter(&vmm_mtx);
1779 /* Confirm that hook installation is not blocked */
1780 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1781 mutex_exit(&vmm_mtx);
1782 return (EBUSY);
1783 }
1784 /*
1785 * Optimistically record an installed hook which will prevent a block
1786 * from being asserted while the mutex is dropped.
1787 */
1788 hold->vmh_ioport_hook_cnt++;
1789 mutex_exit(&vmm_mtx);
1790
1791 vmm_write_lock(sc);
1792 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1793 arg, cookie);
1794 vmm_write_unlock(sc);
1795
1796 if (err != 0) {
1797 mutex_enter(&vmm_mtx);
1798 /* Walk back optimism about the hook installation */
1799 hold->vmh_ioport_hook_cnt--;
1800 mutex_exit(&vmm_mtx);
1801 }
1802 return (err);
1803 }
1804
1805 void
1806 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1807 {
1808 vmm_softc_t *sc;
1809
1810 ASSERT(hold != NULL);
1811 ASSERT(cookie != NULL);
1812 ASSERT(hold->vmh_ioport_hook_cnt != 0);
1813
1814 sc = hold->vmh_sc;
1815 vmm_write_lock(sc);
1816 vm_ioport_unhook(sc->vmm_vm, cookie);
1817 vmm_write_unlock(sc);
1818
1819 mutex_enter(&vmm_mtx);
1820 hold->vmh_ioport_hook_cnt--;
1821 mutex_exit(&vmm_mtx);
1822 }
1823
1824 static int
1825 vmm_drv_purge(vmm_softc_t *sc)
1826 {
1827 ASSERT(MUTEX_HELD(&vmm_mtx));
1828
1829 if ((sc->vmm_flags & VMM_HELD) != 0) {
1830 vmm_hold_t *hold;
1831
1832 sc->vmm_flags |= VMM_CLEANUP;
1833 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1834 hold = list_next(&sc->vmm_holds, hold)) {
1835 hold->vmh_release_req = B_TRUE;
1836 }
1837 while ((sc->vmm_flags & VMM_HELD) != 0) {
1838 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1839 return (EINTR);
1840 }
1841 }
1842 sc->vmm_flags &= ~VMM_CLEANUP;
1843 }
1844
1845 VERIFY(list_is_empty(&sc->vmm_holds));
1846 sc->vmm_flags |= VMM_PURGED;
1847 return (0);
1848 }
1849
1850 static int
1851 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1852 {
1853 int err = 0;
1854
1855 mutex_enter(&vmm_mtx);
1856 if (!enable_block) {
1857 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1858
1859 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1860 goto done;
1861 }
1862
1863 /* If any holds have hooks installed, the block is a failure */
1864 if (!list_is_empty(&sc->vmm_holds)) {
1865 vmm_hold_t *hold;
1866
1867 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1868 hold = list_next(&sc->vmm_holds, hold)) {
1869 if (hold->vmh_ioport_hook_cnt != 0) {
1870 err = EBUSY;
1871 goto done;
1872 }
1873 }
1874 }
1875 sc->vmm_flags |= VMM_BLOCK_HOOK;
1876
1877 done:
1878 mutex_exit(&vmm_mtx);
1879 return (err);
1880 }
1881
1882 static int
1883 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1884 boolean_t *hma_release)
1885 {
1886 dev_info_t *pdip = ddi_get_parent(vmmdev_dip);
1887 minor_t minor;
1888
1889 ASSERT(MUTEX_HELD(&vmm_mtx));
1890
1891 *hma_release = B_FALSE;
1892
1893 if (vmm_drv_purge(sc) != 0) {
1894 return (EINTR);
1895 }
1896
1897 if (clean_zsd) {
1898 vmm_zsd_rem_vm(sc);
1899 }
1900
1901 /* Clean up devmem entries */
1902 vmmdev_devmem_purge(sc);
1903
1904 list_remove(&vmm_list, sc);
1905 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1906 minor = sc->vmm_minor;
1907 zone_rele(sc->vmm_zone);
1908 if (sc->vmm_is_open) {
1909 list_insert_tail(&vmm_destroy_list, sc);
1910 sc->vmm_flags |= VMM_DESTROY;
1911 } else {
1912 vm_destroy(sc->vmm_vm);
1913 vmm_kstat_fini(sc);
1914 ddi_soft_state_free(vmm_statep, minor);
1915 id_free(vmm_minors, minor);
1916 *hma_release = B_TRUE;
1917 }
1918 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1919
1920 return (0);
1921 }
1922
1923 int
1924 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1925 {
1926 boolean_t hma_release = B_FALSE;
1927 int err;
1928
1929 mutex_enter(&vmm_mtx);
1930 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1931 mutex_exit(&vmm_mtx);
1932
1933 if (hma_release)
1934 vmm_hma_release();
1935
1936 return (err);
1937 }
1938
1939 /* ARGSUSED */
1940 static int
1941 vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1942 {
1943 boolean_t hma_release = B_FALSE;
1944 vmm_softc_t *sc;
1945 int err;
1946
1947 if (crgetuid(cr) != 0)
1948 return (EPERM);
1949
1950 mutex_enter(&vmm_mtx);
1951
1952 if ((sc = vmm_lookup(name)) == NULL) {
1953 mutex_exit(&vmm_mtx);
1954 return (ENOENT);
1955 }
1956 /*
1957 * We don't check this in vmm_lookup() since that function is also used
1958 * for validation during create and currently vmm names must be unique.
1959 */
1960 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1961 mutex_exit(&vmm_mtx);
1962 return (EPERM);
1963 }
1964 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1965
1966 mutex_exit(&vmm_mtx);
1967
1968 if (hma_release)
1969 vmm_hma_release();
1970
1971 return (err);
1972 }
1973
1974 #define VCPU_NAME_BUFLEN 32
1975
1976 static int
1977 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
1978 {
1979 zoneid_t zid = crgetzoneid(cr);
1980 int instance = minor;
1981 kstat_t *ksp;
1982
1983 ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
1984
1985 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
1986 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
1987 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
1988
1989 if (ksp == NULL) {
1990 return (-1);
1991 }
1992 sc->vmm_kstat_vm = ksp;
1993
1994 for (uint_t i = 0; i < VM_MAXCPU; i++) {
1995 char namebuf[VCPU_NAME_BUFLEN];
1996
1997 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
1998
1999 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2000 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2001 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2002 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2003 0, zid);
2004 if (ksp == NULL) {
2005 goto fail;
2006 }
2007
2008 sc->vmm_kstat_vcpu[i] = ksp;
2009 }
2010
2011 /*
2012 * If this instance is associated with a non-global zone, make its
2013 * kstats visible from the GZ.
2014 */
2015 if (zid != GLOBAL_ZONEID) {
2016 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2017 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2018 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2019 }
2020 }
2021
2022 return (0);
2023
2024 fail:
2025 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2026 if (sc->vmm_kstat_vcpu[i] != NULL) {
2027 kstat_delete(sc->vmm_kstat_vcpu[i]);
2028 sc->vmm_kstat_vcpu[i] = NULL;
2029 } else {
2030 break;
2031 }
2032 }
2033 kstat_delete(sc->vmm_kstat_vm);
2034 sc->vmm_kstat_vm = NULL;
2035 return (-1);
2036 }
2037
2038 static void
2039 vmm_kstat_init(vmm_softc_t *sc)
2040 {
2041 kstat_t *ksp;
2042
2043 ASSERT3P(sc->vmm_vm, !=, NULL);
2044 ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2045
2046 ksp = sc->vmm_kstat_vm;
2047 vmm_kstats_t *vk = ksp->ks_data;
2048 ksp->ks_private = sc->vmm_vm;
2049 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2050 kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2051
2052 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2053 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2054
2055 ksp = sc->vmm_kstat_vcpu[i];
2056 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2057
2058 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2059 vvk->vvk_vcpu.value.ui32 = i;
2060 kstat_named_init(&vvk->vvk_time_init, "time_init",
2061 KSTAT_DATA_UINT64);
2062 kstat_named_init(&vvk->vvk_time_run, "time_run",
2063 KSTAT_DATA_UINT64);
2064 kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2065 KSTAT_DATA_UINT64);
2066 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2067 KSTAT_DATA_UINT64);
2068 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2069 KSTAT_DATA_UINT64);
2070 kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2071 KSTAT_DATA_UINT64);
2072 ksp->ks_private = sc->vmm_vm;
2073 ksp->ks_update = vmm_kstat_update_vcpu;
2074 }
2075
2076 kstat_install(sc->vmm_kstat_vm);
2077 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2078 kstat_install(sc->vmm_kstat_vcpu[i]);
2079 }
2080 }
2081
2082 static void
2083 vmm_kstat_fini(vmm_softc_t *sc)
2084 {
2085 ASSERT(sc->vmm_kstat_vm != NULL);
2086
2087 kstat_delete(sc->vmm_kstat_vm);
2088 sc->vmm_kstat_vm = NULL;
2089
2090 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2091 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2092
2093 kstat_delete(sc->vmm_kstat_vcpu[i]);
2094 sc->vmm_kstat_vcpu[i] = NULL;
2095 }
2096 }
2097
2098 static int
2099 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2100 {
2101 minor_t minor;
2102 vmm_softc_t *sc;
2103
2104 minor = getminor(*devp);
2105 if (minor == VMM_CTL_MINOR) {
2106 /*
2107 * Master control device must be opened exclusively.
2108 */
2109 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2110 return (EINVAL);
2111 }
2112
2113 return (0);
2114 }
2115
2116 mutex_enter(&vmm_mtx);
2117 sc = ddi_get_soft_state(vmm_statep, minor);
2118 if (sc == NULL) {
2119 mutex_exit(&vmm_mtx);
2120 return (ENXIO);
2121 }
2122
2123 sc->vmm_is_open = B_TRUE;
2124 mutex_exit(&vmm_mtx);
2125
2126 return (0);
2127 }
2128
2129 static int
2130 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2131 {
2132 minor_t minor;
2133 vmm_softc_t *sc;
2134 boolean_t hma_release = B_FALSE;
2135
2136 minor = getminor(dev);
2137 if (minor == VMM_CTL_MINOR)
2138 return (0);
2139
2140 mutex_enter(&vmm_mtx);
2141 sc = ddi_get_soft_state(vmm_statep, minor);
2142 if (sc == NULL) {
2143 mutex_exit(&vmm_mtx);
2144 return (ENXIO);
2145 }
2146
2147 VERIFY(sc->vmm_is_open);
2148 sc->vmm_is_open = B_FALSE;
2149
2150 /*
2151 * If this VM was destroyed while the vmm device was open, then
2152 * clean it up now that it is closed.
2153 */
2154 if (sc->vmm_flags & VMM_DESTROY) {
2155 list_remove(&vmm_destroy_list, sc);
2156 vm_destroy(sc->vmm_vm);
2157 ddi_soft_state_free(vmm_statep, minor);
2158 id_free(vmm_minors, minor);
2159 hma_release = B_TRUE;
2160 }
2161 mutex_exit(&vmm_mtx);
2162
2163 if (hma_release)
2164 vmm_hma_release();
2165
2166 return (0);
2167 }
2168
2169 static int
2170 vmm_is_supported(intptr_t arg)
2171 {
2172 int r;
2173 const char *msg;
2174
2175 if (vmm_is_intel()) {
2176 r = vmx_x86_supported(&msg);
2177 } else if (vmm_is_svm()) {
2178 /*
2179 * HMA already ensured that the features necessary for SVM
2180 * operation were present and online during vmm_attach().
2181 */
2182 r = 0;
2183 } else {
2184 r = ENXIO;
2185 msg = "Unsupported CPU vendor";
2186 }
2187
2188 if (r != 0 && arg != (intptr_t)NULL) {
2189 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2190 return (EFAULT);
2191 }
2192 return (r);
2193 }
2194
2195 static int
2196 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2197 int *rvalp)
2198 {
2199 vmm_softc_t *sc;
2200 minor_t minor;
2201
2202 /* The structs in bhyve ioctls assume a 64-bit datamodel */
2203 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2204 return (ENOTSUP);
2205 }
2206
2207 minor = getminor(dev);
2208
2209 if (minor == VMM_CTL_MINOR) {
2210 void *argp = (void *)arg;
2211 char name[VM_MAX_NAMELEN] = { 0 };
2212 size_t len = 0;
2213
2214 if ((mode & FKIOCTL) != 0) {
2215 len = strlcpy(name, argp, sizeof (name));
2216 } else {
2217 if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2218 return (EFAULT);
2219 }
2220 }
2221 if (len >= VM_MAX_NAMELEN) {
2222 return (ENAMETOOLONG);
2223 }
2224
2225 switch (cmd) {
2226 case VMM_CREATE_VM:
2227 if ((mode & FWRITE) == 0)
2228 return (EPERM);
2229 return (vmmdev_do_vm_create(name, credp));
2230 case VMM_DESTROY_VM:
2231 if ((mode & FWRITE) == 0)
2232 return (EPERM);
2233 return (vmmdev_do_vm_destroy(name, credp));
2234 case VMM_VM_SUPPORTED:
2235 return (vmm_is_supported(arg));
2236 default:
2237 /* No other actions are legal on ctl device */
2238 return (ENOTTY);
2239 }
2240 }
2241
2242 sc = ddi_get_soft_state(vmm_statep, minor);
2243 ASSERT(sc);
2244
2245 if (sc->vmm_flags & VMM_DESTROY)
2246 return (ENXIO);
2247
2248 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2249 }
2250
2251 static int
2252 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2253 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2254 {
2255 vmm_softc_t *sc;
2256 const minor_t minor = getminor(dev);
2257 struct vm *vm;
2258 int err;
2259 vm_object_t vmo = NULL;
2260 struct vmspace *vms;
2261
2262 if (minor == VMM_CTL_MINOR) {
2263 return (ENODEV);
2264 }
2265 if (off < 0 || (off + len) <= 0) {
2266 return (EINVAL);
2267 }
2268 if ((prot & PROT_USER) == 0) {
2269 return (EACCES);
2270 }
2271
2272 sc = ddi_get_soft_state(vmm_statep, minor);
2273 ASSERT(sc);
2274
2275 if (sc->vmm_flags & VMM_DESTROY)
2276 return (ENXIO);
2277
2278 /* Grab read lock on the VM to prevent any changes to the memory map */
2279 vmm_read_lock(sc);
2280
2281 vm = sc->vmm_vm;
2282 vms = vm_get_vmspace(vm);
2283 if (off >= VM_DEVMEM_START) {
2284 int segid;
2285 off_t map_off = 0;
2286
2287 /* Mapping a devmem "device" */
2288 if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
2289 err = ENODEV;
2290 goto out;
2291 }
2292 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2293 if (err != 0) {
2294 goto out;
2295 }
2296 err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
2297 flags);
2298 } else {
2299 /* Mapping a part of the guest physical space */
2300 err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2301 flags);
2302 }
2303
2304
2305 out:
2306 vmm_read_unlock(sc);
2307 return (err);
2308 }
2309
2310 static sdev_plugin_validate_t
2311 vmm_sdev_validate(sdev_ctx_t ctx)
2312 {
2313 const char *name = sdev_ctx_name(ctx);
2314 vmm_softc_t *sc;
2315 sdev_plugin_validate_t ret;
2316 minor_t minor;
2317
2318 if (sdev_ctx_vtype(ctx) != VCHR)
2319 return (SDEV_VTOR_INVALID);
2320
2321 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2322
2323 mutex_enter(&vmm_mtx);
2324 if ((sc = vmm_lookup(name)) == NULL)
2325 ret = SDEV_VTOR_INVALID;
2326 else if (sc->vmm_minor != minor)
2327 ret = SDEV_VTOR_STALE;
2328 else
2329 ret = SDEV_VTOR_VALID;
2330 mutex_exit(&vmm_mtx);
2331
2332 return (ret);
2333 }
2334
2335 static int
2336 vmm_sdev_filldir(sdev_ctx_t ctx)
2337 {
2338 vmm_softc_t *sc;
2339 int ret;
2340
2341 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2342 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2343 sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2344 return (EINVAL);
2345 }
2346
2347 mutex_enter(&vmm_mtx);
2348 ASSERT(vmmdev_dip != NULL);
2349 for (sc = list_head(&vmm_list); sc != NULL;
2350 sc = list_next(&vmm_list, sc)) {
2351 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2352 ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2353 S_IFCHR | 0600,
2354 makedevice(ddi_driver_major(vmmdev_dip),
2355 sc->vmm_minor));
2356 } else {
2357 continue;
2358 }
2359 if (ret != 0 && ret != EEXIST)
2360 goto out;
2361 }
2362
2363 ret = 0;
2364
2365 out:
2366 mutex_exit(&vmm_mtx);
2367 return (ret);
2368 }
2369
2370 /* ARGSUSED */
2371 static void
2372 vmm_sdev_inactive(sdev_ctx_t ctx)
2373 {
2374 }
2375
2376 static sdev_plugin_ops_t vmm_sdev_ops = {
2377 .spo_version = SDEV_PLUGIN_VERSION,
2378 .spo_flags = SDEV_PLUGIN_SUBDIR,
2379 .spo_validate = vmm_sdev_validate,
2380 .spo_filldir = vmm_sdev_filldir,
2381 .spo_inactive = vmm_sdev_inactive
2382 };
2383
2384 /* ARGSUSED */
2385 static int
2386 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2387 {
2388 int error;
2389
2390 switch (cmd) {
2391 case DDI_INFO_DEVT2DEVINFO:
2392 *result = (void *)vmmdev_dip;
2393 error = DDI_SUCCESS;
2394 break;
2395 case DDI_INFO_DEVT2INSTANCE:
2396 *result = (void *)0;
2397 error = DDI_SUCCESS;
2398 break;
2399 default:
2400 error = DDI_FAILURE;
2401 break;
2402 }
2403 return (error);
2404 }
2405
2406 static int
2407 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2408 {
2409 sdev_plugin_hdl_t sph;
2410 hma_reg_t *reg = NULL;
2411 boolean_t vmm_loaded = B_FALSE;
2412
2413 if (cmd != DDI_ATTACH) {
2414 return (DDI_FAILURE);
2415 }
2416
2417 mutex_enter(&vmmdev_mtx);
2418 /* Ensure we are not already attached. */
2419 if (vmmdev_dip != NULL) {
2420 mutex_exit(&vmmdev_mtx);
2421 return (DDI_FAILURE);
2422 }
2423
2424 vmm_sol_glue_init();
2425 vmm_arena_init();
2426
2427 /*
2428 * Perform temporary HMA registration to determine if the system
2429 * is capable.
2430 */
2431 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2432 goto fail;
2433 } else if (vmm_mod_load() != 0) {
2434 goto fail;
2435 }
2436 vmm_loaded = B_TRUE;
2437 hma_unregister(reg);
2438 reg = NULL;
2439
2440 /* Create control node. Other nodes will be created on demand. */
2441 if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2442 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2443 goto fail;
2444 }
2445
2446 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
2447 if (sph == (sdev_plugin_hdl_t)NULL) {
2448 ddi_remove_minor_node(dip, NULL);
2449 goto fail;
2450 }
2451
2452 ddi_report_dev(dip);
2453 vmmdev_sdev_hdl = sph;
2454 vmmdev_dip = dip;
2455 mutex_exit(&vmmdev_mtx);
2456 return (DDI_SUCCESS);
2457
2458 fail:
2459 if (vmm_loaded) {
2460 VERIFY0(vmm_mod_unload());
2461 }
2462 if (reg != NULL) {
2463 hma_unregister(reg);
2464 }
2465 vmm_arena_fini();
2466 vmm_sol_glue_cleanup();
2467 mutex_exit(&vmmdev_mtx);
2468 return (DDI_FAILURE);
2469 }
2470
2471 static int
2472 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2473 {
2474 if (cmd != DDI_DETACH) {
2475 return (DDI_FAILURE);
2476 }
2477
2478 /*
2479 * Ensure that all resources have been cleaned up.
2480 *
2481 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2482 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2483 * devinfo locked as iommu_cleanup() tries to recursively lock each
2484 * devinfo, including our own, while holding vmmdev_mtx.
2485 */
2486 if (mutex_tryenter(&vmmdev_mtx) == 0)
2487 return (DDI_FAILURE);
2488
2489 mutex_enter(&vmm_mtx);
2490 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2491 mutex_exit(&vmm_mtx);
2492 mutex_exit(&vmmdev_mtx);
2493 return (DDI_FAILURE);
2494 }
2495 mutex_exit(&vmm_mtx);
2496
2497 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2498 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2499 mutex_exit(&vmmdev_mtx);
2500 return (DDI_FAILURE);
2501 }
2502 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2503
2504 /* Remove the control node. */
2505 ddi_remove_minor_node(dip, "ctl");
2506 vmmdev_dip = NULL;
2507
2508 VERIFY0(vmm_mod_unload());
2509 VERIFY3U(vmmdev_hma_reg, ==, NULL);
2510 vmm_arena_fini();
2511 vmm_sol_glue_cleanup();
2512
2513 mutex_exit(&vmmdev_mtx);
2514
2515 return (DDI_SUCCESS);
2516 }
2517
2518 static struct cb_ops vmm_cb_ops = {
2519 vmm_open,
2520 vmm_close,
2521 nodev, /* strategy */
2522 nodev, /* print */
2523 nodev, /* dump */
2524 nodev, /* read */
2525 nodev, /* write */
2526 vmm_ioctl,
2527 nodev, /* devmap */
2528 nodev, /* mmap */
2529 vmm_segmap,
2530 nochpoll, /* poll */
2531 ddi_prop_op,
2532 NULL,
2533 D_NEW | D_MP | D_DEVMAP
2534 };
2535
2536 static struct dev_ops vmm_ops = {
2537 DEVO_REV,
2538 0,
2539 vmm_info,
2540 nulldev, /* identify */
2541 nulldev, /* probe */
2542 vmm_attach,
2543 vmm_detach,
2544 nodev, /* reset */
2545 &vmm_cb_ops,
2546 (struct bus_ops *)NULL
2547 };
2548
2549 static struct modldrv modldrv = {
2550 &mod_driverops,
2551 "bhyve vmm",
2552 &vmm_ops
2553 };
2554
2555 static struct modlinkage modlinkage = {
2556 MODREV_1,
2557 &modldrv,
2558 NULL
2559 };
2560
2561 int
2562 _init(void)
2563 {
2564 int error;
2565
2566 sysinit();
2567
2568 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2569 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2570 list_create(&vmm_list, sizeof (vmm_softc_t),
2571 offsetof(vmm_softc_t, vmm_node));
2572 list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2573 offsetof(vmm_softc_t, vmm_node));
2574 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2575
2576 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2577 if (error) {
2578 return (error);
2579 }
2580
2581 vmm_zsd_init();
2582
2583 error = mod_install(&modlinkage);
2584 if (error) {
2585 ddi_soft_state_fini(&vmm_statep);
2586 vmm_zsd_fini();
2587 }
2588
2589 return (error);
2590 }
2591
2592 int
2593 _fini(void)
2594 {
2595 int error;
2596
2597 error = mod_remove(&modlinkage);
2598 if (error) {
2599 return (error);
2600 }
2601
2602 vmm_zsd_fini();
2603
2604 ddi_soft_state_fini(&vmm_statep);
2605
2606 return (0);
2607 }
2608
2609 int
2610 _info(struct modinfo *modinfop)
2611 {
2612 return (mod_info(&modlinkage, modinfop));
2613 }