1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2015 Pluribus Networks Inc.
15 * Copyright 2020 Joyent, Inc.
16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17 * Copyright 2021 Oxide Computer Company
18 */
19
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48
49 #include <vm/seg_dev.h>
50
51 #include "io/ppt.h"
52 #include "io/vatpic.h"
53 #include "io/vioapic.h"
54 #include "io/vrtc.h"
55 #include "io/vhpet.h"
56 #include "io/vpmtmr.h"
57 #include "vmm_lapic.h"
58 #include "vmm_stat.h"
59 #include "vmm_util.h"
60
61 /*
62 * Locking details:
63 *
64 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
65 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data
66 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire
67 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to
68 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
69 */
70
71 static kmutex_t vmmdev_mtx;
72 static dev_info_t *vmmdev_dip;
73 static hma_reg_t *vmmdev_hma_reg;
74 static uint_t vmmdev_hma_ref;
75 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
76
77 static kmutex_t vmm_mtx;
78 static list_t vmm_list;
79 static list_t vmm_destroy_list;
80 static id_space_t *vmm_minors;
81 static void *vmm_statep;
82
83 static const char *vmmdev_hvm_name = "bhyve";
84
85 /* For sdev plugin (/dev) */
86 #define VMM_SDEV_ROOT "/dev/vmm"
87
88 /* From uts/i86pc/io/vmm/intel/vmx.c */
89 extern int vmx_x86_supported(const char **);
90
91 /* Holds and hooks from drivers external to vmm */
92 struct vmm_hold {
93 list_node_t vmh_node;
94 vmm_softc_t *vmh_sc;
95 boolean_t vmh_release_req;
96 uint_t vmh_ioport_hook_cnt;
97 };
98
99 struct vmm_lease {
100 list_node_t vml_node;
101 struct vm *vml_vm;
102 boolean_t vml_expired;
103 boolean_t (*vml_expire_func)(void *);
104 void *vml_expire_arg;
105 list_node_t vml_expire_node;
106 struct vmm_hold *vml_hold;
107 };
108
109 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
110 static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
111 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
112 static void vmm_kstat_init(vmm_softc_t *);
113 static void vmm_kstat_fini(vmm_softc_t *);
114
115 static int
116 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
117 {
118 int error;
119 bool sysmem;
120
121 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
122 NULL);
123 if (error || mseg->len == 0)
124 return (error);
125
126 if (!sysmem) {
127 vmm_devmem_entry_t *de;
128 list_t *dl = &sc->vmm_devmem_list;
129
130 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
131 if (de->vde_segid == mseg->segid) {
132 break;
133 }
134 }
135 if (de != NULL) {
136 (void) strlcpy(mseg->name, de->vde_name,
137 sizeof (mseg->name));
138 }
139 } else {
140 bzero(mseg->name, sizeof (mseg->name));
141 }
142
143 return (error);
144 }
145
146 /*
147 * The 'devmem' hack:
148 *
149 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
150 * in the vm which appear with their own name related to the vm under /dev.
151 * Since this would be a hassle from an sdev perspective and would require a
152 * new cdev interface (or complicate the existing one), we choose to implement
153 * this in a different manner. When 'devmem' mappings are created, an
154 * identifying off_t is communicated back out to userspace. That off_t,
155 * residing above the normal guest memory space, can be used to mmap the
156 * 'devmem' mapping from the already-open vm device.
157 */
158
159 static int
160 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
161 {
162 off_t map_offset;
163 vmm_devmem_entry_t *entry;
164
165 if (list_is_empty(&sc->vmm_devmem_list)) {
166 map_offset = VM_DEVMEM_START;
167 } else {
168 entry = list_tail(&sc->vmm_devmem_list);
169 map_offset = entry->vde_off + entry->vde_len;
170 if (map_offset < entry->vde_off) {
171 /* Do not tolerate overflow */
172 return (ERANGE);
173 }
174 /*
175 * XXXJOY: We could choose to search the list for duplicate
176 * names and toss an error. Since we're using the offset
177 * method for now, it does not make much of a difference.
178 */
179 }
180
181 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
182 entry->vde_segid = mseg->segid;
183 entry->vde_len = mseg->len;
184 entry->vde_off = map_offset;
185 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
186 list_insert_tail(&sc->vmm_devmem_list, entry);
187
188 return (0);
189 }
190
191 static boolean_t
192 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
193 off_t *map_offp)
194 {
195 list_t *dl = &sc->vmm_devmem_list;
196 vmm_devmem_entry_t *de = NULL;
197 const off_t map_end = off + len;
198
199 VERIFY(off >= VM_DEVMEM_START);
200
201 if (map_end < off) {
202 /* No match on overflow */
203 return (B_FALSE);
204 }
205
206 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
207 const off_t item_end = de->vde_off + de->vde_len;
208
209 if (de->vde_off <= off && item_end >= map_end) {
210 *segidp = de->vde_segid;
211 *map_offp = off - de->vde_off;
212 return (B_TRUE);
213 }
214 }
215 return (B_FALSE);
216 }
217
218 static void
219 vmmdev_devmem_purge(vmm_softc_t *sc)
220 {
221 vmm_devmem_entry_t *entry;
222
223 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
224 kmem_free(entry, sizeof (*entry));
225 }
226 }
227
228 static int
229 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
230 {
231 int error;
232 bool sysmem = true;
233
234 if (VM_MEMSEG_NAME(mseg)) {
235 sysmem = false;
236 }
237 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
238
239 if (error == 0 && VM_MEMSEG_NAME(mseg)) {
240 /*
241 * Rather than create a whole fresh device from which userspace
242 * can mmap this segment, instead make it available at an
243 * offset above where the main guest memory resides.
244 */
245 error = vmmdev_devmem_create(sc, mseg, mseg->name);
246 if (error != 0) {
247 vm_free_memseg(sc->vmm_vm, mseg->segid);
248 }
249 }
250 return (error);
251 }
252
253 /*
254 * Resource Locking and Exclusion
255 *
256 * Much of bhyve depends on key portions of VM state, such as the guest memory
257 * map, to remain unchanged while the guest is running. As ported from
258 * FreeBSD, the initial strategy for this resource exclusion hinged on gating
259 * access to the instance vCPUs. Threads acting on a single vCPU, like those
260 * performing the work of actually running the guest in VMX/SVM, would lock
261 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide
262 * state, all of the vCPUs would be first locked, ensuring that the
263 * operation(s) could complete without any other threads stumbling into
264 * intermediate states.
265 *
266 * This approach is largely effective for bhyve. Common operations, such as
267 * running the vCPUs, steer clear of lock contention. The model begins to
268 * break down for operations which do not occur in the context of a specific
269 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker
270 * thread in the bhyve process. In order to properly protect those vCPU-less
271 * operations from encountering invalid states, additional locking is required.
272 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
273 * It does mean that class of operations will be serialized on locking the
274 * specific vCPU and that instances sized at VM_MAXCPU will potentially see
275 * undue contention on the VM_MAXCPU-1 vCPU.
276 *
277 * In order to address the shortcomings of this model, the concept of a
278 * read/write lock has been added to bhyve. Operations which change
279 * fundamental aspects of a VM (such as the memory map) must acquire the write
280 * lock, which also implies locking all of the vCPUs and waiting for all read
281 * lock holders to release. While it increases the cost and waiting time for
282 * those few operations, it allows most hot-path operations on the VM (which
283 * depend on its configuration remaining stable) to occur with minimal locking.
284 *
285 * Consumers of the Driver API (see below) are a special case when it comes to
286 * this locking, since they may hold a read lock via the drv_lease mechanism
287 * for an extended period of time. Rather than forcing those consumers to
288 * continuously poll for a write lock attempt, the lease system forces them to
289 * provide a release callback to trigger their clean-up (and potential later
290 * reacquisition) of the read lock.
291 */
292
293 static void
294 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
295 {
296 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
297
298 /*
299 * Since this state transition is utilizing from_idle=true, it should
300 * not fail, but rather block until it can be successful.
301 */
302 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
303 }
304
305 static void
306 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
307 {
308 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
309
310 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
311 vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
312 }
313
314 static void
315 vmm_read_lock(vmm_softc_t *sc)
316 {
317 rw_enter(&sc->vmm_rwlock, RW_READER);
318 }
319
320 static void
321 vmm_read_unlock(vmm_softc_t *sc)
322 {
323 rw_exit(&sc->vmm_rwlock);
324 }
325
326 static void
327 vmm_write_lock(vmm_softc_t *sc)
328 {
329 int maxcpus;
330
331 /* First lock all the vCPUs */
332 maxcpus = vm_get_maxcpus(sc->vmm_vm);
333 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
334 vcpu_lock_one(sc, vcpu);
335 }
336
337 mutex_enter(&sc->vmm_lease_lock);
338 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
339 sc->vmm_lease_blocker++;
340 if (sc->vmm_lease_blocker == 1) {
341 list_t *list = &sc->vmm_lease_list;
342 vmm_lease_t *lease = list_head(list);
343
344 while (lease != NULL) {
345 boolean_t sync_break = B_FALSE;
346
347 if (!lease->vml_expired) {
348 void *arg = lease->vml_expire_arg;
349 lease->vml_expired = B_TRUE;
350 sync_break = lease->vml_expire_func(arg);
351 }
352
353 if (sync_break) {
354 vmm_lease_t *next;
355
356 /*
357 * These leases which are synchronously broken
358 * result in vmm_read_unlock() calls from a
359 * different thread than the corresponding
360 * vmm_read_lock(). This is acceptable, given
361 * that the rwlock underpinning the whole
362 * mechanism tolerates the behavior. This
363 * flexibility is _only_ afforded to VM read
364 * lock (RW_READER) holders.
365 */
366 next = list_next(list, lease);
367 vmm_lease_break_locked(sc, lease);
368 lease = next;
369 } else {
370 lease = list_next(list, lease);
371 }
372 }
373 }
374 mutex_exit(&sc->vmm_lease_lock);
375
376 rw_enter(&sc->vmm_rwlock, RW_WRITER);
377 /*
378 * For now, the 'maxcpus' value for an instance is fixed at the
379 * compile-time constant of VM_MAXCPU at creation. If this changes in
380 * the future, allowing for dynamic vCPU resource sizing, acquisition
381 * of the write lock will need to be wary of such changes.
382 */
383 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
384 }
385
386 static void
387 vmm_write_unlock(vmm_softc_t *sc)
388 {
389 int maxcpus;
390
391 mutex_enter(&sc->vmm_lease_lock);
392 VERIFY3U(sc->vmm_lease_blocker, !=, 0);
393 sc->vmm_lease_blocker--;
394 if (sc->vmm_lease_blocker == 0) {
395 cv_broadcast(&sc->vmm_lease_cv);
396 }
397 mutex_exit(&sc->vmm_lease_lock);
398
399 /*
400 * The VM write lock _must_ be released from the same thread it was
401 * acquired in, unlike the read lock.
402 */
403 VERIFY(rw_write_held(&sc->vmm_rwlock));
404 rw_exit(&sc->vmm_rwlock);
405
406 /* Unlock all the vCPUs */
407 maxcpus = vm_get_maxcpus(sc->vmm_vm);
408 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
409 vcpu_unlock_one(sc, vcpu);
410 }
411 }
412
413 static int
414 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
415 cred_t *credp, int *rvalp)
416 {
417 int error = 0, vcpu = -1;
418 void *datap = (void *)arg;
419 enum vm_lock_type {
420 LOCK_NONE = 0,
421 LOCK_VCPU,
422 LOCK_READ_HOLD,
423 LOCK_WRITE_HOLD
424 } lock_type = LOCK_NONE;
425
426 /* Acquire any exclusion resources needed for the operation. */
427 switch (cmd) {
428 case VM_RUN:
429 case VM_GET_REGISTER:
430 case VM_SET_REGISTER:
431 case VM_GET_SEGMENT_DESCRIPTOR:
432 case VM_SET_SEGMENT_DESCRIPTOR:
433 case VM_GET_REGISTER_SET:
434 case VM_SET_REGISTER_SET:
435 case VM_INJECT_EXCEPTION:
436 case VM_GET_CAPABILITY:
437 case VM_SET_CAPABILITY:
438 case VM_PPTDEV_MSI:
439 case VM_PPTDEV_MSIX:
440 case VM_SET_X2APIC_STATE:
441 case VM_GLA2GPA:
442 case VM_GLA2GPA_NOFAULT:
443 case VM_ACTIVATE_CPU:
444 case VM_SET_INTINFO:
445 case VM_GET_INTINFO:
446 case VM_RESTART_INSTRUCTION:
447 case VM_SET_KERNEMU_DEV:
448 case VM_GET_KERNEMU_DEV:
449 case VM_RESET_CPU:
450 case VM_GET_RUN_STATE:
451 case VM_SET_RUN_STATE:
452 /*
453 * Copy in the ID of the vCPU chosen for this operation.
454 * Since a nefarious caller could update their struct between
455 * this locking and when the rest of the ioctl data is copied
456 * in, it is _critical_ that this local 'vcpu' variable be used
457 * rather than the in-struct one when performing the ioctl.
458 */
459 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
460 return (EFAULT);
461 }
462 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
463 return (EINVAL);
464 }
465 vcpu_lock_one(sc, vcpu);
466 lock_type = LOCK_VCPU;
467 break;
468
469 case VM_REINIT:
470 case VM_BIND_PPTDEV:
471 case VM_UNBIND_PPTDEV:
472 case VM_MAP_PPTDEV_MMIO:
473 case VM_UNMAP_PPTDEV_MMIO:
474 case VM_ALLOC_MEMSEG:
475 case VM_MMAP_MEMSEG:
476 case VM_MUNMAP_MEMSEG:
477 case VM_WRLOCK_CYCLE:
478 case VM_PMTMR_LOCATE:
479 case VM_ARC_RESV:
480 vmm_write_lock(sc);
481 lock_type = LOCK_WRITE_HOLD;
482 break;
483
484 case VM_GET_GPA_PMAP:
485 case VM_GET_MEMSEG:
486 case VM_MMAP_GETNEXT:
487 case VM_LAPIC_IRQ:
488 case VM_INJECT_NMI:
489 case VM_IOAPIC_ASSERT_IRQ:
490 case VM_IOAPIC_DEASSERT_IRQ:
491 case VM_IOAPIC_PULSE_IRQ:
492 case VM_LAPIC_MSI:
493 case VM_LAPIC_LOCAL_IRQ:
494 case VM_GET_X2APIC_STATE:
495 case VM_RTC_READ:
496 case VM_RTC_WRITE:
497 case VM_RTC_SETTIME:
498 case VM_RTC_GETTIME:
499 case VM_PPTDEV_DISABLE_MSIX:
500 case VM_DEVMEM_GETOFFSET:
501 vmm_read_lock(sc);
502 lock_type = LOCK_READ_HOLD;
503 break;
504
505 case VM_IOAPIC_PINCOUNT:
506 default:
507 break;
508 }
509
510 /* Execute the primary logic for the ioctl. */
511 switch (cmd) {
512 case VM_RUN: {
513 struct vm_entry entry;
514
515 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
516 error = EFAULT;
517 break;
518 }
519
520 if (!(curthread->t_schedflag & TS_VCPU))
521 smt_mark_as_vcpu();
522
523 error = vm_run(sc->vmm_vm, vcpu, &entry);
524
525 /*
526 * Unexpected states in vm_run() are expressed through positive
527 * errno-oriented return values. VM states which expect further
528 * processing in userspace (necessary context via exitinfo) are
529 * expressed through negative return values. For the time being
530 * a return value of 0 is not expected from vm_run().
531 */
532 ASSERT(error != 0);
533 if (error < 0) {
534 const struct vm_exit *vme;
535 void *outp = entry.exit_data;
536
537 error = 0;
538 vme = vm_exitinfo(sc->vmm_vm, vcpu);
539 if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
540 error = EFAULT;
541 }
542 }
543 break;
544 }
545 case VM_SUSPEND: {
546 struct vm_suspend vmsuspend;
547
548 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
549 error = EFAULT;
550 break;
551 }
552 error = vm_suspend(sc->vmm_vm, vmsuspend.how);
553 break;
554 }
555 case VM_REINIT:
556 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
557 /*
558 * The VM instance should be free of driver-attached
559 * hooks during the reinitialization process.
560 */
561 break;
562 }
563 error = vm_reinit(sc->vmm_vm);
564 (void) vmm_drv_block_hook(sc, B_FALSE);
565 break;
566 case VM_STAT_DESC: {
567 struct vm_stat_desc statdesc;
568
569 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
570 error = EFAULT;
571 break;
572 }
573 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
574 sizeof (statdesc.desc));
575 if (error == 0 &&
576 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
577 error = EFAULT;
578 break;
579 }
580 break;
581 }
582 case VM_STATS_IOC: {
583 struct vm_stats vmstats;
584
585 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
586 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
587 error = EFAULT;
588 break;
589 }
590 hrt2tv(gethrtime(), &vmstats.tv);
591 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
592 &vmstats.num_entries, vmstats.statbuf);
593 if (error == 0 &&
594 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
595 error = EFAULT;
596 break;
597 }
598 break;
599 }
600
601 case VM_PPTDEV_MSI: {
602 struct vm_pptdev_msi pptmsi;
603
604 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
605 error = EFAULT;
606 break;
607 }
608 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
609 pptmsi.addr, pptmsi.msg, pptmsi.numvec);
610 break;
611 }
612 case VM_PPTDEV_MSIX: {
613 struct vm_pptdev_msix pptmsix;
614
615 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
616 error = EFAULT;
617 break;
618 }
619 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
620 pptmsix.idx, pptmsix.addr, pptmsix.msg,
621 pptmsix.vector_control);
622 break;
623 }
624 case VM_PPTDEV_DISABLE_MSIX: {
625 struct vm_pptdev pptdev;
626
627 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
628 error = EFAULT;
629 break;
630 }
631 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
632 break;
633 }
634 case VM_MAP_PPTDEV_MMIO: {
635 struct vm_pptdev_mmio pptmmio;
636
637 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
638 error = EFAULT;
639 break;
640 }
641 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
642 pptmmio.len, pptmmio.hpa);
643 break;
644 }
645 case VM_UNMAP_PPTDEV_MMIO: {
646 struct vm_pptdev_mmio pptmmio;
647
648 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
649 error = EFAULT;
650 break;
651 }
652 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
653 pptmmio.len);
654 break;
655 }
656 case VM_BIND_PPTDEV: {
657 struct vm_pptdev pptdev;
658
659 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
660 error = EFAULT;
661 break;
662 }
663 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
664 break;
665 }
666 case VM_UNBIND_PPTDEV: {
667 struct vm_pptdev pptdev;
668
669 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
670 error = EFAULT;
671 break;
672 }
673 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
674 break;
675 }
676 case VM_GET_PPTDEV_LIMITS: {
677 struct vm_pptdev_limits pptlimits;
678
679 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
680 error = EFAULT;
681 break;
682 }
683 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
684 &pptlimits.msi_limit, &pptlimits.msix_limit);
685 if (error == 0 &&
686 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
687 error = EFAULT;
688 break;
689 }
690 break;
691 }
692 case VM_INJECT_EXCEPTION: {
693 struct vm_exception vmexc;
694 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
695 error = EFAULT;
696 break;
697 }
698 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
699 vmexc.error_code_valid, vmexc.error_code,
700 vmexc.restart_instruction);
701 break;
702 }
703 case VM_INJECT_NMI: {
704 struct vm_nmi vmnmi;
705
706 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
707 error = EFAULT;
708 break;
709 }
710 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
711 break;
712 }
713 case VM_LAPIC_IRQ: {
714 struct vm_lapic_irq vmirq;
715
716 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
717 error = EFAULT;
718 break;
719 }
720 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
721 break;
722 }
723 case VM_LAPIC_LOCAL_IRQ: {
724 struct vm_lapic_irq vmirq;
725
726 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
727 error = EFAULT;
728 break;
729 }
730 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
731 vmirq.vector);
732 break;
733 }
734 case VM_LAPIC_MSI: {
735 struct vm_lapic_msi vmmsi;
736
737 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
738 error = EFAULT;
739 break;
740 }
741 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
742 break;
743 }
744
745 case VM_IOAPIC_ASSERT_IRQ: {
746 struct vm_ioapic_irq ioapic_irq;
747
748 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
749 error = EFAULT;
750 break;
751 }
752 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
753 break;
754 }
755 case VM_IOAPIC_DEASSERT_IRQ: {
756 struct vm_ioapic_irq ioapic_irq;
757
758 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
759 error = EFAULT;
760 break;
761 }
762 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
763 break;
764 }
765 case VM_IOAPIC_PULSE_IRQ: {
766 struct vm_ioapic_irq ioapic_irq;
767
768 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
769 error = EFAULT;
770 break;
771 }
772 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
773 break;
774 }
775 case VM_IOAPIC_PINCOUNT: {
776 int pincount;
777
778 pincount = vioapic_pincount(sc->vmm_vm);
779 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
780 error = EFAULT;
781 break;
782 }
783 break;
784 }
785
786 case VM_ISA_ASSERT_IRQ: {
787 struct vm_isa_irq isa_irq;
788
789 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
790 error = EFAULT;
791 break;
792 }
793 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
794 if (error == 0 && isa_irq.ioapic_irq != -1) {
795 error = vioapic_assert_irq(sc->vmm_vm,
796 isa_irq.ioapic_irq);
797 }
798 break;
799 }
800 case VM_ISA_DEASSERT_IRQ: {
801 struct vm_isa_irq isa_irq;
802
803 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
804 error = EFAULT;
805 break;
806 }
807 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
808 if (error == 0 && isa_irq.ioapic_irq != -1) {
809 error = vioapic_deassert_irq(sc->vmm_vm,
810 isa_irq.ioapic_irq);
811 }
812 break;
813 }
814 case VM_ISA_PULSE_IRQ: {
815 struct vm_isa_irq isa_irq;
816
817 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
818 error = EFAULT;
819 break;
820 }
821 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
822 if (error == 0 && isa_irq.ioapic_irq != -1) {
823 error = vioapic_pulse_irq(sc->vmm_vm,
824 isa_irq.ioapic_irq);
825 }
826 break;
827 }
828 case VM_ISA_SET_IRQ_TRIGGER: {
829 struct vm_isa_irq_trigger isa_irq_trigger;
830
831 if (ddi_copyin(datap, &isa_irq_trigger,
832 sizeof (isa_irq_trigger), md)) {
833 error = EFAULT;
834 break;
835 }
836 error = vatpic_set_irq_trigger(sc->vmm_vm,
837 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
838 break;
839 }
840
841 case VM_MMAP_GETNEXT: {
842 struct vm_memmap mm;
843
844 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
845 error = EFAULT;
846 break;
847 }
848 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
849 &mm.segoff, &mm.len, &mm.prot, &mm.flags);
850 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
851 error = EFAULT;
852 break;
853 }
854 break;
855 }
856 case VM_MMAP_MEMSEG: {
857 struct vm_memmap mm;
858
859 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
860 error = EFAULT;
861 break;
862 }
863 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
864 mm.len, mm.prot, mm.flags);
865 break;
866 }
867 case VM_MUNMAP_MEMSEG: {
868 struct vm_munmap mu;
869
870 if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
871 error = EFAULT;
872 break;
873 }
874 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
875 break;
876 }
877 case VM_ALLOC_MEMSEG: {
878 struct vm_memseg vmseg;
879
880 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
881 error = EFAULT;
882 break;
883 }
884 error = vmmdev_alloc_memseg(sc, &vmseg);
885 break;
886 }
887 case VM_GET_MEMSEG: {
888 struct vm_memseg vmseg;
889
890 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
891 error = EFAULT;
892 break;
893 }
894 error = vmmdev_get_memseg(sc, &vmseg);
895 if (error == 0 &&
896 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
897 error = EFAULT;
898 break;
899 }
900 break;
901 }
902 case VM_GET_REGISTER: {
903 struct vm_register vmreg;
904
905 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
906 error = EFAULT;
907 break;
908 }
909 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
910 &vmreg.regval);
911 if (error == 0 &&
912 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
913 error = EFAULT;
914 break;
915 }
916 break;
917 }
918 case VM_SET_REGISTER: {
919 struct vm_register vmreg;
920
921 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
922 error = EFAULT;
923 break;
924 }
925 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
926 vmreg.regval);
927 break;
928 }
929 case VM_SET_SEGMENT_DESCRIPTOR: {
930 struct vm_seg_desc vmsegd;
931
932 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
933 error = EFAULT;
934 break;
935 }
936 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
937 &vmsegd.desc);
938 break;
939 }
940 case VM_GET_SEGMENT_DESCRIPTOR: {
941 struct vm_seg_desc vmsegd;
942
943 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
944 error = EFAULT;
945 break;
946 }
947 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
948 &vmsegd.desc);
949 if (error == 0 &&
950 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
951 error = EFAULT;
952 break;
953 }
954 break;
955 }
956 case VM_GET_REGISTER_SET: {
957 struct vm_register_set vrs;
958 int regnums[VM_REG_LAST];
959 uint64_t regvals[VM_REG_LAST];
960
961 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
962 error = EFAULT;
963 break;
964 }
965 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
966 error = EINVAL;
967 break;
968 }
969 if (ddi_copyin(vrs.regnums, regnums,
970 sizeof (int) * vrs.count, md)) {
971 error = EFAULT;
972 break;
973 }
974
975 error = 0;
976 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
977 if (regnums[i] < 0) {
978 error = EINVAL;
979 break;
980 }
981 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
982 ®vals[i]);
983 }
984 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
985 sizeof (uint64_t) * vrs.count, md)) {
986 error = EFAULT;
987 }
988 break;
989 }
990 case VM_SET_REGISTER_SET: {
991 struct vm_register_set vrs;
992 int regnums[VM_REG_LAST];
993 uint64_t regvals[VM_REG_LAST];
994
995 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
996 error = EFAULT;
997 break;
998 }
999 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1000 error = EINVAL;
1001 break;
1002 }
1003 if (ddi_copyin(vrs.regnums, regnums,
1004 sizeof (int) * vrs.count, md)) {
1005 error = EFAULT;
1006 break;
1007 }
1008 if (ddi_copyin(vrs.regvals, regvals,
1009 sizeof (uint64_t) * vrs.count, md)) {
1010 error = EFAULT;
1011 break;
1012 }
1013
1014 error = 0;
1015 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1016 /*
1017 * Setting registers in a set is not atomic, since a
1018 * failure in the middle of the set will cause a
1019 * bail-out and inconsistent register state. Callers
1020 * should be wary of this.
1021 */
1022 if (regnums[i] < 0) {
1023 error = EINVAL;
1024 break;
1025 }
1026 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1027 regvals[i]);
1028 }
1029 break;
1030 }
1031 case VM_RESET_CPU: {
1032 struct vm_vcpu_reset vvr;
1033
1034 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1035 error = EFAULT;
1036 break;
1037 }
1038 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1039 error = EINVAL;
1040 }
1041
1042 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1043 break;
1044 }
1045 case VM_GET_RUN_STATE: {
1046 struct vm_run_state vrs;
1047
1048 bzero(&vrs, sizeof (vrs));
1049 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1050 &vrs.sipi_vector);
1051 if (error == 0) {
1052 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1053 error = EFAULT;
1054 break;
1055 }
1056 }
1057 break;
1058 }
1059 case VM_SET_RUN_STATE: {
1060 struct vm_run_state vrs;
1061
1062 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1063 error = EFAULT;
1064 break;
1065 }
1066 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1067 vrs.sipi_vector);
1068 break;
1069 }
1070
1071 case VM_SET_KERNEMU_DEV:
1072 case VM_GET_KERNEMU_DEV: {
1073 struct vm_readwrite_kernemu_device kemu;
1074 size_t size = 0;
1075
1076 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1077 error = EFAULT;
1078 break;
1079 }
1080
1081 if (kemu.access_width > 3) {
1082 error = EINVAL;
1083 break;
1084 }
1085 size = (1 << kemu.access_width);
1086 ASSERT(size >= 1 && size <= 8);
1087
1088 if (cmd == VM_SET_KERNEMU_DEV) {
1089 error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1090 kemu.gpa, kemu.value, size);
1091 } else {
1092 error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1093 kemu.gpa, &kemu.value, size);
1094 }
1095
1096 if (error == 0) {
1097 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1098 error = EFAULT;
1099 break;
1100 }
1101 }
1102 break;
1103 }
1104
1105 case VM_GET_CAPABILITY: {
1106 struct vm_capability vmcap;
1107
1108 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1109 error = EFAULT;
1110 break;
1111 }
1112 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1113 &vmcap.capval);
1114 if (error == 0 &&
1115 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1116 error = EFAULT;
1117 break;
1118 }
1119 break;
1120 }
1121 case VM_SET_CAPABILITY: {
1122 struct vm_capability vmcap;
1123
1124 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1125 error = EFAULT;
1126 break;
1127 }
1128 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1129 vmcap.capval);
1130 break;
1131 }
1132 case VM_SET_X2APIC_STATE: {
1133 struct vm_x2apic x2apic;
1134
1135 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1136 error = EFAULT;
1137 break;
1138 }
1139 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1140 break;
1141 }
1142 case VM_GET_X2APIC_STATE: {
1143 struct vm_x2apic x2apic;
1144
1145 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1146 error = EFAULT;
1147 break;
1148 }
1149 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1150 &x2apic.state);
1151 if (error == 0 &&
1152 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1153 error = EFAULT;
1154 break;
1155 }
1156 break;
1157 }
1158 case VM_GET_GPA_PMAP: {
1159 struct vm_gpa_pte gpapte;
1160
1161 if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1162 error = EFAULT;
1163 break;
1164 }
1165 #ifdef __FreeBSD__
1166 /* XXXJOY: add function? */
1167 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1168 gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1169 #endif
1170 error = 0;
1171 break;
1172 }
1173 case VM_GET_HPET_CAPABILITIES: {
1174 struct vm_hpet_cap hpetcap;
1175
1176 error = vhpet_getcap(&hpetcap);
1177 if (error == 0 &&
1178 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1179 error = EFAULT;
1180 break;
1181 }
1182 break;
1183 }
1184 case VM_GLA2GPA: {
1185 struct vm_gla2gpa gg;
1186
1187 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1188 error = EFAULT;
1189 break;
1190 }
1191 gg.vcpuid = vcpu;
1192 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1193 gg.prot, &gg.gpa, &gg.fault);
1194 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1195 error = EFAULT;
1196 break;
1197 }
1198 break;
1199 }
1200 case VM_GLA2GPA_NOFAULT: {
1201 struct vm_gla2gpa gg;
1202
1203 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1204 error = EFAULT;
1205 break;
1206 }
1207 gg.vcpuid = vcpu;
1208 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1209 gg.gla, gg.prot, &gg.gpa, &gg.fault);
1210 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1211 error = EFAULT;
1212 break;
1213 }
1214 break;
1215 }
1216
1217 case VM_ACTIVATE_CPU:
1218 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1219 break;
1220
1221 case VM_SUSPEND_CPU:
1222 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1223 error = EFAULT;
1224 } else {
1225 error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1226 }
1227 break;
1228
1229 case VM_RESUME_CPU:
1230 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1231 error = EFAULT;
1232 } else {
1233 error = vm_resume_cpu(sc->vmm_vm, vcpu);
1234 }
1235 break;
1236
1237 case VM_GET_CPUS: {
1238 struct vm_cpuset vm_cpuset;
1239 cpuset_t tempset;
1240 void *srcp = &tempset;
1241 int size;
1242
1243 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1244 error = EFAULT;
1245 break;
1246 }
1247
1248 /* Be more generous about sizing since our cpuset_t is large. */
1249 size = vm_cpuset.cpusetsize;
1250 if (size <= 0 || size > sizeof (cpuset_t)) {
1251 error = ERANGE;
1252 }
1253 /*
1254 * If they want a ulong_t or less, make sure they receive the
1255 * low bits with all the useful information.
1256 */
1257 if (size <= sizeof (tempset.cpub[0])) {
1258 srcp = &tempset.cpub[0];
1259 }
1260
1261 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1262 tempset = vm_active_cpus(sc->vmm_vm);
1263 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1264 tempset = vm_suspended_cpus(sc->vmm_vm);
1265 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1266 tempset = vm_debug_cpus(sc->vmm_vm);
1267 } else {
1268 error = EINVAL;
1269 }
1270
1271 ASSERT(size > 0 && size <= sizeof (tempset));
1272 if (error == 0 &&
1273 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1274 error = EFAULT;
1275 break;
1276 }
1277 break;
1278 }
1279 case VM_SET_INTINFO: {
1280 struct vm_intinfo vmii;
1281
1282 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1283 error = EFAULT;
1284 break;
1285 }
1286 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1287 break;
1288 }
1289 case VM_GET_INTINFO: {
1290 struct vm_intinfo vmii;
1291
1292 vmii.vcpuid = vcpu;
1293 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1294 &vmii.info2);
1295 if (error == 0 &&
1296 ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1297 error = EFAULT;
1298 break;
1299 }
1300 break;
1301 }
1302 case VM_RTC_WRITE: {
1303 struct vm_rtc_data rtcdata;
1304
1305 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1306 error = EFAULT;
1307 break;
1308 }
1309 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1310 rtcdata.value);
1311 break;
1312 }
1313 case VM_RTC_READ: {
1314 struct vm_rtc_data rtcdata;
1315
1316 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1317 error = EFAULT;
1318 break;
1319 }
1320 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1321 &rtcdata.value);
1322 if (error == 0 &&
1323 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1324 error = EFAULT;
1325 break;
1326 }
1327 break;
1328 }
1329 case VM_RTC_SETTIME: {
1330 struct vm_rtc_time rtctime;
1331
1332 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1333 error = EFAULT;
1334 break;
1335 }
1336 error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1337 break;
1338 }
1339 case VM_RTC_GETTIME: {
1340 struct vm_rtc_time rtctime;
1341
1342 rtctime.secs = vrtc_get_time(sc->vmm_vm);
1343 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1344 error = EFAULT;
1345 break;
1346 }
1347 break;
1348 }
1349
1350 case VM_PMTMR_LOCATE: {
1351 uint16_t port = arg;
1352 error = vpmtmr_set_location(sc->vmm_vm, port);
1353 break;
1354 }
1355
1356 case VM_RESTART_INSTRUCTION:
1357 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1358 break;
1359
1360 case VM_SET_TOPOLOGY: {
1361 struct vm_cpu_topology topo;
1362
1363 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1364 error = EFAULT;
1365 break;
1366 }
1367 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1368 topo.threads, topo.maxcpus);
1369 break;
1370 }
1371 case VM_GET_TOPOLOGY: {
1372 struct vm_cpu_topology topo;
1373
1374 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1375 &topo.threads, &topo.maxcpus);
1376 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1377 error = EFAULT;
1378 break;
1379 }
1380 break;
1381 }
1382
1383 case VM_DEVMEM_GETOFFSET: {
1384 struct vm_devmem_offset vdo;
1385 list_t *dl = &sc->vmm_devmem_list;
1386 vmm_devmem_entry_t *de = NULL;
1387
1388 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1389 error = EFAULT;
1390 break;
1391 }
1392
1393 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1394 if (de->vde_segid == vdo.segid) {
1395 break;
1396 }
1397 }
1398 if (de != NULL) {
1399 vdo.offset = de->vde_off;
1400 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1401 error = EFAULT;
1402 }
1403 } else {
1404 error = ENOENT;
1405 }
1406 break;
1407 }
1408 case VM_WRLOCK_CYCLE: {
1409 /*
1410 * Present a test mechanism to acquire/release the write lock
1411 * on the VM without any other effects.
1412 */
1413 break;
1414 }
1415 case VM_ARC_RESV:
1416 error = vm_arc_resv(sc->vmm_vm, (uint64_t)arg);
1417 break;
1418 default:
1419 error = ENOTTY;
1420 break;
1421 }
1422
1423 /* Release exclusion resources */
1424 switch (lock_type) {
1425 case LOCK_NONE:
1426 break;
1427 case LOCK_VCPU:
1428 vcpu_unlock_one(sc, vcpu);
1429 break;
1430 case LOCK_READ_HOLD:
1431 vmm_read_unlock(sc);
1432 break;
1433 case LOCK_WRITE_HOLD:
1434 vmm_write_unlock(sc);
1435 break;
1436 default:
1437 panic("unexpected lock type");
1438 break;
1439 }
1440
1441 return (error);
1442 }
1443
1444 static vmm_softc_t *
1445 vmm_lookup(const char *name)
1446 {
1447 list_t *vml = &vmm_list;
1448 vmm_softc_t *sc;
1449
1450 ASSERT(MUTEX_HELD(&vmm_mtx));
1451
1452 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1453 if (strcmp(sc->vmm_name, name) == 0) {
1454 break;
1455 }
1456 }
1457
1458 return (sc);
1459 }
1460
1461 /*
1462 * Acquire an HMA registration if not already held.
1463 */
1464 static boolean_t
1465 vmm_hma_acquire(void)
1466 {
1467 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1468
1469 mutex_enter(&vmmdev_mtx);
1470
1471 if (vmmdev_hma_reg == NULL) {
1472 VERIFY3U(vmmdev_hma_ref, ==, 0);
1473 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1474 if (vmmdev_hma_reg == NULL) {
1475 cmn_err(CE_WARN, "%s HMA registration failed.",
1476 vmmdev_hvm_name);
1477 mutex_exit(&vmmdev_mtx);
1478 return (B_FALSE);
1479 }
1480 }
1481
1482 vmmdev_hma_ref++;
1483
1484 mutex_exit(&vmmdev_mtx);
1485
1486 return (B_TRUE);
1487 }
1488
1489 /*
1490 * Release the HMA registration if held and there are no remaining VMs.
1491 */
1492 static void
1493 vmm_hma_release(void)
1494 {
1495 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1496
1497 mutex_enter(&vmmdev_mtx);
1498
1499 VERIFY3U(vmmdev_hma_ref, !=, 0);
1500
1501 vmmdev_hma_ref--;
1502
1503 if (vmmdev_hma_ref == 0) {
1504 VERIFY(vmmdev_hma_reg != NULL);
1505 hma_unregister(vmmdev_hma_reg);
1506 vmmdev_hma_reg = NULL;
1507 }
1508 mutex_exit(&vmmdev_mtx);
1509 }
1510
1511 static int
1512 vmmdev_do_vm_create(char *name, cred_t *cr)
1513 {
1514 vmm_softc_t *sc = NULL;
1515 minor_t minor;
1516 int error = ENOMEM;
1517
1518 if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1519 return (EINVAL);
1520 }
1521
1522 if (!vmm_hma_acquire())
1523 return (ENXIO);
1524
1525 mutex_enter(&vmm_mtx);
1526
1527 /* Look for duplicate names */
1528 if (vmm_lookup(name) != NULL) {
1529 mutex_exit(&vmm_mtx);
1530 vmm_hma_release();
1531 return (EEXIST);
1532 }
1533
1534 /* Allow only one instance per non-global zone. */
1535 if (!INGLOBALZONE(curproc)) {
1536 for (sc = list_head(&vmm_list); sc != NULL;
1537 sc = list_next(&vmm_list, sc)) {
1538 if (sc->vmm_zone == curzone) {
1539 mutex_exit(&vmm_mtx);
1540 vmm_hma_release();
1541 return (EINVAL);
1542 }
1543 }
1544 }
1545
1546 minor = id_alloc(vmm_minors);
1547 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1548 goto fail;
1549 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1550 ddi_soft_state_free(vmm_statep, minor);
1551 goto fail;
1552 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1553 DDI_PSEUDO, 0) != DDI_SUCCESS) {
1554 goto fail;
1555 }
1556
1557 if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1558 goto fail;
1559 }
1560
1561 error = vm_create(name, &sc->vmm_vm);
1562 if (error == 0) {
1563 /* Complete VM intialization and report success. */
1564 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1565 sc->vmm_minor = minor;
1566 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1567 offsetof(vmm_devmem_entry_t, vde_node));
1568
1569 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1570 offsetof(vmm_hold_t, vmh_node));
1571 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1572
1573 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1574 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1575 offsetof(vmm_lease_t, vml_node));
1576 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1577 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1578
1579 sc->vmm_zone = crgetzone(cr);
1580 zone_hold(sc->vmm_zone);
1581 vmm_zsd_add_vm(sc);
1582 vmm_kstat_init(sc);
1583
1584 list_insert_tail(&vmm_list, sc);
1585 mutex_exit(&vmm_mtx);
1586 return (0);
1587 }
1588
1589 vmm_kstat_fini(sc);
1590 ddi_remove_minor_node(vmmdev_dip, name);
1591 fail:
1592 id_free(vmm_minors, minor);
1593 if (sc != NULL) {
1594 ddi_soft_state_free(vmm_statep, minor);
1595 }
1596 mutex_exit(&vmm_mtx);
1597 vmm_hma_release();
1598
1599 return (error);
1600 }
1601
1602 /*
1603 * Bhyve 'Driver' Interface
1604 *
1605 * While many devices are emulated in the bhyve userspace process, there are
1606 * others with performance constraints which require that they run mostly or
1607 * entirely in-kernel. For those not integrated directly into bhyve, an API is
1608 * needed so they can query/manipulate the portions of VM state needed to
1609 * fulfill their purpose.
1610 *
1611 * This includes:
1612 * - Translating guest-physical addresses to host-virtual pointers
1613 * - Injecting MSIs
1614 * - Hooking IO port addresses
1615 *
1616 * The vmm_drv interface exists to provide that functionality to its consumers.
1617 * (At this time, 'viona' is the only user)
1618 */
1619 int
1620 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1621 {
1622 vnode_t *vp = fp->f_vnode;
1623 const dev_t dev = vp->v_rdev;
1624 vmm_softc_t *sc;
1625 vmm_hold_t *hold;
1626 int err = 0;
1627
1628 if (vp->v_type != VCHR) {
1629 return (ENXIO);
1630 }
1631 const major_t major = getmajor(dev);
1632 const minor_t minor = getminor(dev);
1633
1634 mutex_enter(&vmmdev_mtx);
1635 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1636 mutex_exit(&vmmdev_mtx);
1637 return (ENOENT);
1638 }
1639 mutex_enter(&vmm_mtx);
1640 mutex_exit(&vmmdev_mtx);
1641
1642 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1643 err = ENOENT;
1644 goto out;
1645 }
1646 /* XXXJOY: check cred permissions against instance */
1647
1648 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1649 err = EBUSY;
1650 goto out;
1651 }
1652
1653 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1654 hold->vmh_sc = sc;
1655 hold->vmh_release_req = B_FALSE;
1656
1657 list_insert_tail(&sc->vmm_holds, hold);
1658 sc->vmm_flags |= VMM_HELD;
1659 *holdp = hold;
1660
1661 out:
1662 mutex_exit(&vmm_mtx);
1663 return (err);
1664 }
1665
1666 void
1667 vmm_drv_rele(vmm_hold_t *hold)
1668 {
1669 vmm_softc_t *sc;
1670
1671 ASSERT(hold != NULL);
1672 ASSERT(hold->vmh_sc != NULL);
1673 VERIFY(hold->vmh_ioport_hook_cnt == 0);
1674
1675 mutex_enter(&vmm_mtx);
1676 sc = hold->vmh_sc;
1677 list_remove(&sc->vmm_holds, hold);
1678 if (list_is_empty(&sc->vmm_holds)) {
1679 sc->vmm_flags &= ~VMM_HELD;
1680 cv_broadcast(&sc->vmm_cv);
1681 }
1682 mutex_exit(&vmm_mtx);
1683 kmem_free(hold, sizeof (*hold));
1684 }
1685
1686 boolean_t
1687 vmm_drv_release_reqd(vmm_hold_t *hold)
1688 {
1689 ASSERT(hold != NULL);
1690
1691 return (hold->vmh_release_req);
1692 }
1693
1694 vmm_lease_t *
1695 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1696 {
1697 vmm_softc_t *sc = hold->vmh_sc;
1698 vmm_lease_t *lease;
1699
1700 ASSERT3P(expiref, !=, NULL);
1701
1702 if (hold->vmh_release_req) {
1703 return (NULL);
1704 }
1705
1706 lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1707 list_link_init(&lease->vml_node);
1708 lease->vml_expire_func = expiref;
1709 lease->vml_expire_arg = arg;
1710 lease->vml_expired = B_FALSE;
1711 lease->vml_hold = hold;
1712 /* cache the VM pointer for one less pointer chase */
1713 lease->vml_vm = sc->vmm_vm;
1714
1715 mutex_enter(&sc->vmm_lease_lock);
1716 while (sc->vmm_lease_blocker != 0) {
1717 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1718 }
1719 list_insert_tail(&sc->vmm_lease_list, lease);
1720 vmm_read_lock(sc);
1721 mutex_exit(&sc->vmm_lease_lock);
1722
1723 return (lease);
1724 }
1725
1726 static void
1727 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1728 {
1729 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1730
1731 list_remove(&sc->vmm_lease_list, lease);
1732 vmm_read_unlock(sc);
1733 kmem_free(lease, sizeof (*lease));
1734 }
1735
1736 void
1737 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1738 {
1739 vmm_softc_t *sc = hold->vmh_sc;
1740
1741 VERIFY3P(hold, ==, lease->vml_hold);
1742
1743 mutex_enter(&sc->vmm_lease_lock);
1744 vmm_lease_break_locked(sc, lease);
1745 mutex_exit(&sc->vmm_lease_lock);
1746 }
1747
1748 boolean_t
1749 vmm_drv_lease_expired(vmm_lease_t *lease)
1750 {
1751 return (lease->vml_expired);
1752 }
1753
1754 void *
1755 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1756 {
1757 ASSERT(lease != NULL);
1758
1759 return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1760 }
1761
1762 int
1763 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1764 {
1765 ASSERT(lease != NULL);
1766
1767 return (lapic_intr_msi(lease->vml_vm, addr, msg));
1768 }
1769
1770 int
1771 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1772 void *arg, void **cookie)
1773 {
1774 vmm_softc_t *sc;
1775 int err;
1776
1777 ASSERT(hold != NULL);
1778 ASSERT(cookie != NULL);
1779
1780 sc = hold->vmh_sc;
1781 mutex_enter(&vmm_mtx);
1782 /* Confirm that hook installation is not blocked */
1783 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1784 mutex_exit(&vmm_mtx);
1785 return (EBUSY);
1786 }
1787 /*
1788 * Optimistically record an installed hook which will prevent a block
1789 * from being asserted while the mutex is dropped.
1790 */
1791 hold->vmh_ioport_hook_cnt++;
1792 mutex_exit(&vmm_mtx);
1793
1794 vmm_write_lock(sc);
1795 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1796 arg, cookie);
1797 vmm_write_unlock(sc);
1798
1799 if (err != 0) {
1800 mutex_enter(&vmm_mtx);
1801 /* Walk back optimism about the hook installation */
1802 hold->vmh_ioport_hook_cnt--;
1803 mutex_exit(&vmm_mtx);
1804 }
1805 return (err);
1806 }
1807
1808 void
1809 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1810 {
1811 vmm_softc_t *sc;
1812
1813 ASSERT(hold != NULL);
1814 ASSERT(cookie != NULL);
1815 ASSERT(hold->vmh_ioport_hook_cnt != 0);
1816
1817 sc = hold->vmh_sc;
1818 vmm_write_lock(sc);
1819 vm_ioport_unhook(sc->vmm_vm, cookie);
1820 vmm_write_unlock(sc);
1821
1822 mutex_enter(&vmm_mtx);
1823 hold->vmh_ioport_hook_cnt--;
1824 mutex_exit(&vmm_mtx);
1825 }
1826
1827 static int
1828 vmm_drv_purge(vmm_softc_t *sc)
1829 {
1830 ASSERT(MUTEX_HELD(&vmm_mtx));
1831
1832 if ((sc->vmm_flags & VMM_HELD) != 0) {
1833 vmm_hold_t *hold;
1834
1835 sc->vmm_flags |= VMM_CLEANUP;
1836 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1837 hold = list_next(&sc->vmm_holds, hold)) {
1838 hold->vmh_release_req = B_TRUE;
1839 }
1840 while ((sc->vmm_flags & VMM_HELD) != 0) {
1841 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1842 return (EINTR);
1843 }
1844 }
1845 sc->vmm_flags &= ~VMM_CLEANUP;
1846 }
1847
1848 VERIFY(list_is_empty(&sc->vmm_holds));
1849 sc->vmm_flags |= VMM_PURGED;
1850 return (0);
1851 }
1852
1853 static int
1854 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1855 {
1856 int err = 0;
1857
1858 mutex_enter(&vmm_mtx);
1859 if (!enable_block) {
1860 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1861
1862 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1863 goto done;
1864 }
1865
1866 /* If any holds have hooks installed, the block is a failure */
1867 if (!list_is_empty(&sc->vmm_holds)) {
1868 vmm_hold_t *hold;
1869
1870 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1871 hold = list_next(&sc->vmm_holds, hold)) {
1872 if (hold->vmh_ioport_hook_cnt != 0) {
1873 err = EBUSY;
1874 goto done;
1875 }
1876 }
1877 }
1878 sc->vmm_flags |= VMM_BLOCK_HOOK;
1879
1880 done:
1881 mutex_exit(&vmm_mtx);
1882 return (err);
1883 }
1884
1885 static int
1886 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1887 boolean_t *hma_release)
1888 {
1889 dev_info_t *pdip = ddi_get_parent(vmmdev_dip);
1890 minor_t minor;
1891
1892 ASSERT(MUTEX_HELD(&vmm_mtx));
1893
1894 *hma_release = B_FALSE;
1895
1896 if (vmm_drv_purge(sc) != 0) {
1897 return (EINTR);
1898 }
1899
1900 if (clean_zsd) {
1901 vmm_zsd_rem_vm(sc);
1902 }
1903
1904 /* Clean up devmem entries */
1905 vmmdev_devmem_purge(sc);
1906
1907 list_remove(&vmm_list, sc);
1908 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1909 minor = sc->vmm_minor;
1910 zone_rele(sc->vmm_zone);
1911 if (sc->vmm_is_open) {
1912 list_insert_tail(&vmm_destroy_list, sc);
1913 sc->vmm_flags |= VMM_DESTROY;
1914 } else {
1915 vm_destroy(sc->vmm_vm);
1916 vmm_kstat_fini(sc);
1917 ddi_soft_state_free(vmm_statep, minor);
1918 id_free(vmm_minors, minor);
1919 *hma_release = B_TRUE;
1920 }
1921 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1922
1923 return (0);
1924 }
1925
1926 int
1927 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1928 {
1929 boolean_t hma_release = B_FALSE;
1930 int err;
1931
1932 mutex_enter(&vmm_mtx);
1933 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1934 mutex_exit(&vmm_mtx);
1935
1936 if (hma_release)
1937 vmm_hma_release();
1938
1939 return (err);
1940 }
1941
1942 /* ARGSUSED */
1943 static int
1944 vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1945 {
1946 boolean_t hma_release = B_FALSE;
1947 vmm_softc_t *sc;
1948 int err;
1949
1950 if (crgetuid(cr) != 0)
1951 return (EPERM);
1952
1953 mutex_enter(&vmm_mtx);
1954
1955 if ((sc = vmm_lookup(name)) == NULL) {
1956 mutex_exit(&vmm_mtx);
1957 return (ENOENT);
1958 }
1959 /*
1960 * We don't check this in vmm_lookup() since that function is also used
1961 * for validation during create and currently vmm names must be unique.
1962 */
1963 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1964 mutex_exit(&vmm_mtx);
1965 return (EPERM);
1966 }
1967 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1968
1969 mutex_exit(&vmm_mtx);
1970
1971 if (hma_release)
1972 vmm_hma_release();
1973
1974 return (err);
1975 }
1976
1977 #define VCPU_NAME_BUFLEN 32
1978
1979 static int
1980 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
1981 {
1982 zoneid_t zid = crgetzoneid(cr);
1983 int instance = minor;
1984 kstat_t *ksp;
1985
1986 ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
1987
1988 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
1989 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
1990 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
1991
1992 if (ksp == NULL) {
1993 return (-1);
1994 }
1995 sc->vmm_kstat_vm = ksp;
1996
1997 for (uint_t i = 0; i < VM_MAXCPU; i++) {
1998 char namebuf[VCPU_NAME_BUFLEN];
1999
2000 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2001
2002 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2003 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2004 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2005 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2006 0, zid);
2007 if (ksp == NULL) {
2008 goto fail;
2009 }
2010
2011 sc->vmm_kstat_vcpu[i] = ksp;
2012 }
2013
2014 /*
2015 * If this instance is associated with a non-global zone, make its
2016 * kstats visible from the GZ.
2017 */
2018 if (zid != GLOBAL_ZONEID) {
2019 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2020 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2021 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2022 }
2023 }
2024
2025 return (0);
2026
2027 fail:
2028 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2029 if (sc->vmm_kstat_vcpu[i] != NULL) {
2030 kstat_delete(sc->vmm_kstat_vcpu[i]);
2031 sc->vmm_kstat_vcpu[i] = NULL;
2032 } else {
2033 break;
2034 }
2035 }
2036 kstat_delete(sc->vmm_kstat_vm);
2037 sc->vmm_kstat_vm = NULL;
2038 return (-1);
2039 }
2040
2041 static void
2042 vmm_kstat_init(vmm_softc_t *sc)
2043 {
2044 kstat_t *ksp;
2045
2046 ASSERT3P(sc->vmm_vm, !=, NULL);
2047 ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2048
2049 ksp = sc->vmm_kstat_vm;
2050 vmm_kstats_t *vk = ksp->ks_data;
2051 ksp->ks_private = sc->vmm_vm;
2052 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2053 kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2054
2055 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2056 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2057
2058 ksp = sc->vmm_kstat_vcpu[i];
2059 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2060
2061 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2062 vvk->vvk_vcpu.value.ui32 = i;
2063 kstat_named_init(&vvk->vvk_time_init, "time_init",
2064 KSTAT_DATA_UINT64);
2065 kstat_named_init(&vvk->vvk_time_run, "time_run",
2066 KSTAT_DATA_UINT64);
2067 kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2068 KSTAT_DATA_UINT64);
2069 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2070 KSTAT_DATA_UINT64);
2071 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2072 KSTAT_DATA_UINT64);
2073 kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2074 KSTAT_DATA_UINT64);
2075 ksp->ks_private = sc->vmm_vm;
2076 ksp->ks_update = vmm_kstat_update_vcpu;
2077 }
2078
2079 kstat_install(sc->vmm_kstat_vm);
2080 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2081 kstat_install(sc->vmm_kstat_vcpu[i]);
2082 }
2083 }
2084
2085 static void
2086 vmm_kstat_fini(vmm_softc_t *sc)
2087 {
2088 ASSERT(sc->vmm_kstat_vm != NULL);
2089
2090 kstat_delete(sc->vmm_kstat_vm);
2091 sc->vmm_kstat_vm = NULL;
2092
2093 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2094 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2095
2096 kstat_delete(sc->vmm_kstat_vcpu[i]);
2097 sc->vmm_kstat_vcpu[i] = NULL;
2098 }
2099 }
2100
2101 static int
2102 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2103 {
2104 minor_t minor;
2105 vmm_softc_t *sc;
2106
2107 minor = getminor(*devp);
2108 if (minor == VMM_CTL_MINOR) {
2109 /*
2110 * Master control device must be opened exclusively.
2111 */
2112 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2113 return (EINVAL);
2114 }
2115
2116 return (0);
2117 }
2118
2119 mutex_enter(&vmm_mtx);
2120 sc = ddi_get_soft_state(vmm_statep, minor);
2121 if (sc == NULL) {
2122 mutex_exit(&vmm_mtx);
2123 return (ENXIO);
2124 }
2125
2126 sc->vmm_is_open = B_TRUE;
2127 mutex_exit(&vmm_mtx);
2128
2129 return (0);
2130 }
2131
2132 static int
2133 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2134 {
2135 minor_t minor;
2136 vmm_softc_t *sc;
2137 boolean_t hma_release = B_FALSE;
2138
2139 minor = getminor(dev);
2140 if (minor == VMM_CTL_MINOR)
2141 return (0);
2142
2143 mutex_enter(&vmm_mtx);
2144 sc = ddi_get_soft_state(vmm_statep, minor);
2145 if (sc == NULL) {
2146 mutex_exit(&vmm_mtx);
2147 return (ENXIO);
2148 }
2149
2150 VERIFY(sc->vmm_is_open);
2151 sc->vmm_is_open = B_FALSE;
2152
2153 /*
2154 * If this VM was destroyed while the vmm device was open, then
2155 * clean it up now that it is closed.
2156 */
2157 if (sc->vmm_flags & VMM_DESTROY) {
2158 list_remove(&vmm_destroy_list, sc);
2159 vm_destroy(sc->vmm_vm);
2160 ddi_soft_state_free(vmm_statep, minor);
2161 id_free(vmm_minors, minor);
2162 hma_release = B_TRUE;
2163 }
2164 mutex_exit(&vmm_mtx);
2165
2166 if (hma_release)
2167 vmm_hma_release();
2168
2169 return (0);
2170 }
2171
2172 static int
2173 vmm_is_supported(intptr_t arg)
2174 {
2175 int r;
2176 const char *msg;
2177
2178 if (vmm_is_intel()) {
2179 r = vmx_x86_supported(&msg);
2180 } else if (vmm_is_svm()) {
2181 /*
2182 * HMA already ensured that the features necessary for SVM
2183 * operation were present and online during vmm_attach().
2184 */
2185 r = 0;
2186 } else {
2187 r = ENXIO;
2188 msg = "Unsupported CPU vendor";
2189 }
2190
2191 if (r != 0 && arg != (intptr_t)NULL) {
2192 if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0)
2193 return (EFAULT);
2194 }
2195 return (r);
2196 }
2197
2198 static int
2199 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2200 int *rvalp)
2201 {
2202 vmm_softc_t *sc;
2203 minor_t minor;
2204
2205 /* The structs in bhyve ioctls assume a 64-bit datamodel */
2206 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2207 return (ENOTSUP);
2208 }
2209
2210 minor = getminor(dev);
2211
2212 if (minor == VMM_CTL_MINOR) {
2213 void *argp = (void *)arg;
2214 char name[VM_MAX_NAMELEN] = { 0 };
2215 size_t len = 0;
2216
2217 if ((mode & FKIOCTL) != 0) {
2218 len = strlcpy(name, argp, sizeof (name));
2219 } else {
2220 if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2221 return (EFAULT);
2222 }
2223 }
2224 if (len >= VM_MAX_NAMELEN) {
2225 return (ENAMETOOLONG);
2226 }
2227
2228 switch (cmd) {
2229 case VMM_CREATE_VM:
2230 if ((mode & FWRITE) == 0)
2231 return (EPERM);
2232 return (vmmdev_do_vm_create(name, credp));
2233 case VMM_DESTROY_VM:
2234 if ((mode & FWRITE) == 0)
2235 return (EPERM);
2236 return (vmmdev_do_vm_destroy(name, credp));
2237 case VMM_VM_SUPPORTED:
2238 return (vmm_is_supported(arg));
2239 default:
2240 /* No other actions are legal on ctl device */
2241 return (ENOTTY);
2242 }
2243 }
2244
2245 sc = ddi_get_soft_state(vmm_statep, minor);
2246 ASSERT(sc);
2247
2248 if (sc->vmm_flags & VMM_DESTROY)
2249 return (ENXIO);
2250
2251 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2252 }
2253
2254 static int
2255 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2256 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2257 {
2258 vmm_softc_t *sc;
2259 const minor_t minor = getminor(dev);
2260 struct vm *vm;
2261 int err;
2262 vm_object_t vmo = NULL;
2263 struct vmspace *vms;
2264
2265 if (minor == VMM_CTL_MINOR) {
2266 return (ENODEV);
2267 }
2268 if (off < 0 || (off + len) <= 0) {
2269 return (EINVAL);
2270 }
2271 if ((prot & PROT_USER) == 0) {
2272 return (EACCES);
2273 }
2274
2275 sc = ddi_get_soft_state(vmm_statep, minor);
2276 ASSERT(sc);
2277
2278 if (sc->vmm_flags & VMM_DESTROY)
2279 return (ENXIO);
2280
2281 /* Grab read lock on the VM to prevent any changes to the memory map */
2282 vmm_read_lock(sc);
2283
2284 vm = sc->vmm_vm;
2285 vms = vm_get_vmspace(vm);
2286 if (off >= VM_DEVMEM_START) {
2287 int segid;
2288 off_t map_off = 0;
2289
2290 /* Mapping a devmem "device" */
2291 if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
2292 err = ENODEV;
2293 goto out;
2294 }
2295 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2296 if (err != 0) {
2297 goto out;
2298 }
2299 err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
2300 flags);
2301 } else {
2302 /* Mapping a part of the guest physical space */
2303 err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2304 flags);
2305 }
2306
2307
2308 out:
2309 vmm_read_unlock(sc);
2310 return (err);
2311 }
2312
2313 static sdev_plugin_validate_t
2314 vmm_sdev_validate(sdev_ctx_t ctx)
2315 {
2316 const char *name = sdev_ctx_name(ctx);
2317 vmm_softc_t *sc;
2318 sdev_plugin_validate_t ret;
2319 minor_t minor;
2320
2321 if (sdev_ctx_vtype(ctx) != VCHR)
2322 return (SDEV_VTOR_INVALID);
2323
2324 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2325
2326 mutex_enter(&vmm_mtx);
2327 if ((sc = vmm_lookup(name)) == NULL)
2328 ret = SDEV_VTOR_INVALID;
2329 else if (sc->vmm_minor != minor)
2330 ret = SDEV_VTOR_STALE;
2331 else
2332 ret = SDEV_VTOR_VALID;
2333 mutex_exit(&vmm_mtx);
2334
2335 return (ret);
2336 }
2337
2338 static int
2339 vmm_sdev_filldir(sdev_ctx_t ctx)
2340 {
2341 vmm_softc_t *sc;
2342 int ret;
2343
2344 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2345 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2346 sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2347 return (EINVAL);
2348 }
2349
2350 mutex_enter(&vmm_mtx);
2351 ASSERT(vmmdev_dip != NULL);
2352 for (sc = list_head(&vmm_list); sc != NULL;
2353 sc = list_next(&vmm_list, sc)) {
2354 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2355 ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2356 S_IFCHR | 0600,
2357 makedevice(ddi_driver_major(vmmdev_dip),
2358 sc->vmm_minor));
2359 } else {
2360 continue;
2361 }
2362 if (ret != 0 && ret != EEXIST)
2363 goto out;
2364 }
2365
2366 ret = 0;
2367
2368 out:
2369 mutex_exit(&vmm_mtx);
2370 return (ret);
2371 }
2372
2373 /* ARGSUSED */
2374 static void
2375 vmm_sdev_inactive(sdev_ctx_t ctx)
2376 {
2377 }
2378
2379 static sdev_plugin_ops_t vmm_sdev_ops = {
2380 .spo_version = SDEV_PLUGIN_VERSION,
2381 .spo_flags = SDEV_PLUGIN_SUBDIR,
2382 .spo_validate = vmm_sdev_validate,
2383 .spo_filldir = vmm_sdev_filldir,
2384 .spo_inactive = vmm_sdev_inactive
2385 };
2386
2387 /* ARGSUSED */
2388 static int
2389 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2390 {
2391 int error;
2392
2393 switch (cmd) {
2394 case DDI_INFO_DEVT2DEVINFO:
2395 *result = (void *)vmmdev_dip;
2396 error = DDI_SUCCESS;
2397 break;
2398 case DDI_INFO_DEVT2INSTANCE:
2399 *result = (void *)0;
2400 error = DDI_SUCCESS;
2401 break;
2402 default:
2403 error = DDI_FAILURE;
2404 break;
2405 }
2406 return (error);
2407 }
2408
2409 static int
2410 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2411 {
2412 sdev_plugin_hdl_t sph;
2413 hma_reg_t *reg = NULL;
2414 boolean_t vmm_loaded = B_FALSE;
2415
2416 if (cmd != DDI_ATTACH) {
2417 return (DDI_FAILURE);
2418 }
2419
2420 mutex_enter(&vmmdev_mtx);
2421 /* Ensure we are not already attached. */
2422 if (vmmdev_dip != NULL) {
2423 mutex_exit(&vmmdev_mtx);
2424 return (DDI_FAILURE);
2425 }
2426
2427 vmm_sol_glue_init();
2428 vmm_arena_init();
2429
2430 /*
2431 * Perform temporary HMA registration to determine if the system
2432 * is capable.
2433 */
2434 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2435 goto fail;
2436 } else if (vmm_mod_load() != 0) {
2437 goto fail;
2438 }
2439 vmm_loaded = B_TRUE;
2440 hma_unregister(reg);
2441 reg = NULL;
2442
2443 /* Create control node. Other nodes will be created on demand. */
2444 if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2445 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2446 goto fail;
2447 }
2448
2449 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
2450 if (sph == (sdev_plugin_hdl_t)NULL) {
2451 ddi_remove_minor_node(dip, NULL);
2452 goto fail;
2453 }
2454
2455 ddi_report_dev(dip);
2456 vmmdev_sdev_hdl = sph;
2457 vmmdev_dip = dip;
2458 mutex_exit(&vmmdev_mtx);
2459 return (DDI_SUCCESS);
2460
2461 fail:
2462 if (vmm_loaded) {
2463 VERIFY0(vmm_mod_unload());
2464 }
2465 if (reg != NULL) {
2466 hma_unregister(reg);
2467 }
2468 vmm_arena_fini();
2469 vmm_sol_glue_cleanup();
2470 mutex_exit(&vmmdev_mtx);
2471 return (DDI_FAILURE);
2472 }
2473
2474 static int
2475 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2476 {
2477 if (cmd != DDI_DETACH) {
2478 return (DDI_FAILURE);
2479 }
2480
2481 /*
2482 * Ensure that all resources have been cleaned up.
2483 *
2484 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2485 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2486 * devinfo locked as iommu_cleanup() tries to recursively lock each
2487 * devinfo, including our own, while holding vmmdev_mtx.
2488 */
2489 if (mutex_tryenter(&vmmdev_mtx) == 0)
2490 return (DDI_FAILURE);
2491
2492 mutex_enter(&vmm_mtx);
2493 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2494 mutex_exit(&vmm_mtx);
2495 mutex_exit(&vmmdev_mtx);
2496 return (DDI_FAILURE);
2497 }
2498 mutex_exit(&vmm_mtx);
2499
2500 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2501 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2502 mutex_exit(&vmmdev_mtx);
2503 return (DDI_FAILURE);
2504 }
2505 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2506
2507 /* Remove the control node. */
2508 ddi_remove_minor_node(dip, "ctl");
2509 vmmdev_dip = NULL;
2510
2511 VERIFY0(vmm_mod_unload());
2512 VERIFY3U(vmmdev_hma_reg, ==, NULL);
2513 vmm_arena_fini();
2514 vmm_sol_glue_cleanup();
2515
2516 mutex_exit(&vmmdev_mtx);
2517
2518 return (DDI_SUCCESS);
2519 }
2520
2521 static struct cb_ops vmm_cb_ops = {
2522 vmm_open,
2523 vmm_close,
2524 nodev, /* strategy */
2525 nodev, /* print */
2526 nodev, /* dump */
2527 nodev, /* read */
2528 nodev, /* write */
2529 vmm_ioctl,
2530 nodev, /* devmap */
2531 nodev, /* mmap */
2532 vmm_segmap,
2533 nochpoll, /* poll */
2534 ddi_prop_op,
2535 NULL,
2536 D_NEW | D_MP | D_DEVMAP
2537 };
2538
2539 static struct dev_ops vmm_ops = {
2540 DEVO_REV,
2541 0,
2542 vmm_info,
2543 nulldev, /* identify */
2544 nulldev, /* probe */
2545 vmm_attach,
2546 vmm_detach,
2547 nodev, /* reset */
2548 &vmm_cb_ops,
2549 (struct bus_ops *)NULL
2550 };
2551
2552 static struct modldrv modldrv = {
2553 &mod_driverops,
2554 "bhyve vmm",
2555 &vmm_ops
2556 };
2557
2558 static struct modlinkage modlinkage = {
2559 MODREV_1,
2560 &modldrv,
2561 NULL
2562 };
2563
2564 int
2565 _init(void)
2566 {
2567 int error;
2568
2569 sysinit();
2570
2571 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2572 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2573 list_create(&vmm_list, sizeof (vmm_softc_t),
2574 offsetof(vmm_softc_t, vmm_node));
2575 list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2576 offsetof(vmm_softc_t, vmm_node));
2577 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2578
2579 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2580 if (error) {
2581 return (error);
2582 }
2583
2584 vmm_zsd_init();
2585
2586 error = mod_install(&modlinkage);
2587 if (error) {
2588 ddi_soft_state_fini(&vmm_statep);
2589 vmm_zsd_fini();
2590 }
2591
2592 return (error);
2593 }
2594
2595 int
2596 _fini(void)
2597 {
2598 int error;
2599
2600 error = mod_remove(&modlinkage);
2601 if (error) {
2602 return (error);
2603 }
2604
2605 vmm_zsd_fini();
2606
2607 ddi_soft_state_fini(&vmm_statep);
2608
2609 return (0);
2610 }
2611
2612 int
2613 _info(struct modinfo *modinfop)
2614 {
2615 return (mod_info(&modlinkage, modinfop));
2616 }