1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2015 Pluribus Networks Inc.
15 * Copyright 2019 Joyent, Inc.
16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17 * Copyright 2020 Oxide Computer Company
18 */
19
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34
35 #include <sys/kernel.h>
36 #include <sys/hma.h>
37 #include <sys/x86_archext.h>
38 #include <x86/apicreg.h>
39
40 #include <sys/vmm.h>
41 #include <sys/vmm_kernel.h>
42 #include <sys/vmm_instruction_emul.h>
43 #include <sys/vmm_dev.h>
44 #include <sys/vmm_impl.h>
45 #include <sys/vmm_drv.h>
46
47 #include <vm/vm.h>
48 #include <vm/seg_dev.h>
49
50 #include "io/ppt.h"
51 #include "io/vatpic.h"
52 #include "io/vioapic.h"
53 #include "io/vrtc.h"
54 #include "io/vhpet.h"
55 #include "io/vpmtmr.h"
56 #include "vmm_lapic.h"
57 #include "vmm_stat.h"
58 #include "vmm_util.h"
59 #include "vm/vm_glue.h"
60
61 /*
62 * Locking details:
63 *
64 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
65 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data
66 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire
67 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to
68 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
69 */
70
71 static kmutex_t vmmdev_mtx;
72 static dev_info_t *vmmdev_dip;
73 static hma_reg_t *vmmdev_hma_reg;
74 static uint_t vmmdev_hma_ref;
75 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
76
77 static kmutex_t vmm_mtx;
78 static list_t vmm_list;
79 static list_t vmm_destroy_list;
80 static id_space_t *vmm_minors;
81 static void *vmm_statep;
82
83 static const char *vmmdev_hvm_name = "bhyve";
84
85 /* For sdev plugin (/dev) */
86 #define VMM_SDEV_ROOT "/dev/vmm"
87
88 /* From uts/i86pc/io/vmm/intel/vmx.c */
89 extern int vmx_x86_supported(const char **);
90
91 /* Holds and hooks from drivers external to vmm */
92 struct vmm_hold {
93 list_node_t vmh_node;
94 vmm_softc_t *vmh_sc;
95 boolean_t vmh_release_req;
96 uint_t vmh_ioport_hook_cnt;
97 };
98
99 struct vmm_lease {
100 list_node_t vml_node;
101 struct vm *vml_vm;
102 boolean_t vml_expired;
103 boolean_t (*vml_expire_func)(void *);
104 void *vml_expire_arg;
105 list_node_t vml_expire_node;
106 struct vmm_hold *vml_hold;
107 };
108
109 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
110 static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
111
112 static int
113 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
114 {
115 int error;
116 bool sysmem;
117
118 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
119 NULL);
120 if (error || mseg->len == 0)
121 return (error);
122
123 if (!sysmem) {
124 vmm_devmem_entry_t *de;
125 list_t *dl = &sc->vmm_devmem_list;
126
127 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
128 if (de->vde_segid == mseg->segid) {
129 break;
130 }
131 }
132 if (de != NULL) {
133 (void) strlcpy(mseg->name, de->vde_name,
134 sizeof (mseg->name));
135 }
136 } else {
137 bzero(mseg->name, sizeof (mseg->name));
138 }
139
140 return (error);
141 }
142
143 /*
144 * The 'devmem' hack:
145 *
146 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
147 * in the vm which appear with their own name related to the vm under /dev.
148 * Since this would be a hassle from an sdev perspective and would require a
149 * new cdev interface (or complicate the existing one), we choose to implement
150 * this in a different manner. When 'devmem' mappings are created, an
151 * identifying off_t is communicated back out to userspace. That off_t,
152 * residing above the normal guest memory space, can be used to mmap the
153 * 'devmem' mapping from the already-open vm device.
154 */
155
156 static int
157 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
158 {
159 off_t map_offset;
160 vmm_devmem_entry_t *entry;
161
162 if (list_is_empty(&sc->vmm_devmem_list)) {
163 map_offset = VM_DEVMEM_START;
164 } else {
165 entry = list_tail(&sc->vmm_devmem_list);
166 map_offset = entry->vde_off + entry->vde_len;
167 if (map_offset < entry->vde_off) {
168 /* Do not tolerate overflow */
169 return (ERANGE);
170 }
171 /*
172 * XXXJOY: We could choose to search the list for duplicate
173 * names and toss an error. Since we're using the offset
174 * method for now, it does not make much of a difference.
175 */
176 }
177
178 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
179 entry->vde_segid = mseg->segid;
180 entry->vde_len = mseg->len;
181 entry->vde_off = map_offset;
182 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
183 list_insert_tail(&sc->vmm_devmem_list, entry);
184
185 return (0);
186 }
187
188 static boolean_t
189 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
190 off_t *map_offp)
191 {
192 list_t *dl = &sc->vmm_devmem_list;
193 vmm_devmem_entry_t *de = NULL;
194 const off_t map_end = off + len;
195
196 VERIFY(off >= VM_DEVMEM_START);
197
198 if (map_end < off) {
199 /* No match on overflow */
200 return (B_FALSE);
201 }
202
203 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
204 const off_t item_end = de->vde_off + de->vde_len;
205
206 if (de->vde_off <= off && item_end >= map_end) {
207 *segidp = de->vde_segid;
208 *map_offp = off - de->vde_off;
209 return (B_TRUE);
210 }
211 }
212 return (B_FALSE);
213 }
214
215 static void
216 vmmdev_devmem_purge(vmm_softc_t *sc)
217 {
218 vmm_devmem_entry_t *entry;
219
220 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
221 kmem_free(entry, sizeof (*entry));
222 }
223 }
224
225 static int
226 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
227 {
228 int error;
229 bool sysmem = true;
230
231 if (VM_MEMSEG_NAME(mseg)) {
232 sysmem = false;
233 }
234 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
235
236 if (error == 0 && VM_MEMSEG_NAME(mseg)) {
237 /*
238 * Rather than create a whole fresh device from which userspace
239 * can mmap this segment, instead make it available at an
240 * offset above where the main guest memory resides.
241 */
242 error = vmmdev_devmem_create(sc, mseg, mseg->name);
243 if (error != 0) {
244 vm_free_memseg(sc->vmm_vm, mseg->segid);
245 }
246 }
247 return (error);
248 }
249
250 /*
251 * Resource Locking and Exclusion
252 *
253 * Much of bhyve depends on key portions of VM state, such as the guest memory
254 * map, to remain unchanged while the guest is running. As ported from
255 * FreeBSD, the initial strategy for this resource exclusion hinged on gating
256 * access to the instance vCPUs. Threads acting on a single vCPU, like those
257 * performing the work of actually running the guest in VMX/SVM, would lock
258 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide
259 * state, all of the vCPUs would be first locked, ensuring that the
260 * operation(s) could complete without any other threads stumbling into
261 * intermediate states.
262 *
263 * This approach is largely effective for bhyve. Common operations, such as
264 * running the vCPUs, steer clear of lock contention. The model begins to
265 * break down for operations which do not occur in the context of a specific
266 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker
267 * thread in the bhyve process. In order to properly protect those vCPU-less
268 * operations from encountering invalid states, additional locking is required.
269 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
270 * It does mean that class of operations will be serialized on locking the
271 * specific vCPU and that instances sized at VM_MAXCPU will potentially see
272 * undue contention on the VM_MAXCPU-1 vCPU.
273 *
274 * In order to address the shortcomings of this model, the concept of a
275 * read/write lock has been added to bhyve. Operations which change
276 * fundamental aspects of a VM (such as the memory map) must acquire the write
277 * lock, which also implies locking all of the vCPUs and waiting for all read
278 * lock holders to release. While it increases the cost and waiting time for
279 * those few operations, it allows most hot-path operations on the VM (which
280 * depend on its configuration remaining stable) to occur with minimal locking.
281 *
282 * Consumers of the Driver API (see below) are a special case when it comes to
283 * this locking, since they may hold a read lock via the drv_lease mechanism
284 * for an extended period of time. Rather than forcing those consumers to
285 * continuously poll for a write lock attempt, the lease system forces them to
286 * provide a release callback to trigger their clean-up (and potential later
287 * reacquisition) of the read lock.
288 */
289
290 static void
291 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
292 {
293 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
294
295 /*
296 * Since this state transition is utilizing from_idle=true, it should
297 * not fail, but rather block until it can be successful.
298 */
299 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
300 }
301
302 static void
303 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
304 {
305 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
306
307 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
308 vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
309 }
310
311 static void
312 vmm_read_lock(vmm_softc_t *sc)
313 {
314 rw_enter(&sc->vmm_rwlock, RW_READER);
315 }
316
317 static void
318 vmm_read_unlock(vmm_softc_t *sc)
319 {
320 rw_exit(&sc->vmm_rwlock);
321 }
322
323 static void
324 vmm_write_lock(vmm_softc_t *sc)
325 {
326 int maxcpus;
327
328 /* First lock all the vCPUs */
329 maxcpus = vm_get_maxcpus(sc->vmm_vm);
330 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
331 vcpu_lock_one(sc, vcpu);
332 }
333
334 mutex_enter(&sc->vmm_lease_lock);
335 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
336 sc->vmm_lease_blocker++;
337 if (sc->vmm_lease_blocker == 1) {
338 list_t *list = &sc->vmm_lease_list;
339 vmm_lease_t *lease = list_head(list);
340
341 while (lease != NULL) {
342 boolean_t sync_break = B_FALSE;
343
344 if (!lease->vml_expired) {
345 void *arg = lease->vml_expire_arg;
346 lease->vml_expired = B_TRUE;
347 sync_break = lease->vml_expire_func(arg);
348 }
349
350 if (sync_break) {
351 vmm_lease_t *next;
352
353 /*
354 * These leases which are synchronously broken
355 * result in vmm_read_unlock() calls from a
356 * different thread than the corresponding
357 * vmm_read_lock(). This is acceptable, given
358 * that the rwlock underpinning the whole
359 * mechanism tolerates the behavior. This
360 * flexibility is _only_ afforded to VM read
361 * lock (RW_READER) holders.
362 */
363 next = list_next(list, lease);
364 vmm_lease_break_locked(sc, lease);
365 lease = next;
366 } else {
367 lease = list_next(list, lease);
368 }
369 }
370 }
371 mutex_exit(&sc->vmm_lease_lock);
372
373 rw_enter(&sc->vmm_rwlock, RW_WRITER);
374 /*
375 * For now, the 'maxcpus' value for an instance is fixed at the
376 * compile-time constant of VM_MAXCPU at creation. If this changes in
377 * the future, allowing for dynamic vCPU resource sizing, acquisition
378 * of the write lock will need to be wary of such changes.
379 */
380 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
381 }
382
383 static void
384 vmm_write_unlock(vmm_softc_t *sc)
385 {
386 int maxcpus;
387
388 mutex_enter(&sc->vmm_lease_lock);
389 VERIFY3U(sc->vmm_lease_blocker, !=, 0);
390 sc->vmm_lease_blocker--;
391 if (sc->vmm_lease_blocker == 0) {
392 cv_broadcast(&sc->vmm_lease_cv);
393 }
394 mutex_exit(&sc->vmm_lease_lock);
395
396 /*
397 * The VM write lock _must_ be released from the same thread it was
398 * acquired in, unlike the read lock.
399 */
400 VERIFY(rw_write_held(&sc->vmm_rwlock));
401 rw_exit(&sc->vmm_rwlock);
402
403 /* Unlock all the vCPUs */
404 maxcpus = vm_get_maxcpus(sc->vmm_vm);
405 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
406 vcpu_unlock_one(sc, vcpu);
407 }
408 }
409
410 static int
411 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
412 cred_t *credp, int *rvalp)
413 {
414 int error = 0, vcpu = -1;
415 void *datap = (void *)arg;
416 enum vm_lock_type {
417 LOCK_NONE = 0,
418 LOCK_VCPU,
419 LOCK_READ_HOLD,
420 LOCK_WRITE_HOLD
421 } lock_type = LOCK_NONE;
422
423 /* Acquire any exclusion resources needed for the operation. */
424 switch (cmd) {
425 case VM_RUN:
426 case VM_GET_REGISTER:
427 case VM_SET_REGISTER:
428 case VM_GET_SEGMENT_DESCRIPTOR:
429 case VM_SET_SEGMENT_DESCRIPTOR:
430 case VM_GET_REGISTER_SET:
431 case VM_SET_REGISTER_SET:
432 case VM_INJECT_EXCEPTION:
433 case VM_GET_CAPABILITY:
434 case VM_SET_CAPABILITY:
435 case VM_PPTDEV_MSI:
436 case VM_PPTDEV_MSIX:
437 case VM_SET_X2APIC_STATE:
438 case VM_GLA2GPA:
439 case VM_GLA2GPA_NOFAULT:
440 case VM_ACTIVATE_CPU:
441 case VM_SET_INTINFO:
442 case VM_GET_INTINFO:
443 case VM_RESTART_INSTRUCTION:
444 case VM_SET_KERNEMU_DEV:
445 case VM_GET_KERNEMU_DEV:
446 case VM_RESET_CPU:
447 case VM_GET_RUN_STATE:
448 case VM_SET_RUN_STATE:
449 /*
450 * Copy in the ID of the vCPU chosen for this operation.
451 * Since a nefarious caller could update their struct between
452 * this locking and when the rest of the ioctl data is copied
453 * in, it is _critical_ that this local 'vcpu' variable be used
454 * rather than the in-struct one when performing the ioctl.
455 */
456 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
457 return (EFAULT);
458 }
459 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
460 return (EINVAL);
461 }
462 vcpu_lock_one(sc, vcpu);
463 lock_type = LOCK_VCPU;
464 break;
465
466 case VM_REINIT:
467 case VM_BIND_PPTDEV:
468 case VM_UNBIND_PPTDEV:
469 case VM_MAP_PPTDEV_MMIO:
470 case VM_ALLOC_MEMSEG:
471 case VM_MMAP_MEMSEG:
472 case VM_WRLOCK_CYCLE:
473 case VM_PMTMR_LOCATE:
474 vmm_write_lock(sc);
475 lock_type = LOCK_WRITE_HOLD;
476 break;
477
478 case VM_GET_GPA_PMAP:
479 case VM_GET_MEMSEG:
480 case VM_MMAP_GETNEXT:
481 case VM_LAPIC_IRQ:
482 case VM_INJECT_NMI:
483 case VM_IOAPIC_ASSERT_IRQ:
484 case VM_IOAPIC_DEASSERT_IRQ:
485 case VM_IOAPIC_PULSE_IRQ:
486 case VM_LAPIC_MSI:
487 case VM_LAPIC_LOCAL_IRQ:
488 case VM_GET_X2APIC_STATE:
489 case VM_RTC_READ:
490 case VM_RTC_WRITE:
491 case VM_RTC_SETTIME:
492 case VM_RTC_GETTIME:
493 #ifndef __FreeBSD__
494 case VM_DEVMEM_GETOFFSET:
495 #endif
496 vmm_read_lock(sc);
497 lock_type = LOCK_READ_HOLD;
498 break;
499
500 case VM_IOAPIC_PINCOUNT:
501 default:
502 break;
503 }
504
505 /* Execute the primary logic for the ioctl. */
506 switch (cmd) {
507 case VM_RUN: {
508 struct vm_entry entry;
509
510 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
511 error = EFAULT;
512 break;
513 }
514
515 if (!(curthread->t_schedflag & TS_VCPU))
516 smt_mark_as_vcpu();
517
518 error = vm_run(sc->vmm_vm, vcpu, &entry);
519
520 /*
521 * Unexpected states in vm_run() are expressed through positive
522 * errno-oriented return values. VM states which expect further
523 * processing in userspace (necessary context via exitinfo) are
524 * expressed through negative return values. For the time being
525 * a return value of 0 is not expected from vm_run().
526 */
527 ASSERT(error != 0);
528 if (error < 0) {
529 const struct vm_exit *vme;
530 void *outp = entry.exit_data;
531
532 error = 0;
533 vme = vm_exitinfo(sc->vmm_vm, vcpu);
534 if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
535 error = EFAULT;
536 }
537 }
538 break;
539 }
540 case VM_SUSPEND: {
541 struct vm_suspend vmsuspend;
542
543 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
544 error = EFAULT;
545 break;
546 }
547 error = vm_suspend(sc->vmm_vm, vmsuspend.how);
548 break;
549 }
550 case VM_REINIT:
551 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
552 /*
553 * The VM instance should be free of driver-attached
554 * hooks during the reinitialization process.
555 */
556 break;
557 }
558 error = vm_reinit(sc->vmm_vm);
559 (void) vmm_drv_block_hook(sc, B_FALSE);
560 break;
561 case VM_STAT_DESC: {
562 struct vm_stat_desc statdesc;
563
564 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
565 error = EFAULT;
566 break;
567 }
568 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
569 sizeof (statdesc.desc));
570 if (error == 0 &&
571 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
572 error = EFAULT;
573 break;
574 }
575 break;
576 }
577 case VM_STATS_IOC: {
578 struct vm_stats vmstats;
579
580 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
581 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
582 error = EFAULT;
583 break;
584 }
585 hrt2tv(gethrtime(), &vmstats.tv);
586 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
587 &vmstats.num_entries, vmstats.statbuf);
588 if (error == 0 &&
589 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
590 error = EFAULT;
591 break;
592 }
593 break;
594 }
595
596 case VM_PPTDEV_MSI: {
597 struct vm_pptdev_msi pptmsi;
598
599 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
600 error = EFAULT;
601 break;
602 }
603 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
604 pptmsi.addr, pptmsi.msg, pptmsi.numvec);
605 break;
606 }
607 case VM_PPTDEV_MSIX: {
608 struct vm_pptdev_msix pptmsix;
609
610 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
611 error = EFAULT;
612 break;
613 }
614 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
615 pptmsix.idx, pptmsix.addr, pptmsix.msg,
616 pptmsix.vector_control);
617 break;
618 }
619 case VM_MAP_PPTDEV_MMIO: {
620 struct vm_pptdev_mmio pptmmio;
621
622 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
623 error = EFAULT;
624 break;
625 }
626 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
627 pptmmio.len, pptmmio.hpa);
628 break;
629 }
630 case VM_BIND_PPTDEV: {
631 struct vm_pptdev pptdev;
632
633 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
634 error = EFAULT;
635 break;
636 }
637 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
638 break;
639 }
640 case VM_UNBIND_PPTDEV: {
641 struct vm_pptdev pptdev;
642
643 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
644 error = EFAULT;
645 break;
646 }
647 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
648 break;
649 }
650 case VM_GET_PPTDEV_LIMITS: {
651 struct vm_pptdev_limits pptlimits;
652
653 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
654 error = EFAULT;
655 break;
656 }
657 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
658 &pptlimits.msi_limit, &pptlimits.msix_limit);
659 if (error == 0 &&
660 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
661 error = EFAULT;
662 break;
663 }
664 break;
665 }
666 case VM_INJECT_EXCEPTION: {
667 struct vm_exception vmexc;
668 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
669 error = EFAULT;
670 break;
671 }
672 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
673 vmexc.error_code_valid, vmexc.error_code,
674 vmexc.restart_instruction);
675 break;
676 }
677 case VM_INJECT_NMI: {
678 struct vm_nmi vmnmi;
679
680 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
681 error = EFAULT;
682 break;
683 }
684 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
685 break;
686 }
687 case VM_LAPIC_IRQ: {
688 struct vm_lapic_irq vmirq;
689
690 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
691 error = EFAULT;
692 break;
693 }
694 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
695 break;
696 }
697 case VM_LAPIC_LOCAL_IRQ: {
698 struct vm_lapic_irq vmirq;
699
700 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
701 error = EFAULT;
702 break;
703 }
704 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
705 vmirq.vector);
706 break;
707 }
708 case VM_LAPIC_MSI: {
709 struct vm_lapic_msi vmmsi;
710
711 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
712 error = EFAULT;
713 break;
714 }
715 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
716 break;
717 }
718
719 case VM_IOAPIC_ASSERT_IRQ: {
720 struct vm_ioapic_irq ioapic_irq;
721
722 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
723 error = EFAULT;
724 break;
725 }
726 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
727 break;
728 }
729 case VM_IOAPIC_DEASSERT_IRQ: {
730 struct vm_ioapic_irq ioapic_irq;
731
732 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
733 error = EFAULT;
734 break;
735 }
736 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
737 break;
738 }
739 case VM_IOAPIC_PULSE_IRQ: {
740 struct vm_ioapic_irq ioapic_irq;
741
742 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
743 error = EFAULT;
744 break;
745 }
746 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
747 break;
748 }
749 case VM_IOAPIC_PINCOUNT: {
750 int pincount;
751
752 pincount = vioapic_pincount(sc->vmm_vm);
753 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
754 error = EFAULT;
755 break;
756 }
757 break;
758 }
759
760 case VM_ISA_ASSERT_IRQ: {
761 struct vm_isa_irq isa_irq;
762
763 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
764 error = EFAULT;
765 break;
766 }
767 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
768 if (error == 0 && isa_irq.ioapic_irq != -1) {
769 error = vioapic_assert_irq(sc->vmm_vm,
770 isa_irq.ioapic_irq);
771 }
772 break;
773 }
774 case VM_ISA_DEASSERT_IRQ: {
775 struct vm_isa_irq isa_irq;
776
777 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
778 error = EFAULT;
779 break;
780 }
781 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
782 if (error == 0 && isa_irq.ioapic_irq != -1) {
783 error = vioapic_deassert_irq(sc->vmm_vm,
784 isa_irq.ioapic_irq);
785 }
786 break;
787 }
788 case VM_ISA_PULSE_IRQ: {
789 struct vm_isa_irq isa_irq;
790
791 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
792 error = EFAULT;
793 break;
794 }
795 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
796 if (error == 0 && isa_irq.ioapic_irq != -1) {
797 error = vioapic_pulse_irq(sc->vmm_vm,
798 isa_irq.ioapic_irq);
799 }
800 break;
801 }
802 case VM_ISA_SET_IRQ_TRIGGER: {
803 struct vm_isa_irq_trigger isa_irq_trigger;
804
805 if (ddi_copyin(datap, &isa_irq_trigger,
806 sizeof (isa_irq_trigger), md)) {
807 error = EFAULT;
808 break;
809 }
810 error = vatpic_set_irq_trigger(sc->vmm_vm,
811 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
812 break;
813 }
814
815 case VM_MMAP_GETNEXT: {
816 struct vm_memmap mm;
817
818 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
819 error = EFAULT;
820 break;
821 }
822 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
823 &mm.segoff, &mm.len, &mm.prot, &mm.flags);
824 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
825 error = EFAULT;
826 break;
827 }
828 break;
829 }
830 case VM_MMAP_MEMSEG: {
831 struct vm_memmap mm;
832
833 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
834 error = EFAULT;
835 break;
836 }
837 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
838 mm.len, mm.prot, mm.flags);
839 break;
840 }
841 case VM_ALLOC_MEMSEG: {
842 struct vm_memseg vmseg;
843
844 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
845 error = EFAULT;
846 break;
847 }
848 error = vmmdev_alloc_memseg(sc, &vmseg);
849 break;
850 }
851 case VM_GET_MEMSEG: {
852 struct vm_memseg vmseg;
853
854 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
855 error = EFAULT;
856 break;
857 }
858 error = vmmdev_get_memseg(sc, &vmseg);
859 if (error == 0 &&
860 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
861 error = EFAULT;
862 break;
863 }
864 break;
865 }
866 case VM_GET_REGISTER: {
867 struct vm_register vmreg;
868
869 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
870 error = EFAULT;
871 break;
872 }
873 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
874 &vmreg.regval);
875 if (error == 0 &&
876 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
877 error = EFAULT;
878 break;
879 }
880 break;
881 }
882 case VM_SET_REGISTER: {
883 struct vm_register vmreg;
884
885 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
886 error = EFAULT;
887 break;
888 }
889 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
890 vmreg.regval);
891 break;
892 }
893 case VM_SET_SEGMENT_DESCRIPTOR: {
894 struct vm_seg_desc vmsegd;
895
896 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
897 error = EFAULT;
898 break;
899 }
900 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
901 &vmsegd.desc);
902 break;
903 }
904 case VM_GET_SEGMENT_DESCRIPTOR: {
905 struct vm_seg_desc vmsegd;
906
907 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
908 error = EFAULT;
909 break;
910 }
911 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
912 &vmsegd.desc);
913 if (error == 0 &&
914 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
915 error = EFAULT;
916 break;
917 }
918 break;
919 }
920 case VM_GET_REGISTER_SET: {
921 struct vm_register_set vrs;
922 int regnums[VM_REG_LAST];
923 uint64_t regvals[VM_REG_LAST];
924
925 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
926 error = EFAULT;
927 break;
928 }
929 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
930 error = EINVAL;
931 break;
932 }
933 if (ddi_copyin(vrs.regnums, regnums,
934 sizeof (int) * vrs.count, md)) {
935 error = EFAULT;
936 break;
937 }
938
939 error = 0;
940 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
941 if (regnums[i] < 0) {
942 error = EINVAL;
943 break;
944 }
945 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
946 ®vals[i]);
947 }
948 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
949 sizeof (uint64_t) * vrs.count, md)) {
950 error = EFAULT;
951 }
952 break;
953 }
954 case VM_SET_REGISTER_SET: {
955 struct vm_register_set vrs;
956 int regnums[VM_REG_LAST];
957 uint64_t regvals[VM_REG_LAST];
958
959 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
960 error = EFAULT;
961 break;
962 }
963 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
964 error = EINVAL;
965 break;
966 }
967 if (ddi_copyin(vrs.regnums, regnums,
968 sizeof (int) * vrs.count, md)) {
969 error = EFAULT;
970 break;
971 }
972 if (ddi_copyin(vrs.regvals, regvals,
973 sizeof (uint64_t) * vrs.count, md)) {
974 error = EFAULT;
975 break;
976 }
977
978 error = 0;
979 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
980 /*
981 * Setting registers in a set is not atomic, since a
982 * failure in the middle of the set will cause a
983 * bail-out and inconsistent register state. Callers
984 * should be wary of this.
985 */
986 if (regnums[i] < 0) {
987 error = EINVAL;
988 break;
989 }
990 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
991 regvals[i]);
992 }
993 break;
994 }
995 case VM_RESET_CPU: {
996 struct vm_vcpu_reset vvr;
997
998 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
999 error = EFAULT;
1000 break;
1001 }
1002 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1003 error = EINVAL;
1004 }
1005
1006 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1007 break;
1008 }
1009 case VM_GET_RUN_STATE: {
1010 struct vm_run_state vrs;
1011
1012 bzero(&vrs, sizeof (vrs));
1013 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1014 &vrs.sipi_vector);
1015 if (error == 0) {
1016 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1017 error = EFAULT;
1018 break;
1019 }
1020 }
1021 break;
1022 }
1023 case VM_SET_RUN_STATE: {
1024 struct vm_run_state vrs;
1025
1026 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1027 error = EFAULT;
1028 break;
1029 }
1030 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1031 vrs.sipi_vector);
1032 break;
1033 }
1034
1035 case VM_SET_KERNEMU_DEV:
1036 case VM_GET_KERNEMU_DEV: {
1037 struct vm_readwrite_kernemu_device kemu;
1038 size_t size = 0;
1039
1040 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1041 error = EFAULT;
1042 break;
1043 }
1044
1045 if (kemu.access_width > 3) {
1046 error = EINVAL;
1047 break;
1048 }
1049 size = (1 << kemu.access_width);
1050 ASSERT(size >= 1 && size <= 8);
1051
1052 if (cmd == VM_SET_KERNEMU_DEV) {
1053 error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1054 kemu.gpa, kemu.value, size);
1055 } else {
1056 error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1057 kemu.gpa, &kemu.value, size);
1058 }
1059
1060 if (error == 0) {
1061 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1062 error = EFAULT;
1063 break;
1064 }
1065 }
1066 break;
1067 }
1068
1069 case VM_GET_CAPABILITY: {
1070 struct vm_capability vmcap;
1071
1072 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1073 error = EFAULT;
1074 break;
1075 }
1076 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1077 &vmcap.capval);
1078 if (error == 0 &&
1079 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1080 error = EFAULT;
1081 break;
1082 }
1083 break;
1084 }
1085 case VM_SET_CAPABILITY: {
1086 struct vm_capability vmcap;
1087
1088 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1089 error = EFAULT;
1090 break;
1091 }
1092 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1093 vmcap.capval);
1094 break;
1095 }
1096 case VM_SET_X2APIC_STATE: {
1097 struct vm_x2apic x2apic;
1098
1099 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1100 error = EFAULT;
1101 break;
1102 }
1103 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1104 break;
1105 }
1106 case VM_GET_X2APIC_STATE: {
1107 struct vm_x2apic x2apic;
1108
1109 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1110 error = EFAULT;
1111 break;
1112 }
1113 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1114 &x2apic.state);
1115 if (error == 0 &&
1116 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1117 error = EFAULT;
1118 break;
1119 }
1120 break;
1121 }
1122 case VM_GET_GPA_PMAP: {
1123 struct vm_gpa_pte gpapte;
1124
1125 if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1126 error = EFAULT;
1127 break;
1128 }
1129 #ifdef __FreeBSD__
1130 /* XXXJOY: add function? */
1131 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1132 gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1133 #endif
1134 error = 0;
1135 break;
1136 }
1137 case VM_GET_HPET_CAPABILITIES: {
1138 struct vm_hpet_cap hpetcap;
1139
1140 error = vhpet_getcap(&hpetcap);
1141 if (error == 0 &&
1142 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1143 error = EFAULT;
1144 break;
1145 }
1146 break;
1147 }
1148 case VM_GLA2GPA: {
1149 struct vm_gla2gpa gg;
1150
1151 CTASSERT(PROT_READ == VM_PROT_READ);
1152 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1153 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1154
1155 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1156 error = EFAULT;
1157 break;
1158 }
1159 gg.vcpuid = vcpu;
1160 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1161 gg.prot, &gg.gpa, &gg.fault);
1162 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1163 error = EFAULT;
1164 break;
1165 }
1166 break;
1167 }
1168 case VM_GLA2GPA_NOFAULT: {
1169 struct vm_gla2gpa gg;
1170
1171 CTASSERT(PROT_READ == VM_PROT_READ);
1172 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1173 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1174
1175 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1176 error = EFAULT;
1177 break;
1178 }
1179 gg.vcpuid = vcpu;
1180 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1181 gg.gla, gg.prot, &gg.gpa, &gg.fault);
1182 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1183 error = EFAULT;
1184 break;
1185 }
1186 break;
1187 }
1188
1189 case VM_ACTIVATE_CPU:
1190 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1191 break;
1192
1193 case VM_SUSPEND_CPU:
1194 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1195 error = EFAULT;
1196 } else {
1197 error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1198 }
1199 break;
1200
1201 case VM_RESUME_CPU:
1202 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1203 error = EFAULT;
1204 } else {
1205 error = vm_resume_cpu(sc->vmm_vm, vcpu);
1206 }
1207 break;
1208
1209 case VM_GET_CPUS: {
1210 struct vm_cpuset vm_cpuset;
1211 cpuset_t tempset;
1212 void *srcp = &tempset;
1213 int size;
1214
1215 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1216 error = EFAULT;
1217 break;
1218 }
1219
1220 /* Be more generous about sizing since our cpuset_t is large. */
1221 size = vm_cpuset.cpusetsize;
1222 if (size <= 0 || size > sizeof (cpuset_t)) {
1223 error = ERANGE;
1224 }
1225 /*
1226 * If they want a ulong_t or less, make sure they receive the
1227 * low bits with all the useful information.
1228 */
1229 if (size <= sizeof (tempset.cpub[0])) {
1230 srcp = &tempset.cpub[0];
1231 }
1232
1233 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1234 tempset = vm_active_cpus(sc->vmm_vm);
1235 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1236 tempset = vm_suspended_cpus(sc->vmm_vm);
1237 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1238 tempset = vm_debug_cpus(sc->vmm_vm);
1239 } else {
1240 error = EINVAL;
1241 }
1242
1243 ASSERT(size > 0 && size <= sizeof (tempset));
1244 if (error == 0 &&
1245 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1246 error = EFAULT;
1247 break;
1248 }
1249 break;
1250 }
1251 case VM_SET_INTINFO: {
1252 struct vm_intinfo vmii;
1253
1254 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1255 error = EFAULT;
1256 break;
1257 }
1258 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1259 break;
1260 }
1261 case VM_GET_INTINFO: {
1262 struct vm_intinfo vmii;
1263
1264 vmii.vcpuid = vcpu;
1265 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1266 &vmii.info2);
1267 if (error == 0 &&
1268 ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1269 error = EFAULT;
1270 break;
1271 }
1272 break;
1273 }
1274 case VM_RTC_WRITE: {
1275 struct vm_rtc_data rtcdata;
1276
1277 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1278 error = EFAULT;
1279 break;
1280 }
1281 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1282 rtcdata.value);
1283 break;
1284 }
1285 case VM_RTC_READ: {
1286 struct vm_rtc_data rtcdata;
1287
1288 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1289 error = EFAULT;
1290 break;
1291 }
1292 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1293 &rtcdata.value);
1294 if (error == 0 &&
1295 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1296 error = EFAULT;
1297 break;
1298 }
1299 break;
1300 }
1301 case VM_RTC_SETTIME: {
1302 struct vm_rtc_time rtctime;
1303
1304 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1305 error = EFAULT;
1306 break;
1307 }
1308 error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1309 break;
1310 }
1311 case VM_RTC_GETTIME: {
1312 struct vm_rtc_time rtctime;
1313
1314 rtctime.secs = vrtc_get_time(sc->vmm_vm);
1315 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1316 error = EFAULT;
1317 break;
1318 }
1319 break;
1320 }
1321
1322 case VM_PMTMR_LOCATE: {
1323 uint16_t port = arg;
1324 error = vpmtmr_set_location(sc->vmm_vm, port);
1325 break;
1326 }
1327
1328 case VM_RESTART_INSTRUCTION:
1329 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1330 break;
1331
1332 case VM_SET_TOPOLOGY: {
1333 struct vm_cpu_topology topo;
1334
1335 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1336 error = EFAULT;
1337 break;
1338 }
1339 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1340 topo.threads, topo.maxcpus);
1341 break;
1342 }
1343 case VM_GET_TOPOLOGY: {
1344 struct vm_cpu_topology topo;
1345
1346 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1347 &topo.threads, &topo.maxcpus);
1348 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1349 error = EFAULT;
1350 break;
1351 }
1352 break;
1353 }
1354
1355 #ifndef __FreeBSD__
1356 case VM_DEVMEM_GETOFFSET: {
1357 struct vm_devmem_offset vdo;
1358 list_t *dl = &sc->vmm_devmem_list;
1359 vmm_devmem_entry_t *de = NULL;
1360
1361 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1362 error = EFAULT;
1363 break;
1364 }
1365
1366 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1367 if (de->vde_segid == vdo.segid) {
1368 break;
1369 }
1370 }
1371 if (de != NULL) {
1372 vdo.offset = de->vde_off;
1373 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1374 error = EFAULT;
1375 }
1376 } else {
1377 error = ENOENT;
1378 }
1379 break;
1380 }
1381 case VM_WRLOCK_CYCLE: {
1382 /*
1383 * Present a test mechanism to acquire/release the write lock
1384 * on the VM without any other effects.
1385 */
1386 break;
1387 }
1388 #endif
1389 default:
1390 error = ENOTTY;
1391 break;
1392 }
1393
1394 /* Release exclusion resources */
1395 switch (lock_type) {
1396 case LOCK_NONE:
1397 break;
1398 case LOCK_VCPU:
1399 vcpu_unlock_one(sc, vcpu);
1400 break;
1401 case LOCK_READ_HOLD:
1402 vmm_read_unlock(sc);
1403 break;
1404 case LOCK_WRITE_HOLD:
1405 vmm_write_unlock(sc);
1406 break;
1407 default:
1408 panic("unexpected lock type");
1409 break;
1410 }
1411
1412 return (error);
1413 }
1414
1415 static vmm_softc_t *
1416 vmm_lookup(const char *name)
1417 {
1418 list_t *vml = &vmm_list;
1419 vmm_softc_t *sc;
1420
1421 ASSERT(MUTEX_HELD(&vmm_mtx));
1422
1423 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1424 if (strcmp(sc->vmm_name, name) == 0) {
1425 break;
1426 }
1427 }
1428
1429 return (sc);
1430 }
1431
1432 /*
1433 * Acquire an HMA registration if not already held.
1434 */
1435 static boolean_t
1436 vmm_hma_acquire(void)
1437 {
1438 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1439
1440 mutex_enter(&vmmdev_mtx);
1441
1442 if (vmmdev_hma_reg == NULL) {
1443 VERIFY3U(vmmdev_hma_ref, ==, 0);
1444 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1445 if (vmmdev_hma_reg == NULL) {
1446 cmn_err(CE_WARN, "%s HMA registration failed.",
1447 vmmdev_hvm_name);
1448 mutex_exit(&vmmdev_mtx);
1449 return (B_FALSE);
1450 }
1451 }
1452
1453 vmmdev_hma_ref++;
1454
1455 mutex_exit(&vmmdev_mtx);
1456
1457 return (B_TRUE);
1458 }
1459
1460 /*
1461 * Release the HMA registration if held and there are no remaining VMs.
1462 */
1463 static void
1464 vmm_hma_release(void)
1465 {
1466 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1467
1468 mutex_enter(&vmmdev_mtx);
1469
1470 VERIFY3U(vmmdev_hma_ref, !=, 0);
1471
1472 vmmdev_hma_ref--;
1473
1474 if (vmmdev_hma_ref == 0) {
1475 VERIFY(vmmdev_hma_reg != NULL);
1476 hma_unregister(vmmdev_hma_reg);
1477 vmmdev_hma_reg = NULL;
1478 }
1479 mutex_exit(&vmmdev_mtx);
1480 }
1481
1482 static int
1483 vmmdev_do_vm_create(char *name, cred_t *cr)
1484 {
1485 vmm_softc_t *sc = NULL;
1486 minor_t minor;
1487 int error = ENOMEM;
1488
1489 if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1490 return (EINVAL);
1491 }
1492
1493 if (!vmm_hma_acquire())
1494 return (ENXIO);
1495
1496 mutex_enter(&vmm_mtx);
1497
1498 /* Look for duplicate names */
1499 if (vmm_lookup(name) != NULL) {
1500 mutex_exit(&vmm_mtx);
1501 vmm_hma_release();
1502 return (EEXIST);
1503 }
1504
1505 /* Allow only one instance per non-global zone. */
1506 if (!INGLOBALZONE(curproc)) {
1507 for (sc = list_head(&vmm_list); sc != NULL;
1508 sc = list_next(&vmm_list, sc)) {
1509 if (sc->vmm_zone == curzone) {
1510 mutex_exit(&vmm_mtx);
1511 vmm_hma_release();
1512 return (EINVAL);
1513 }
1514 }
1515 }
1516
1517 minor = id_alloc(vmm_minors);
1518 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1519 goto fail;
1520 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1521 ddi_soft_state_free(vmm_statep, minor);
1522 goto fail;
1523 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1524 DDI_PSEUDO, 0) != DDI_SUCCESS) {
1525 goto fail;
1526 }
1527
1528 error = vm_create(name, &sc->vmm_vm);
1529 if (error == 0) {
1530 /* Complete VM intialization and report success. */
1531 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1532 sc->vmm_minor = minor;
1533 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1534 offsetof(vmm_devmem_entry_t, vde_node));
1535
1536 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1537 offsetof(vmm_hold_t, vmh_node));
1538 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1539
1540 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1541 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1542 offsetof(vmm_lease_t, vml_node));
1543 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1544 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1545
1546 sc->vmm_zone = crgetzone(cr);
1547 zone_hold(sc->vmm_zone);
1548 vmm_zsd_add_vm(sc);
1549
1550 list_insert_tail(&vmm_list, sc);
1551 mutex_exit(&vmm_mtx);
1552 return (0);
1553 }
1554
1555 ddi_remove_minor_node(vmmdev_dip, name);
1556 fail:
1557 id_free(vmm_minors, minor);
1558 if (sc != NULL) {
1559 ddi_soft_state_free(vmm_statep, minor);
1560 }
1561 mutex_exit(&vmm_mtx);
1562 vmm_hma_release();
1563
1564 return (error);
1565 }
1566
1567 /*
1568 * Bhyve 'Driver' Interface
1569 *
1570 * While many devices are emulated in the bhyve userspace process, there are
1571 * others with performance constraints which require that they run mostly or
1572 * entirely in-kernel. For those not integrated directly into bhyve, an API is
1573 * needed so they can query/manipulate the portions of VM state needed to
1574 * fulfill their purpose.
1575 *
1576 * This includes:
1577 * - Translating guest-physical addresses to host-virtual pointers
1578 * - Injecting MSIs
1579 * - Hooking IO port addresses
1580 *
1581 * The vmm_drv interface exists to provide that functionality to its consumers.
1582 * (At this time, 'viona' is the only user)
1583 */
1584 int
1585 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1586 {
1587 vnode_t *vp = fp->f_vnode;
1588 const dev_t dev = vp->v_rdev;
1589 vmm_softc_t *sc;
1590 vmm_hold_t *hold;
1591 int err = 0;
1592
1593 if (vp->v_type != VCHR) {
1594 return (ENXIO);
1595 }
1596 const major_t major = getmajor(dev);
1597 const minor_t minor = getminor(dev);
1598
1599 mutex_enter(&vmmdev_mtx);
1600 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1601 mutex_exit(&vmmdev_mtx);
1602 return (ENOENT);
1603 }
1604 mutex_enter(&vmm_mtx);
1605 mutex_exit(&vmmdev_mtx);
1606
1607 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1608 err = ENOENT;
1609 goto out;
1610 }
1611 /* XXXJOY: check cred permissions against instance */
1612
1613 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1614 err = EBUSY;
1615 goto out;
1616 }
1617
1618 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1619 hold->vmh_sc = sc;
1620 hold->vmh_release_req = B_FALSE;
1621
1622 list_insert_tail(&sc->vmm_holds, hold);
1623 sc->vmm_flags |= VMM_HELD;
1624 *holdp = hold;
1625
1626 out:
1627 mutex_exit(&vmm_mtx);
1628 return (err);
1629 }
1630
1631 void
1632 vmm_drv_rele(vmm_hold_t *hold)
1633 {
1634 vmm_softc_t *sc;
1635
1636 ASSERT(hold != NULL);
1637 ASSERT(hold->vmh_sc != NULL);
1638 VERIFY(hold->vmh_ioport_hook_cnt == 0);
1639
1640 mutex_enter(&vmm_mtx);
1641 sc = hold->vmh_sc;
1642 list_remove(&sc->vmm_holds, hold);
1643 if (list_is_empty(&sc->vmm_holds)) {
1644 sc->vmm_flags &= ~VMM_HELD;
1645 cv_broadcast(&sc->vmm_cv);
1646 }
1647 mutex_exit(&vmm_mtx);
1648 kmem_free(hold, sizeof (*hold));
1649 }
1650
1651 boolean_t
1652 vmm_drv_release_reqd(vmm_hold_t *hold)
1653 {
1654 ASSERT(hold != NULL);
1655
1656 return (hold->vmh_release_req);
1657 }
1658
1659 vmm_lease_t *
1660 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1661 {
1662 vmm_softc_t *sc = hold->vmh_sc;
1663 vmm_lease_t *lease;
1664
1665 ASSERT3P(expiref, !=, NULL);
1666
1667 if (hold->vmh_release_req) {
1668 return (NULL);
1669 }
1670
1671 lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1672 list_link_init(&lease->vml_node);
1673 lease->vml_expire_func = expiref;
1674 lease->vml_expire_arg = arg;
1675 lease->vml_expired = B_FALSE;
1676 lease->vml_hold = hold;
1677 /* cache the VM pointer for one less pointer chase */
1678 lease->vml_vm = sc->vmm_vm;
1679
1680 mutex_enter(&sc->vmm_lease_lock);
1681 while (sc->vmm_lease_blocker != 0) {
1682 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1683 }
1684 list_insert_tail(&sc->vmm_lease_list, lease);
1685 vmm_read_lock(sc);
1686 mutex_exit(&sc->vmm_lease_lock);
1687
1688 return (lease);
1689 }
1690
1691 static void
1692 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1693 {
1694 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1695
1696 list_remove(&sc->vmm_lease_list, lease);
1697 vmm_read_unlock(sc);
1698 kmem_free(lease, sizeof (*lease));
1699 }
1700
1701 void
1702 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1703 {
1704 vmm_softc_t *sc = hold->vmh_sc;
1705
1706 VERIFY3P(hold, ==, lease->vml_hold);
1707
1708 mutex_enter(&sc->vmm_lease_lock);
1709 vmm_lease_break_locked(sc, lease);
1710 mutex_exit(&sc->vmm_lease_lock);
1711 }
1712
1713 boolean_t
1714 vmm_drv_lease_expired(vmm_lease_t *lease)
1715 {
1716 return (lease->vml_expired);
1717 }
1718
1719 void *
1720 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1721 {
1722 ASSERT(lease != NULL);
1723
1724 return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1725 }
1726
1727 int
1728 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1729 {
1730 ASSERT(lease != NULL);
1731
1732 return (lapic_intr_msi(lease->vml_vm, addr, msg));
1733 }
1734
1735 int
1736 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1737 void *arg, void **cookie)
1738 {
1739 vmm_softc_t *sc;
1740 int err;
1741
1742 ASSERT(hold != NULL);
1743 ASSERT(cookie != NULL);
1744
1745 sc = hold->vmh_sc;
1746 mutex_enter(&vmm_mtx);
1747 /* Confirm that hook installation is not blocked */
1748 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1749 mutex_exit(&vmm_mtx);
1750 return (EBUSY);
1751 }
1752 /*
1753 * Optimistically record an installed hook which will prevent a block
1754 * from being asserted while the mutex is dropped.
1755 */
1756 hold->vmh_ioport_hook_cnt++;
1757 mutex_exit(&vmm_mtx);
1758
1759 vmm_write_lock(sc);
1760 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1761 arg, cookie);
1762 vmm_write_unlock(sc);
1763
1764 if (err != 0) {
1765 mutex_enter(&vmm_mtx);
1766 /* Walk back optimism about the hook installation */
1767 hold->vmh_ioport_hook_cnt--;
1768 mutex_exit(&vmm_mtx);
1769 }
1770 return (err);
1771 }
1772
1773 void
1774 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1775 {
1776 vmm_softc_t *sc;
1777
1778 ASSERT(hold != NULL);
1779 ASSERT(cookie != NULL);
1780 ASSERT(hold->vmh_ioport_hook_cnt != 0);
1781
1782 sc = hold->vmh_sc;
1783 vmm_write_lock(sc);
1784 vm_ioport_unhook(sc->vmm_vm, cookie);
1785 vmm_write_unlock(sc);
1786
1787 mutex_enter(&vmm_mtx);
1788 hold->vmh_ioport_hook_cnt--;
1789 mutex_exit(&vmm_mtx);
1790 }
1791
1792 static int
1793 vmm_drv_purge(vmm_softc_t *sc)
1794 {
1795 ASSERT(MUTEX_HELD(&vmm_mtx));
1796
1797 if ((sc->vmm_flags & VMM_HELD) != 0) {
1798 vmm_hold_t *hold;
1799
1800 sc->vmm_flags |= VMM_CLEANUP;
1801 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1802 hold = list_next(&sc->vmm_holds, hold)) {
1803 hold->vmh_release_req = B_TRUE;
1804 }
1805 while ((sc->vmm_flags & VMM_HELD) != 0) {
1806 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1807 return (EINTR);
1808 }
1809 }
1810 sc->vmm_flags &= ~VMM_CLEANUP;
1811 }
1812
1813 VERIFY(list_is_empty(&sc->vmm_holds));
1814 sc->vmm_flags |= VMM_PURGED;
1815 return (0);
1816 }
1817
1818 static int
1819 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1820 {
1821 int err = 0;
1822
1823 mutex_enter(&vmm_mtx);
1824 if (!enable_block) {
1825 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1826
1827 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1828 goto done;
1829 }
1830
1831 /* If any holds have hooks installed, the block is a failure */
1832 if (!list_is_empty(&sc->vmm_holds)) {
1833 vmm_hold_t *hold;
1834
1835 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1836 hold = list_next(&sc->vmm_holds, hold)) {
1837 if (hold->vmh_ioport_hook_cnt != 0) {
1838 err = EBUSY;
1839 goto done;
1840 }
1841 }
1842 }
1843 sc->vmm_flags |= VMM_BLOCK_HOOK;
1844
1845 done:
1846 mutex_exit(&vmm_mtx);
1847 return (err);
1848 }
1849
1850 static int
1851 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1852 boolean_t *hma_release)
1853 {
1854 dev_info_t *pdip = ddi_get_parent(vmmdev_dip);
1855 minor_t minor;
1856
1857 ASSERT(MUTEX_HELD(&vmm_mtx));
1858
1859 *hma_release = B_FALSE;
1860
1861 if (clean_zsd) {
1862 vmm_zsd_rem_vm(sc);
1863 }
1864
1865 if (vmm_drv_purge(sc) != 0) {
1866 return (EINTR);
1867 }
1868
1869 /* Clean up devmem entries */
1870 vmmdev_devmem_purge(sc);
1871
1872 list_remove(&vmm_list, sc);
1873 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1874 minor = sc->vmm_minor;
1875 zone_rele(sc->vmm_zone);
1876 if (sc->vmm_is_open) {
1877 list_insert_tail(&vmm_destroy_list, sc);
1878 sc->vmm_flags |= VMM_DESTROY;
1879 } else {
1880 vm_destroy(sc->vmm_vm);
1881 ddi_soft_state_free(vmm_statep, minor);
1882 id_free(vmm_minors, minor);
1883 *hma_release = B_TRUE;
1884 }
1885 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1886
1887 return (0);
1888 }
1889
1890 int
1891 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1892 {
1893 boolean_t hma_release = B_FALSE;
1894 int err;
1895
1896 mutex_enter(&vmm_mtx);
1897 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1898 mutex_exit(&vmm_mtx);
1899
1900 if (hma_release)
1901 vmm_hma_release();
1902
1903 return (err);
1904 }
1905
1906 /* ARGSUSED */
1907 static int
1908 vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1909 {
1910 boolean_t hma_release = B_FALSE;
1911 vmm_softc_t *sc;
1912 int err;
1913
1914 if (crgetuid(cr) != 0)
1915 return (EPERM);
1916
1917 mutex_enter(&vmm_mtx);
1918
1919 if ((sc = vmm_lookup(name)) == NULL) {
1920 mutex_exit(&vmm_mtx);
1921 return (ENOENT);
1922 }
1923 /*
1924 * We don't check this in vmm_lookup() since that function is also used
1925 * for validation during create and currently vmm names must be unique.
1926 */
1927 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1928 mutex_exit(&vmm_mtx);
1929 return (EPERM);
1930 }
1931 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1932
1933 mutex_exit(&vmm_mtx);
1934
1935 if (hma_release)
1936 vmm_hma_release();
1937
1938 return (err);
1939 }
1940
1941 static int
1942 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1943 {
1944 minor_t minor;
1945 vmm_softc_t *sc;
1946
1947 minor = getminor(*devp);
1948 if (minor == VMM_CTL_MINOR) {
1949 /*
1950 * Master control device must be opened exclusively.
1951 */
1952 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
1953 return (EINVAL);
1954 }
1955
1956 return (0);
1957 }
1958
1959 mutex_enter(&vmm_mtx);
1960 sc = ddi_get_soft_state(vmm_statep, minor);
1961 if (sc == NULL) {
1962 mutex_exit(&vmm_mtx);
1963 return (ENXIO);
1964 }
1965
1966 sc->vmm_is_open = B_TRUE;
1967 mutex_exit(&vmm_mtx);
1968
1969 return (0);
1970 }
1971
1972 static int
1973 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
1974 {
1975 minor_t minor;
1976 vmm_softc_t *sc;
1977 boolean_t hma_release = B_FALSE;
1978
1979 minor = getminor(dev);
1980 if (minor == VMM_CTL_MINOR)
1981 return (0);
1982
1983 mutex_enter(&vmm_mtx);
1984 sc = ddi_get_soft_state(vmm_statep, minor);
1985 if (sc == NULL) {
1986 mutex_exit(&vmm_mtx);
1987 return (ENXIO);
1988 }
1989
1990 VERIFY(sc->vmm_is_open);
1991 sc->vmm_is_open = B_FALSE;
1992
1993 /*
1994 * If this VM was destroyed while the vmm device was open, then
1995 * clean it up now that it is closed.
1996 */
1997 if (sc->vmm_flags & VMM_DESTROY) {
1998 list_remove(&vmm_destroy_list, sc);
1999 vm_destroy(sc->vmm_vm);
2000 ddi_soft_state_free(vmm_statep, minor);
2001 id_free(vmm_minors, minor);
2002 hma_release = B_TRUE;
2003 }
2004 mutex_exit(&vmm_mtx);
2005
2006 if (hma_release)
2007 vmm_hma_release();
2008
2009 return (0);
2010 }
2011
2012 static int
2013 vmm_is_supported(intptr_t arg)
2014 {
2015 int r;
2016 const char *msg;
2017
2018 if (vmm_is_intel()) {
2019 r = vmx_x86_supported(&msg);
2020 } else if (vmm_is_svm()) {
2021 /*
2022 * HMA already ensured that the features necessary for SVM
2023 * operation were present and online during vmm_attach().
2024 */
2025 r = 0;
2026 } else {
2027 r = ENXIO;
2028 msg = "Unsupported CPU vendor";
2029 }
2030
2031 if (r != 0 && arg != (intptr_t)NULL) {
2032 if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0)
2033 return (EFAULT);
2034 }
2035 return (r);
2036 }
2037
2038 static int
2039 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2040 int *rvalp)
2041 {
2042 vmm_softc_t *sc;
2043 minor_t minor;
2044
2045 /* The structs in bhyve ioctls assume a 64-bit datamodel */
2046 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2047 return (ENOTSUP);
2048 }
2049
2050 minor = getminor(dev);
2051
2052 if (minor == VMM_CTL_MINOR) {
2053 void *argp = (void *)arg;
2054 char name[VM_MAX_NAMELEN] = { 0 };
2055 size_t len = 0;
2056
2057 if ((mode & FKIOCTL) != 0) {
2058 len = strlcpy(name, argp, sizeof (name));
2059 } else {
2060 if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2061 return (EFAULT);
2062 }
2063 }
2064 if (len >= VM_MAX_NAMELEN) {
2065 return (ENAMETOOLONG);
2066 }
2067
2068 switch (cmd) {
2069 case VMM_CREATE_VM:
2070 if ((mode & FWRITE) == 0)
2071 return (EPERM);
2072 return (vmmdev_do_vm_create(name, credp));
2073 case VMM_DESTROY_VM:
2074 if ((mode & FWRITE) == 0)
2075 return (EPERM);
2076 return (vmmdev_do_vm_destroy(name, credp));
2077 case VMM_VM_SUPPORTED:
2078 return (vmm_is_supported(arg));
2079 default:
2080 /* No other actions are legal on ctl device */
2081 return (ENOTTY);
2082 }
2083 }
2084
2085 sc = ddi_get_soft_state(vmm_statep, minor);
2086 ASSERT(sc);
2087
2088 if (sc->vmm_flags & VMM_DESTROY)
2089 return (ENXIO);
2090
2091 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2092 }
2093
2094 static int
2095 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2096 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2097 {
2098 vmm_softc_t *sc;
2099 const minor_t minor = getminor(dev);
2100 struct vm *vm;
2101 int err;
2102 vm_object_t vmo = NULL;
2103 struct vmspace *vms;
2104
2105 if (minor == VMM_CTL_MINOR) {
2106 return (ENODEV);
2107 }
2108 if (off < 0 || (off + len) <= 0) {
2109 return (EINVAL);
2110 }
2111 if ((prot & PROT_USER) == 0) {
2112 return (EACCES);
2113 }
2114
2115 sc = ddi_get_soft_state(vmm_statep, minor);
2116 ASSERT(sc);
2117
2118 if (sc->vmm_flags & VMM_DESTROY)
2119 return (ENXIO);
2120
2121 /* Grab read lock on the VM to prevent any changes to the memory map */
2122 vmm_read_lock(sc);
2123
2124 vm = sc->vmm_vm;
2125 vms = vm_get_vmspace(vm);
2126 if (off >= VM_DEVMEM_START) {
2127 int segid;
2128 off_t map_off = 0;
2129
2130 /* Mapping a devmem "device" */
2131 if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
2132 err = ENODEV;
2133 goto out;
2134 }
2135 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2136 if (err != 0) {
2137 goto out;
2138 }
2139 err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
2140 flags);
2141 } else {
2142 /* Mapping a part of the guest physical space */
2143 err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2144 flags);
2145 }
2146
2147
2148 out:
2149 vmm_read_unlock(sc);
2150 return (err);
2151 }
2152
2153 static sdev_plugin_validate_t
2154 vmm_sdev_validate(sdev_ctx_t ctx)
2155 {
2156 const char *name = sdev_ctx_name(ctx);
2157 vmm_softc_t *sc;
2158 sdev_plugin_validate_t ret;
2159 minor_t minor;
2160
2161 if (sdev_ctx_vtype(ctx) != VCHR)
2162 return (SDEV_VTOR_INVALID);
2163
2164 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2165
2166 mutex_enter(&vmm_mtx);
2167 if ((sc = vmm_lookup(name)) == NULL)
2168 ret = SDEV_VTOR_INVALID;
2169 else if (sc->vmm_minor != minor)
2170 ret = SDEV_VTOR_STALE;
2171 else
2172 ret = SDEV_VTOR_VALID;
2173 mutex_exit(&vmm_mtx);
2174
2175 return (ret);
2176 }
2177
2178 static int
2179 vmm_sdev_filldir(sdev_ctx_t ctx)
2180 {
2181 vmm_softc_t *sc;
2182 int ret;
2183
2184 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2185 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2186 sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2187 return (EINVAL);
2188 }
2189
2190 mutex_enter(&vmm_mtx);
2191 ASSERT(vmmdev_dip != NULL);
2192 for (sc = list_head(&vmm_list); sc != NULL;
2193 sc = list_next(&vmm_list, sc)) {
2194 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2195 ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2196 S_IFCHR | 0600,
2197 makedevice(ddi_driver_major(vmmdev_dip),
2198 sc->vmm_minor));
2199 } else {
2200 continue;
2201 }
2202 if (ret != 0 && ret != EEXIST)
2203 goto out;
2204 }
2205
2206 ret = 0;
2207
2208 out:
2209 mutex_exit(&vmm_mtx);
2210 return (ret);
2211 }
2212
2213 /* ARGSUSED */
2214 static void
2215 vmm_sdev_inactive(sdev_ctx_t ctx)
2216 {
2217 }
2218
2219 static sdev_plugin_ops_t vmm_sdev_ops = {
2220 .spo_version = SDEV_PLUGIN_VERSION,
2221 .spo_flags = SDEV_PLUGIN_SUBDIR,
2222 .spo_validate = vmm_sdev_validate,
2223 .spo_filldir = vmm_sdev_filldir,
2224 .spo_inactive = vmm_sdev_inactive
2225 };
2226
2227 /* ARGSUSED */
2228 static int
2229 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2230 {
2231 int error;
2232
2233 switch (cmd) {
2234 case DDI_INFO_DEVT2DEVINFO:
2235 *result = (void *)vmmdev_dip;
2236 error = DDI_SUCCESS;
2237 break;
2238 case DDI_INFO_DEVT2INSTANCE:
2239 *result = (void *)0;
2240 error = DDI_SUCCESS;
2241 break;
2242 default:
2243 error = DDI_FAILURE;
2244 break;
2245 }
2246 return (error);
2247 }
2248
2249 static int
2250 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2251 {
2252 sdev_plugin_hdl_t sph;
2253 hma_reg_t *reg = NULL;
2254 boolean_t vmm_loaded = B_FALSE;
2255
2256 if (cmd != DDI_ATTACH) {
2257 return (DDI_FAILURE);
2258 }
2259
2260 mutex_enter(&vmmdev_mtx);
2261 /* Ensure we are not already attached. */
2262 if (vmmdev_dip != NULL) {
2263 mutex_exit(&vmmdev_mtx);
2264 return (DDI_FAILURE);
2265 }
2266
2267 vmm_sol_glue_init();
2268 vmm_arena_init();
2269
2270 /*
2271 * Perform temporary HMA registration to determine if the system
2272 * is capable.
2273 */
2274 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2275 goto fail;
2276 } else if (vmm_mod_load() != 0) {
2277 goto fail;
2278 }
2279 vmm_loaded = B_TRUE;
2280 hma_unregister(reg);
2281 reg = NULL;
2282
2283 /* Create control node. Other nodes will be created on demand. */
2284 if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2285 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2286 goto fail;
2287 }
2288
2289 if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) ==
2290 (sdev_plugin_hdl_t)NULL) {
2291 ddi_remove_minor_node(dip, NULL);
2292 goto fail;
2293 }
2294
2295 ddi_report_dev(dip);
2296 vmmdev_sdev_hdl = sph;
2297 vmmdev_dip = dip;
2298 mutex_exit(&vmmdev_mtx);
2299 return (DDI_SUCCESS);
2300
2301 fail:
2302 if (vmm_loaded) {
2303 VERIFY0(vmm_mod_unload());
2304 }
2305 if (reg != NULL) {
2306 hma_unregister(reg);
2307 }
2308 vmm_arena_fini();
2309 vmm_sol_glue_cleanup();
2310 mutex_exit(&vmmdev_mtx);
2311 return (DDI_FAILURE);
2312 }
2313
2314 static int
2315 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2316 {
2317 if (cmd != DDI_DETACH) {
2318 return (DDI_FAILURE);
2319 }
2320
2321 /*
2322 * Ensure that all resources have been cleaned up.
2323 *
2324 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2325 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2326 * devinfo locked as iommu_cleanup() tries to recursively lock each
2327 * devinfo, including our own, while holding vmmdev_mtx.
2328 */
2329 if (mutex_tryenter(&vmmdev_mtx) == 0)
2330 return (DDI_FAILURE);
2331
2332 mutex_enter(&vmm_mtx);
2333 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2334 mutex_exit(&vmm_mtx);
2335 mutex_exit(&vmmdev_mtx);
2336 return (DDI_FAILURE);
2337 }
2338 mutex_exit(&vmm_mtx);
2339
2340 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2341 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2342 mutex_exit(&vmmdev_mtx);
2343 return (DDI_FAILURE);
2344 }
2345 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2346
2347 /* Remove the control node. */
2348 ddi_remove_minor_node(dip, "ctl");
2349 vmmdev_dip = NULL;
2350
2351 VERIFY0(vmm_mod_unload());
2352 VERIFY3U(vmmdev_hma_reg, ==, NULL);
2353 vmm_arena_fini();
2354 vmm_sol_glue_cleanup();
2355
2356 mutex_exit(&vmmdev_mtx);
2357
2358 return (DDI_SUCCESS);
2359 }
2360
2361 static struct cb_ops vmm_cb_ops = {
2362 vmm_open,
2363 vmm_close,
2364 nodev, /* strategy */
2365 nodev, /* print */
2366 nodev, /* dump */
2367 nodev, /* read */
2368 nodev, /* write */
2369 vmm_ioctl,
2370 nodev, /* devmap */
2371 nodev, /* mmap */
2372 vmm_segmap,
2373 nochpoll, /* poll */
2374 ddi_prop_op,
2375 NULL,
2376 D_NEW | D_MP | D_DEVMAP
2377 };
2378
2379 static struct dev_ops vmm_ops = {
2380 DEVO_REV,
2381 0,
2382 vmm_info,
2383 nulldev, /* identify */
2384 nulldev, /* probe */
2385 vmm_attach,
2386 vmm_detach,
2387 nodev, /* reset */
2388 &vmm_cb_ops,
2389 (struct bus_ops *)NULL
2390 };
2391
2392 static struct modldrv modldrv = {
2393 &mod_driverops,
2394 "bhyve vmm",
2395 &vmm_ops
2396 };
2397
2398 static struct modlinkage modlinkage = {
2399 MODREV_1,
2400 &modldrv,
2401 NULL
2402 };
2403
2404 int
2405 _init(void)
2406 {
2407 int error;
2408
2409 sysinit();
2410
2411 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2412 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2413 list_create(&vmm_list, sizeof (vmm_softc_t),
2414 offsetof(vmm_softc_t, vmm_node));
2415 list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2416 offsetof(vmm_softc_t, vmm_node));
2417 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2418
2419 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2420 if (error) {
2421 return (error);
2422 }
2423
2424 vmm_zsd_init();
2425
2426 error = mod_install(&modlinkage);
2427 if (error) {
2428 ddi_soft_state_fini(&vmm_statep);
2429 vmm_zsd_fini();
2430 }
2431
2432 return (error);
2433 }
2434
2435 int
2436 _fini(void)
2437 {
2438 int error;
2439
2440 error = mod_remove(&modlinkage);
2441 if (error) {
2442 return (error);
2443 }
2444
2445 vmm_zsd_fini();
2446
2447 ddi_soft_state_fini(&vmm_statep);
2448
2449 return (0);
2450 }
2451
2452 int
2453 _info(struct modinfo *modinfop)
2454 {
2455 return (mod_info(&modlinkage, modinfop));
2456 }