1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2015 Pluribus Networks Inc.
15 * Copyright 2019 Joyent, Inc.
16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17 * Copyright 2020 Oxide Computer Company
18 */
19
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34
35 #include <sys/kernel.h>
36 #include <sys/hma.h>
37 #include <sys/x86_archext.h>
38 #include <x86/apicreg.h>
39
40 #include <sys/vmm.h>
41 #include <sys/vmm_kernel.h>
42 #include <sys/vmm_instruction_emul.h>
43 #include <sys/vmm_dev.h>
44 #include <sys/vmm_impl.h>
45 #include <sys/vmm_drv.h>
46
47 #include <vm/vm.h>
48 #include <vm/seg_dev.h>
49
50 #include "io/ppt.h"
51 #include "io/vatpic.h"
52 #include "io/vioapic.h"
53 #include "io/vrtc.h"
54 #include "io/vhpet.h"
55 #include "io/vpmtmr.h"
56 #include "vmm_lapic.h"
57 #include "vmm_stat.h"
58 #include "vmm_util.h"
59 #include "vm/vm_glue.h"
60
61 /*
62 * Locking details:
63 *
64 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
65 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data
66 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire
67 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to
68 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
69 */
70
71 static kmutex_t vmmdev_mtx;
72 static dev_info_t *vmmdev_dip;
73 static hma_reg_t *vmmdev_hma_reg;
74 static uint_t vmmdev_hma_ref;
75 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
76
77 static kmutex_t vmm_mtx;
78 static list_t vmm_list;
79 static list_t vmm_destroy_list;
80 static id_space_t *vmm_minors;
81 static void *vmm_statep;
82
83 static const char *vmmdev_hvm_name = "bhyve";
84
85 /* For sdev plugin (/dev) */
86 #define VMM_SDEV_ROOT "/dev/vmm"
87
88 /* From uts/i86pc/io/vmm/intel/vmx.c */
89 extern int vmx_x86_supported(const char **);
90
91 /* Holds and hooks from drivers external to vmm */
92 struct vmm_hold {
93 list_node_t vmh_node;
94 vmm_softc_t *vmh_sc;
95 boolean_t vmh_release_req;
96 uint_t vmh_ioport_hook_cnt;
97 };
98
99 struct vmm_lease {
100 list_node_t vml_node;
101 struct vm *vml_vm;
102 boolean_t vml_expired;
103 boolean_t (*vml_expire_func)(void *);
104 void *vml_expire_arg;
105 list_node_t vml_expire_node;
106 struct vmm_hold *vml_hold;
107 };
108
109 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
110 static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
111
112 static int
113 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
114 {
115 int error;
116 bool sysmem;
117
118 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
119 NULL);
120 if (error || mseg->len == 0)
121 return (error);
122
123 if (!sysmem) {
124 vmm_devmem_entry_t *de;
125 list_t *dl = &sc->vmm_devmem_list;
126
127 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
128 if (de->vde_segid == mseg->segid) {
129 break;
130 }
131 }
132 if (de != NULL) {
133 (void) strlcpy(mseg->name, de->vde_name,
134 sizeof (mseg->name));
135 }
136 } else {
137 bzero(mseg->name, sizeof (mseg->name));
138 }
139
140 return (error);
141 }
142
143 /*
144 * The 'devmem' hack:
145 *
146 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
147 * in the vm which appear with their own name related to the vm under /dev.
148 * Since this would be a hassle from an sdev perspective and would require a
149 * new cdev interface (or complicate the existing one), we choose to implement
150 * this in a different manner. When 'devmem' mappings are created, an
151 * identifying off_t is communicated back out to userspace. That off_t,
152 * residing above the normal guest memory space, can be used to mmap the
153 * 'devmem' mapping from the already-open vm device.
154 */
155
156 static int
157 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
158 {
159 off_t map_offset;
160 vmm_devmem_entry_t *entry;
161
162 if (list_is_empty(&sc->vmm_devmem_list)) {
163 map_offset = VM_DEVMEM_START;
164 } else {
165 entry = list_tail(&sc->vmm_devmem_list);
166 map_offset = entry->vde_off + entry->vde_len;
167 if (map_offset < entry->vde_off) {
168 /* Do not tolerate overflow */
169 return (ERANGE);
170 }
171 /*
172 * XXXJOY: We could choose to search the list for duplicate
173 * names and toss an error. Since we're using the offset
174 * method for now, it does not make much of a difference.
175 */
176 }
177
178 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
179 entry->vde_segid = mseg->segid;
180 entry->vde_len = mseg->len;
181 entry->vde_off = map_offset;
182 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
183 list_insert_tail(&sc->vmm_devmem_list, entry);
184
185 return (0);
186 }
187
188 static boolean_t
189 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
190 off_t *map_offp)
191 {
192 list_t *dl = &sc->vmm_devmem_list;
193 vmm_devmem_entry_t *de = NULL;
194 const off_t map_end = off + len;
195
196 VERIFY(off >= VM_DEVMEM_START);
197
198 if (map_end < off) {
199 /* No match on overflow */
200 return (B_FALSE);
201 }
202
203 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
204 const off_t item_end = de->vde_off + de->vde_len;
205
206 if (de->vde_off <= off && item_end >= map_end) {
207 *segidp = de->vde_segid;
208 *map_offp = off - de->vde_off;
209 return (B_TRUE);
210 }
211 }
212 return (B_FALSE);
213 }
214
215 static void
216 vmmdev_devmem_purge(vmm_softc_t *sc)
217 {
218 vmm_devmem_entry_t *entry;
219
220 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
221 kmem_free(entry, sizeof (*entry));
222 }
223 }
224
225 static int
226 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
227 {
228 int error;
229 bool sysmem = true;
230
231 if (VM_MEMSEG_NAME(mseg)) {
232 sysmem = false;
233 }
234 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
235
236 if (error == 0 && VM_MEMSEG_NAME(mseg)) {
237 /*
238 * Rather than create a whole fresh device from which userspace
239 * can mmap this segment, instead make it available at an
240 * offset above where the main guest memory resides.
241 */
242 error = vmmdev_devmem_create(sc, mseg, mseg->name);
243 if (error != 0) {
244 vm_free_memseg(sc->vmm_vm, mseg->segid);
245 }
246 }
247 return (error);
248 }
249
250 /*
251 * Resource Locking and Exclusion
252 *
253 * Much of bhyve depends on key portions of VM state, such as the guest memory
254 * map, to remain unchanged while the guest is running. As ported from
255 * FreeBSD, the initial strategy for this resource exclusion hinged on gating
256 * access to the instance vCPUs. Threads acting on a single vCPU, like those
257 * performing the work of actually running the guest in VMX/SVM, would lock
258 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide
259 * state, all of the vCPUs would be first locked, ensuring that the
260 * operation(s) could complete without any other threads stumbling into
261 * intermediate states.
262 *
263 * This approach is largely effective for bhyve. Common operations, such as
264 * running the vCPUs, steer clear of lock contention. The model begins to
265 * break down for operations which do not occur in the context of a specific
266 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker
267 * thread in the bhyve process. In order to properly protect those vCPU-less
268 * operations from encountering invalid states, additional locking is required.
269 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
270 * It does mean that class of operations will be serialized on locking the
271 * specific vCPU and that instances sized at VM_MAXCPU will potentially see
272 * undue contention on the VM_MAXCPU-1 vCPU.
273 *
274 * In order to address the shortcomings of this model, the concept of a
275 * read/write lock has been added to bhyve. Operations which change
276 * fundamental aspects of a VM (such as the memory map) must acquire the write
277 * lock, which also implies locking all of the vCPUs and waiting for all read
278 * lock holders to release. While it increases the cost and waiting time for
279 * those few operations, it allows most hot-path operations on the VM (which
280 * depend on its configuration remaining stable) to occur with minimal locking.
281 *
282 * Consumers of the Driver API (see below) are a special case when it comes to
283 * this locking, since they may hold a read lock via the drv_lease mechanism
284 * for an extended period of time. Rather than forcing those consumers to
285 * continuously poll for a write lock attempt, the lease system forces them to
286 * provide a release callback to trigger their clean-up (and potential later
287 * reacquisition) of the read lock.
288 */
289
290 static void
291 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
292 {
293 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
294
295 /*
296 * Since this state transition is utilizing from_idle=true, it should
297 * not fail, but rather block until it can be successful.
298 */
299 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
300 }
301
302 static void
303 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
304 {
305 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
306
307 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
308 vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
309 }
310
311 static void
312 vmm_read_lock(vmm_softc_t *sc)
313 {
314 rw_enter(&sc->vmm_rwlock, RW_READER);
315 }
316
317 static void
318 vmm_read_unlock(vmm_softc_t *sc)
319 {
320 rw_exit(&sc->vmm_rwlock);
321 }
322
323 static void
324 vmm_write_lock(vmm_softc_t *sc)
325 {
326 int maxcpus;
327
328 /* First lock all the vCPUs */
329 maxcpus = vm_get_maxcpus(sc->vmm_vm);
330 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
331 vcpu_lock_one(sc, vcpu);
332 }
333
334 mutex_enter(&sc->vmm_lease_lock);
335 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
336 sc->vmm_lease_blocker++;
337 if (sc->vmm_lease_blocker == 1) {
338 list_t *list = &sc->vmm_lease_list;
339 vmm_lease_t *lease = list_head(list);
340
341 while (lease != NULL) {
342 boolean_t sync_break = B_FALSE;
343
344 if (!lease->vml_expired) {
345 void *arg = lease->vml_expire_arg;
346 lease->vml_expired = B_TRUE;
347 sync_break = lease->vml_expire_func(arg);
348 }
349
350 if (sync_break) {
351 vmm_lease_t *next;
352
353 /*
354 * These leases which are synchronously broken
355 * result in vmm_read_unlock() calls from a
356 * different thread than the corresponding
357 * vmm_read_lock(). This is acceptable, given
358 * that the rwlock underpinning the whole
359 * mechanism tolerates the behavior. This
360 * flexibility is _only_ afforded to VM read
361 * lock (RW_READER) holders.
362 */
363 next = list_next(list, lease);
364 vmm_lease_break_locked(sc, lease);
365 lease = next;
366 } else {
367 lease = list_next(list, lease);
368 }
369 }
370 }
371 mutex_exit(&sc->vmm_lease_lock);
372
373 rw_enter(&sc->vmm_rwlock, RW_WRITER);
374 /*
375 * For now, the 'maxcpus' value for an instance is fixed at the
376 * compile-time constant of VM_MAXCPU at creation. If this changes in
377 * the future, allowing for dynamic vCPU resource sizing, acquisition
378 * of the write lock will need to be wary of such changes.
379 */
380 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
381 }
382
383 static void
384 vmm_write_unlock(vmm_softc_t *sc)
385 {
386 int maxcpus;
387
388 mutex_enter(&sc->vmm_lease_lock);
389 VERIFY3U(sc->vmm_lease_blocker, !=, 0);
390 sc->vmm_lease_blocker--;
391 if (sc->vmm_lease_blocker == 0) {
392 cv_broadcast(&sc->vmm_lease_cv);
393 }
394 mutex_exit(&sc->vmm_lease_lock);
395
396 /*
397 * The VM write lock _must_ be released from the same thread it was
398 * acquired in, unlike the read lock.
399 */
400 VERIFY(rw_write_held(&sc->vmm_rwlock));
401 rw_exit(&sc->vmm_rwlock);
402
403 /* Unlock all the vCPUs */
404 maxcpus = vm_get_maxcpus(sc->vmm_vm);
405 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
406 vcpu_unlock_one(sc, vcpu);
407 }
408 }
409
410 static int
411 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
412 cred_t *credp, int *rvalp)
413 {
414 int error = 0, vcpu = -1;
415 void *datap = (void *)arg;
416 enum vm_lock_type {
417 LOCK_NONE = 0,
418 LOCK_VCPU,
419 LOCK_READ_HOLD,
420 LOCK_WRITE_HOLD
421 } lock_type = LOCK_NONE;
422
423 /* Acquire any exclusion resources needed for the operation. */
424 switch (cmd) {
425 case VM_RUN:
426 case VM_GET_REGISTER:
427 case VM_SET_REGISTER:
428 case VM_GET_SEGMENT_DESCRIPTOR:
429 case VM_SET_SEGMENT_DESCRIPTOR:
430 case VM_GET_REGISTER_SET:
431 case VM_SET_REGISTER_SET:
432 case VM_INJECT_EXCEPTION:
433 case VM_GET_CAPABILITY:
434 case VM_SET_CAPABILITY:
435 case VM_PPTDEV_MSI:
436 case VM_PPTDEV_MSIX:
437 case VM_SET_X2APIC_STATE:
438 case VM_GLA2GPA:
439 case VM_GLA2GPA_NOFAULT:
440 case VM_ACTIVATE_CPU:
441 case VM_SET_INTINFO:
442 case VM_GET_INTINFO:
443 case VM_RESTART_INSTRUCTION:
444 case VM_SET_KERNEMU_DEV:
445 case VM_GET_KERNEMU_DEV:
446 /*
447 * Copy in the ID of the vCPU chosen for this operation.
448 * Since a nefarious caller could update their struct between
449 * this locking and when the rest of the ioctl data is copied
450 * in, it is _critical_ that this local 'vcpu' variable be used
451 * rather than the in-struct one when performing the ioctl.
452 */
453 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
454 return (EFAULT);
455 }
456 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
457 return (EINVAL);
458 }
459 vcpu_lock_one(sc, vcpu);
460 lock_type = LOCK_VCPU;
461 break;
462
463 case VM_REINIT:
464 case VM_BIND_PPTDEV:
465 case VM_UNBIND_PPTDEV:
466 case VM_MAP_PPTDEV_MMIO:
467 case VM_ALLOC_MEMSEG:
468 case VM_MMAP_MEMSEG:
469 case VM_WRLOCK_CYCLE:
470 case VM_PMTMR_LOCATE:
471 vmm_write_lock(sc);
472 lock_type = LOCK_WRITE_HOLD;
473 break;
474
475 case VM_GET_GPA_PMAP:
476 case VM_GET_MEMSEG:
477 case VM_MMAP_GETNEXT:
478 case VM_LAPIC_IRQ:
479 case VM_INJECT_NMI:
480 case VM_IOAPIC_ASSERT_IRQ:
481 case VM_IOAPIC_DEASSERT_IRQ:
482 case VM_IOAPIC_PULSE_IRQ:
483 case VM_LAPIC_MSI:
484 case VM_LAPIC_LOCAL_IRQ:
485 case VM_GET_X2APIC_STATE:
486 case VM_RTC_READ:
487 case VM_RTC_WRITE:
488 case VM_RTC_SETTIME:
489 case VM_RTC_GETTIME:
490 #ifndef __FreeBSD__
491 case VM_DEVMEM_GETOFFSET:
492 #endif
493 vmm_read_lock(sc);
494 lock_type = LOCK_READ_HOLD;
495 break;
496
497 case VM_IOAPIC_PINCOUNT:
498 default:
499 break;
500 }
501
502 /* Execute the primary logic for the ioctl. */
503 switch (cmd) {
504 case VM_RUN: {
505 struct vm_entry entry;
506
507 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
508 error = EFAULT;
509 break;
510 }
511
512 if (!(curthread->t_schedflag & TS_VCPU))
513 smt_mark_as_vcpu();
514
515 error = vm_run(sc->vmm_vm, vcpu, &entry);
516
517 /*
518 * Unexpected states in vm_run() are expressed through positive
519 * errno-oriented return values. VM states which expect further
520 * processing in userspace (necessary context via exitinfo) are
521 * expressed through negative return values. For the time being
522 * a return value of 0 is not expected from vm_run().
523 */
524 ASSERT(error != 0);
525 if (error < 0) {
526 const struct vm_exit *vme;
527 void *outp = entry.exit_data;
528
529 error = 0;
530 vme = vm_exitinfo(sc->vmm_vm, vcpu);
531 if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
532 error = EFAULT;
533 }
534 }
535 break;
536 }
537 case VM_SUSPEND: {
538 struct vm_suspend vmsuspend;
539
540 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
541 error = EFAULT;
542 break;
543 }
544 error = vm_suspend(sc->vmm_vm, vmsuspend.how);
545 break;
546 }
547 case VM_REINIT:
548 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
549 /*
550 * The VM instance should be free of driver-attached
551 * hooks during the reinitialization process.
552 */
553 break;
554 }
555 error = vm_reinit(sc->vmm_vm);
556 (void) vmm_drv_block_hook(sc, B_FALSE);
557 break;
558 case VM_STAT_DESC: {
559 struct vm_stat_desc statdesc;
560
561 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
562 error = EFAULT;
563 break;
564 }
565 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
566 sizeof (statdesc.desc));
567 if (error == 0 &&
568 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
569 error = EFAULT;
570 break;
571 }
572 break;
573 }
574 case VM_STATS_IOC: {
575 struct vm_stats vmstats;
576
577 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
578 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
579 error = EFAULT;
580 break;
581 }
582 hrt2tv(gethrtime(), &vmstats.tv);
583 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
584 &vmstats.num_entries, vmstats.statbuf);
585 if (error == 0 &&
586 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
587 error = EFAULT;
588 break;
589 }
590 break;
591 }
592
593 case VM_PPTDEV_MSI: {
594 struct vm_pptdev_msi pptmsi;
595
596 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
597 error = EFAULT;
598 break;
599 }
600 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
601 pptmsi.addr, pptmsi.msg, pptmsi.numvec);
602 break;
603 }
604 case VM_PPTDEV_MSIX: {
605 struct vm_pptdev_msix pptmsix;
606
607 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
608 error = EFAULT;
609 break;
610 }
611 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
612 pptmsix.idx, pptmsix.addr, pptmsix.msg,
613 pptmsix.vector_control);
614 break;
615 }
616 case VM_MAP_PPTDEV_MMIO: {
617 struct vm_pptdev_mmio pptmmio;
618
619 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
620 error = EFAULT;
621 break;
622 }
623 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
624 pptmmio.len, pptmmio.hpa);
625 break;
626 }
627 case VM_BIND_PPTDEV: {
628 struct vm_pptdev pptdev;
629
630 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
631 error = EFAULT;
632 break;
633 }
634 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
635 break;
636 }
637 case VM_UNBIND_PPTDEV: {
638 struct vm_pptdev pptdev;
639
640 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
641 error = EFAULT;
642 break;
643 }
644 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
645 break;
646 }
647 case VM_GET_PPTDEV_LIMITS: {
648 struct vm_pptdev_limits pptlimits;
649
650 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
651 error = EFAULT;
652 break;
653 }
654 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
655 &pptlimits.msi_limit, &pptlimits.msix_limit);
656 if (error == 0 &&
657 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
658 error = EFAULT;
659 break;
660 }
661 break;
662 }
663 case VM_INJECT_EXCEPTION: {
664 struct vm_exception vmexc;
665 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
666 error = EFAULT;
667 break;
668 }
669 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
670 vmexc.error_code_valid, vmexc.error_code,
671 vmexc.restart_instruction);
672 break;
673 }
674 case VM_INJECT_NMI: {
675 struct vm_nmi vmnmi;
676
677 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
678 error = EFAULT;
679 break;
680 }
681 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
682 break;
683 }
684 case VM_LAPIC_IRQ: {
685 struct vm_lapic_irq vmirq;
686
687 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
688 error = EFAULT;
689 break;
690 }
691 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
692 break;
693 }
694 case VM_LAPIC_LOCAL_IRQ: {
695 struct vm_lapic_irq vmirq;
696
697 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
698 error = EFAULT;
699 break;
700 }
701 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
702 vmirq.vector);
703 break;
704 }
705 case VM_LAPIC_MSI: {
706 struct vm_lapic_msi vmmsi;
707
708 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
709 error = EFAULT;
710 break;
711 }
712 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
713 break;
714 }
715
716 case VM_IOAPIC_ASSERT_IRQ: {
717 struct vm_ioapic_irq ioapic_irq;
718
719 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
720 error = EFAULT;
721 break;
722 }
723 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
724 break;
725 }
726 case VM_IOAPIC_DEASSERT_IRQ: {
727 struct vm_ioapic_irq ioapic_irq;
728
729 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
730 error = EFAULT;
731 break;
732 }
733 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
734 break;
735 }
736 case VM_IOAPIC_PULSE_IRQ: {
737 struct vm_ioapic_irq ioapic_irq;
738
739 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
740 error = EFAULT;
741 break;
742 }
743 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
744 break;
745 }
746 case VM_IOAPIC_PINCOUNT: {
747 int pincount;
748
749 pincount = vioapic_pincount(sc->vmm_vm);
750 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
751 error = EFAULT;
752 break;
753 }
754 break;
755 }
756
757 case VM_ISA_ASSERT_IRQ: {
758 struct vm_isa_irq isa_irq;
759
760 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
761 error = EFAULT;
762 break;
763 }
764 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
765 if (error == 0 && isa_irq.ioapic_irq != -1) {
766 error = vioapic_assert_irq(sc->vmm_vm,
767 isa_irq.ioapic_irq);
768 }
769 break;
770 }
771 case VM_ISA_DEASSERT_IRQ: {
772 struct vm_isa_irq isa_irq;
773
774 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
775 error = EFAULT;
776 break;
777 }
778 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
779 if (error == 0 && isa_irq.ioapic_irq != -1) {
780 error = vioapic_deassert_irq(sc->vmm_vm,
781 isa_irq.ioapic_irq);
782 }
783 break;
784 }
785 case VM_ISA_PULSE_IRQ: {
786 struct vm_isa_irq isa_irq;
787
788 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
789 error = EFAULT;
790 break;
791 }
792 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
793 if (error == 0 && isa_irq.ioapic_irq != -1) {
794 error = vioapic_pulse_irq(sc->vmm_vm,
795 isa_irq.ioapic_irq);
796 }
797 break;
798 }
799 case VM_ISA_SET_IRQ_TRIGGER: {
800 struct vm_isa_irq_trigger isa_irq_trigger;
801
802 if (ddi_copyin(datap, &isa_irq_trigger,
803 sizeof (isa_irq_trigger), md)) {
804 error = EFAULT;
805 break;
806 }
807 error = vatpic_set_irq_trigger(sc->vmm_vm,
808 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
809 break;
810 }
811
812 case VM_MMAP_GETNEXT: {
813 struct vm_memmap mm;
814
815 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
816 error = EFAULT;
817 break;
818 }
819 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
820 &mm.segoff, &mm.len, &mm.prot, &mm.flags);
821 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
822 error = EFAULT;
823 break;
824 }
825 break;
826 }
827 case VM_MMAP_MEMSEG: {
828 struct vm_memmap mm;
829
830 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
831 error = EFAULT;
832 break;
833 }
834 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
835 mm.len, mm.prot, mm.flags);
836 break;
837 }
838 case VM_ALLOC_MEMSEG: {
839 struct vm_memseg vmseg;
840
841 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
842 error = EFAULT;
843 break;
844 }
845 error = vmmdev_alloc_memseg(sc, &vmseg);
846 break;
847 }
848 case VM_GET_MEMSEG: {
849 struct vm_memseg vmseg;
850
851 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
852 error = EFAULT;
853 break;
854 }
855 error = vmmdev_get_memseg(sc, &vmseg);
856 if (error == 0 &&
857 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
858 error = EFAULT;
859 break;
860 }
861 break;
862 }
863 case VM_GET_REGISTER: {
864 struct vm_register vmreg;
865
866 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
867 error = EFAULT;
868 break;
869 }
870 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
871 &vmreg.regval);
872 if (error == 0 &&
873 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
874 error = EFAULT;
875 break;
876 }
877 break;
878 }
879 case VM_SET_REGISTER: {
880 struct vm_register vmreg;
881
882 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
883 error = EFAULT;
884 break;
885 }
886 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
887 vmreg.regval);
888 break;
889 }
890 case VM_SET_SEGMENT_DESCRIPTOR: {
891 struct vm_seg_desc vmsegd;
892
893 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
894 error = EFAULT;
895 break;
896 }
897 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
898 &vmsegd.desc);
899 break;
900 }
901 case VM_GET_SEGMENT_DESCRIPTOR: {
902 struct vm_seg_desc vmsegd;
903
904 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
905 error = EFAULT;
906 break;
907 }
908 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
909 &vmsegd.desc);
910 if (error == 0 &&
911 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
912 error = EFAULT;
913 break;
914 }
915 break;
916 }
917 case VM_GET_REGISTER_SET: {
918 struct vm_register_set vrs;
919 int regnums[VM_REG_LAST];
920 uint64_t regvals[VM_REG_LAST];
921
922 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
923 error = EFAULT;
924 break;
925 }
926 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
927 error = EINVAL;
928 break;
929 }
930 if (ddi_copyin(vrs.regnums, regnums,
931 sizeof (int) * vrs.count, md)) {
932 error = EFAULT;
933 break;
934 }
935
936 error = 0;
937 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
938 if (regnums[i] < 0) {
939 error = EINVAL;
940 break;
941 }
942 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
943 ®vals[i]);
944 }
945 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
946 sizeof (uint64_t) * vrs.count, md)) {
947 error = EFAULT;
948 }
949 break;
950 }
951 case VM_SET_REGISTER_SET: {
952 struct vm_register_set vrs;
953 int regnums[VM_REG_LAST];
954 uint64_t regvals[VM_REG_LAST];
955
956 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
957 error = EFAULT;
958 break;
959 }
960 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
961 error = EINVAL;
962 break;
963 }
964 if (ddi_copyin(vrs.regnums, regnums,
965 sizeof (int) * vrs.count, md)) {
966 error = EFAULT;
967 break;
968 }
969 if (ddi_copyin(vrs.regvals, regvals,
970 sizeof (uint64_t) * vrs.count, md)) {
971 error = EFAULT;
972 break;
973 }
974
975 error = 0;
976 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
977 /*
978 * Setting registers in a set is not atomic, since a
979 * failure in the middle of the set will cause a
980 * bail-out and inconsistent register state. Callers
981 * should be wary of this.
982 */
983 if (regnums[i] < 0) {
984 error = EINVAL;
985 break;
986 }
987 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
988 regvals[i]);
989 }
990 break;
991 }
992
993 case VM_SET_KERNEMU_DEV:
994 case VM_GET_KERNEMU_DEV: {
995 struct vm_readwrite_kernemu_device kemu;
996 size_t size = 0;
997
998 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
999 error = EFAULT;
1000 break;
1001 }
1002
1003 if (kemu.access_width > 3) {
1004 error = EINVAL;
1005 break;
1006 }
1007 size = (1 << kemu.access_width);
1008 ASSERT(size >= 1 && size <= 8);
1009
1010 if (cmd == VM_SET_KERNEMU_DEV) {
1011 error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1012 kemu.gpa, kemu.value, size);
1013 } else {
1014 error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1015 kemu.gpa, &kemu.value, size);
1016 }
1017
1018 if (error == 0) {
1019 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1020 error = EFAULT;
1021 break;
1022 }
1023 }
1024 break;
1025 }
1026
1027 case VM_GET_CAPABILITY: {
1028 struct vm_capability vmcap;
1029
1030 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1031 error = EFAULT;
1032 break;
1033 }
1034 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1035 &vmcap.capval);
1036 if (error == 0 &&
1037 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1038 error = EFAULT;
1039 break;
1040 }
1041 break;
1042 }
1043 case VM_SET_CAPABILITY: {
1044 struct vm_capability vmcap;
1045
1046 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1047 error = EFAULT;
1048 break;
1049 }
1050 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1051 vmcap.capval);
1052 break;
1053 }
1054 case VM_SET_X2APIC_STATE: {
1055 struct vm_x2apic x2apic;
1056
1057 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1058 error = EFAULT;
1059 break;
1060 }
1061 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1062 break;
1063 }
1064 case VM_GET_X2APIC_STATE: {
1065 struct vm_x2apic x2apic;
1066
1067 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1068 error = EFAULT;
1069 break;
1070 }
1071 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1072 &x2apic.state);
1073 if (error == 0 &&
1074 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1075 error = EFAULT;
1076 break;
1077 }
1078 break;
1079 }
1080 case VM_GET_GPA_PMAP: {
1081 struct vm_gpa_pte gpapte;
1082
1083 if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1084 error = EFAULT;
1085 break;
1086 }
1087 #ifdef __FreeBSD__
1088 /* XXXJOY: add function? */
1089 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1090 gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1091 #endif
1092 error = 0;
1093 break;
1094 }
1095 case VM_GET_HPET_CAPABILITIES: {
1096 struct vm_hpet_cap hpetcap;
1097
1098 error = vhpet_getcap(&hpetcap);
1099 if (error == 0 &&
1100 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1101 error = EFAULT;
1102 break;
1103 }
1104 break;
1105 }
1106 case VM_GLA2GPA: {
1107 struct vm_gla2gpa gg;
1108
1109 CTASSERT(PROT_READ == VM_PROT_READ);
1110 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1111 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1112
1113 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1114 error = EFAULT;
1115 break;
1116 }
1117 gg.vcpuid = vcpu;
1118 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1119 gg.prot, &gg.gpa, &gg.fault);
1120 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1121 error = EFAULT;
1122 break;
1123 }
1124 break;
1125 }
1126 case VM_GLA2GPA_NOFAULT: {
1127 struct vm_gla2gpa gg;
1128
1129 CTASSERT(PROT_READ == VM_PROT_READ);
1130 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1131 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1132
1133 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1134 error = EFAULT;
1135 break;
1136 }
1137 gg.vcpuid = vcpu;
1138 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1139 gg.gla, gg.prot, &gg.gpa, &gg.fault);
1140 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1141 error = EFAULT;
1142 break;
1143 }
1144 break;
1145 }
1146
1147 case VM_ACTIVATE_CPU:
1148 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1149 break;
1150
1151 case VM_SUSPEND_CPU:
1152 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1153 error = EFAULT;
1154 } else {
1155 error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1156 }
1157 break;
1158
1159 case VM_RESUME_CPU:
1160 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1161 error = EFAULT;
1162 } else {
1163 error = vm_resume_cpu(sc->vmm_vm, vcpu);
1164 }
1165 break;
1166
1167 case VM_GET_CPUS: {
1168 struct vm_cpuset vm_cpuset;
1169 cpuset_t tempset;
1170 void *srcp = &tempset;
1171 int size;
1172
1173 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1174 error = EFAULT;
1175 break;
1176 }
1177
1178 /* Be more generous about sizing since our cpuset_t is large. */
1179 size = vm_cpuset.cpusetsize;
1180 if (size <= 0 || size > sizeof (cpuset_t)) {
1181 error = ERANGE;
1182 }
1183 /*
1184 * If they want a ulong_t or less, make sure they receive the
1185 * low bits with all the useful information.
1186 */
1187 if (size <= sizeof (tempset.cpub[0])) {
1188 srcp = &tempset.cpub[0];
1189 }
1190
1191 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1192 tempset = vm_active_cpus(sc->vmm_vm);
1193 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1194 tempset = vm_suspended_cpus(sc->vmm_vm);
1195 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1196 tempset = vm_debug_cpus(sc->vmm_vm);
1197 } else {
1198 error = EINVAL;
1199 }
1200
1201 ASSERT(size > 0 && size <= sizeof (tempset));
1202 if (error == 0 &&
1203 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1204 error = EFAULT;
1205 break;
1206 }
1207 break;
1208 }
1209 case VM_SET_INTINFO: {
1210 struct vm_intinfo vmii;
1211
1212 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1213 error = EFAULT;
1214 break;
1215 }
1216 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1217 break;
1218 }
1219 case VM_GET_INTINFO: {
1220 struct vm_intinfo vmii;
1221
1222 vmii.vcpuid = vcpu;
1223 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1224 &vmii.info2);
1225 if (error == 0 &&
1226 ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1227 error = EFAULT;
1228 break;
1229 }
1230 break;
1231 }
1232 case VM_RTC_WRITE: {
1233 struct vm_rtc_data rtcdata;
1234
1235 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1236 error = EFAULT;
1237 break;
1238 }
1239 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1240 rtcdata.value);
1241 break;
1242 }
1243 case VM_RTC_READ: {
1244 struct vm_rtc_data rtcdata;
1245
1246 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1247 error = EFAULT;
1248 break;
1249 }
1250 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1251 &rtcdata.value);
1252 if (error == 0 &&
1253 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1254 error = EFAULT;
1255 break;
1256 }
1257 break;
1258 }
1259 case VM_RTC_SETTIME: {
1260 struct vm_rtc_time rtctime;
1261
1262 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1263 error = EFAULT;
1264 break;
1265 }
1266 error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1267 break;
1268 }
1269 case VM_RTC_GETTIME: {
1270 struct vm_rtc_time rtctime;
1271
1272 rtctime.secs = vrtc_get_time(sc->vmm_vm);
1273 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1274 error = EFAULT;
1275 break;
1276 }
1277 break;
1278 }
1279
1280 case VM_PMTMR_LOCATE: {
1281 uint16_t port = arg;
1282 error = vpmtmr_set_location(sc->vmm_vm, port);
1283 break;
1284 }
1285
1286 case VM_RESTART_INSTRUCTION:
1287 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1288 break;
1289
1290 case VM_SET_TOPOLOGY: {
1291 struct vm_cpu_topology topo;
1292
1293 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1294 error = EFAULT;
1295 break;
1296 }
1297 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1298 topo.threads, topo.maxcpus);
1299 break;
1300 }
1301 case VM_GET_TOPOLOGY: {
1302 struct vm_cpu_topology topo;
1303
1304 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1305 &topo.threads, &topo.maxcpus);
1306 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1307 error = EFAULT;
1308 break;
1309 }
1310 break;
1311 }
1312
1313 #ifndef __FreeBSD__
1314 case VM_DEVMEM_GETOFFSET: {
1315 struct vm_devmem_offset vdo;
1316 list_t *dl = &sc->vmm_devmem_list;
1317 vmm_devmem_entry_t *de = NULL;
1318
1319 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1320 error = EFAULT;
1321 break;
1322 }
1323
1324 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1325 if (de->vde_segid == vdo.segid) {
1326 break;
1327 }
1328 }
1329 if (de != NULL) {
1330 vdo.offset = de->vde_off;
1331 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1332 error = EFAULT;
1333 }
1334 } else {
1335 error = ENOENT;
1336 }
1337 break;
1338 }
1339 case VM_WRLOCK_CYCLE: {
1340 /*
1341 * Present a test mechanism to acquire/release the write lock
1342 * on the VM without any other effects.
1343 */
1344 break;
1345 }
1346 #endif
1347 default:
1348 error = ENOTTY;
1349 break;
1350 }
1351
1352 /* Release exclusion resources */
1353 switch (lock_type) {
1354 case LOCK_NONE:
1355 break;
1356 case LOCK_VCPU:
1357 vcpu_unlock_one(sc, vcpu);
1358 break;
1359 case LOCK_READ_HOLD:
1360 vmm_read_unlock(sc);
1361 break;
1362 case LOCK_WRITE_HOLD:
1363 vmm_write_unlock(sc);
1364 break;
1365 default:
1366 panic("unexpected lock type");
1367 break;
1368 }
1369
1370 return (error);
1371 }
1372
1373 static vmm_softc_t *
1374 vmm_lookup(const char *name)
1375 {
1376 list_t *vml = &vmm_list;
1377 vmm_softc_t *sc;
1378
1379 ASSERT(MUTEX_HELD(&vmm_mtx));
1380
1381 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1382 if (strcmp(sc->vmm_name, name) == 0) {
1383 break;
1384 }
1385 }
1386
1387 return (sc);
1388 }
1389
1390 /*
1391 * Acquire an HMA registration if not already held.
1392 */
1393 static boolean_t
1394 vmm_hma_acquire(void)
1395 {
1396 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1397
1398 mutex_enter(&vmmdev_mtx);
1399
1400 if (vmmdev_hma_reg == NULL) {
1401 VERIFY3U(vmmdev_hma_ref, ==, 0);
1402 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1403 if (vmmdev_hma_reg == NULL) {
1404 cmn_err(CE_WARN, "%s HMA registration failed.",
1405 vmmdev_hvm_name);
1406 mutex_exit(&vmmdev_mtx);
1407 return (B_FALSE);
1408 }
1409 }
1410
1411 vmmdev_hma_ref++;
1412
1413 mutex_exit(&vmmdev_mtx);
1414
1415 return (B_TRUE);
1416 }
1417
1418 /*
1419 * Release the HMA registration if held and there are no remaining VMs.
1420 */
1421 static void
1422 vmm_hma_release(void)
1423 {
1424 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1425
1426 mutex_enter(&vmmdev_mtx);
1427
1428 VERIFY3U(vmmdev_hma_ref, !=, 0);
1429
1430 vmmdev_hma_ref--;
1431
1432 if (vmmdev_hma_ref == 0) {
1433 VERIFY(vmmdev_hma_reg != NULL);
1434 hma_unregister(vmmdev_hma_reg);
1435 vmmdev_hma_reg = NULL;
1436 }
1437 mutex_exit(&vmmdev_mtx);
1438 }
1439
1440 static int
1441 vmmdev_do_vm_create(char *name, cred_t *cr)
1442 {
1443 vmm_softc_t *sc = NULL;
1444 minor_t minor;
1445 int error = ENOMEM;
1446
1447 if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1448 return (EINVAL);
1449 }
1450
1451 if (!vmm_hma_acquire())
1452 return (ENXIO);
1453
1454 mutex_enter(&vmm_mtx);
1455
1456 /* Look for duplicate names */
1457 if (vmm_lookup(name) != NULL) {
1458 mutex_exit(&vmm_mtx);
1459 vmm_hma_release();
1460 return (EEXIST);
1461 }
1462
1463 /* Allow only one instance per non-global zone. */
1464 if (!INGLOBALZONE(curproc)) {
1465 for (sc = list_head(&vmm_list); sc != NULL;
1466 sc = list_next(&vmm_list, sc)) {
1467 if (sc->vmm_zone == curzone) {
1468 mutex_exit(&vmm_mtx);
1469 vmm_hma_release();
1470 return (EINVAL);
1471 }
1472 }
1473 }
1474
1475 minor = id_alloc(vmm_minors);
1476 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1477 goto fail;
1478 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1479 ddi_soft_state_free(vmm_statep, minor);
1480 goto fail;
1481 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1482 DDI_PSEUDO, 0) != DDI_SUCCESS) {
1483 goto fail;
1484 }
1485
1486 error = vm_create(name, &sc->vmm_vm);
1487 if (error == 0) {
1488 /* Complete VM intialization and report success. */
1489 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1490 sc->vmm_minor = minor;
1491 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1492 offsetof(vmm_devmem_entry_t, vde_node));
1493
1494 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1495 offsetof(vmm_hold_t, vmh_node));
1496 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1497
1498 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1499 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1500 offsetof(vmm_lease_t, vml_node));
1501 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1502 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1503
1504 sc->vmm_zone = crgetzone(cr);
1505 zone_hold(sc->vmm_zone);
1506 vmm_zsd_add_vm(sc);
1507
1508 list_insert_tail(&vmm_list, sc);
1509 mutex_exit(&vmm_mtx);
1510 return (0);
1511 }
1512
1513 ddi_remove_minor_node(vmmdev_dip, name);
1514 fail:
1515 id_free(vmm_minors, minor);
1516 if (sc != NULL) {
1517 ddi_soft_state_free(vmm_statep, minor);
1518 }
1519 mutex_exit(&vmm_mtx);
1520 vmm_hma_release();
1521
1522 return (error);
1523 }
1524
1525 /*
1526 * Bhyve 'Driver' Interface
1527 *
1528 * While many devices are emulated in the bhyve userspace process, there are
1529 * others with performance constraints which require that they run mostly or
1530 * entirely in-kernel. For those not integrated directly into bhyve, an API is
1531 * needed so they can query/manipulate the portions of VM state needed to
1532 * fulfill their purpose.
1533 *
1534 * This includes:
1535 * - Translating guest-physical addresses to host-virtual pointers
1536 * - Injecting MSIs
1537 * - Hooking IO port addresses
1538 *
1539 * The vmm_drv interface exists to provide that functionality to its consumers.
1540 * (At this time, 'viona' is the only user)
1541 */
1542 int
1543 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1544 {
1545 vnode_t *vp = fp->f_vnode;
1546 const dev_t dev = vp->v_rdev;
1547 vmm_softc_t *sc;
1548 vmm_hold_t *hold;
1549 int err = 0;
1550
1551 if (vp->v_type != VCHR) {
1552 return (ENXIO);
1553 }
1554 const major_t major = getmajor(dev);
1555 const minor_t minor = getminor(dev);
1556
1557 mutex_enter(&vmmdev_mtx);
1558 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1559 mutex_exit(&vmmdev_mtx);
1560 return (ENOENT);
1561 }
1562 mutex_enter(&vmm_mtx);
1563 mutex_exit(&vmmdev_mtx);
1564
1565 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1566 err = ENOENT;
1567 goto out;
1568 }
1569 /* XXXJOY: check cred permissions against instance */
1570
1571 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1572 err = EBUSY;
1573 goto out;
1574 }
1575
1576 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1577 hold->vmh_sc = sc;
1578 hold->vmh_release_req = B_FALSE;
1579
1580 list_insert_tail(&sc->vmm_holds, hold);
1581 sc->vmm_flags |= VMM_HELD;
1582 *holdp = hold;
1583
1584 out:
1585 mutex_exit(&vmm_mtx);
1586 return (err);
1587 }
1588
1589 void
1590 vmm_drv_rele(vmm_hold_t *hold)
1591 {
1592 vmm_softc_t *sc;
1593
1594 ASSERT(hold != NULL);
1595 ASSERT(hold->vmh_sc != NULL);
1596 VERIFY(hold->vmh_ioport_hook_cnt == 0);
1597
1598 mutex_enter(&vmm_mtx);
1599 sc = hold->vmh_sc;
1600 list_remove(&sc->vmm_holds, hold);
1601 if (list_is_empty(&sc->vmm_holds)) {
1602 sc->vmm_flags &= ~VMM_HELD;
1603 cv_broadcast(&sc->vmm_cv);
1604 }
1605 mutex_exit(&vmm_mtx);
1606 kmem_free(hold, sizeof (*hold));
1607 }
1608
1609 boolean_t
1610 vmm_drv_release_reqd(vmm_hold_t *hold)
1611 {
1612 ASSERT(hold != NULL);
1613
1614 return (hold->vmh_release_req);
1615 }
1616
1617 vmm_lease_t *
1618 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1619 {
1620 vmm_softc_t *sc = hold->vmh_sc;
1621 vmm_lease_t *lease;
1622
1623 ASSERT3P(expiref, !=, NULL);
1624
1625 if (hold->vmh_release_req) {
1626 return (NULL);
1627 }
1628
1629 lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1630 list_link_init(&lease->vml_node);
1631 lease->vml_expire_func = expiref;
1632 lease->vml_expire_arg = arg;
1633 lease->vml_expired = B_FALSE;
1634 lease->vml_hold = hold;
1635 /* cache the VM pointer for one less pointer chase */
1636 lease->vml_vm = sc->vmm_vm;
1637
1638 mutex_enter(&sc->vmm_lease_lock);
1639 while (sc->vmm_lease_blocker != 0) {
1640 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1641 }
1642 list_insert_tail(&sc->vmm_lease_list, lease);
1643 vmm_read_lock(sc);
1644 mutex_exit(&sc->vmm_lease_lock);
1645
1646 return (lease);
1647 }
1648
1649 static void
1650 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1651 {
1652 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1653
1654 list_remove(&sc->vmm_lease_list, lease);
1655 vmm_read_unlock(sc);
1656 kmem_free(lease, sizeof (*lease));
1657 }
1658
1659 void
1660 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1661 {
1662 vmm_softc_t *sc = hold->vmh_sc;
1663
1664 VERIFY3P(hold, ==, lease->vml_hold);
1665
1666 mutex_enter(&sc->vmm_lease_lock);
1667 vmm_lease_break_locked(sc, lease);
1668 mutex_exit(&sc->vmm_lease_lock);
1669 }
1670
1671 boolean_t
1672 vmm_drv_lease_expired(vmm_lease_t *lease)
1673 {
1674 return (lease->vml_expired);
1675 }
1676
1677 void *
1678 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1679 {
1680 ASSERT(lease != NULL);
1681
1682 return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1683 }
1684
1685 int
1686 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1687 {
1688 ASSERT(lease != NULL);
1689
1690 return (lapic_intr_msi(lease->vml_vm, addr, msg));
1691 }
1692
1693 int
1694 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1695 void *arg, void **cookie)
1696 {
1697 vmm_softc_t *sc;
1698 int err;
1699
1700 ASSERT(hold != NULL);
1701 ASSERT(cookie != NULL);
1702
1703 sc = hold->vmh_sc;
1704 mutex_enter(&vmm_mtx);
1705 /* Confirm that hook installation is not blocked */
1706 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1707 mutex_exit(&vmm_mtx);
1708 return (EBUSY);
1709 }
1710 /*
1711 * Optimistically record an installed hook which will prevent a block
1712 * from being asserted while the mutex is dropped.
1713 */
1714 hold->vmh_ioport_hook_cnt++;
1715 mutex_exit(&vmm_mtx);
1716
1717 vmm_write_lock(sc);
1718 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1719 arg, cookie);
1720 vmm_write_unlock(sc);
1721
1722 if (err != 0) {
1723 mutex_enter(&vmm_mtx);
1724 /* Walk back optimism about the hook installation */
1725 hold->vmh_ioport_hook_cnt--;
1726 mutex_exit(&vmm_mtx);
1727 }
1728 return (err);
1729 }
1730
1731 void
1732 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1733 {
1734 vmm_softc_t *sc;
1735
1736 ASSERT(hold != NULL);
1737 ASSERT(cookie != NULL);
1738 ASSERT(hold->vmh_ioport_hook_cnt != 0);
1739
1740 sc = hold->vmh_sc;
1741 vmm_write_lock(sc);
1742 vm_ioport_unhook(sc->vmm_vm, cookie);
1743 vmm_write_unlock(sc);
1744
1745 mutex_enter(&vmm_mtx);
1746 hold->vmh_ioport_hook_cnt--;
1747 mutex_exit(&vmm_mtx);
1748 }
1749
1750 static int
1751 vmm_drv_purge(vmm_softc_t *sc)
1752 {
1753 ASSERT(MUTEX_HELD(&vmm_mtx));
1754
1755 if ((sc->vmm_flags & VMM_HELD) != 0) {
1756 vmm_hold_t *hold;
1757
1758 sc->vmm_flags |= VMM_CLEANUP;
1759 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1760 hold = list_next(&sc->vmm_holds, hold)) {
1761 hold->vmh_release_req = B_TRUE;
1762 }
1763 while ((sc->vmm_flags & VMM_HELD) != 0) {
1764 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1765 return (EINTR);
1766 }
1767 }
1768 sc->vmm_flags &= ~VMM_CLEANUP;
1769 }
1770
1771 VERIFY(list_is_empty(&sc->vmm_holds));
1772 sc->vmm_flags |= VMM_PURGED;
1773 return (0);
1774 }
1775
1776 static int
1777 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1778 {
1779 int err = 0;
1780
1781 mutex_enter(&vmm_mtx);
1782 if (!enable_block) {
1783 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1784
1785 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1786 goto done;
1787 }
1788
1789 /* If any holds have hooks installed, the block is a failure */
1790 if (!list_is_empty(&sc->vmm_holds)) {
1791 vmm_hold_t *hold;
1792
1793 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1794 hold = list_next(&sc->vmm_holds, hold)) {
1795 if (hold->vmh_ioport_hook_cnt != 0) {
1796 err = EBUSY;
1797 goto done;
1798 }
1799 }
1800 }
1801 sc->vmm_flags |= VMM_BLOCK_HOOK;
1802
1803 done:
1804 mutex_exit(&vmm_mtx);
1805 return (err);
1806 }
1807
1808 static int
1809 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1810 boolean_t *hma_release)
1811 {
1812 dev_info_t *pdip = ddi_get_parent(vmmdev_dip);
1813 minor_t minor;
1814
1815 ASSERT(MUTEX_HELD(&vmm_mtx));
1816
1817 *hma_release = B_FALSE;
1818
1819 if (clean_zsd) {
1820 vmm_zsd_rem_vm(sc);
1821 }
1822
1823 if (vmm_drv_purge(sc) != 0) {
1824 return (EINTR);
1825 }
1826
1827 /* Clean up devmem entries */
1828 vmmdev_devmem_purge(sc);
1829
1830 list_remove(&vmm_list, sc);
1831 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1832 minor = sc->vmm_minor;
1833 zone_rele(sc->vmm_zone);
1834 if (sc->vmm_is_open) {
1835 list_insert_tail(&vmm_destroy_list, sc);
1836 sc->vmm_flags |= VMM_DESTROY;
1837 } else {
1838 vm_destroy(sc->vmm_vm);
1839 ddi_soft_state_free(vmm_statep, minor);
1840 id_free(vmm_minors, minor);
1841 *hma_release = B_TRUE;
1842 }
1843 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1844
1845 return (0);
1846 }
1847
1848 int
1849 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1850 {
1851 boolean_t hma_release = B_FALSE;
1852 int err;
1853
1854 mutex_enter(&vmm_mtx);
1855 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1856 mutex_exit(&vmm_mtx);
1857
1858 if (hma_release)
1859 vmm_hma_release();
1860
1861 return (err);
1862 }
1863
1864 /* ARGSUSED */
1865 static int
1866 vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1867 {
1868 boolean_t hma_release = B_FALSE;
1869 vmm_softc_t *sc;
1870 int err;
1871
1872 if (crgetuid(cr) != 0)
1873 return (EPERM);
1874
1875 mutex_enter(&vmm_mtx);
1876
1877 if ((sc = vmm_lookup(name)) == NULL) {
1878 mutex_exit(&vmm_mtx);
1879 return (ENOENT);
1880 }
1881 /*
1882 * We don't check this in vmm_lookup() since that function is also used
1883 * for validation during create and currently vmm names must be unique.
1884 */
1885 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1886 mutex_exit(&vmm_mtx);
1887 return (EPERM);
1888 }
1889 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1890
1891 mutex_exit(&vmm_mtx);
1892
1893 if (hma_release)
1894 vmm_hma_release();
1895
1896 return (err);
1897 }
1898
1899 static int
1900 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1901 {
1902 minor_t minor;
1903 vmm_softc_t *sc;
1904
1905 minor = getminor(*devp);
1906 if (minor == VMM_CTL_MINOR) {
1907 /*
1908 * Master control device must be opened exclusively.
1909 */
1910 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
1911 return (EINVAL);
1912 }
1913
1914 return (0);
1915 }
1916
1917 mutex_enter(&vmm_mtx);
1918 sc = ddi_get_soft_state(vmm_statep, minor);
1919 if (sc == NULL) {
1920 mutex_exit(&vmm_mtx);
1921 return (ENXIO);
1922 }
1923
1924 sc->vmm_is_open = B_TRUE;
1925 mutex_exit(&vmm_mtx);
1926
1927 return (0);
1928 }
1929
1930 static int
1931 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
1932 {
1933 minor_t minor;
1934 vmm_softc_t *sc;
1935 boolean_t hma_release = B_FALSE;
1936
1937 minor = getminor(dev);
1938 if (minor == VMM_CTL_MINOR)
1939 return (0);
1940
1941 mutex_enter(&vmm_mtx);
1942 sc = ddi_get_soft_state(vmm_statep, minor);
1943 if (sc == NULL) {
1944 mutex_exit(&vmm_mtx);
1945 return (ENXIO);
1946 }
1947
1948 VERIFY(sc->vmm_is_open);
1949 sc->vmm_is_open = B_FALSE;
1950
1951 /*
1952 * If this VM was destroyed while the vmm device was open, then
1953 * clean it up now that it is closed.
1954 */
1955 if (sc->vmm_flags & VMM_DESTROY) {
1956 list_remove(&vmm_destroy_list, sc);
1957 vm_destroy(sc->vmm_vm);
1958 ddi_soft_state_free(vmm_statep, minor);
1959 id_free(vmm_minors, minor);
1960 hma_release = B_TRUE;
1961 }
1962 mutex_exit(&vmm_mtx);
1963
1964 if (hma_release)
1965 vmm_hma_release();
1966
1967 return (0);
1968 }
1969
1970 static int
1971 vmm_is_supported(intptr_t arg)
1972 {
1973 int r;
1974 const char *msg;
1975
1976 if (vmm_is_intel()) {
1977 r = vmx_x86_supported(&msg);
1978 } else if (vmm_is_svm()) {
1979 /*
1980 * HMA already ensured that the features necessary for SVM
1981 * operation were present and online during vmm_attach().
1982 */
1983 r = 0;
1984 } else {
1985 r = ENXIO;
1986 msg = "Unsupported CPU vendor";
1987 }
1988
1989 if (r != 0 && arg != (intptr_t)NULL) {
1990 if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0)
1991 return (EFAULT);
1992 }
1993 return (r);
1994 }
1995
1996 static int
1997 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1998 int *rvalp)
1999 {
2000 vmm_softc_t *sc;
2001 minor_t minor;
2002
2003 /* The structs in bhyve ioctls assume a 64-bit datamodel */
2004 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2005 return (ENOTSUP);
2006 }
2007
2008 minor = getminor(dev);
2009
2010 if (minor == VMM_CTL_MINOR) {
2011 void *argp = (void *)arg;
2012 char name[VM_MAX_NAMELEN] = { 0 };
2013 size_t len = 0;
2014
2015 if ((mode & FKIOCTL) != 0) {
2016 len = strlcpy(name, argp, sizeof (name));
2017 } else {
2018 if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2019 return (EFAULT);
2020 }
2021 }
2022 if (len >= VM_MAX_NAMELEN) {
2023 return (ENAMETOOLONG);
2024 }
2025
2026 switch (cmd) {
2027 case VMM_CREATE_VM:
2028 if ((mode & FWRITE) == 0)
2029 return (EPERM);
2030 return (vmmdev_do_vm_create(name, credp));
2031 case VMM_DESTROY_VM:
2032 if ((mode & FWRITE) == 0)
2033 return (EPERM);
2034 return (vmmdev_do_vm_destroy(name, credp));
2035 case VMM_VM_SUPPORTED:
2036 return (vmm_is_supported(arg));
2037 default:
2038 /* No other actions are legal on ctl device */
2039 return (ENOTTY);
2040 }
2041 }
2042
2043 sc = ddi_get_soft_state(vmm_statep, minor);
2044 ASSERT(sc);
2045
2046 if (sc->vmm_flags & VMM_DESTROY)
2047 return (ENXIO);
2048
2049 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2050 }
2051
2052 static int
2053 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2054 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2055 {
2056 vmm_softc_t *sc;
2057 const minor_t minor = getminor(dev);
2058 struct vm *vm;
2059 int err;
2060 vm_object_t vmo = NULL;
2061 struct vmspace *vms;
2062
2063 if (minor == VMM_CTL_MINOR) {
2064 return (ENODEV);
2065 }
2066 if (off < 0 || (off + len) <= 0) {
2067 return (EINVAL);
2068 }
2069 if ((prot & PROT_USER) == 0) {
2070 return (EACCES);
2071 }
2072
2073 sc = ddi_get_soft_state(vmm_statep, minor);
2074 ASSERT(sc);
2075
2076 if (sc->vmm_flags & VMM_DESTROY)
2077 return (ENXIO);
2078
2079 /* Grab read lock on the VM to prevent any changes to the memory map */
2080 vmm_read_lock(sc);
2081
2082 vm = sc->vmm_vm;
2083 vms = vm_get_vmspace(vm);
2084 if (off >= VM_DEVMEM_START) {
2085 int segid;
2086 off_t map_off = 0;
2087
2088 /* Mapping a devmem "device" */
2089 if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
2090 err = ENODEV;
2091 goto out;
2092 }
2093 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2094 if (err != 0) {
2095 goto out;
2096 }
2097 err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
2098 flags);
2099 } else {
2100 /* Mapping a part of the guest physical space */
2101 err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2102 flags);
2103 }
2104
2105
2106 out:
2107 vmm_read_unlock(sc);
2108 return (err);
2109 }
2110
2111 static sdev_plugin_validate_t
2112 vmm_sdev_validate(sdev_ctx_t ctx)
2113 {
2114 const char *name = sdev_ctx_name(ctx);
2115 vmm_softc_t *sc;
2116 sdev_plugin_validate_t ret;
2117 minor_t minor;
2118
2119 if (sdev_ctx_vtype(ctx) != VCHR)
2120 return (SDEV_VTOR_INVALID);
2121
2122 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2123
2124 mutex_enter(&vmm_mtx);
2125 if ((sc = vmm_lookup(name)) == NULL)
2126 ret = SDEV_VTOR_INVALID;
2127 else if (sc->vmm_minor != minor)
2128 ret = SDEV_VTOR_STALE;
2129 else
2130 ret = SDEV_VTOR_VALID;
2131 mutex_exit(&vmm_mtx);
2132
2133 return (ret);
2134 }
2135
2136 static int
2137 vmm_sdev_filldir(sdev_ctx_t ctx)
2138 {
2139 vmm_softc_t *sc;
2140 int ret;
2141
2142 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2143 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2144 sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2145 return (EINVAL);
2146 }
2147
2148 mutex_enter(&vmm_mtx);
2149 ASSERT(vmmdev_dip != NULL);
2150 for (sc = list_head(&vmm_list); sc != NULL;
2151 sc = list_next(&vmm_list, sc)) {
2152 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2153 ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2154 S_IFCHR | 0600,
2155 makedevice(ddi_driver_major(vmmdev_dip),
2156 sc->vmm_minor));
2157 } else {
2158 continue;
2159 }
2160 if (ret != 0 && ret != EEXIST)
2161 goto out;
2162 }
2163
2164 ret = 0;
2165
2166 out:
2167 mutex_exit(&vmm_mtx);
2168 return (ret);
2169 }
2170
2171 /* ARGSUSED */
2172 static void
2173 vmm_sdev_inactive(sdev_ctx_t ctx)
2174 {
2175 }
2176
2177 static sdev_plugin_ops_t vmm_sdev_ops = {
2178 .spo_version = SDEV_PLUGIN_VERSION,
2179 .spo_flags = SDEV_PLUGIN_SUBDIR,
2180 .spo_validate = vmm_sdev_validate,
2181 .spo_filldir = vmm_sdev_filldir,
2182 .spo_inactive = vmm_sdev_inactive
2183 };
2184
2185 /* ARGSUSED */
2186 static int
2187 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2188 {
2189 int error;
2190
2191 switch (cmd) {
2192 case DDI_INFO_DEVT2DEVINFO:
2193 *result = (void *)vmmdev_dip;
2194 error = DDI_SUCCESS;
2195 break;
2196 case DDI_INFO_DEVT2INSTANCE:
2197 *result = (void *)0;
2198 error = DDI_SUCCESS;
2199 break;
2200 default:
2201 error = DDI_FAILURE;
2202 break;
2203 }
2204 return (error);
2205 }
2206
2207 static int
2208 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2209 {
2210 sdev_plugin_hdl_t sph;
2211 hma_reg_t *reg = NULL;
2212 boolean_t vmm_loaded = B_FALSE;
2213
2214 if (cmd != DDI_ATTACH) {
2215 return (DDI_FAILURE);
2216 }
2217
2218 mutex_enter(&vmmdev_mtx);
2219 /* Ensure we are not already attached. */
2220 if (vmmdev_dip != NULL) {
2221 mutex_exit(&vmmdev_mtx);
2222 return (DDI_FAILURE);
2223 }
2224
2225 vmm_sol_glue_init();
2226 vmm_arena_init();
2227
2228 /*
2229 * Perform temporary HMA registration to determine if the system
2230 * is capable.
2231 */
2232 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2233 goto fail;
2234 } else if (vmm_mod_load() != 0) {
2235 goto fail;
2236 }
2237 vmm_loaded = B_TRUE;
2238 hma_unregister(reg);
2239 reg = NULL;
2240
2241 /* Create control node. Other nodes will be created on demand. */
2242 if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2243 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2244 goto fail;
2245 }
2246
2247 if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) ==
2248 (sdev_plugin_hdl_t)NULL) {
2249 ddi_remove_minor_node(dip, NULL);
2250 goto fail;
2251 }
2252
2253 ddi_report_dev(dip);
2254 vmmdev_sdev_hdl = sph;
2255 vmmdev_dip = dip;
2256 mutex_exit(&vmmdev_mtx);
2257 return (DDI_SUCCESS);
2258
2259 fail:
2260 if (vmm_loaded) {
2261 VERIFY0(vmm_mod_unload());
2262 }
2263 if (reg != NULL) {
2264 hma_unregister(reg);
2265 }
2266 vmm_arena_fini();
2267 vmm_sol_glue_cleanup();
2268 mutex_exit(&vmmdev_mtx);
2269 return (DDI_FAILURE);
2270 }
2271
2272 static int
2273 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2274 {
2275 if (cmd != DDI_DETACH) {
2276 return (DDI_FAILURE);
2277 }
2278
2279 /*
2280 * Ensure that all resources have been cleaned up.
2281 *
2282 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2283 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2284 * devinfo locked as iommu_cleanup() tries to recursively lock each
2285 * devinfo, including our own, while holding vmmdev_mtx.
2286 */
2287 if (mutex_tryenter(&vmmdev_mtx) == 0)
2288 return (DDI_FAILURE);
2289
2290 mutex_enter(&vmm_mtx);
2291 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2292 mutex_exit(&vmm_mtx);
2293 mutex_exit(&vmmdev_mtx);
2294 return (DDI_FAILURE);
2295 }
2296 mutex_exit(&vmm_mtx);
2297
2298 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2299 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2300 mutex_exit(&vmmdev_mtx);
2301 return (DDI_FAILURE);
2302 }
2303 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2304
2305 /* Remove the control node. */
2306 ddi_remove_minor_node(dip, "ctl");
2307 vmmdev_dip = NULL;
2308
2309 VERIFY0(vmm_mod_unload());
2310 VERIFY3U(vmmdev_hma_reg, ==, NULL);
2311 vmm_arena_fini();
2312 vmm_sol_glue_cleanup();
2313
2314 mutex_exit(&vmmdev_mtx);
2315
2316 return (DDI_SUCCESS);
2317 }
2318
2319 static struct cb_ops vmm_cb_ops = {
2320 vmm_open,
2321 vmm_close,
2322 nodev, /* strategy */
2323 nodev, /* print */
2324 nodev, /* dump */
2325 nodev, /* read */
2326 nodev, /* write */
2327 vmm_ioctl,
2328 nodev, /* devmap */
2329 nodev, /* mmap */
2330 vmm_segmap,
2331 nochpoll, /* poll */
2332 ddi_prop_op,
2333 NULL,
2334 D_NEW | D_MP | D_DEVMAP
2335 };
2336
2337 static struct dev_ops vmm_ops = {
2338 DEVO_REV,
2339 0,
2340 vmm_info,
2341 nulldev, /* identify */
2342 nulldev, /* probe */
2343 vmm_attach,
2344 vmm_detach,
2345 nodev, /* reset */
2346 &vmm_cb_ops,
2347 (struct bus_ops *)NULL
2348 };
2349
2350 static struct modldrv modldrv = {
2351 &mod_driverops,
2352 "bhyve vmm",
2353 &vmm_ops
2354 };
2355
2356 static struct modlinkage modlinkage = {
2357 MODREV_1,
2358 &modldrv,
2359 NULL
2360 };
2361
2362 int
2363 _init(void)
2364 {
2365 int error;
2366
2367 sysinit();
2368
2369 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2370 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2371 list_create(&vmm_list, sizeof (vmm_softc_t),
2372 offsetof(vmm_softc_t, vmm_node));
2373 list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2374 offsetof(vmm_softc_t, vmm_node));
2375 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2376
2377 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2378 if (error) {
2379 return (error);
2380 }
2381
2382 vmm_zsd_init();
2383
2384 error = mod_install(&modlinkage);
2385 if (error) {
2386 ddi_soft_state_fini(&vmm_statep);
2387 vmm_zsd_fini();
2388 }
2389
2390 return (error);
2391 }
2392
2393 int
2394 _fini(void)
2395 {
2396 int error;
2397
2398 error = mod_remove(&modlinkage);
2399 if (error) {
2400 return (error);
2401 }
2402
2403 vmm_zsd_fini();
2404
2405 ddi_soft_state_fini(&vmm_statep);
2406
2407 return (0);
2408 }
2409
2410 int
2411 _info(struct modinfo *modinfop)
2412 {
2413 return (mod_info(&modlinkage, modinfop));
2414 }