1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
  12 
  13 /*
  14  * Copyright 2015 Pluribus Networks Inc.
  15  * Copyright 2020 Joyent, Inc.
  16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  17  * Copyright 2021 Oxide Computer Company
  18  */
  19 
  20 #include <sys/types.h>
  21 #include <sys/conf.h>
  22 #include <sys/cpuvar.h>
  23 #include <sys/ioccom.h>
  24 #include <sys/stat.h>
  25 #include <sys/vmsystm.h>
  26 #include <sys/ddi.h>
  27 #include <sys/mkdev.h>
  28 #include <sys/sunddi.h>
  29 #include <sys/fs/dv_node.h>
  30 #include <sys/cpuset.h>
  31 #include <sys/id_space.h>
  32 #include <sys/fs/sdev_plugin.h>
  33 #include <sys/smt.h>
  34 #include <sys/kstat.h>
  35 
  36 #include <sys/kernel.h>
  37 #include <sys/hma.h>
  38 #include <sys/x86_archext.h>
  39 #include <x86/apicreg.h>
  40 
  41 #include <sys/vmm.h>
  42 #include <sys/vmm_kernel.h>
  43 #include <sys/vmm_instruction_emul.h>
  44 #include <sys/vmm_dev.h>
  45 #include <sys/vmm_impl.h>
  46 #include <sys/vmm_drv.h>
  47 #include <sys/vmm_vm.h>
  48 
  49 #include <vm/seg_dev.h>
  50 
  51 #include "io/ppt.h"
  52 #include "io/vatpic.h"
  53 #include "io/vioapic.h"
  54 #include "io/vrtc.h"
  55 #include "io/vhpet.h"
  56 #include "io/vpmtmr.h"
  57 #include "vmm_lapic.h"
  58 #include "vmm_stat.h"
  59 #include "vmm_util.h"
  60 
  61 /*
  62  * Locking details:
  63  *
  64  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
  65  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
  66  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
  67  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
  68  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
  69  */
  70 
  71 static kmutex_t         vmmdev_mtx;
  72 static dev_info_t       *vmmdev_dip;
  73 static hma_reg_t        *vmmdev_hma_reg;
  74 static uint_t           vmmdev_hma_ref;
  75 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
  76 
  77 static kmutex_t         vmm_mtx;
  78 static list_t           vmm_list;
  79 static list_t           vmm_destroy_list;
  80 static id_space_t       *vmm_minors;
  81 static void             *vmm_statep;
  82 
  83 static const char *vmmdev_hvm_name = "bhyve";
  84 
  85 /* For sdev plugin (/dev) */
  86 #define VMM_SDEV_ROOT "/dev/vmm"
  87 
  88 /* From uts/i86pc/io/vmm/intel/vmx.c */
  89 extern int vmx_x86_supported(const char **);
  90 
  91 /* Holds and hooks from drivers external to vmm */
  92 struct vmm_hold {
  93         list_node_t     vmh_node;
  94         vmm_softc_t     *vmh_sc;
  95         boolean_t       vmh_release_req;
  96         uint_t          vmh_ioport_hook_cnt;
  97 };
  98 
  99 struct vmm_lease {
 100         list_node_t             vml_node;
 101         struct vm               *vml_vm;
 102         boolean_t               vml_expired;
 103         boolean_t               (*vml_expire_func)(void *);
 104         void                    *vml_expire_arg;
 105         list_node_t             vml_expire_node;
 106         struct vmm_hold         *vml_hold;
 107 };
 108 
 109 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
 110 static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
 111 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
 112 static void vmm_kstat_init(vmm_softc_t *);
 113 static void vmm_kstat_fini(vmm_softc_t *);
 114 
 115 static int
 116 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 117 {
 118         int error;
 119         bool sysmem;
 120 
 121         error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
 122             NULL);
 123         if (error || mseg->len == 0)
 124                 return (error);
 125 
 126         if (!sysmem) {
 127                 vmm_devmem_entry_t *de;
 128                 list_t *dl = &sc->vmm_devmem_list;
 129 
 130                 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
 131                         if (de->vde_segid == mseg->segid) {
 132                                 break;
 133                         }
 134                 }
 135                 if (de != NULL) {
 136                         (void) strlcpy(mseg->name, de->vde_name,
 137                             sizeof (mseg->name));
 138                 }
 139         } else {
 140                 bzero(mseg->name, sizeof (mseg->name));
 141         }
 142 
 143         return (error);
 144 }
 145 
 146 /*
 147  * The 'devmem' hack:
 148  *
 149  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
 150  * in the vm which appear with their own name related to the vm under /dev.
 151  * Since this would be a hassle from an sdev perspective and would require a
 152  * new cdev interface (or complicate the existing one), we choose to implement
 153  * this in a different manner.  When 'devmem' mappings are created, an
 154  * identifying off_t is communicated back out to userspace.  That off_t,
 155  * residing above the normal guest memory space, can be used to mmap the
 156  * 'devmem' mapping from the already-open vm device.
 157  */
 158 
 159 static int
 160 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
 161 {
 162         off_t map_offset;
 163         vmm_devmem_entry_t *entry;
 164 
 165         if (list_is_empty(&sc->vmm_devmem_list)) {
 166                 map_offset = VM_DEVMEM_START;
 167         } else {
 168                 entry = list_tail(&sc->vmm_devmem_list);
 169                 map_offset = entry->vde_off + entry->vde_len;
 170                 if (map_offset < entry->vde_off) {
 171                         /* Do not tolerate overflow */
 172                         return (ERANGE);
 173                 }
 174                 /*
 175                  * XXXJOY: We could choose to search the list for duplicate
 176                  * names and toss an error.  Since we're using the offset
 177                  * method for now, it does not make much of a difference.
 178                  */
 179         }
 180 
 181         entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
 182         entry->vde_segid = mseg->segid;
 183         entry->vde_len = mseg->len;
 184         entry->vde_off = map_offset;
 185         (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
 186         list_insert_tail(&sc->vmm_devmem_list, entry);
 187 
 188         return (0);
 189 }
 190 
 191 static boolean_t
 192 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
 193     off_t *map_offp)
 194 {
 195         list_t *dl = &sc->vmm_devmem_list;
 196         vmm_devmem_entry_t *de = NULL;
 197         const off_t map_end = off + len;
 198 
 199         VERIFY(off >= VM_DEVMEM_START);
 200 
 201         if (map_end < off) {
 202                 /* No match on overflow */
 203                 return (B_FALSE);
 204         }
 205 
 206         for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
 207                 const off_t item_end = de->vde_off + de->vde_len;
 208 
 209                 if (de->vde_off <= off && item_end >= map_end) {
 210                         *segidp = de->vde_segid;
 211                         *map_offp = off - de->vde_off;
 212                         return (B_TRUE);
 213                 }
 214         }
 215         return (B_FALSE);
 216 }
 217 
 218 static void
 219 vmmdev_devmem_purge(vmm_softc_t *sc)
 220 {
 221         vmm_devmem_entry_t *entry;
 222 
 223         while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
 224                 kmem_free(entry, sizeof (*entry));
 225         }
 226 }
 227 
 228 static int
 229 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 230 {
 231         int error;
 232         bool sysmem = true;
 233 
 234         if (VM_MEMSEG_NAME(mseg)) {
 235                 sysmem = false;
 236         }
 237         error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
 238 
 239         if (error == 0 && VM_MEMSEG_NAME(mseg)) {
 240                 /*
 241                  * Rather than create a whole fresh device from which userspace
 242                  * can mmap this segment, instead make it available at an
 243                  * offset above where the main guest memory resides.
 244                  */
 245                 error = vmmdev_devmem_create(sc, mseg, mseg->name);
 246                 if (error != 0) {
 247                         vm_free_memseg(sc->vmm_vm, mseg->segid);
 248                 }
 249         }
 250         return (error);
 251 }
 252 
 253 /*
 254  * Resource Locking and Exclusion
 255  *
 256  * Much of bhyve depends on key portions of VM state, such as the guest memory
 257  * map, to remain unchanged while the guest is running.  As ported from
 258  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
 259  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
 260  * performing the work of actually running the guest in VMX/SVM, would lock
 261  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
 262  * state, all of the vCPUs would be first locked, ensuring that the
 263  * operation(s) could complete without any other threads stumbling into
 264  * intermediate states.
 265  *
 266  * This approach is largely effective for bhyve.  Common operations, such as
 267  * running the vCPUs, steer clear of lock contention.  The model begins to
 268  * break down for operations which do not occur in the context of a specific
 269  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
 270  * thread in the bhyve process.  In order to properly protect those vCPU-less
 271  * operations from encountering invalid states, additional locking is required.
 272  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
 273  * It does mean that class of operations will be serialized on locking the
 274  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
 275  * undue contention on the VM_MAXCPU-1 vCPU.
 276  *
 277  * In order to address the shortcomings of this model, the concept of a
 278  * read/write lock has been added to bhyve.  Operations which change
 279  * fundamental aspects of a VM (such as the memory map) must acquire the write
 280  * lock, which also implies locking all of the vCPUs and waiting for all read
 281  * lock holders to release.  While it increases the cost and waiting time for
 282  * those few operations, it allows most hot-path operations on the VM (which
 283  * depend on its configuration remaining stable) to occur with minimal locking.
 284  *
 285  * Consumers of the Driver API (see below) are a special case when it comes to
 286  * this locking, since they may hold a read lock via the drv_lease mechanism
 287  * for an extended period of time.  Rather than forcing those consumers to
 288  * continuously poll for a write lock attempt, the lease system forces them to
 289  * provide a release callback to trigger their clean-up (and potential later
 290  * reacquisition) of the read lock.
 291  */
 292 
 293 static void
 294 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
 295 {
 296         ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 297 
 298         /*
 299          * Since this state transition is utilizing from_idle=true, it should
 300          * not fail, but rather block until it can be successful.
 301          */
 302         VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
 303 }
 304 
 305 static void
 306 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
 307 {
 308         ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 309 
 310         VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
 311         vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
 312 }
 313 
 314 static void
 315 vmm_read_lock(vmm_softc_t *sc)
 316 {
 317         rw_enter(&sc->vmm_rwlock, RW_READER);
 318 }
 319 
 320 static void
 321 vmm_read_unlock(vmm_softc_t *sc)
 322 {
 323         rw_exit(&sc->vmm_rwlock);
 324 }
 325 
 326 static void
 327 vmm_write_lock(vmm_softc_t *sc)
 328 {
 329         int maxcpus;
 330 
 331         /* First lock all the vCPUs */
 332         maxcpus = vm_get_maxcpus(sc->vmm_vm);
 333         for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
 334                 vcpu_lock_one(sc, vcpu);
 335         }
 336 
 337         mutex_enter(&sc->vmm_lease_lock);
 338         VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
 339         sc->vmm_lease_blocker++;
 340         if (sc->vmm_lease_blocker == 1) {
 341                 list_t *list = &sc->vmm_lease_list;
 342                 vmm_lease_t *lease = list_head(list);
 343 
 344                 while (lease != NULL) {
 345                         boolean_t sync_break = B_FALSE;
 346 
 347                         if (!lease->vml_expired) {
 348                                 void *arg = lease->vml_expire_arg;
 349                                 lease->vml_expired = B_TRUE;
 350                                 sync_break = lease->vml_expire_func(arg);
 351                         }
 352 
 353                         if (sync_break) {
 354                                 vmm_lease_t *next;
 355 
 356                                 /*
 357                                  * These leases which are synchronously broken
 358                                  * result in vmm_read_unlock() calls from a
 359                                  * different thread than the corresponding
 360                                  * vmm_read_lock().  This is acceptable, given
 361                                  * that the rwlock underpinning the whole
 362                                  * mechanism tolerates the behavior.  This
 363                                  * flexibility is _only_ afforded to VM read
 364                                  * lock (RW_READER) holders.
 365                                  */
 366                                 next = list_next(list, lease);
 367                                 vmm_lease_break_locked(sc, lease);
 368                                 lease = next;
 369                         } else {
 370                                 lease = list_next(list, lease);
 371                         }
 372                 }
 373         }
 374         mutex_exit(&sc->vmm_lease_lock);
 375 
 376         rw_enter(&sc->vmm_rwlock, RW_WRITER);
 377         /*
 378          * For now, the 'maxcpus' value for an instance is fixed at the
 379          * compile-time constant of VM_MAXCPU at creation.  If this changes in
 380          * the future, allowing for dynamic vCPU resource sizing, acquisition
 381          * of the write lock will need to be wary of such changes.
 382          */
 383         VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
 384 }
 385 
 386 static void
 387 vmm_write_unlock(vmm_softc_t *sc)
 388 {
 389         int maxcpus;
 390 
 391         mutex_enter(&sc->vmm_lease_lock);
 392         VERIFY3U(sc->vmm_lease_blocker, !=, 0);
 393         sc->vmm_lease_blocker--;
 394         if (sc->vmm_lease_blocker == 0) {
 395                 cv_broadcast(&sc->vmm_lease_cv);
 396         }
 397         mutex_exit(&sc->vmm_lease_lock);
 398 
 399         /*
 400          * The VM write lock _must_ be released from the same thread it was
 401          * acquired in, unlike the read lock.
 402          */
 403         VERIFY(rw_write_held(&sc->vmm_rwlock));
 404         rw_exit(&sc->vmm_rwlock);
 405 
 406         /* Unlock all the vCPUs */
 407         maxcpus = vm_get_maxcpus(sc->vmm_vm);
 408         for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
 409                 vcpu_unlock_one(sc, vcpu);
 410         }
 411 }
 412 
 413 static int
 414 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 415     cred_t *credp, int *rvalp)
 416 {
 417         int error = 0, vcpu = -1;
 418         void *datap = (void *)arg;
 419         enum vm_lock_type {
 420                 LOCK_NONE = 0,
 421                 LOCK_VCPU,
 422                 LOCK_READ_HOLD,
 423                 LOCK_WRITE_HOLD
 424         } lock_type = LOCK_NONE;
 425 
 426         /* Acquire any exclusion resources needed for the operation. */
 427         switch (cmd) {
 428         case VM_RUN:
 429         case VM_GET_REGISTER:
 430         case VM_SET_REGISTER:
 431         case VM_GET_SEGMENT_DESCRIPTOR:
 432         case VM_SET_SEGMENT_DESCRIPTOR:
 433         case VM_GET_REGISTER_SET:
 434         case VM_SET_REGISTER_SET:
 435         case VM_INJECT_EXCEPTION:
 436         case VM_GET_CAPABILITY:
 437         case VM_SET_CAPABILITY:
 438         case VM_PPTDEV_MSI:
 439         case VM_PPTDEV_MSIX:
 440         case VM_SET_X2APIC_STATE:
 441         case VM_GLA2GPA:
 442         case VM_GLA2GPA_NOFAULT:
 443         case VM_ACTIVATE_CPU:
 444         case VM_SET_INTINFO:
 445         case VM_GET_INTINFO:
 446         case VM_RESTART_INSTRUCTION:
 447         case VM_SET_KERNEMU_DEV:
 448         case VM_GET_KERNEMU_DEV:
 449         case VM_RESET_CPU:
 450         case VM_GET_RUN_STATE:
 451         case VM_SET_RUN_STATE:
 452                 /*
 453                  * Copy in the ID of the vCPU chosen for this operation.
 454                  * Since a nefarious caller could update their struct between
 455                  * this locking and when the rest of the ioctl data is copied
 456                  * in, it is _critical_ that this local 'vcpu' variable be used
 457                  * rather than the in-struct one when performing the ioctl.
 458                  */
 459                 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
 460                         return (EFAULT);
 461                 }
 462                 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
 463                         return (EINVAL);
 464                 }
 465                 vcpu_lock_one(sc, vcpu);
 466                 lock_type = LOCK_VCPU;
 467                 break;
 468 
 469         case VM_REINIT:
 470         case VM_BIND_PPTDEV:
 471         case VM_UNBIND_PPTDEV:
 472         case VM_MAP_PPTDEV_MMIO:
 473         case VM_UNMAP_PPTDEV_MMIO:
 474         case VM_ALLOC_MEMSEG:
 475         case VM_MMAP_MEMSEG:
 476         case VM_MUNMAP_MEMSEG:
 477         case VM_WRLOCK_CYCLE:
 478         case VM_PMTMR_LOCATE:
 479         case VM_ARC_RESV:
 480                 vmm_write_lock(sc);
 481                 lock_type = LOCK_WRITE_HOLD;
 482                 break;
 483 
 484         case VM_GET_GPA_PMAP:
 485         case VM_GET_MEMSEG:
 486         case VM_MMAP_GETNEXT:
 487         case VM_LAPIC_IRQ:
 488         case VM_INJECT_NMI:
 489         case VM_IOAPIC_ASSERT_IRQ:
 490         case VM_IOAPIC_DEASSERT_IRQ:
 491         case VM_IOAPIC_PULSE_IRQ:
 492         case VM_LAPIC_MSI:
 493         case VM_LAPIC_LOCAL_IRQ:
 494         case VM_GET_X2APIC_STATE:
 495         case VM_RTC_READ:
 496         case VM_RTC_WRITE:
 497         case VM_RTC_SETTIME:
 498         case VM_RTC_GETTIME:
 499         case VM_PPTDEV_DISABLE_MSIX:
 500         case VM_DEVMEM_GETOFFSET:
 501                 vmm_read_lock(sc);
 502                 lock_type = LOCK_READ_HOLD;
 503                 break;
 504 
 505         case VM_IOAPIC_PINCOUNT:
 506         default:
 507                 break;
 508         }
 509 
 510         /* Execute the primary logic for the ioctl. */
 511         switch (cmd) {
 512         case VM_RUN: {
 513                 struct vm_entry entry;
 514 
 515                 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
 516                         error = EFAULT;
 517                         break;
 518                 }
 519 
 520                 if (!(curthread->t_schedflag & TS_VCPU))
 521                         smt_mark_as_vcpu();
 522 
 523                 error = vm_run(sc->vmm_vm, vcpu, &entry);
 524 
 525                 /*
 526                  * Unexpected states in vm_run() are expressed through positive
 527                  * errno-oriented return values.  VM states which expect further
 528                  * processing in userspace (necessary context via exitinfo) are
 529                  * expressed through negative return values.  For the time being
 530                  * a return value of 0 is not expected from vm_run().
 531                  */
 532                 ASSERT(error != 0);
 533                 if (error < 0) {
 534                         const struct vm_exit *vme;
 535                         void *outp = entry.exit_data;
 536 
 537                         error = 0;
 538                         vme = vm_exitinfo(sc->vmm_vm, vcpu);
 539                         if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
 540                                 error = EFAULT;
 541                         }
 542                 }
 543                 break;
 544         }
 545         case VM_SUSPEND: {
 546                 struct vm_suspend vmsuspend;
 547 
 548                 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
 549                         error = EFAULT;
 550                         break;
 551                 }
 552                 error = vm_suspend(sc->vmm_vm, vmsuspend.how);
 553                 break;
 554         }
 555         case VM_REINIT:
 556                 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
 557                         /*
 558                          * The VM instance should be free of driver-attached
 559                          * hooks during the reinitialization process.
 560                          */
 561                         break;
 562                 }
 563                 error = vm_reinit(sc->vmm_vm);
 564                 (void) vmm_drv_block_hook(sc, B_FALSE);
 565                 break;
 566         case VM_STAT_DESC: {
 567                 struct vm_stat_desc statdesc;
 568 
 569                 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
 570                         error = EFAULT;
 571                         break;
 572                 }
 573                 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
 574                     sizeof (statdesc.desc));
 575                 if (error == 0 &&
 576                     ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
 577                         error = EFAULT;
 578                         break;
 579                 }
 580                 break;
 581         }
 582         case VM_STATS_IOC: {
 583                 struct vm_stats vmstats;
 584 
 585                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
 586                 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
 587                         error = EFAULT;
 588                         break;
 589                 }
 590                 hrt2tv(gethrtime(), &vmstats.tv);
 591                 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
 592                     &vmstats.num_entries, vmstats.statbuf);
 593                 if (error == 0 &&
 594                     ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
 595                         error = EFAULT;
 596                         break;
 597                 }
 598                 break;
 599         }
 600 
 601         case VM_PPTDEV_MSI: {
 602                 struct vm_pptdev_msi pptmsi;
 603 
 604                 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
 605                         error = EFAULT;
 606                         break;
 607                 }
 608                 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
 609                     pptmsi.addr, pptmsi.msg, pptmsi.numvec);
 610                 break;
 611         }
 612         case VM_PPTDEV_MSIX: {
 613                 struct vm_pptdev_msix pptmsix;
 614 
 615                 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
 616                         error = EFAULT;
 617                         break;
 618                 }
 619                 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
 620                     pptmsix.idx, pptmsix.addr, pptmsix.msg,
 621                     pptmsix.vector_control);
 622                 break;
 623         }
 624         case VM_PPTDEV_DISABLE_MSIX: {
 625                 struct vm_pptdev pptdev;
 626 
 627                 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 628                         error = EFAULT;
 629                         break;
 630                 }
 631                 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
 632                 break;
 633         }
 634         case VM_MAP_PPTDEV_MMIO: {
 635                 struct vm_pptdev_mmio pptmmio;
 636 
 637                 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
 638                         error = EFAULT;
 639                         break;
 640                 }
 641                 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
 642                     pptmmio.len, pptmmio.hpa);
 643                 break;
 644         }
 645         case VM_UNMAP_PPTDEV_MMIO: {
 646                 struct vm_pptdev_mmio pptmmio;
 647 
 648                 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
 649                         error = EFAULT;
 650                         break;
 651                 }
 652                 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
 653                     pptmmio.len);
 654                 break;
 655         }
 656         case VM_BIND_PPTDEV: {
 657                 struct vm_pptdev pptdev;
 658 
 659                 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 660                         error = EFAULT;
 661                         break;
 662                 }
 663                 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
 664                 break;
 665         }
 666         case VM_UNBIND_PPTDEV: {
 667                 struct vm_pptdev pptdev;
 668 
 669                 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 670                         error = EFAULT;
 671                         break;
 672                 }
 673                 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
 674                 break;
 675         }
 676         case VM_GET_PPTDEV_LIMITS: {
 677                 struct vm_pptdev_limits pptlimits;
 678 
 679                 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
 680                         error = EFAULT;
 681                         break;
 682                 }
 683                 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
 684                     &pptlimits.msi_limit, &pptlimits.msix_limit);
 685                 if (error == 0 &&
 686                     ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
 687                         error = EFAULT;
 688                         break;
 689                 }
 690                 break;
 691         }
 692         case VM_INJECT_EXCEPTION: {
 693                 struct vm_exception vmexc;
 694                 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
 695                         error = EFAULT;
 696                         break;
 697                 }
 698                 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
 699                     vmexc.error_code_valid, vmexc.error_code,
 700                     vmexc.restart_instruction);
 701                 break;
 702         }
 703         case VM_INJECT_NMI: {
 704                 struct vm_nmi vmnmi;
 705 
 706                 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
 707                         error = EFAULT;
 708                         break;
 709                 }
 710                 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
 711                 break;
 712         }
 713         case VM_LAPIC_IRQ: {
 714                 struct vm_lapic_irq vmirq;
 715 
 716                 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
 717                         error = EFAULT;
 718                         break;
 719                 }
 720                 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
 721                 break;
 722         }
 723         case VM_LAPIC_LOCAL_IRQ: {
 724                 struct vm_lapic_irq vmirq;
 725 
 726                 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
 727                         error = EFAULT;
 728                         break;
 729                 }
 730                 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
 731                     vmirq.vector);
 732                 break;
 733         }
 734         case VM_LAPIC_MSI: {
 735                 struct vm_lapic_msi vmmsi;
 736 
 737                 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
 738                         error = EFAULT;
 739                         break;
 740                 }
 741                 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
 742                 break;
 743         }
 744 
 745         case VM_IOAPIC_ASSERT_IRQ: {
 746                 struct vm_ioapic_irq ioapic_irq;
 747 
 748                 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 749                         error = EFAULT;
 750                         break;
 751                 }
 752                 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
 753                 break;
 754         }
 755         case VM_IOAPIC_DEASSERT_IRQ: {
 756                 struct vm_ioapic_irq ioapic_irq;
 757 
 758                 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 759                         error = EFAULT;
 760                         break;
 761                 }
 762                 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
 763                 break;
 764         }
 765         case VM_IOAPIC_PULSE_IRQ: {
 766                 struct vm_ioapic_irq ioapic_irq;
 767 
 768                 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 769                         error = EFAULT;
 770                         break;
 771                 }
 772                 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
 773                 break;
 774         }
 775         case VM_IOAPIC_PINCOUNT: {
 776                 int pincount;
 777 
 778                 pincount = vioapic_pincount(sc->vmm_vm);
 779                 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
 780                         error = EFAULT;
 781                         break;
 782                 }
 783                 break;
 784         }
 785 
 786         case VM_ISA_ASSERT_IRQ: {
 787                 struct vm_isa_irq isa_irq;
 788 
 789                 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 790                         error = EFAULT;
 791                         break;
 792                 }
 793                 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
 794                 if (error == 0 && isa_irq.ioapic_irq != -1) {
 795                         error = vioapic_assert_irq(sc->vmm_vm,
 796                             isa_irq.ioapic_irq);
 797                 }
 798                 break;
 799         }
 800         case VM_ISA_DEASSERT_IRQ: {
 801                 struct vm_isa_irq isa_irq;
 802 
 803                 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 804                         error = EFAULT;
 805                         break;
 806                 }
 807                 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
 808                 if (error == 0 && isa_irq.ioapic_irq != -1) {
 809                         error = vioapic_deassert_irq(sc->vmm_vm,
 810                             isa_irq.ioapic_irq);
 811                 }
 812                 break;
 813         }
 814         case VM_ISA_PULSE_IRQ: {
 815                 struct vm_isa_irq isa_irq;
 816 
 817                 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 818                         error = EFAULT;
 819                         break;
 820                 }
 821                 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
 822                 if (error == 0 && isa_irq.ioapic_irq != -1) {
 823                         error = vioapic_pulse_irq(sc->vmm_vm,
 824                             isa_irq.ioapic_irq);
 825                 }
 826                 break;
 827         }
 828         case VM_ISA_SET_IRQ_TRIGGER: {
 829                 struct vm_isa_irq_trigger isa_irq_trigger;
 830 
 831                 if (ddi_copyin(datap, &isa_irq_trigger,
 832                     sizeof (isa_irq_trigger), md)) {
 833                         error = EFAULT;
 834                         break;
 835                 }
 836                 error = vatpic_set_irq_trigger(sc->vmm_vm,
 837                     isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
 838                 break;
 839         }
 840 
 841         case VM_MMAP_GETNEXT: {
 842                 struct vm_memmap mm;
 843 
 844                 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
 845                         error = EFAULT;
 846                         break;
 847                 }
 848                 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
 849                     &mm.segoff, &mm.len, &mm.prot, &mm.flags);
 850                 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
 851                         error = EFAULT;
 852                         break;
 853                 }
 854                 break;
 855         }
 856         case VM_MMAP_MEMSEG: {
 857                 struct vm_memmap mm;
 858 
 859                 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
 860                         error = EFAULT;
 861                         break;
 862                 }
 863                 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
 864                     mm.len, mm.prot, mm.flags);
 865                 break;
 866         }
 867         case VM_MUNMAP_MEMSEG: {
 868                 struct vm_munmap mu;
 869 
 870                 if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
 871                         error = EFAULT;
 872                         break;
 873                 }
 874                 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
 875                 break;
 876         }
 877         case VM_ALLOC_MEMSEG: {
 878                 struct vm_memseg vmseg;
 879 
 880                 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
 881                         error = EFAULT;
 882                         break;
 883                 }
 884                 error = vmmdev_alloc_memseg(sc, &vmseg);
 885                 break;
 886         }
 887         case VM_GET_MEMSEG: {
 888                 struct vm_memseg vmseg;
 889 
 890                 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
 891                         error = EFAULT;
 892                         break;
 893                 }
 894                 error = vmmdev_get_memseg(sc, &vmseg);
 895                 if (error == 0 &&
 896                     ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
 897                         error = EFAULT;
 898                         break;
 899                 }
 900                 break;
 901         }
 902         case VM_GET_REGISTER: {
 903                 struct vm_register vmreg;
 904 
 905                 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
 906                         error = EFAULT;
 907                         break;
 908                 }
 909                 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
 910                     &vmreg.regval);
 911                 if (error == 0 &&
 912                     ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
 913                         error = EFAULT;
 914                         break;
 915                 }
 916                 break;
 917         }
 918         case VM_SET_REGISTER: {
 919                 struct vm_register vmreg;
 920 
 921                 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
 922                         error = EFAULT;
 923                         break;
 924                 }
 925                 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
 926                     vmreg.regval);
 927                 break;
 928         }
 929         case VM_SET_SEGMENT_DESCRIPTOR: {
 930                 struct vm_seg_desc vmsegd;
 931 
 932                 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
 933                         error = EFAULT;
 934                         break;
 935                 }
 936                 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
 937                     &vmsegd.desc);
 938                 break;
 939         }
 940         case VM_GET_SEGMENT_DESCRIPTOR: {
 941                 struct vm_seg_desc vmsegd;
 942 
 943                 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
 944                         error = EFAULT;
 945                         break;
 946                 }
 947                 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
 948                     &vmsegd.desc);
 949                 if (error == 0 &&
 950                     ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
 951                         error = EFAULT;
 952                         break;
 953                 }
 954                 break;
 955         }
 956         case VM_GET_REGISTER_SET: {
 957                 struct vm_register_set vrs;
 958                 int regnums[VM_REG_LAST];
 959                 uint64_t regvals[VM_REG_LAST];
 960 
 961                 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
 962                         error = EFAULT;
 963                         break;
 964                 }
 965                 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
 966                         error = EINVAL;
 967                         break;
 968                 }
 969                 if (ddi_copyin(vrs.regnums, regnums,
 970                     sizeof (int) * vrs.count, md)) {
 971                         error = EFAULT;
 972                         break;
 973                 }
 974 
 975                 error = 0;
 976                 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
 977                         if (regnums[i] < 0) {
 978                                 error = EINVAL;
 979                                 break;
 980                         }
 981                         error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
 982                             ®vals[i]);
 983                 }
 984                 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
 985                     sizeof (uint64_t) * vrs.count, md)) {
 986                         error = EFAULT;
 987                 }
 988                 break;
 989         }
 990         case VM_SET_REGISTER_SET: {
 991                 struct vm_register_set vrs;
 992                 int regnums[VM_REG_LAST];
 993                 uint64_t regvals[VM_REG_LAST];
 994 
 995                 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
 996                         error = EFAULT;
 997                         break;
 998                 }
 999                 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1000                         error = EINVAL;
1001                         break;
1002                 }
1003                 if (ddi_copyin(vrs.regnums, regnums,
1004                     sizeof (int) * vrs.count, md)) {
1005                         error = EFAULT;
1006                         break;
1007                 }
1008                 if (ddi_copyin(vrs.regvals, regvals,
1009                     sizeof (uint64_t) * vrs.count, md)) {
1010                         error = EFAULT;
1011                         break;
1012                 }
1013 
1014                 error = 0;
1015                 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1016                         /*
1017                          * Setting registers in a set is not atomic, since a
1018                          * failure in the middle of the set will cause a
1019                          * bail-out and inconsistent register state.  Callers
1020                          * should be wary of this.
1021                          */
1022                         if (regnums[i] < 0) {
1023                                 error = EINVAL;
1024                                 break;
1025                         }
1026                         error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1027                             regvals[i]);
1028                 }
1029                 break;
1030         }
1031         case VM_RESET_CPU: {
1032                 struct vm_vcpu_reset vvr;
1033 
1034                 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1035                         error = EFAULT;
1036                         break;
1037                 }
1038                 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1039                         error = EINVAL;
1040                 }
1041 
1042                 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1043                 break;
1044         }
1045         case VM_GET_RUN_STATE: {
1046                 struct vm_run_state vrs;
1047 
1048                 bzero(&vrs, sizeof (vrs));
1049                 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1050                     &vrs.sipi_vector);
1051                 if (error == 0) {
1052                         if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1053                                 error = EFAULT;
1054                                 break;
1055                         }
1056                 }
1057                 break;
1058         }
1059         case VM_SET_RUN_STATE: {
1060                 struct vm_run_state vrs;
1061 
1062                 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1063                         error = EFAULT;
1064                         break;
1065                 }
1066                 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1067                     vrs.sipi_vector);
1068                 break;
1069         }
1070 
1071         case VM_SET_KERNEMU_DEV:
1072         case VM_GET_KERNEMU_DEV: {
1073                 struct vm_readwrite_kernemu_device kemu;
1074                 size_t size = 0;
1075 
1076                 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1077                         error = EFAULT;
1078                         break;
1079                 }
1080 
1081                 if (kemu.access_width > 3) {
1082                         error = EINVAL;
1083                         break;
1084                 }
1085                 size = (1 << kemu.access_width);
1086                 ASSERT(size >= 1 && size <= 8);
1087 
1088                 if (cmd == VM_SET_KERNEMU_DEV) {
1089                         error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1090                             kemu.gpa, kemu.value, size);
1091                 } else {
1092                         error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1093                             kemu.gpa, &kemu.value, size);
1094                 }
1095 
1096                 if (error == 0) {
1097                         if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1098                                 error = EFAULT;
1099                                 break;
1100                         }
1101                 }
1102                 break;
1103         }
1104 
1105         case VM_GET_CAPABILITY: {
1106                 struct vm_capability vmcap;
1107 
1108                 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1109                         error = EFAULT;
1110                         break;
1111                 }
1112                 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1113                     &vmcap.capval);
1114                 if (error == 0 &&
1115                     ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1116                         error = EFAULT;
1117                         break;
1118                 }
1119                 break;
1120         }
1121         case VM_SET_CAPABILITY: {
1122                 struct vm_capability vmcap;
1123 
1124                 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1125                         error = EFAULT;
1126                         break;
1127                 }
1128                 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1129                     vmcap.capval);
1130                 break;
1131         }
1132         case VM_SET_X2APIC_STATE: {
1133                 struct vm_x2apic x2apic;
1134 
1135                 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1136                         error = EFAULT;
1137                         break;
1138                 }
1139                 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1140                 break;
1141         }
1142         case VM_GET_X2APIC_STATE: {
1143                 struct vm_x2apic x2apic;
1144 
1145                 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1146                         error = EFAULT;
1147                         break;
1148                 }
1149                 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1150                     &x2apic.state);
1151                 if (error == 0 &&
1152                     ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1153                         error = EFAULT;
1154                         break;
1155                 }
1156                 break;
1157         }
1158         case VM_GET_GPA_PMAP: {
1159                 struct vm_gpa_pte gpapte;
1160 
1161                 if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1162                         error = EFAULT;
1163                         break;
1164                 }
1165 #ifdef __FreeBSD__
1166                 /* XXXJOY: add function? */
1167                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1168                     gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1169 #endif
1170                 error = 0;
1171                 break;
1172         }
1173         case VM_GET_HPET_CAPABILITIES: {
1174                 struct vm_hpet_cap hpetcap;
1175 
1176                 error = vhpet_getcap(&hpetcap);
1177                 if (error == 0 &&
1178                     ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1179                         error = EFAULT;
1180                         break;
1181                 }
1182                 break;
1183         }
1184         case VM_GLA2GPA: {
1185                 struct vm_gla2gpa gg;
1186 
1187                 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1188                         error = EFAULT;
1189                         break;
1190                 }
1191                 gg.vcpuid = vcpu;
1192                 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1193                     gg.prot, &gg.gpa, &gg.fault);
1194                 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1195                         error = EFAULT;
1196                         break;
1197                 }
1198                 break;
1199         }
1200         case VM_GLA2GPA_NOFAULT: {
1201                 struct vm_gla2gpa gg;
1202 
1203                 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1204                         error = EFAULT;
1205                         break;
1206                 }
1207                 gg.vcpuid = vcpu;
1208                 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1209                     gg.gla, gg.prot, &gg.gpa, &gg.fault);
1210                 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1211                         error = EFAULT;
1212                         break;
1213                 }
1214                 break;
1215         }
1216 
1217         case VM_ACTIVATE_CPU:
1218                 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1219                 break;
1220 
1221         case VM_SUSPEND_CPU:
1222                 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1223                         error = EFAULT;
1224                 } else {
1225                         error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1226                 }
1227                 break;
1228 
1229         case VM_RESUME_CPU:
1230                 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1231                         error = EFAULT;
1232                 } else {
1233                         error = vm_resume_cpu(sc->vmm_vm, vcpu);
1234                 }
1235                 break;
1236 
1237         case VM_GET_CPUS: {
1238                 struct vm_cpuset vm_cpuset;
1239                 cpuset_t tempset;
1240                 void *srcp = &tempset;
1241                 int size;
1242 
1243                 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1244                         error = EFAULT;
1245                         break;
1246                 }
1247 
1248                 /* Be more generous about sizing since our cpuset_t is large. */
1249                 size = vm_cpuset.cpusetsize;
1250                 if (size <= 0 || size > sizeof (cpuset_t)) {
1251                         error = ERANGE;
1252                 }
1253                 /*
1254                  * If they want a ulong_t or less, make sure they receive the
1255                  * low bits with all the useful information.
1256                  */
1257                 if (size <= sizeof (tempset.cpub[0])) {
1258                         srcp = &tempset.cpub[0];
1259                 }
1260 
1261                 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1262                         tempset = vm_active_cpus(sc->vmm_vm);
1263                 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1264                         tempset = vm_suspended_cpus(sc->vmm_vm);
1265                 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1266                         tempset = vm_debug_cpus(sc->vmm_vm);
1267                 } else {
1268                         error = EINVAL;
1269                 }
1270 
1271                 ASSERT(size > 0 && size <= sizeof (tempset));
1272                 if (error == 0 &&
1273                     ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1274                         error = EFAULT;
1275                         break;
1276                 }
1277                 break;
1278         }
1279         case VM_SET_INTINFO: {
1280                 struct vm_intinfo vmii;
1281 
1282                 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1283                         error = EFAULT;
1284                         break;
1285                 }
1286                 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1287                 break;
1288         }
1289         case VM_GET_INTINFO: {
1290                 struct vm_intinfo vmii;
1291 
1292                 vmii.vcpuid = vcpu;
1293                 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1294                     &vmii.info2);
1295                 if (error == 0 &&
1296                     ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1297                         error = EFAULT;
1298                         break;
1299                 }
1300                 break;
1301         }
1302         case VM_RTC_WRITE: {
1303                 struct vm_rtc_data rtcdata;
1304 
1305                 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1306                         error = EFAULT;
1307                         break;
1308                 }
1309                 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1310                     rtcdata.value);
1311                 break;
1312         }
1313         case VM_RTC_READ: {
1314                 struct vm_rtc_data rtcdata;
1315 
1316                 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1317                         error = EFAULT;
1318                         break;
1319                 }
1320                 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1321                     &rtcdata.value);
1322                 if (error == 0 &&
1323                     ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1324                         error = EFAULT;
1325                         break;
1326                 }
1327                 break;
1328         }
1329         case VM_RTC_SETTIME: {
1330                 struct vm_rtc_time rtctime;
1331 
1332                 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1333                         error = EFAULT;
1334                         break;
1335                 }
1336                 error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1337                 break;
1338         }
1339         case VM_RTC_GETTIME: {
1340                 struct vm_rtc_time rtctime;
1341 
1342                 rtctime.secs = vrtc_get_time(sc->vmm_vm);
1343                 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1344                         error = EFAULT;
1345                         break;
1346                 }
1347                 break;
1348         }
1349 
1350         case VM_PMTMR_LOCATE: {
1351                 uint16_t port = arg;
1352                 error = vpmtmr_set_location(sc->vmm_vm, port);
1353                 break;
1354         }
1355 
1356         case VM_RESTART_INSTRUCTION:
1357                 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1358                 break;
1359 
1360         case VM_SET_TOPOLOGY: {
1361                 struct vm_cpu_topology topo;
1362 
1363                 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1364                         error = EFAULT;
1365                         break;
1366                 }
1367                 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1368                     topo.threads, topo.maxcpus);
1369                 break;
1370         }
1371         case VM_GET_TOPOLOGY: {
1372                 struct vm_cpu_topology topo;
1373 
1374                 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1375                     &topo.threads, &topo.maxcpus);
1376                 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1377                         error = EFAULT;
1378                         break;
1379                 }
1380                 break;
1381         }
1382 
1383         case VM_DEVMEM_GETOFFSET: {
1384                 struct vm_devmem_offset vdo;
1385                 list_t *dl = &sc->vmm_devmem_list;
1386                 vmm_devmem_entry_t *de = NULL;
1387 
1388                 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1389                         error = EFAULT;
1390                         break;
1391                 }
1392 
1393                 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1394                         if (de->vde_segid == vdo.segid) {
1395                                 break;
1396                         }
1397                 }
1398                 if (de != NULL) {
1399                         vdo.offset = de->vde_off;
1400                         if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1401                                 error = EFAULT;
1402                         }
1403                 } else {
1404                         error = ENOENT;
1405                 }
1406                 break;
1407         }
1408         case VM_WRLOCK_CYCLE: {
1409                 /*
1410                  * Present a test mechanism to acquire/release the write lock
1411                  * on the VM without any other effects.
1412                  */
1413                 break;
1414         }
1415         case VM_ARC_RESV:
1416                 error = vm_arc_resv(sc->vmm_vm, (uint64_t)arg);
1417                 break;
1418         default:
1419                 error = ENOTTY;
1420                 break;
1421         }
1422 
1423         /* Release exclusion resources */
1424         switch (lock_type) {
1425         case LOCK_NONE:
1426                 break;
1427         case LOCK_VCPU:
1428                 vcpu_unlock_one(sc, vcpu);
1429                 break;
1430         case LOCK_READ_HOLD:
1431                 vmm_read_unlock(sc);
1432                 break;
1433         case LOCK_WRITE_HOLD:
1434                 vmm_write_unlock(sc);
1435                 break;
1436         default:
1437                 panic("unexpected lock type");
1438                 break;
1439         }
1440 
1441         return (error);
1442 }
1443 
1444 static vmm_softc_t *
1445 vmm_lookup(const char *name)
1446 {
1447         list_t *vml = &vmm_list;
1448         vmm_softc_t *sc;
1449 
1450         ASSERT(MUTEX_HELD(&vmm_mtx));
1451 
1452         for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1453                 if (strcmp(sc->vmm_name, name) == 0) {
1454                         break;
1455                 }
1456         }
1457 
1458         return (sc);
1459 }
1460 
1461 /*
1462  * Acquire an HMA registration if not already held.
1463  */
1464 static boolean_t
1465 vmm_hma_acquire(void)
1466 {
1467         ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1468 
1469         mutex_enter(&vmmdev_mtx);
1470 
1471         if (vmmdev_hma_reg == NULL) {
1472                 VERIFY3U(vmmdev_hma_ref, ==, 0);
1473                 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1474                 if (vmmdev_hma_reg == NULL) {
1475                         cmn_err(CE_WARN, "%s HMA registration failed.",
1476                             vmmdev_hvm_name);
1477                         mutex_exit(&vmmdev_mtx);
1478                         return (B_FALSE);
1479                 }
1480         }
1481 
1482         vmmdev_hma_ref++;
1483 
1484         mutex_exit(&vmmdev_mtx);
1485 
1486         return (B_TRUE);
1487 }
1488 
1489 /*
1490  * Release the HMA registration if held and there are no remaining VMs.
1491  */
1492 static void
1493 vmm_hma_release(void)
1494 {
1495         ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1496 
1497         mutex_enter(&vmmdev_mtx);
1498 
1499         VERIFY3U(vmmdev_hma_ref, !=, 0);
1500 
1501         vmmdev_hma_ref--;
1502 
1503         if (vmmdev_hma_ref == 0) {
1504                 VERIFY(vmmdev_hma_reg != NULL);
1505                 hma_unregister(vmmdev_hma_reg);
1506                 vmmdev_hma_reg = NULL;
1507         }
1508         mutex_exit(&vmmdev_mtx);
1509 }
1510 
1511 static int
1512 vmmdev_do_vm_create(char *name, cred_t *cr)
1513 {
1514         vmm_softc_t     *sc = NULL;
1515         minor_t         minor;
1516         int             error = ENOMEM;
1517 
1518         if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1519                 return (EINVAL);
1520         }
1521 
1522         if (!vmm_hma_acquire())
1523                 return (ENXIO);
1524 
1525         mutex_enter(&vmm_mtx);
1526 
1527         /* Look for duplicate names */
1528         if (vmm_lookup(name) != NULL) {
1529                 mutex_exit(&vmm_mtx);
1530                 vmm_hma_release();
1531                 return (EEXIST);
1532         }
1533 
1534         /* Allow only one instance per non-global zone. */
1535         if (!INGLOBALZONE(curproc)) {
1536                 for (sc = list_head(&vmm_list); sc != NULL;
1537                     sc = list_next(&vmm_list, sc)) {
1538                         if (sc->vmm_zone == curzone) {
1539                                 mutex_exit(&vmm_mtx);
1540                                 vmm_hma_release();
1541                                 return (EINVAL);
1542                         }
1543                 }
1544         }
1545 
1546         minor = id_alloc(vmm_minors);
1547         if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1548                 goto fail;
1549         } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1550                 ddi_soft_state_free(vmm_statep, minor);
1551                 goto fail;
1552         } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1553             DDI_PSEUDO, 0) != DDI_SUCCESS) {
1554                 goto fail;
1555         }
1556 
1557         if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1558                 goto fail;
1559         }
1560 
1561         error = vm_create(name, &sc->vmm_vm);
1562         if (error == 0) {
1563                 /* Complete VM intialization and report success. */
1564                 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1565                 sc->vmm_minor = minor;
1566                 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1567                     offsetof(vmm_devmem_entry_t, vde_node));
1568 
1569                 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1570                     offsetof(vmm_hold_t, vmh_node));
1571                 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1572 
1573                 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1574                 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1575                     offsetof(vmm_lease_t, vml_node));
1576                 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1577                 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1578 
1579                 sc->vmm_zone = crgetzone(cr);
1580                 zone_hold(sc->vmm_zone);
1581                 vmm_zsd_add_vm(sc);
1582                 vmm_kstat_init(sc);
1583 
1584                 list_insert_tail(&vmm_list, sc);
1585                 mutex_exit(&vmm_mtx);
1586                 return (0);
1587         }
1588 
1589         vmm_kstat_fini(sc);
1590         ddi_remove_minor_node(vmmdev_dip, name);
1591 fail:
1592         id_free(vmm_minors, minor);
1593         if (sc != NULL) {
1594                 ddi_soft_state_free(vmm_statep, minor);
1595         }
1596         mutex_exit(&vmm_mtx);
1597         vmm_hma_release();
1598 
1599         return (error);
1600 }
1601 
1602 /*
1603  * Bhyve 'Driver' Interface
1604  *
1605  * While many devices are emulated in the bhyve userspace process, there are
1606  * others with performance constraints which require that they run mostly or
1607  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
1608  * needed so they can query/manipulate the portions of VM state needed to
1609  * fulfill their purpose.
1610  *
1611  * This includes:
1612  * - Translating guest-physical addresses to host-virtual pointers
1613  * - Injecting MSIs
1614  * - Hooking IO port addresses
1615  *
1616  * The vmm_drv interface exists to provide that functionality to its consumers.
1617  * (At this time, 'viona' is the only user)
1618  */
1619 int
1620 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1621 {
1622         vnode_t *vp = fp->f_vnode;
1623         const dev_t dev = vp->v_rdev;
1624         vmm_softc_t *sc;
1625         vmm_hold_t *hold;
1626         int err = 0;
1627 
1628         if (vp->v_type != VCHR) {
1629                 return (ENXIO);
1630         }
1631         const major_t major = getmajor(dev);
1632         const minor_t minor = getminor(dev);
1633 
1634         mutex_enter(&vmmdev_mtx);
1635         if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1636                 mutex_exit(&vmmdev_mtx);
1637                 return (ENOENT);
1638         }
1639         mutex_enter(&vmm_mtx);
1640         mutex_exit(&vmmdev_mtx);
1641 
1642         if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1643                 err = ENOENT;
1644                 goto out;
1645         }
1646         /* XXXJOY: check cred permissions against instance */
1647 
1648         if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1649                 err = EBUSY;
1650                 goto out;
1651         }
1652 
1653         hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1654         hold->vmh_sc = sc;
1655         hold->vmh_release_req = B_FALSE;
1656 
1657         list_insert_tail(&sc->vmm_holds, hold);
1658         sc->vmm_flags |= VMM_HELD;
1659         *holdp = hold;
1660 
1661 out:
1662         mutex_exit(&vmm_mtx);
1663         return (err);
1664 }
1665 
1666 void
1667 vmm_drv_rele(vmm_hold_t *hold)
1668 {
1669         vmm_softc_t *sc;
1670 
1671         ASSERT(hold != NULL);
1672         ASSERT(hold->vmh_sc != NULL);
1673         VERIFY(hold->vmh_ioport_hook_cnt == 0);
1674 
1675         mutex_enter(&vmm_mtx);
1676         sc = hold->vmh_sc;
1677         list_remove(&sc->vmm_holds, hold);
1678         if (list_is_empty(&sc->vmm_holds)) {
1679                 sc->vmm_flags &= ~VMM_HELD;
1680                 cv_broadcast(&sc->vmm_cv);
1681         }
1682         mutex_exit(&vmm_mtx);
1683         kmem_free(hold, sizeof (*hold));
1684 }
1685 
1686 boolean_t
1687 vmm_drv_release_reqd(vmm_hold_t *hold)
1688 {
1689         ASSERT(hold != NULL);
1690 
1691         return (hold->vmh_release_req);
1692 }
1693 
1694 vmm_lease_t *
1695 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1696 {
1697         vmm_softc_t *sc = hold->vmh_sc;
1698         vmm_lease_t *lease;
1699 
1700         ASSERT3P(expiref, !=, NULL);
1701 
1702         if (hold->vmh_release_req) {
1703                 return (NULL);
1704         }
1705 
1706         lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1707         list_link_init(&lease->vml_node);
1708         lease->vml_expire_func = expiref;
1709         lease->vml_expire_arg = arg;
1710         lease->vml_expired = B_FALSE;
1711         lease->vml_hold = hold;
1712         /* cache the VM pointer for one less pointer chase */
1713         lease->vml_vm = sc->vmm_vm;
1714 
1715         mutex_enter(&sc->vmm_lease_lock);
1716         while (sc->vmm_lease_blocker != 0) {
1717                 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1718         }
1719         list_insert_tail(&sc->vmm_lease_list, lease);
1720         vmm_read_lock(sc);
1721         mutex_exit(&sc->vmm_lease_lock);
1722 
1723         return (lease);
1724 }
1725 
1726 static void
1727 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1728 {
1729         ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1730 
1731         list_remove(&sc->vmm_lease_list, lease);
1732         vmm_read_unlock(sc);
1733         kmem_free(lease, sizeof (*lease));
1734 }
1735 
1736 void
1737 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1738 {
1739         vmm_softc_t *sc = hold->vmh_sc;
1740 
1741         VERIFY3P(hold, ==, lease->vml_hold);
1742 
1743         mutex_enter(&sc->vmm_lease_lock);
1744         vmm_lease_break_locked(sc, lease);
1745         mutex_exit(&sc->vmm_lease_lock);
1746 }
1747 
1748 boolean_t
1749 vmm_drv_lease_expired(vmm_lease_t *lease)
1750 {
1751         return (lease->vml_expired);
1752 }
1753 
1754 void *
1755 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1756 {
1757         ASSERT(lease != NULL);
1758 
1759         return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1760 }
1761 
1762 int
1763 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1764 {
1765         ASSERT(lease != NULL);
1766 
1767         return (lapic_intr_msi(lease->vml_vm, addr, msg));
1768 }
1769 
1770 int
1771 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1772     void *arg, void **cookie)
1773 {
1774         vmm_softc_t *sc;
1775         int err;
1776 
1777         ASSERT(hold != NULL);
1778         ASSERT(cookie != NULL);
1779 
1780         sc = hold->vmh_sc;
1781         mutex_enter(&vmm_mtx);
1782         /* Confirm that hook installation is not blocked */
1783         if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1784                 mutex_exit(&vmm_mtx);
1785                 return (EBUSY);
1786         }
1787         /*
1788          * Optimistically record an installed hook which will prevent a block
1789          * from being asserted while the mutex is dropped.
1790          */
1791         hold->vmh_ioport_hook_cnt++;
1792         mutex_exit(&vmm_mtx);
1793 
1794         vmm_write_lock(sc);
1795         err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1796             arg, cookie);
1797         vmm_write_unlock(sc);
1798 
1799         if (err != 0) {
1800                 mutex_enter(&vmm_mtx);
1801                 /* Walk back optimism about the hook installation */
1802                 hold->vmh_ioport_hook_cnt--;
1803                 mutex_exit(&vmm_mtx);
1804         }
1805         return (err);
1806 }
1807 
1808 void
1809 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1810 {
1811         vmm_softc_t *sc;
1812 
1813         ASSERT(hold != NULL);
1814         ASSERT(cookie != NULL);
1815         ASSERT(hold->vmh_ioport_hook_cnt != 0);
1816 
1817         sc = hold->vmh_sc;
1818         vmm_write_lock(sc);
1819         vm_ioport_unhook(sc->vmm_vm, cookie);
1820         vmm_write_unlock(sc);
1821 
1822         mutex_enter(&vmm_mtx);
1823         hold->vmh_ioport_hook_cnt--;
1824         mutex_exit(&vmm_mtx);
1825 }
1826 
1827 static int
1828 vmm_drv_purge(vmm_softc_t *sc)
1829 {
1830         ASSERT(MUTEX_HELD(&vmm_mtx));
1831 
1832         if ((sc->vmm_flags & VMM_HELD) != 0) {
1833                 vmm_hold_t *hold;
1834 
1835                 sc->vmm_flags |= VMM_CLEANUP;
1836                 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1837                     hold = list_next(&sc->vmm_holds, hold)) {
1838                         hold->vmh_release_req = B_TRUE;
1839                 }
1840                 while ((sc->vmm_flags & VMM_HELD) != 0) {
1841                         if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1842                                 return (EINTR);
1843                         }
1844                 }
1845                 sc->vmm_flags &= ~VMM_CLEANUP;
1846         }
1847 
1848         VERIFY(list_is_empty(&sc->vmm_holds));
1849         sc->vmm_flags |= VMM_PURGED;
1850         return (0);
1851 }
1852 
1853 static int
1854 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1855 {
1856         int err = 0;
1857 
1858         mutex_enter(&vmm_mtx);
1859         if (!enable_block) {
1860                 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1861 
1862                 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1863                 goto done;
1864         }
1865 
1866         /* If any holds have hooks installed, the block is a failure */
1867         if (!list_is_empty(&sc->vmm_holds)) {
1868                 vmm_hold_t *hold;
1869 
1870                 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1871                     hold = list_next(&sc->vmm_holds, hold)) {
1872                         if (hold->vmh_ioport_hook_cnt != 0) {
1873                                 err = EBUSY;
1874                                 goto done;
1875                         }
1876                 }
1877         }
1878         sc->vmm_flags |= VMM_BLOCK_HOOK;
1879 
1880 done:
1881         mutex_exit(&vmm_mtx);
1882         return (err);
1883 }
1884 
1885 static int
1886 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1887     boolean_t *hma_release)
1888 {
1889         dev_info_t      *pdip = ddi_get_parent(vmmdev_dip);
1890         minor_t         minor;
1891 
1892         ASSERT(MUTEX_HELD(&vmm_mtx));
1893 
1894         *hma_release = B_FALSE;
1895 
1896         if (vmm_drv_purge(sc) != 0) {
1897                 return (EINTR);
1898         }
1899 
1900         if (clean_zsd) {
1901                 vmm_zsd_rem_vm(sc);
1902         }
1903 
1904         /* Clean up devmem entries */
1905         vmmdev_devmem_purge(sc);
1906 
1907         list_remove(&vmm_list, sc);
1908         ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1909         minor = sc->vmm_minor;
1910         zone_rele(sc->vmm_zone);
1911         if (sc->vmm_is_open) {
1912                 list_insert_tail(&vmm_destroy_list, sc);
1913                 sc->vmm_flags |= VMM_DESTROY;
1914         } else {
1915                 vm_destroy(sc->vmm_vm);
1916                 vmm_kstat_fini(sc);
1917                 ddi_soft_state_free(vmm_statep, minor);
1918                 id_free(vmm_minors, minor);
1919                 *hma_release = B_TRUE;
1920         }
1921         (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1922 
1923         return (0);
1924 }
1925 
1926 int
1927 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1928 {
1929         boolean_t       hma_release = B_FALSE;
1930         int             err;
1931 
1932         mutex_enter(&vmm_mtx);
1933         err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1934         mutex_exit(&vmm_mtx);
1935 
1936         if (hma_release)
1937                 vmm_hma_release();
1938 
1939         return (err);
1940 }
1941 
1942 /* ARGSUSED */
1943 static int
1944 vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1945 {
1946         boolean_t       hma_release = B_FALSE;
1947         vmm_softc_t     *sc;
1948         int             err;
1949 
1950         if (crgetuid(cr) != 0)
1951                 return (EPERM);
1952 
1953         mutex_enter(&vmm_mtx);
1954 
1955         if ((sc = vmm_lookup(name)) == NULL) {
1956                 mutex_exit(&vmm_mtx);
1957                 return (ENOENT);
1958         }
1959         /*
1960          * We don't check this in vmm_lookup() since that function is also used
1961          * for validation during create and currently vmm names must be unique.
1962          */
1963         if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1964                 mutex_exit(&vmm_mtx);
1965                 return (EPERM);
1966         }
1967         err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1968 
1969         mutex_exit(&vmm_mtx);
1970 
1971         if (hma_release)
1972                 vmm_hma_release();
1973 
1974         return (err);
1975 }
1976 
1977 #define VCPU_NAME_BUFLEN        32
1978 
1979 static int
1980 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
1981 {
1982         zoneid_t zid = crgetzoneid(cr);
1983         int instance = minor;
1984         kstat_t *ksp;
1985 
1986         ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
1987 
1988         ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
1989             VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
1990             sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
1991 
1992         if (ksp == NULL) {
1993                 return (-1);
1994         }
1995         sc->vmm_kstat_vm = ksp;
1996 
1997         for (uint_t i = 0; i < VM_MAXCPU; i++) {
1998                 char namebuf[VCPU_NAME_BUFLEN];
1999 
2000                 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2001 
2002                 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2003                 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2004                     VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2005                     sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2006                     0, zid);
2007                 if (ksp == NULL) {
2008                         goto fail;
2009                 }
2010 
2011                 sc->vmm_kstat_vcpu[i] = ksp;
2012         }
2013 
2014         /*
2015          * If this instance is associated with a non-global zone, make its
2016          * kstats visible from the GZ.
2017          */
2018         if (zid != GLOBAL_ZONEID) {
2019                 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2020                 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2021                         kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2022                 }
2023         }
2024 
2025         return (0);
2026 
2027 fail:
2028         for (uint_t i = 0; i < VM_MAXCPU; i++) {
2029                 if (sc->vmm_kstat_vcpu[i] != NULL) {
2030                         kstat_delete(sc->vmm_kstat_vcpu[i]);
2031                         sc->vmm_kstat_vcpu[i] = NULL;
2032                 } else {
2033                         break;
2034                 }
2035         }
2036         kstat_delete(sc->vmm_kstat_vm);
2037         sc->vmm_kstat_vm = NULL;
2038         return (-1);
2039 }
2040 
2041 static void
2042 vmm_kstat_init(vmm_softc_t *sc)
2043 {
2044         kstat_t *ksp;
2045 
2046         ASSERT3P(sc->vmm_vm, !=, NULL);
2047         ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2048 
2049         ksp = sc->vmm_kstat_vm;
2050         vmm_kstats_t *vk = ksp->ks_data;
2051         ksp->ks_private = sc->vmm_vm;
2052         kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2053         kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2054 
2055         for (uint_t i = 0; i < VM_MAXCPU; i++) {
2056                 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2057 
2058                 ksp = sc->vmm_kstat_vcpu[i];
2059                 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2060 
2061                 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2062                 vvk->vvk_vcpu.value.ui32 = i;
2063                 kstat_named_init(&vvk->vvk_time_init, "time_init",
2064                     KSTAT_DATA_UINT64);
2065                 kstat_named_init(&vvk->vvk_time_run, "time_run",
2066                     KSTAT_DATA_UINT64);
2067                 kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2068                     KSTAT_DATA_UINT64);
2069                 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2070                     KSTAT_DATA_UINT64);
2071                 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2072                     KSTAT_DATA_UINT64);
2073                 kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2074                     KSTAT_DATA_UINT64);
2075                 ksp->ks_private = sc->vmm_vm;
2076                 ksp->ks_update = vmm_kstat_update_vcpu;
2077         }
2078 
2079         kstat_install(sc->vmm_kstat_vm);
2080         for (uint_t i = 0; i < VM_MAXCPU; i++) {
2081                 kstat_install(sc->vmm_kstat_vcpu[i]);
2082         }
2083 }
2084 
2085 static void
2086 vmm_kstat_fini(vmm_softc_t *sc)
2087 {
2088         ASSERT(sc->vmm_kstat_vm != NULL);
2089 
2090         kstat_delete(sc->vmm_kstat_vm);
2091         sc->vmm_kstat_vm = NULL;
2092 
2093         for (uint_t i = 0; i < VM_MAXCPU; i++) {
2094                 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2095 
2096                 kstat_delete(sc->vmm_kstat_vcpu[i]);
2097                 sc->vmm_kstat_vcpu[i] = NULL;
2098         }
2099 }
2100 
2101 static int
2102 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2103 {
2104         minor_t         minor;
2105         vmm_softc_t     *sc;
2106 
2107         minor = getminor(*devp);
2108         if (minor == VMM_CTL_MINOR) {
2109                 /*
2110                  * Master control device must be opened exclusively.
2111                  */
2112                 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2113                         return (EINVAL);
2114                 }
2115 
2116                 return (0);
2117         }
2118 
2119         mutex_enter(&vmm_mtx);
2120         sc = ddi_get_soft_state(vmm_statep, minor);
2121         if (sc == NULL) {
2122                 mutex_exit(&vmm_mtx);
2123                 return (ENXIO);
2124         }
2125 
2126         sc->vmm_is_open = B_TRUE;
2127         mutex_exit(&vmm_mtx);
2128 
2129         return (0);
2130 }
2131 
2132 static int
2133 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2134 {
2135         minor_t         minor;
2136         vmm_softc_t     *sc;
2137         boolean_t       hma_release = B_FALSE;
2138 
2139         minor = getminor(dev);
2140         if (minor == VMM_CTL_MINOR)
2141                 return (0);
2142 
2143         mutex_enter(&vmm_mtx);
2144         sc = ddi_get_soft_state(vmm_statep, minor);
2145         if (sc == NULL) {
2146                 mutex_exit(&vmm_mtx);
2147                 return (ENXIO);
2148         }
2149 
2150         VERIFY(sc->vmm_is_open);
2151         sc->vmm_is_open = B_FALSE;
2152 
2153         /*
2154          * If this VM was destroyed while the vmm device was open, then
2155          * clean it up now that it is closed.
2156          */
2157         if (sc->vmm_flags & VMM_DESTROY) {
2158                 list_remove(&vmm_destroy_list, sc);
2159                 vm_destroy(sc->vmm_vm);
2160                 ddi_soft_state_free(vmm_statep, minor);
2161                 id_free(vmm_minors, minor);
2162                 hma_release = B_TRUE;
2163         }
2164         mutex_exit(&vmm_mtx);
2165 
2166         if (hma_release)
2167                 vmm_hma_release();
2168 
2169         return (0);
2170 }
2171 
2172 static int
2173 vmm_is_supported(intptr_t arg)
2174 {
2175         int r;
2176         const char *msg;
2177 
2178         if (vmm_is_intel()) {
2179                 r = vmx_x86_supported(&msg);
2180         } else if (vmm_is_svm()) {
2181                 /*
2182                  * HMA already ensured that the features necessary for SVM
2183                  * operation were present and online during vmm_attach().
2184                  */
2185                 r = 0;
2186         } else {
2187                 r = ENXIO;
2188                 msg = "Unsupported CPU vendor";
2189         }
2190 
2191         if (r != 0 && arg != (intptr_t)NULL) {
2192                 if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0)
2193                         return (EFAULT);
2194         }
2195         return (r);
2196 }
2197 
2198 static int
2199 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2200     int *rvalp)
2201 {
2202         vmm_softc_t     *sc;
2203         minor_t         minor;
2204 
2205         /* The structs in bhyve ioctls assume a 64-bit datamodel */
2206         if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2207                 return (ENOTSUP);
2208         }
2209 
2210         minor = getminor(dev);
2211 
2212         if (minor == VMM_CTL_MINOR) {
2213                 void *argp = (void *)arg;
2214                 char name[VM_MAX_NAMELEN] = { 0 };
2215                 size_t len = 0;
2216 
2217                 if ((mode & FKIOCTL) != 0) {
2218                         len = strlcpy(name, argp, sizeof (name));
2219                 } else {
2220                         if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2221                                 return (EFAULT);
2222                         }
2223                 }
2224                 if (len >= VM_MAX_NAMELEN) {
2225                         return (ENAMETOOLONG);
2226                 }
2227 
2228                 switch (cmd) {
2229                 case VMM_CREATE_VM:
2230                         if ((mode & FWRITE) == 0)
2231                                 return (EPERM);
2232                         return (vmmdev_do_vm_create(name, credp));
2233                 case VMM_DESTROY_VM:
2234                         if ((mode & FWRITE) == 0)
2235                                 return (EPERM);
2236                         return (vmmdev_do_vm_destroy(name, credp));
2237                 case VMM_VM_SUPPORTED:
2238                         return (vmm_is_supported(arg));
2239                 default:
2240                         /* No other actions are legal on ctl device */
2241                         return (ENOTTY);
2242                 }
2243         }
2244 
2245         sc = ddi_get_soft_state(vmm_statep, minor);
2246         ASSERT(sc);
2247 
2248         if (sc->vmm_flags & VMM_DESTROY)
2249                 return (ENXIO);
2250 
2251         return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2252 }
2253 
2254 static int
2255 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2256     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2257 {
2258         vmm_softc_t *sc;
2259         const minor_t minor = getminor(dev);
2260         struct vm *vm;
2261         int err;
2262         vm_object_t vmo = NULL;
2263         struct vmspace *vms;
2264 
2265         if (minor == VMM_CTL_MINOR) {
2266                 return (ENODEV);
2267         }
2268         if (off < 0 || (off + len) <= 0) {
2269                 return (EINVAL);
2270         }
2271         if ((prot & PROT_USER) == 0) {
2272                 return (EACCES);
2273         }
2274 
2275         sc = ddi_get_soft_state(vmm_statep, minor);
2276         ASSERT(sc);
2277 
2278         if (sc->vmm_flags & VMM_DESTROY)
2279                 return (ENXIO);
2280 
2281         /* Grab read lock on the VM to prevent any changes to the memory map */
2282         vmm_read_lock(sc);
2283 
2284         vm = sc->vmm_vm;
2285         vms = vm_get_vmspace(vm);
2286         if (off >= VM_DEVMEM_START) {
2287                 int segid;
2288                 off_t map_off = 0;
2289 
2290                 /* Mapping a devmem "device" */
2291                 if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
2292                         err = ENODEV;
2293                         goto out;
2294                 }
2295                 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2296                 if (err != 0) {
2297                         goto out;
2298                 }
2299                 err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
2300                     flags);
2301         } else {
2302                 /* Mapping a part of the guest physical space */
2303                 err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2304                     flags);
2305         }
2306 
2307 
2308 out:
2309         vmm_read_unlock(sc);
2310         return (err);
2311 }
2312 
2313 static sdev_plugin_validate_t
2314 vmm_sdev_validate(sdev_ctx_t ctx)
2315 {
2316         const char *name = sdev_ctx_name(ctx);
2317         vmm_softc_t *sc;
2318         sdev_plugin_validate_t ret;
2319         minor_t minor;
2320 
2321         if (sdev_ctx_vtype(ctx) != VCHR)
2322                 return (SDEV_VTOR_INVALID);
2323 
2324         VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2325 
2326         mutex_enter(&vmm_mtx);
2327         if ((sc = vmm_lookup(name)) == NULL)
2328                 ret = SDEV_VTOR_INVALID;
2329         else if (sc->vmm_minor != minor)
2330                 ret = SDEV_VTOR_STALE;
2331         else
2332                 ret = SDEV_VTOR_VALID;
2333         mutex_exit(&vmm_mtx);
2334 
2335         return (ret);
2336 }
2337 
2338 static int
2339 vmm_sdev_filldir(sdev_ctx_t ctx)
2340 {
2341         vmm_softc_t *sc;
2342         int ret;
2343 
2344         if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2345                 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2346                     sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2347                 return (EINVAL);
2348         }
2349 
2350         mutex_enter(&vmm_mtx);
2351         ASSERT(vmmdev_dip != NULL);
2352         for (sc = list_head(&vmm_list); sc != NULL;
2353             sc = list_next(&vmm_list, sc)) {
2354                 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2355                         ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2356                             S_IFCHR | 0600,
2357                             makedevice(ddi_driver_major(vmmdev_dip),
2358                             sc->vmm_minor));
2359                 } else {
2360                         continue;
2361                 }
2362                 if (ret != 0 && ret != EEXIST)
2363                         goto out;
2364         }
2365 
2366         ret = 0;
2367 
2368 out:
2369         mutex_exit(&vmm_mtx);
2370         return (ret);
2371 }
2372 
2373 /* ARGSUSED */
2374 static void
2375 vmm_sdev_inactive(sdev_ctx_t ctx)
2376 {
2377 }
2378 
2379 static sdev_plugin_ops_t vmm_sdev_ops = {
2380         .spo_version = SDEV_PLUGIN_VERSION,
2381         .spo_flags = SDEV_PLUGIN_SUBDIR,
2382         .spo_validate = vmm_sdev_validate,
2383         .spo_filldir = vmm_sdev_filldir,
2384         .spo_inactive = vmm_sdev_inactive
2385 };
2386 
2387 /* ARGSUSED */
2388 static int
2389 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2390 {
2391         int error;
2392 
2393         switch (cmd) {
2394         case DDI_INFO_DEVT2DEVINFO:
2395                 *result = (void *)vmmdev_dip;
2396                 error = DDI_SUCCESS;
2397                 break;
2398         case DDI_INFO_DEVT2INSTANCE:
2399                 *result = (void *)0;
2400                 error = DDI_SUCCESS;
2401                 break;
2402         default:
2403                 error = DDI_FAILURE;
2404                 break;
2405         }
2406         return (error);
2407 }
2408 
2409 static int
2410 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2411 {
2412         sdev_plugin_hdl_t sph;
2413         hma_reg_t *reg = NULL;
2414         boolean_t vmm_loaded = B_FALSE;
2415 
2416         if (cmd != DDI_ATTACH) {
2417                 return (DDI_FAILURE);
2418         }
2419 
2420         mutex_enter(&vmmdev_mtx);
2421         /* Ensure we are not already attached. */
2422         if (vmmdev_dip != NULL) {
2423                 mutex_exit(&vmmdev_mtx);
2424                 return (DDI_FAILURE);
2425         }
2426 
2427         vmm_sol_glue_init();
2428         vmm_arena_init();
2429 
2430         /*
2431          * Perform temporary HMA registration to determine if the system
2432          * is capable.
2433          */
2434         if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2435                 goto fail;
2436         } else if (vmm_mod_load() != 0) {
2437                 goto fail;
2438         }
2439         vmm_loaded = B_TRUE;
2440         hma_unregister(reg);
2441         reg = NULL;
2442 
2443         /* Create control node.  Other nodes will be created on demand. */
2444         if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2445             VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2446                 goto fail;
2447         }
2448 
2449         sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
2450         if (sph == (sdev_plugin_hdl_t)NULL) {
2451                 ddi_remove_minor_node(dip, NULL);
2452                 goto fail;
2453         }
2454 
2455         ddi_report_dev(dip);
2456         vmmdev_sdev_hdl = sph;
2457         vmmdev_dip = dip;
2458         mutex_exit(&vmmdev_mtx);
2459         return (DDI_SUCCESS);
2460 
2461 fail:
2462         if (vmm_loaded) {
2463                 VERIFY0(vmm_mod_unload());
2464         }
2465         if (reg != NULL) {
2466                 hma_unregister(reg);
2467         }
2468         vmm_arena_fini();
2469         vmm_sol_glue_cleanup();
2470         mutex_exit(&vmmdev_mtx);
2471         return (DDI_FAILURE);
2472 }
2473 
2474 static int
2475 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2476 {
2477         if (cmd != DDI_DETACH) {
2478                 return (DDI_FAILURE);
2479         }
2480 
2481         /*
2482          * Ensure that all resources have been cleaned up.
2483          *
2484          * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2485          * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2486          * devinfo locked as iommu_cleanup() tries to recursively lock each
2487          * devinfo, including our own, while holding vmmdev_mtx.
2488          */
2489         if (mutex_tryenter(&vmmdev_mtx) == 0)
2490                 return (DDI_FAILURE);
2491 
2492         mutex_enter(&vmm_mtx);
2493         if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2494                 mutex_exit(&vmm_mtx);
2495                 mutex_exit(&vmmdev_mtx);
2496                 return (DDI_FAILURE);
2497         }
2498         mutex_exit(&vmm_mtx);
2499 
2500         VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2501         if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2502                 mutex_exit(&vmmdev_mtx);
2503                 return (DDI_FAILURE);
2504         }
2505         vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2506 
2507         /* Remove the control node. */
2508         ddi_remove_minor_node(dip, "ctl");
2509         vmmdev_dip = NULL;
2510 
2511         VERIFY0(vmm_mod_unload());
2512         VERIFY3U(vmmdev_hma_reg, ==, NULL);
2513         vmm_arena_fini();
2514         vmm_sol_glue_cleanup();
2515 
2516         mutex_exit(&vmmdev_mtx);
2517 
2518         return (DDI_SUCCESS);
2519 }
2520 
2521 static struct cb_ops vmm_cb_ops = {
2522         vmm_open,
2523         vmm_close,
2524         nodev,          /* strategy */
2525         nodev,          /* print */
2526         nodev,          /* dump */
2527         nodev,          /* read */
2528         nodev,          /* write */
2529         vmm_ioctl,
2530         nodev,          /* devmap */
2531         nodev,          /* mmap */
2532         vmm_segmap,
2533         nochpoll,       /* poll */
2534         ddi_prop_op,
2535         NULL,
2536         D_NEW | D_MP | D_DEVMAP
2537 };
2538 
2539 static struct dev_ops vmm_ops = {
2540         DEVO_REV,
2541         0,
2542         vmm_info,
2543         nulldev,        /* identify */
2544         nulldev,        /* probe */
2545         vmm_attach,
2546         vmm_detach,
2547         nodev,          /* reset */
2548         &vmm_cb_ops,
2549         (struct bus_ops *)NULL
2550 };
2551 
2552 static struct modldrv modldrv = {
2553         &mod_driverops,
2554         "bhyve vmm",
2555         &vmm_ops
2556 };
2557 
2558 static struct modlinkage modlinkage = {
2559         MODREV_1,
2560         &modldrv,
2561         NULL
2562 };
2563 
2564 int
2565 _init(void)
2566 {
2567         int     error;
2568 
2569         sysinit();
2570 
2571         mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2572         mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2573         list_create(&vmm_list, sizeof (vmm_softc_t),
2574             offsetof(vmm_softc_t, vmm_node));
2575         list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2576             offsetof(vmm_softc_t, vmm_node));
2577         vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2578 
2579         error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2580         if (error) {
2581                 return (error);
2582         }
2583 
2584         vmm_zsd_init();
2585 
2586         error = mod_install(&modlinkage);
2587         if (error) {
2588                 ddi_soft_state_fini(&vmm_statep);
2589                 vmm_zsd_fini();
2590         }
2591 
2592         return (error);
2593 }
2594 
2595 int
2596 _fini(void)
2597 {
2598         int     error;
2599 
2600         error = mod_remove(&modlinkage);
2601         if (error) {
2602                 return (error);
2603         }
2604 
2605         vmm_zsd_fini();
2606 
2607         ddi_soft_state_fini(&vmm_statep);
2608 
2609         return (0);
2610 }
2611 
2612 int
2613 _info(struct modinfo *modinfop)
2614 {
2615         return (mod_info(&modlinkage, modinfop));
2616 }