1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
  12 
  13 /*
  14  * Copyright 2015 Pluribus Networks Inc.
  15  * Copyright 2019 Joyent, Inc.
  16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  17  * Copyright 2020 Oxide Computer Company
  18  */
  19 
  20 #include <sys/types.h>
  21 #include <sys/conf.h>
  22 #include <sys/cpuvar.h>
  23 #include <sys/ioccom.h>
  24 #include <sys/stat.h>
  25 #include <sys/vmsystm.h>
  26 #include <sys/ddi.h>
  27 #include <sys/mkdev.h>
  28 #include <sys/sunddi.h>
  29 #include <sys/fs/dv_node.h>
  30 #include <sys/cpuset.h>
  31 #include <sys/id_space.h>
  32 #include <sys/fs/sdev_plugin.h>
  33 #include <sys/smt.h>
  34 
  35 #include <sys/kernel.h>
  36 #include <sys/hma.h>
  37 #include <sys/x86_archext.h>
  38 #include <x86/apicreg.h>
  39 
  40 #include <sys/vmm.h>
  41 #include <sys/vmm_kernel.h>
  42 #include <sys/vmm_instruction_emul.h>
  43 #include <sys/vmm_dev.h>
  44 #include <sys/vmm_impl.h>
  45 #include <sys/vmm_drv.h>
  46 
  47 #include <vm/vm.h>
  48 #include <vm/seg_dev.h>
  49 
  50 #include "io/ppt.h"
  51 #include "io/vatpic.h"
  52 #include "io/vioapic.h"
  53 #include "io/vrtc.h"
  54 #include "io/vhpet.h"
  55 #include "io/vpmtmr.h"
  56 #include "vmm_lapic.h"
  57 #include "vmm_stat.h"
  58 #include "vmm_util.h"
  59 #include "vm/vm_glue.h"
  60 
  61 /*
  62  * Locking details:
  63  *
  64  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
  65  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
  66  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
  67  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
  68  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
  69  */
  70 
  71 static kmutex_t         vmmdev_mtx;
  72 static dev_info_t       *vmmdev_dip;
  73 static hma_reg_t        *vmmdev_hma_reg;
  74 static uint_t           vmmdev_hma_ref;
  75 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
  76 
  77 static kmutex_t         vmm_mtx;
  78 static list_t           vmm_list;
  79 static list_t           vmm_destroy_list;
  80 static id_space_t       *vmm_minors;
  81 static void             *vmm_statep;
  82 
  83 static const char *vmmdev_hvm_name = "bhyve";
  84 
  85 /* For sdev plugin (/dev) */
  86 #define VMM_SDEV_ROOT "/dev/vmm"
  87 
  88 /* From uts/i86pc/io/vmm/intel/vmx.c */
  89 extern int vmx_x86_supported(const char **);
  90 
  91 /* Holds and hooks from drivers external to vmm */
  92 struct vmm_hold {
  93         list_node_t     vmh_node;
  94         vmm_softc_t     *vmh_sc;
  95         boolean_t       vmh_release_req;
  96         uint_t          vmh_ioport_hook_cnt;
  97 };
  98 
  99 struct vmm_lease {
 100         list_node_t             vml_node;
 101         struct vm               *vml_vm;
 102         boolean_t               vml_expired;
 103         boolean_t               (*vml_expire_func)(void *);
 104         void                    *vml_expire_arg;
 105         list_node_t             vml_expire_node;
 106         struct vmm_hold         *vml_hold;
 107 };
 108 
 109 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
 110 static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
 111 
 112 static int
 113 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 114 {
 115         int error;
 116         bool sysmem;
 117 
 118         error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
 119             NULL);
 120         if (error || mseg->len == 0)
 121                 return (error);
 122 
 123         if (!sysmem) {
 124                 vmm_devmem_entry_t *de;
 125                 list_t *dl = &sc->vmm_devmem_list;
 126 
 127                 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
 128                         if (de->vde_segid == mseg->segid) {
 129                                 break;
 130                         }
 131                 }
 132                 if (de != NULL) {
 133                         (void) strlcpy(mseg->name, de->vde_name,
 134                             sizeof (mseg->name));
 135                 }
 136         } else {
 137                 bzero(mseg->name, sizeof (mseg->name));
 138         }
 139 
 140         return (error);
 141 }
 142 
 143 /*
 144  * The 'devmem' hack:
 145  *
 146  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
 147  * in the vm which appear with their own name related to the vm under /dev.
 148  * Since this would be a hassle from an sdev perspective and would require a
 149  * new cdev interface (or complicate the existing one), we choose to implement
 150  * this in a different manner.  When 'devmem' mappings are created, an
 151  * identifying off_t is communicated back out to userspace.  That off_t,
 152  * residing above the normal guest memory space, can be used to mmap the
 153  * 'devmem' mapping from the already-open vm device.
 154  */
 155 
 156 static int
 157 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
 158 {
 159         off_t map_offset;
 160         vmm_devmem_entry_t *entry;
 161 
 162         if (list_is_empty(&sc->vmm_devmem_list)) {
 163                 map_offset = VM_DEVMEM_START;
 164         } else {
 165                 entry = list_tail(&sc->vmm_devmem_list);
 166                 map_offset = entry->vde_off + entry->vde_len;
 167                 if (map_offset < entry->vde_off) {
 168                         /* Do not tolerate overflow */
 169                         return (ERANGE);
 170                 }
 171                 /*
 172                  * XXXJOY: We could choose to search the list for duplicate
 173                  * names and toss an error.  Since we're using the offset
 174                  * method for now, it does not make much of a difference.
 175                  */
 176         }
 177 
 178         entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
 179         entry->vde_segid = mseg->segid;
 180         entry->vde_len = mseg->len;
 181         entry->vde_off = map_offset;
 182         (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
 183         list_insert_tail(&sc->vmm_devmem_list, entry);
 184 
 185         return (0);
 186 }
 187 
 188 static boolean_t
 189 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
 190     off_t *map_offp)
 191 {
 192         list_t *dl = &sc->vmm_devmem_list;
 193         vmm_devmem_entry_t *de = NULL;
 194         const off_t map_end = off + len;
 195 
 196         VERIFY(off >= VM_DEVMEM_START);
 197 
 198         if (map_end < off) {
 199                 /* No match on overflow */
 200                 return (B_FALSE);
 201         }
 202 
 203         for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
 204                 const off_t item_end = de->vde_off + de->vde_len;
 205 
 206                 if (de->vde_off <= off && item_end >= map_end) {
 207                         *segidp = de->vde_segid;
 208                         *map_offp = off - de->vde_off;
 209                         return (B_TRUE);
 210                 }
 211         }
 212         return (B_FALSE);
 213 }
 214 
 215 static void
 216 vmmdev_devmem_purge(vmm_softc_t *sc)
 217 {
 218         vmm_devmem_entry_t *entry;
 219 
 220         while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
 221                 kmem_free(entry, sizeof (*entry));
 222         }
 223 }
 224 
 225 static int
 226 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 227 {
 228         int error;
 229         bool sysmem = true;
 230 
 231         if (VM_MEMSEG_NAME(mseg)) {
 232                 sysmem = false;
 233         }
 234         error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
 235 
 236         if (error == 0 && VM_MEMSEG_NAME(mseg)) {
 237                 /*
 238                  * Rather than create a whole fresh device from which userspace
 239                  * can mmap this segment, instead make it available at an
 240                  * offset above where the main guest memory resides.
 241                  */
 242                 error = vmmdev_devmem_create(sc, mseg, mseg->name);
 243                 if (error != 0) {
 244                         vm_free_memseg(sc->vmm_vm, mseg->segid);
 245                 }
 246         }
 247         return (error);
 248 }
 249 
 250 /*
 251  * Resource Locking and Exclusion
 252  *
 253  * Much of bhyve depends on key portions of VM state, such as the guest memory
 254  * map, to remain unchanged while the guest is running.  As ported from
 255  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
 256  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
 257  * performing the work of actually running the guest in VMX/SVM, would lock
 258  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
 259  * state, all of the vCPUs would be first locked, ensuring that the
 260  * operation(s) could complete without any other threads stumbling into
 261  * intermediate states.
 262  *
 263  * This approach is largely effective for bhyve.  Common operations, such as
 264  * running the vCPUs, steer clear of lock contention.  The model begins to
 265  * break down for operations which do not occur in the context of a specific
 266  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
 267  * thread in the bhyve process.  In order to properly protect those vCPU-less
 268  * operations from encountering invalid states, additional locking is required.
 269  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
 270  * It does mean that class of operations will be serialized on locking the
 271  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
 272  * undue contention on the VM_MAXCPU-1 vCPU.
 273  *
 274  * In order to address the shortcomings of this model, the concept of a
 275  * read/write lock has been added to bhyve.  Operations which change
 276  * fundamental aspects of a VM (such as the memory map) must acquire the write
 277  * lock, which also implies locking all of the vCPUs and waiting for all read
 278  * lock holders to release.  While it increases the cost and waiting time for
 279  * those few operations, it allows most hot-path operations on the VM (which
 280  * depend on its configuration remaining stable) to occur with minimal locking.
 281  *
 282  * Consumers of the Driver API (see below) are a special case when it comes to
 283  * this locking, since they may hold a read lock via the drv_lease mechanism
 284  * for an extended period of time.  Rather than forcing those consumers to
 285  * continuously poll for a write lock attempt, the lease system forces them to
 286  * provide a release callback to trigger their clean-up (and potential later
 287  * reacquisition) of the read lock.
 288  */
 289 
 290 static void
 291 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
 292 {
 293         ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 294 
 295         /*
 296          * Since this state transition is utilizing from_idle=true, it should
 297          * not fail, but rather block until it can be successful.
 298          */
 299         VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
 300 }
 301 
 302 static void
 303 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
 304 {
 305         ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 306 
 307         VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
 308         vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
 309 }
 310 
 311 static void
 312 vmm_read_lock(vmm_softc_t *sc)
 313 {
 314         rw_enter(&sc->vmm_rwlock, RW_READER);
 315 }
 316 
 317 static void
 318 vmm_read_unlock(vmm_softc_t *sc)
 319 {
 320         rw_exit(&sc->vmm_rwlock);
 321 }
 322 
 323 static void
 324 vmm_write_lock(vmm_softc_t *sc)
 325 {
 326         int maxcpus;
 327 
 328         /* First lock all the vCPUs */
 329         maxcpus = vm_get_maxcpus(sc->vmm_vm);
 330         for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
 331                 vcpu_lock_one(sc, vcpu);
 332         }
 333 
 334         mutex_enter(&sc->vmm_lease_lock);
 335         VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
 336         sc->vmm_lease_blocker++;
 337         if (sc->vmm_lease_blocker == 1) {
 338                 list_t *list = &sc->vmm_lease_list;
 339                 vmm_lease_t *lease = list_head(list);
 340 
 341                 while (lease != NULL) {
 342                         boolean_t sync_break = B_FALSE;
 343 
 344                         if (!lease->vml_expired) {
 345                                 void *arg = lease->vml_expire_arg;
 346                                 lease->vml_expired = B_TRUE;
 347                                 sync_break = lease->vml_expire_func(arg);
 348                         }
 349 
 350                         if (sync_break) {
 351                                 vmm_lease_t *next;
 352 
 353                                 /*
 354                                  * These leases which are synchronously broken
 355                                  * result in vmm_read_unlock() calls from a
 356                                  * different thread than the corresponding
 357                                  * vmm_read_lock().  This is acceptable, given
 358                                  * that the rwlock underpinning the whole
 359                                  * mechanism tolerates the behavior.  This
 360                                  * flexibility is _only_ afforded to VM read
 361                                  * lock (RW_READER) holders.
 362                                  */
 363                                 next = list_next(list, lease);
 364                                 vmm_lease_break_locked(sc, lease);
 365                                 lease = next;
 366                         } else {
 367                                 lease = list_next(list, lease);
 368                         }
 369                 }
 370         }
 371         mutex_exit(&sc->vmm_lease_lock);
 372 
 373         rw_enter(&sc->vmm_rwlock, RW_WRITER);
 374         /*
 375          * For now, the 'maxcpus' value for an instance is fixed at the
 376          * compile-time constant of VM_MAXCPU at creation.  If this changes in
 377          * the future, allowing for dynamic vCPU resource sizing, acquisition
 378          * of the write lock will need to be wary of such changes.
 379          */
 380         VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
 381 }
 382 
 383 static void
 384 vmm_write_unlock(vmm_softc_t *sc)
 385 {
 386         int maxcpus;
 387 
 388         mutex_enter(&sc->vmm_lease_lock);
 389         VERIFY3U(sc->vmm_lease_blocker, !=, 0);
 390         sc->vmm_lease_blocker--;
 391         if (sc->vmm_lease_blocker == 0) {
 392                 cv_broadcast(&sc->vmm_lease_cv);
 393         }
 394         mutex_exit(&sc->vmm_lease_lock);
 395 
 396         /*
 397          * The VM write lock _must_ be released from the same thread it was
 398          * acquired in, unlike the read lock.
 399          */
 400         VERIFY(rw_write_held(&sc->vmm_rwlock));
 401         rw_exit(&sc->vmm_rwlock);
 402 
 403         /* Unlock all the vCPUs */
 404         maxcpus = vm_get_maxcpus(sc->vmm_vm);
 405         for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
 406                 vcpu_unlock_one(sc, vcpu);
 407         }
 408 }
 409 
 410 static int
 411 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 412     cred_t *credp, int *rvalp)
 413 {
 414         int error = 0, vcpu = -1;
 415         void *datap = (void *)arg;
 416         enum vm_lock_type {
 417                 LOCK_NONE = 0,
 418                 LOCK_VCPU,
 419                 LOCK_READ_HOLD,
 420                 LOCK_WRITE_HOLD
 421         } lock_type = LOCK_NONE;
 422 
 423         /* Acquire any exclusion resources needed for the operation. */
 424         switch (cmd) {
 425         case VM_RUN:
 426         case VM_GET_REGISTER:
 427         case VM_SET_REGISTER:
 428         case VM_GET_SEGMENT_DESCRIPTOR:
 429         case VM_SET_SEGMENT_DESCRIPTOR:
 430         case VM_GET_REGISTER_SET:
 431         case VM_SET_REGISTER_SET:
 432         case VM_INJECT_EXCEPTION:
 433         case VM_GET_CAPABILITY:
 434         case VM_SET_CAPABILITY:
 435         case VM_PPTDEV_MSI:
 436         case VM_PPTDEV_MSIX:
 437         case VM_SET_X2APIC_STATE:
 438         case VM_GLA2GPA:
 439         case VM_GLA2GPA_NOFAULT:
 440         case VM_ACTIVATE_CPU:
 441         case VM_SET_INTINFO:
 442         case VM_GET_INTINFO:
 443         case VM_RESTART_INSTRUCTION:
 444         case VM_SET_KERNEMU_DEV:
 445         case VM_GET_KERNEMU_DEV:
 446                 /*
 447                  * Copy in the ID of the vCPU chosen for this operation.
 448                  * Since a nefarious caller could update their struct between
 449                  * this locking and when the rest of the ioctl data is copied
 450                  * in, it is _critical_ that this local 'vcpu' variable be used
 451                  * rather than the in-struct one when performing the ioctl.
 452                  */
 453                 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
 454                         return (EFAULT);
 455                 }
 456                 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
 457                         return (EINVAL);
 458                 }
 459                 vcpu_lock_one(sc, vcpu);
 460                 lock_type = LOCK_VCPU;
 461                 break;
 462 
 463         case VM_REINIT:
 464         case VM_BIND_PPTDEV:
 465         case VM_UNBIND_PPTDEV:
 466         case VM_MAP_PPTDEV_MMIO:
 467         case VM_ALLOC_MEMSEG:
 468         case VM_MMAP_MEMSEG:
 469         case VM_WRLOCK_CYCLE:
 470         case VM_PMTMR_LOCATE:
 471                 vmm_write_lock(sc);
 472                 lock_type = LOCK_WRITE_HOLD;
 473                 break;
 474 
 475         case VM_GET_GPA_PMAP:
 476         case VM_GET_MEMSEG:
 477         case VM_MMAP_GETNEXT:
 478         case VM_LAPIC_IRQ:
 479         case VM_INJECT_NMI:
 480         case VM_IOAPIC_ASSERT_IRQ:
 481         case VM_IOAPIC_DEASSERT_IRQ:
 482         case VM_IOAPIC_PULSE_IRQ:
 483         case VM_LAPIC_MSI:
 484         case VM_LAPIC_LOCAL_IRQ:
 485         case VM_GET_X2APIC_STATE:
 486         case VM_RTC_READ:
 487         case VM_RTC_WRITE:
 488         case VM_RTC_SETTIME:
 489         case VM_RTC_GETTIME:
 490 #ifndef __FreeBSD__
 491         case VM_DEVMEM_GETOFFSET:
 492 #endif
 493                 vmm_read_lock(sc);
 494                 lock_type = LOCK_READ_HOLD;
 495                 break;
 496 
 497         case VM_IOAPIC_PINCOUNT:
 498         default:
 499                 break;
 500         }
 501 
 502         /* Execute the primary logic for the ioctl. */
 503         switch (cmd) {
 504         case VM_RUN: {
 505                 struct vm_entry entry;
 506 
 507                 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
 508                         error = EFAULT;
 509                         break;
 510                 }
 511 
 512                 if (!(curthread->t_schedflag & TS_VCPU))
 513                         smt_mark_as_vcpu();
 514 
 515                 error = vm_run(sc->vmm_vm, vcpu, &entry);
 516 
 517                 /*
 518                  * Unexpected states in vm_run() are expressed through positive
 519                  * errno-oriented return values.  VM states which expect further
 520                  * processing in userspace (necessary context via exitinfo) are
 521                  * expressed through negative return values.  For the time being
 522                  * a return value of 0 is not expected from vm_run().
 523                  */
 524                 ASSERT(error != 0);
 525                 if (error < 0) {
 526                         const struct vm_exit *vme;
 527                         void *outp = entry.exit_data;
 528 
 529                         error = 0;
 530                         vme = vm_exitinfo(sc->vmm_vm, vcpu);
 531                         if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
 532                                 error = EFAULT;
 533                         }
 534                 }
 535                 break;
 536         }
 537         case VM_SUSPEND: {
 538                 struct vm_suspend vmsuspend;
 539 
 540                 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
 541                         error = EFAULT;
 542                         break;
 543                 }
 544                 error = vm_suspend(sc->vmm_vm, vmsuspend.how);
 545                 break;
 546         }
 547         case VM_REINIT:
 548                 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
 549                         /*
 550                          * The VM instance should be free of driver-attached
 551                          * hooks during the reinitialization process.
 552                          */
 553                         break;
 554                 }
 555                 error = vm_reinit(sc->vmm_vm);
 556                 (void) vmm_drv_block_hook(sc, B_FALSE);
 557                 break;
 558         case VM_STAT_DESC: {
 559                 struct vm_stat_desc statdesc;
 560 
 561                 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
 562                         error = EFAULT;
 563                         break;
 564                 }
 565                 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
 566                     sizeof (statdesc.desc));
 567                 if (error == 0 &&
 568                     ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
 569                         error = EFAULT;
 570                         break;
 571                 }
 572                 break;
 573         }
 574         case VM_STATS_IOC: {
 575                 struct vm_stats vmstats;
 576 
 577                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
 578                 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
 579                         error = EFAULT;
 580                         break;
 581                 }
 582                 hrt2tv(gethrtime(), &vmstats.tv);
 583                 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
 584                     &vmstats.num_entries, vmstats.statbuf);
 585                 if (error == 0 &&
 586                     ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
 587                         error = EFAULT;
 588                         break;
 589                 }
 590                 break;
 591         }
 592 
 593         case VM_PPTDEV_MSI: {
 594                 struct vm_pptdev_msi pptmsi;
 595 
 596                 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
 597                         error = EFAULT;
 598                         break;
 599                 }
 600                 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
 601                     pptmsi.addr, pptmsi.msg, pptmsi.numvec);
 602                 break;
 603         }
 604         case VM_PPTDEV_MSIX: {
 605                 struct vm_pptdev_msix pptmsix;
 606 
 607                 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
 608                         error = EFAULT;
 609                         break;
 610                 }
 611                 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
 612                     pptmsix.idx, pptmsix.addr, pptmsix.msg,
 613                     pptmsix.vector_control);
 614                 break;
 615         }
 616         case VM_MAP_PPTDEV_MMIO: {
 617                 struct vm_pptdev_mmio pptmmio;
 618 
 619                 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
 620                         error = EFAULT;
 621                         break;
 622                 }
 623                 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
 624                     pptmmio.len, pptmmio.hpa);
 625                 break;
 626         }
 627         case VM_BIND_PPTDEV: {
 628                 struct vm_pptdev pptdev;
 629 
 630                 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 631                         error = EFAULT;
 632                         break;
 633                 }
 634                 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
 635                 break;
 636         }
 637         case VM_UNBIND_PPTDEV: {
 638                 struct vm_pptdev pptdev;
 639 
 640                 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 641                         error = EFAULT;
 642                         break;
 643                 }
 644                 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
 645                 break;
 646         }
 647         case VM_GET_PPTDEV_LIMITS: {
 648                 struct vm_pptdev_limits pptlimits;
 649 
 650                 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
 651                         error = EFAULT;
 652                         break;
 653                 }
 654                 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
 655                     &pptlimits.msi_limit, &pptlimits.msix_limit);
 656                 if (error == 0 &&
 657                     ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
 658                         error = EFAULT;
 659                         break;
 660                 }
 661                 break;
 662         }
 663         case VM_INJECT_EXCEPTION: {
 664                 struct vm_exception vmexc;
 665                 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
 666                         error = EFAULT;
 667                         break;
 668                 }
 669                 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
 670                     vmexc.error_code_valid, vmexc.error_code,
 671                     vmexc.restart_instruction);
 672                 break;
 673         }
 674         case VM_INJECT_NMI: {
 675                 struct vm_nmi vmnmi;
 676 
 677                 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
 678                         error = EFAULT;
 679                         break;
 680                 }
 681                 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
 682                 break;
 683         }
 684         case VM_LAPIC_IRQ: {
 685                 struct vm_lapic_irq vmirq;
 686 
 687                 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
 688                         error = EFAULT;
 689                         break;
 690                 }
 691                 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
 692                 break;
 693         }
 694         case VM_LAPIC_LOCAL_IRQ: {
 695                 struct vm_lapic_irq vmirq;
 696 
 697                 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
 698                         error = EFAULT;
 699                         break;
 700                 }
 701                 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
 702                     vmirq.vector);
 703                 break;
 704         }
 705         case VM_LAPIC_MSI: {
 706                 struct vm_lapic_msi vmmsi;
 707 
 708                 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
 709                         error = EFAULT;
 710                         break;
 711                 }
 712                 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
 713                 break;
 714         }
 715 
 716         case VM_IOAPIC_ASSERT_IRQ: {
 717                 struct vm_ioapic_irq ioapic_irq;
 718 
 719                 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 720                         error = EFAULT;
 721                         break;
 722                 }
 723                 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
 724                 break;
 725         }
 726         case VM_IOAPIC_DEASSERT_IRQ: {
 727                 struct vm_ioapic_irq ioapic_irq;
 728 
 729                 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 730                         error = EFAULT;
 731                         break;
 732                 }
 733                 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
 734                 break;
 735         }
 736         case VM_IOAPIC_PULSE_IRQ: {
 737                 struct vm_ioapic_irq ioapic_irq;
 738 
 739                 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 740                         error = EFAULT;
 741                         break;
 742                 }
 743                 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
 744                 break;
 745         }
 746         case VM_IOAPIC_PINCOUNT: {
 747                 int pincount;
 748 
 749                 pincount = vioapic_pincount(sc->vmm_vm);
 750                 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
 751                         error = EFAULT;
 752                         break;
 753                 }
 754                 break;
 755         }
 756 
 757         case VM_ISA_ASSERT_IRQ: {
 758                 struct vm_isa_irq isa_irq;
 759 
 760                 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 761                         error = EFAULT;
 762                         break;
 763                 }
 764                 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
 765                 if (error == 0 && isa_irq.ioapic_irq != -1) {
 766                         error = vioapic_assert_irq(sc->vmm_vm,
 767                             isa_irq.ioapic_irq);
 768                 }
 769                 break;
 770         }
 771         case VM_ISA_DEASSERT_IRQ: {
 772                 struct vm_isa_irq isa_irq;
 773 
 774                 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 775                         error = EFAULT;
 776                         break;
 777                 }
 778                 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
 779                 if (error == 0 && isa_irq.ioapic_irq != -1) {
 780                         error = vioapic_deassert_irq(sc->vmm_vm,
 781                             isa_irq.ioapic_irq);
 782                 }
 783                 break;
 784         }
 785         case VM_ISA_PULSE_IRQ: {
 786                 struct vm_isa_irq isa_irq;
 787 
 788                 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 789                         error = EFAULT;
 790                         break;
 791                 }
 792                 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
 793                 if (error == 0 && isa_irq.ioapic_irq != -1) {
 794                         error = vioapic_pulse_irq(sc->vmm_vm,
 795                             isa_irq.ioapic_irq);
 796                 }
 797                 break;
 798         }
 799         case VM_ISA_SET_IRQ_TRIGGER: {
 800                 struct vm_isa_irq_trigger isa_irq_trigger;
 801 
 802                 if (ddi_copyin(datap, &isa_irq_trigger,
 803                     sizeof (isa_irq_trigger), md)) {
 804                         error = EFAULT;
 805                         break;
 806                 }
 807                 error = vatpic_set_irq_trigger(sc->vmm_vm,
 808                     isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
 809                 break;
 810         }
 811 
 812         case VM_MMAP_GETNEXT: {
 813                 struct vm_memmap mm;
 814 
 815                 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
 816                         error = EFAULT;
 817                         break;
 818                 }
 819                 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
 820                     &mm.segoff, &mm.len, &mm.prot, &mm.flags);
 821                 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
 822                         error = EFAULT;
 823                         break;
 824                 }
 825                 break;
 826         }
 827         case VM_MMAP_MEMSEG: {
 828                 struct vm_memmap mm;
 829 
 830                 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
 831                         error = EFAULT;
 832                         break;
 833                 }
 834                 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
 835                     mm.len, mm.prot, mm.flags);
 836                 break;
 837         }
 838         case VM_ALLOC_MEMSEG: {
 839                 struct vm_memseg vmseg;
 840 
 841                 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
 842                         error = EFAULT;
 843                         break;
 844                 }
 845                 error = vmmdev_alloc_memseg(sc, &vmseg);
 846                 break;
 847         }
 848         case VM_GET_MEMSEG: {
 849                 struct vm_memseg vmseg;
 850 
 851                 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
 852                         error = EFAULT;
 853                         break;
 854                 }
 855                 error = vmmdev_get_memseg(sc, &vmseg);
 856                 if (error == 0 &&
 857                     ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
 858                         error = EFAULT;
 859                         break;
 860                 }
 861                 break;
 862         }
 863         case VM_GET_REGISTER: {
 864                 struct vm_register vmreg;
 865 
 866                 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
 867                         error = EFAULT;
 868                         break;
 869                 }
 870                 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
 871                     &vmreg.regval);
 872                 if (error == 0 &&
 873                     ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
 874                         error = EFAULT;
 875                         break;
 876                 }
 877                 break;
 878         }
 879         case VM_SET_REGISTER: {
 880                 struct vm_register vmreg;
 881 
 882                 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
 883                         error = EFAULT;
 884                         break;
 885                 }
 886                 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
 887                     vmreg.regval);
 888                 break;
 889         }
 890         case VM_SET_SEGMENT_DESCRIPTOR: {
 891                 struct vm_seg_desc vmsegd;
 892 
 893                 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
 894                         error = EFAULT;
 895                         break;
 896                 }
 897                 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
 898                     &vmsegd.desc);
 899                 break;
 900         }
 901         case VM_GET_SEGMENT_DESCRIPTOR: {
 902                 struct vm_seg_desc vmsegd;
 903 
 904                 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
 905                         error = EFAULT;
 906                         break;
 907                 }
 908                 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
 909                     &vmsegd.desc);
 910                 if (error == 0 &&
 911                     ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
 912                         error = EFAULT;
 913                         break;
 914                 }
 915                 break;
 916         }
 917         case VM_GET_REGISTER_SET: {
 918                 struct vm_register_set vrs;
 919                 int regnums[VM_REG_LAST];
 920                 uint64_t regvals[VM_REG_LAST];
 921 
 922                 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
 923                         error = EFAULT;
 924                         break;
 925                 }
 926                 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
 927                         error = EINVAL;
 928                         break;
 929                 }
 930                 if (ddi_copyin(vrs.regnums, regnums,
 931                     sizeof (int) * vrs.count, md)) {
 932                         error = EFAULT;
 933                         break;
 934                 }
 935 
 936                 error = 0;
 937                 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
 938                         if (regnums[i] < 0) {
 939                                 error = EINVAL;
 940                                 break;
 941                         }
 942                         error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
 943                             &regvals[i]);
 944                 }
 945                 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
 946                     sizeof (uint64_t) * vrs.count, md)) {
 947                         error = EFAULT;
 948                 }
 949                 break;
 950         }
 951         case VM_SET_REGISTER_SET: {
 952                 struct vm_register_set vrs;
 953                 int regnums[VM_REG_LAST];
 954                 uint64_t regvals[VM_REG_LAST];
 955 
 956                 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
 957                         error = EFAULT;
 958                         break;
 959                 }
 960                 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
 961                         error = EINVAL;
 962                         break;
 963                 }
 964                 if (ddi_copyin(vrs.regnums, regnums,
 965                     sizeof (int) * vrs.count, md)) {
 966                         error = EFAULT;
 967                         break;
 968                 }
 969                 if (ddi_copyin(vrs.regvals, regvals,
 970                     sizeof (uint64_t) * vrs.count, md)) {
 971                         error = EFAULT;
 972                         break;
 973                 }
 974 
 975                 error = 0;
 976                 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
 977                         /*
 978                          * Setting registers in a set is not atomic, since a
 979                          * failure in the middle of the set will cause a
 980                          * bail-out and inconsistent register state.  Callers
 981                          * should be wary of this.
 982                          */
 983                         if (regnums[i] < 0) {
 984                                 error = EINVAL;
 985                                 break;
 986                         }
 987                         error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
 988                             regvals[i]);
 989                 }
 990                 break;
 991         }
 992 
 993         case VM_SET_KERNEMU_DEV:
 994         case VM_GET_KERNEMU_DEV: {
 995                 struct vm_readwrite_kernemu_device kemu;
 996                 size_t size = 0;
 997 
 998                 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
 999                         error = EFAULT;
1000                         break;
1001                 }
1002 
1003                 if (kemu.access_width > 3) {
1004                         error = EINVAL;
1005                         break;
1006                 }
1007                 size = (1 << kemu.access_width);
1008                 ASSERT(size >= 1 && size <= 8);
1009 
1010                 if (cmd == VM_SET_KERNEMU_DEV) {
1011                         error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1012                             kemu.gpa, kemu.value, size);
1013                 } else {
1014                         error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1015                             kemu.gpa, &kemu.value, size);
1016                 }
1017 
1018                 if (error == 0) {
1019                         if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1020                                 error = EFAULT;
1021                                 break;
1022                         }
1023                 }
1024                 break;
1025         }
1026 
1027         case VM_GET_CAPABILITY: {
1028                 struct vm_capability vmcap;
1029 
1030                 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1031                         error = EFAULT;
1032                         break;
1033                 }
1034                 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1035                     &vmcap.capval);
1036                 if (error == 0 &&
1037                     ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1038                         error = EFAULT;
1039                         break;
1040                 }
1041                 break;
1042         }
1043         case VM_SET_CAPABILITY: {
1044                 struct vm_capability vmcap;
1045 
1046                 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1047                         error = EFAULT;
1048                         break;
1049                 }
1050                 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1051                     vmcap.capval);
1052                 break;
1053         }
1054         case VM_SET_X2APIC_STATE: {
1055                 struct vm_x2apic x2apic;
1056 
1057                 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1058                         error = EFAULT;
1059                         break;
1060                 }
1061                 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1062                 break;
1063         }
1064         case VM_GET_X2APIC_STATE: {
1065                 struct vm_x2apic x2apic;
1066 
1067                 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1068                         error = EFAULT;
1069                         break;
1070                 }
1071                 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1072                     &x2apic.state);
1073                 if (error == 0 &&
1074                     ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1075                         error = EFAULT;
1076                         break;
1077                 }
1078                 break;
1079         }
1080         case VM_GET_GPA_PMAP: {
1081                 struct vm_gpa_pte gpapte;
1082 
1083                 if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1084                         error = EFAULT;
1085                         break;
1086                 }
1087 #ifdef __FreeBSD__
1088                 /* XXXJOY: add function? */
1089                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1090                     gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1091 #endif
1092                 error = 0;
1093                 break;
1094         }
1095         case VM_GET_HPET_CAPABILITIES: {
1096                 struct vm_hpet_cap hpetcap;
1097 
1098                 error = vhpet_getcap(&hpetcap);
1099                 if (error == 0 &&
1100                     ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1101                         error = EFAULT;
1102                         break;
1103                 }
1104                 break;
1105         }
1106         case VM_GLA2GPA: {
1107                 struct vm_gla2gpa gg;
1108 
1109                 CTASSERT(PROT_READ == VM_PROT_READ);
1110                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1111                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1112 
1113                 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1114                         error = EFAULT;
1115                         break;
1116                 }
1117                 gg.vcpuid = vcpu;
1118                 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1119                     gg.prot, &gg.gpa, &gg.fault);
1120                 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1121                         error = EFAULT;
1122                         break;
1123                 }
1124                 break;
1125         }
1126         case VM_GLA2GPA_NOFAULT: {
1127                 struct vm_gla2gpa gg;
1128 
1129                 CTASSERT(PROT_READ == VM_PROT_READ);
1130                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1131                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1132 
1133                 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1134                         error = EFAULT;
1135                         break;
1136                 }
1137                 gg.vcpuid = vcpu;
1138                 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1139                     gg.gla, gg.prot, &gg.gpa, &gg.fault);
1140                 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1141                         error = EFAULT;
1142                         break;
1143                 }
1144                 break;
1145         }
1146 
1147         case VM_ACTIVATE_CPU:
1148                 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1149                 break;
1150 
1151         case VM_SUSPEND_CPU:
1152                 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1153                         error = EFAULT;
1154                 } else {
1155                         error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1156                 }
1157                 break;
1158 
1159         case VM_RESUME_CPU:
1160                 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1161                         error = EFAULT;
1162                 } else {
1163                         error = vm_resume_cpu(sc->vmm_vm, vcpu);
1164                 }
1165                 break;
1166 
1167         case VM_GET_CPUS: {
1168                 struct vm_cpuset vm_cpuset;
1169                 cpuset_t tempset;
1170                 void *srcp = &tempset;
1171                 int size;
1172 
1173                 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1174                         error = EFAULT;
1175                         break;
1176                 }
1177 
1178                 /* Be more generous about sizing since our cpuset_t is large. */
1179                 size = vm_cpuset.cpusetsize;
1180                 if (size <= 0 || size > sizeof (cpuset_t)) {
1181                         error = ERANGE;
1182                 }
1183                 /*
1184                  * If they want a ulong_t or less, make sure they receive the
1185                  * low bits with all the useful information.
1186                  */
1187                 if (size <= sizeof (tempset.cpub[0])) {
1188                         srcp = &tempset.cpub[0];
1189                 }
1190 
1191                 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1192                         tempset = vm_active_cpus(sc->vmm_vm);
1193                 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1194                         tempset = vm_suspended_cpus(sc->vmm_vm);
1195                 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1196                         tempset = vm_debug_cpus(sc->vmm_vm);
1197                 } else {
1198                         error = EINVAL;
1199                 }
1200 
1201                 ASSERT(size > 0 && size <= sizeof (tempset));
1202                 if (error == 0 &&
1203                     ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1204                         error = EFAULT;
1205                         break;
1206                 }
1207                 break;
1208         }
1209         case VM_SET_INTINFO: {
1210                 struct vm_intinfo vmii;
1211 
1212                 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1213                         error = EFAULT;
1214                         break;
1215                 }
1216                 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1217                 break;
1218         }
1219         case VM_GET_INTINFO: {
1220                 struct vm_intinfo vmii;
1221 
1222                 vmii.vcpuid = vcpu;
1223                 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1224                     &vmii.info2);
1225                 if (error == 0 &&
1226                     ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1227                         error = EFAULT;
1228                         break;
1229                 }
1230                 break;
1231         }
1232         case VM_RTC_WRITE: {
1233                 struct vm_rtc_data rtcdata;
1234 
1235                 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1236                         error = EFAULT;
1237                         break;
1238                 }
1239                 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1240                     rtcdata.value);
1241                 break;
1242         }
1243         case VM_RTC_READ: {
1244                 struct vm_rtc_data rtcdata;
1245 
1246                 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1247                         error = EFAULT;
1248                         break;
1249                 }
1250                 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1251                     &rtcdata.value);
1252                 if (error == 0 &&
1253                     ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1254                         error = EFAULT;
1255                         break;
1256                 }
1257                 break;
1258         }
1259         case VM_RTC_SETTIME: {
1260                 struct vm_rtc_time rtctime;
1261 
1262                 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1263                         error = EFAULT;
1264                         break;
1265                 }
1266                 error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1267                 break;
1268         }
1269         case VM_RTC_GETTIME: {
1270                 struct vm_rtc_time rtctime;
1271 
1272                 rtctime.secs = vrtc_get_time(sc->vmm_vm);
1273                 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1274                         error = EFAULT;
1275                         break;
1276                 }
1277                 break;
1278         }
1279 
1280         case VM_PMTMR_LOCATE: {
1281                 uint16_t port = arg;
1282                 error = vpmtmr_set_location(sc->vmm_vm, port);
1283                 break;
1284         }
1285 
1286         case VM_RESTART_INSTRUCTION:
1287                 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1288                 break;
1289 
1290         case VM_SET_TOPOLOGY: {
1291                 struct vm_cpu_topology topo;
1292 
1293                 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1294                         error = EFAULT;
1295                         break;
1296                 }
1297                 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1298                     topo.threads, topo.maxcpus);
1299                 break;
1300         }
1301         case VM_GET_TOPOLOGY: {
1302                 struct vm_cpu_topology topo;
1303 
1304                 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1305                     &topo.threads, &topo.maxcpus);
1306                 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1307                         error = EFAULT;
1308                         break;
1309                 }
1310                 break;
1311         }
1312 
1313 #ifndef __FreeBSD__
1314         case VM_DEVMEM_GETOFFSET: {
1315                 struct vm_devmem_offset vdo;
1316                 list_t *dl = &sc->vmm_devmem_list;
1317                 vmm_devmem_entry_t *de = NULL;
1318 
1319                 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1320                         error = EFAULT;
1321                         break;
1322                 }
1323 
1324                 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1325                         if (de->vde_segid == vdo.segid) {
1326                                 break;
1327                         }
1328                 }
1329                 if (de != NULL) {
1330                         vdo.offset = de->vde_off;
1331                         if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1332                                 error = EFAULT;
1333                         }
1334                 } else {
1335                         error = ENOENT;
1336                 }
1337                 break;
1338         }
1339         case VM_WRLOCK_CYCLE: {
1340                 /*
1341                  * Present a test mechanism to acquire/release the write lock
1342                  * on the VM without any other effects.
1343                  */
1344                 break;
1345         }
1346 #endif
1347         default:
1348                 error = ENOTTY;
1349                 break;
1350         }
1351 
1352         /* Release exclusion resources */
1353         switch (lock_type) {
1354         case LOCK_NONE:
1355                 break;
1356         case LOCK_VCPU:
1357                 vcpu_unlock_one(sc, vcpu);
1358                 break;
1359         case LOCK_READ_HOLD:
1360                 vmm_read_unlock(sc);
1361                 break;
1362         case LOCK_WRITE_HOLD:
1363                 vmm_write_unlock(sc);
1364                 break;
1365         default:
1366                 panic("unexpected lock type");
1367                 break;
1368         }
1369 
1370         return (error);
1371 }
1372 
1373 static vmm_softc_t *
1374 vmm_lookup(const char *name)
1375 {
1376         list_t *vml = &vmm_list;
1377         vmm_softc_t *sc;
1378 
1379         ASSERT(MUTEX_HELD(&vmm_mtx));
1380 
1381         for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1382                 if (strcmp(sc->vmm_name, name) == 0) {
1383                         break;
1384                 }
1385         }
1386 
1387         return (sc);
1388 }
1389 
1390 /*
1391  * Acquire an HMA registration if not already held.
1392  */
1393 static boolean_t
1394 vmm_hma_acquire(void)
1395 {
1396         ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1397 
1398         mutex_enter(&vmmdev_mtx);
1399 
1400         if (vmmdev_hma_reg == NULL) {
1401                 VERIFY3U(vmmdev_hma_ref, ==, 0);
1402                 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1403                 if (vmmdev_hma_reg == NULL) {
1404                         cmn_err(CE_WARN, "%s HMA registration failed.",
1405                             vmmdev_hvm_name);
1406                         mutex_exit(&vmmdev_mtx);
1407                         return (B_FALSE);
1408                 }
1409         }
1410 
1411         vmmdev_hma_ref++;
1412 
1413         mutex_exit(&vmmdev_mtx);
1414 
1415         return (B_TRUE);
1416 }
1417 
1418 /*
1419  * Release the HMA registration if held and there are no remaining VMs.
1420  */
1421 static void
1422 vmm_hma_release(void)
1423 {
1424         ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1425 
1426         mutex_enter(&vmmdev_mtx);
1427 
1428         VERIFY3U(vmmdev_hma_ref, !=, 0);
1429 
1430         vmmdev_hma_ref--;
1431 
1432         if (vmmdev_hma_ref == 0) {
1433                 VERIFY(vmmdev_hma_reg != NULL);
1434                 hma_unregister(vmmdev_hma_reg);
1435                 vmmdev_hma_reg = NULL;
1436         }
1437         mutex_exit(&vmmdev_mtx);
1438 }
1439 
1440 static int
1441 vmmdev_do_vm_create(char *name, cred_t *cr)
1442 {
1443         vmm_softc_t     *sc = NULL;
1444         minor_t         minor;
1445         int             error = ENOMEM;
1446 
1447         if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1448                 return (EINVAL);
1449         }
1450 
1451         if (!vmm_hma_acquire())
1452                 return (ENXIO);
1453 
1454         mutex_enter(&vmm_mtx);
1455 
1456         /* Look for duplicate names */
1457         if (vmm_lookup(name) != NULL) {
1458                 mutex_exit(&vmm_mtx);
1459                 vmm_hma_release();
1460                 return (EEXIST);
1461         }
1462 
1463         /* Allow only one instance per non-global zone. */
1464         if (!INGLOBALZONE(curproc)) {
1465                 for (sc = list_head(&vmm_list); sc != NULL;
1466                     sc = list_next(&vmm_list, sc)) {
1467                         if (sc->vmm_zone == curzone) {
1468                                 mutex_exit(&vmm_mtx);
1469                                 vmm_hma_release();
1470                                 return (EINVAL);
1471                         }
1472                 }
1473         }
1474 
1475         minor = id_alloc(vmm_minors);
1476         if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1477                 goto fail;
1478         } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1479                 ddi_soft_state_free(vmm_statep, minor);
1480                 goto fail;
1481         } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1482             DDI_PSEUDO, 0) != DDI_SUCCESS) {
1483                 goto fail;
1484         }
1485 
1486         error = vm_create(name, &sc->vmm_vm);
1487         if (error == 0) {
1488                 /* Complete VM intialization and report success. */
1489                 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1490                 sc->vmm_minor = minor;
1491                 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1492                     offsetof(vmm_devmem_entry_t, vde_node));
1493 
1494                 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1495                     offsetof(vmm_hold_t, vmh_node));
1496                 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1497 
1498                 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1499                 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1500                     offsetof(vmm_lease_t, vml_node));
1501                 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1502                 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1503 
1504                 sc->vmm_zone = crgetzone(cr);
1505                 zone_hold(sc->vmm_zone);
1506                 vmm_zsd_add_vm(sc);
1507 
1508                 list_insert_tail(&vmm_list, sc);
1509                 mutex_exit(&vmm_mtx);
1510                 return (0);
1511         }
1512 
1513         ddi_remove_minor_node(vmmdev_dip, name);
1514 fail:
1515         id_free(vmm_minors, minor);
1516         if (sc != NULL) {
1517                 ddi_soft_state_free(vmm_statep, minor);
1518         }
1519         mutex_exit(&vmm_mtx);
1520         vmm_hma_release();
1521 
1522         return (error);
1523 }
1524 
1525 /*
1526  * Bhyve 'Driver' Interface
1527  *
1528  * While many devices are emulated in the bhyve userspace process, there are
1529  * others with performance constraints which require that they run mostly or
1530  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
1531  * needed so they can query/manipulate the portions of VM state needed to
1532  * fulfill their purpose.
1533  *
1534  * This includes:
1535  * - Translating guest-physical addresses to host-virtual pointers
1536  * - Injecting MSIs
1537  * - Hooking IO port addresses
1538  *
1539  * The vmm_drv interface exists to provide that functionality to its consumers.
1540  * (At this time, 'viona' is the only user)
1541  */
1542 int
1543 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1544 {
1545         vnode_t *vp = fp->f_vnode;
1546         const dev_t dev = vp->v_rdev;
1547         vmm_softc_t *sc;
1548         vmm_hold_t *hold;
1549         int err = 0;
1550 
1551         if (vp->v_type != VCHR) {
1552                 return (ENXIO);
1553         }
1554         const major_t major = getmajor(dev);
1555         const minor_t minor = getminor(dev);
1556 
1557         mutex_enter(&vmmdev_mtx);
1558         if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1559                 mutex_exit(&vmmdev_mtx);
1560                 return (ENOENT);
1561         }
1562         mutex_enter(&vmm_mtx);
1563         mutex_exit(&vmmdev_mtx);
1564 
1565         if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1566                 err = ENOENT;
1567                 goto out;
1568         }
1569         /* XXXJOY: check cred permissions against instance */
1570 
1571         if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1572                 err = EBUSY;
1573                 goto out;
1574         }
1575 
1576         hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1577         hold->vmh_sc = sc;
1578         hold->vmh_release_req = B_FALSE;
1579 
1580         list_insert_tail(&sc->vmm_holds, hold);
1581         sc->vmm_flags |= VMM_HELD;
1582         *holdp = hold;
1583 
1584 out:
1585         mutex_exit(&vmm_mtx);
1586         return (err);
1587 }
1588 
1589 void
1590 vmm_drv_rele(vmm_hold_t *hold)
1591 {
1592         vmm_softc_t *sc;
1593 
1594         ASSERT(hold != NULL);
1595         ASSERT(hold->vmh_sc != NULL);
1596         VERIFY(hold->vmh_ioport_hook_cnt == 0);
1597 
1598         mutex_enter(&vmm_mtx);
1599         sc = hold->vmh_sc;
1600         list_remove(&sc->vmm_holds, hold);
1601         if (list_is_empty(&sc->vmm_holds)) {
1602                 sc->vmm_flags &= ~VMM_HELD;
1603                 cv_broadcast(&sc->vmm_cv);
1604         }
1605         mutex_exit(&vmm_mtx);
1606         kmem_free(hold, sizeof (*hold));
1607 }
1608 
1609 boolean_t
1610 vmm_drv_release_reqd(vmm_hold_t *hold)
1611 {
1612         ASSERT(hold != NULL);
1613 
1614         return (hold->vmh_release_req);
1615 }
1616 
1617 vmm_lease_t *
1618 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1619 {
1620         vmm_softc_t *sc = hold->vmh_sc;
1621         vmm_lease_t *lease;
1622 
1623         ASSERT3P(expiref, !=, NULL);
1624 
1625         if (hold->vmh_release_req) {
1626                 return (NULL);
1627         }
1628 
1629         lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1630         list_link_init(&lease->vml_node);
1631         lease->vml_expire_func = expiref;
1632         lease->vml_expire_arg = arg;
1633         lease->vml_expired = B_FALSE;
1634         lease->vml_hold = hold;
1635         /* cache the VM pointer for one less pointer chase */
1636         lease->vml_vm = sc->vmm_vm;
1637 
1638         mutex_enter(&sc->vmm_lease_lock);
1639         while (sc->vmm_lease_blocker != 0) {
1640                 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1641         }
1642         list_insert_tail(&sc->vmm_lease_list, lease);
1643         vmm_read_lock(sc);
1644         mutex_exit(&sc->vmm_lease_lock);
1645 
1646         return (lease);
1647 }
1648 
1649 static void
1650 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1651 {
1652         ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1653 
1654         list_remove(&sc->vmm_lease_list, lease);
1655         vmm_read_unlock(sc);
1656         kmem_free(lease, sizeof (*lease));
1657 }
1658 
1659 void
1660 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1661 {
1662         vmm_softc_t *sc = hold->vmh_sc;
1663 
1664         VERIFY3P(hold, ==, lease->vml_hold);
1665 
1666         mutex_enter(&sc->vmm_lease_lock);
1667         vmm_lease_break_locked(sc, lease);
1668         mutex_exit(&sc->vmm_lease_lock);
1669 }
1670 
1671 boolean_t
1672 vmm_drv_lease_expired(vmm_lease_t *lease)
1673 {
1674         return (lease->vml_expired);
1675 }
1676 
1677 void *
1678 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1679 {
1680         ASSERT(lease != NULL);
1681 
1682         return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1683 }
1684 
1685 int
1686 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1687 {
1688         ASSERT(lease != NULL);
1689 
1690         return (lapic_intr_msi(lease->vml_vm, addr, msg));
1691 }
1692 
1693 int
1694 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1695     void *arg, void **cookie)
1696 {
1697         vmm_softc_t *sc;
1698         int err;
1699 
1700         ASSERT(hold != NULL);
1701         ASSERT(cookie != NULL);
1702 
1703         sc = hold->vmh_sc;
1704         mutex_enter(&vmm_mtx);
1705         /* Confirm that hook installation is not blocked */
1706         if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1707                 mutex_exit(&vmm_mtx);
1708                 return (EBUSY);
1709         }
1710         /*
1711          * Optimistically record an installed hook which will prevent a block
1712          * from being asserted while the mutex is dropped.
1713          */
1714         hold->vmh_ioport_hook_cnt++;
1715         mutex_exit(&vmm_mtx);
1716 
1717         vmm_write_lock(sc);
1718         err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1719             arg, cookie);
1720         vmm_write_unlock(sc);
1721 
1722         if (err != 0) {
1723                 mutex_enter(&vmm_mtx);
1724                 /* Walk back optimism about the hook installation */
1725                 hold->vmh_ioport_hook_cnt--;
1726                 mutex_exit(&vmm_mtx);
1727         }
1728         return (err);
1729 }
1730 
1731 void
1732 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1733 {
1734         vmm_softc_t *sc;
1735 
1736         ASSERT(hold != NULL);
1737         ASSERT(cookie != NULL);
1738         ASSERT(hold->vmh_ioport_hook_cnt != 0);
1739 
1740         sc = hold->vmh_sc;
1741         vmm_write_lock(sc);
1742         vm_ioport_unhook(sc->vmm_vm, cookie);
1743         vmm_write_unlock(sc);
1744 
1745         mutex_enter(&vmm_mtx);
1746         hold->vmh_ioport_hook_cnt--;
1747         mutex_exit(&vmm_mtx);
1748 }
1749 
1750 static int
1751 vmm_drv_purge(vmm_softc_t *sc)
1752 {
1753         ASSERT(MUTEX_HELD(&vmm_mtx));
1754 
1755         if ((sc->vmm_flags & VMM_HELD) != 0) {
1756                 vmm_hold_t *hold;
1757 
1758                 sc->vmm_flags |= VMM_CLEANUP;
1759                 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1760                     hold = list_next(&sc->vmm_holds, hold)) {
1761                         hold->vmh_release_req = B_TRUE;
1762                 }
1763                 while ((sc->vmm_flags & VMM_HELD) != 0) {
1764                         if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1765                                 return (EINTR);
1766                         }
1767                 }
1768                 sc->vmm_flags &= ~VMM_CLEANUP;
1769         }
1770 
1771         VERIFY(list_is_empty(&sc->vmm_holds));
1772         sc->vmm_flags |= VMM_PURGED;
1773         return (0);
1774 }
1775 
1776 static int
1777 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1778 {
1779         int err = 0;
1780 
1781         mutex_enter(&vmm_mtx);
1782         if (!enable_block) {
1783                 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1784 
1785                 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1786                 goto done;
1787         }
1788 
1789         /* If any holds have hooks installed, the block is a failure */
1790         if (!list_is_empty(&sc->vmm_holds)) {
1791                 vmm_hold_t *hold;
1792 
1793                 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1794                     hold = list_next(&sc->vmm_holds, hold)) {
1795                         if (hold->vmh_ioport_hook_cnt != 0) {
1796                                 err = EBUSY;
1797                                 goto done;
1798                         }
1799                 }
1800         }
1801         sc->vmm_flags |= VMM_BLOCK_HOOK;
1802 
1803 done:
1804         mutex_exit(&vmm_mtx);
1805         return (err);
1806 }
1807 
1808 static int
1809 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1810     boolean_t *hma_release)
1811 {
1812         dev_info_t      *pdip = ddi_get_parent(vmmdev_dip);
1813         minor_t         minor;
1814 
1815         ASSERT(MUTEX_HELD(&vmm_mtx));
1816 
1817         *hma_release = B_FALSE;
1818 
1819         if (clean_zsd) {
1820                 vmm_zsd_rem_vm(sc);
1821         }
1822 
1823         if (vmm_drv_purge(sc) != 0) {
1824                 return (EINTR);
1825         }
1826 
1827         /* Clean up devmem entries */
1828         vmmdev_devmem_purge(sc);
1829 
1830         list_remove(&vmm_list, sc);
1831         ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1832         minor = sc->vmm_minor;
1833         zone_rele(sc->vmm_zone);
1834         if (sc->vmm_is_open) {
1835                 list_insert_tail(&vmm_destroy_list, sc);
1836                 sc->vmm_flags |= VMM_DESTROY;
1837         } else {
1838                 vm_destroy(sc->vmm_vm);
1839                 ddi_soft_state_free(vmm_statep, minor);
1840                 id_free(vmm_minors, minor);
1841                 *hma_release = B_TRUE;
1842         }
1843         (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1844 
1845         return (0);
1846 }
1847 
1848 int
1849 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1850 {
1851         boolean_t       hma_release = B_FALSE;
1852         int             err;
1853 
1854         mutex_enter(&vmm_mtx);
1855         err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1856         mutex_exit(&vmm_mtx);
1857 
1858         if (hma_release)
1859                 vmm_hma_release();
1860 
1861         return (err);
1862 }
1863 
1864 /* ARGSUSED */
1865 static int
1866 vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1867 {
1868         boolean_t       hma_release = B_FALSE;
1869         vmm_softc_t     *sc;
1870         int             err;
1871 
1872         if (crgetuid(cr) != 0)
1873                 return (EPERM);
1874 
1875         mutex_enter(&vmm_mtx);
1876 
1877         if ((sc = vmm_lookup(name)) == NULL) {
1878                 mutex_exit(&vmm_mtx);
1879                 return (ENOENT);
1880         }
1881         /*
1882          * We don't check this in vmm_lookup() since that function is also used
1883          * for validation during create and currently vmm names must be unique.
1884          */
1885         if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1886                 mutex_exit(&vmm_mtx);
1887                 return (EPERM);
1888         }
1889         err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1890 
1891         mutex_exit(&vmm_mtx);
1892 
1893         if (hma_release)
1894                 vmm_hma_release();
1895 
1896         return (err);
1897 }
1898 
1899 static int
1900 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1901 {
1902         minor_t         minor;
1903         vmm_softc_t     *sc;
1904 
1905         minor = getminor(*devp);
1906         if (minor == VMM_CTL_MINOR) {
1907                 /*
1908                  * Master control device must be opened exclusively.
1909                  */
1910                 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
1911                         return (EINVAL);
1912                 }
1913 
1914                 return (0);
1915         }
1916 
1917         mutex_enter(&vmm_mtx);
1918         sc = ddi_get_soft_state(vmm_statep, minor);
1919         if (sc == NULL) {
1920                 mutex_exit(&vmm_mtx);
1921                 return (ENXIO);
1922         }
1923 
1924         sc->vmm_is_open = B_TRUE;
1925         mutex_exit(&vmm_mtx);
1926 
1927         return (0);
1928 }
1929 
1930 static int
1931 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
1932 {
1933         minor_t         minor;
1934         vmm_softc_t     *sc;
1935         boolean_t       hma_release = B_FALSE;
1936 
1937         minor = getminor(dev);
1938         if (minor == VMM_CTL_MINOR)
1939                 return (0);
1940 
1941         mutex_enter(&vmm_mtx);
1942         sc = ddi_get_soft_state(vmm_statep, minor);
1943         if (sc == NULL) {
1944                 mutex_exit(&vmm_mtx);
1945                 return (ENXIO);
1946         }
1947 
1948         VERIFY(sc->vmm_is_open);
1949         sc->vmm_is_open = B_FALSE;
1950 
1951         /*
1952          * If this VM was destroyed while the vmm device was open, then
1953          * clean it up now that it is closed.
1954          */
1955         if (sc->vmm_flags & VMM_DESTROY) {
1956                 list_remove(&vmm_destroy_list, sc);
1957                 vm_destroy(sc->vmm_vm);
1958                 ddi_soft_state_free(vmm_statep, minor);
1959                 id_free(vmm_minors, minor);
1960                 hma_release = B_TRUE;
1961         }
1962         mutex_exit(&vmm_mtx);
1963 
1964         if (hma_release)
1965                 vmm_hma_release();
1966 
1967         return (0);
1968 }
1969 
1970 static int
1971 vmm_is_supported(intptr_t arg)
1972 {
1973         int r;
1974         const char *msg;
1975 
1976         if (vmm_is_intel()) {
1977                 r = vmx_x86_supported(&msg);
1978         } else if (vmm_is_svm()) {
1979                 /*
1980                  * HMA already ensured that the features necessary for SVM
1981                  * operation were present and online during vmm_attach().
1982                  */
1983                 r = 0;
1984         } else {
1985                 r = ENXIO;
1986                 msg = "Unsupported CPU vendor";
1987         }
1988 
1989         if (r != 0 && arg != (intptr_t)NULL) {
1990                 if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0)
1991                         return (EFAULT);
1992         }
1993         return (r);
1994 }
1995 
1996 static int
1997 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1998     int *rvalp)
1999 {
2000         vmm_softc_t     *sc;
2001         minor_t         minor;
2002 
2003         /* The structs in bhyve ioctls assume a 64-bit datamodel */
2004         if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2005                 return (ENOTSUP);
2006         }
2007 
2008         minor = getminor(dev);
2009 
2010         if (minor == VMM_CTL_MINOR) {
2011                 void *argp = (void *)arg;
2012                 char name[VM_MAX_NAMELEN] = { 0 };
2013                 size_t len = 0;
2014 
2015                 if ((mode & FKIOCTL) != 0) {
2016                         len = strlcpy(name, argp, sizeof (name));
2017                 } else {
2018                         if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2019                                 return (EFAULT);
2020                         }
2021                 }
2022                 if (len >= VM_MAX_NAMELEN) {
2023                         return (ENAMETOOLONG);
2024                 }
2025 
2026                 switch (cmd) {
2027                 case VMM_CREATE_VM:
2028                         if ((mode & FWRITE) == 0)
2029                                 return (EPERM);
2030                         return (vmmdev_do_vm_create(name, credp));
2031                 case VMM_DESTROY_VM:
2032                         if ((mode & FWRITE) == 0)
2033                                 return (EPERM);
2034                         return (vmmdev_do_vm_destroy(name, credp));
2035                 case VMM_VM_SUPPORTED:
2036                         return (vmm_is_supported(arg));
2037                 default:
2038                         /* No other actions are legal on ctl device */
2039                         return (ENOTTY);
2040                 }
2041         }
2042 
2043         sc = ddi_get_soft_state(vmm_statep, minor);
2044         ASSERT(sc);
2045 
2046         if (sc->vmm_flags & VMM_DESTROY)
2047                 return (ENXIO);
2048 
2049         return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2050 }
2051 
2052 static int
2053 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2054     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2055 {
2056         vmm_softc_t *sc;
2057         const minor_t minor = getminor(dev);
2058         struct vm *vm;
2059         int err;
2060         vm_object_t vmo = NULL;
2061         struct vmspace *vms;
2062 
2063         if (minor == VMM_CTL_MINOR) {
2064                 return (ENODEV);
2065         }
2066         if (off < 0 || (off + len) <= 0) {
2067                 return (EINVAL);
2068         }
2069         if ((prot & PROT_USER) == 0) {
2070                 return (EACCES);
2071         }
2072 
2073         sc = ddi_get_soft_state(vmm_statep, minor);
2074         ASSERT(sc);
2075 
2076         if (sc->vmm_flags & VMM_DESTROY)
2077                 return (ENXIO);
2078 
2079         /* Grab read lock on the VM to prevent any changes to the memory map */
2080         vmm_read_lock(sc);
2081 
2082         vm = sc->vmm_vm;
2083         vms = vm_get_vmspace(vm);
2084         if (off >= VM_DEVMEM_START) {
2085                 int segid;
2086                 off_t map_off = 0;
2087 
2088                 /* Mapping a devmem "device" */
2089                 if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
2090                         err = ENODEV;
2091                         goto out;
2092                 }
2093                 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2094                 if (err != 0) {
2095                         goto out;
2096                 }
2097                 err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
2098                     flags);
2099         } else {
2100                 /* Mapping a part of the guest physical space */
2101                 err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2102                     flags);
2103         }
2104 
2105 
2106 out:
2107         vmm_read_unlock(sc);
2108         return (err);
2109 }
2110 
2111 static sdev_plugin_validate_t
2112 vmm_sdev_validate(sdev_ctx_t ctx)
2113 {
2114         const char *name = sdev_ctx_name(ctx);
2115         vmm_softc_t *sc;
2116         sdev_plugin_validate_t ret;
2117         minor_t minor;
2118 
2119         if (sdev_ctx_vtype(ctx) != VCHR)
2120                 return (SDEV_VTOR_INVALID);
2121 
2122         VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2123 
2124         mutex_enter(&vmm_mtx);
2125         if ((sc = vmm_lookup(name)) == NULL)
2126                 ret = SDEV_VTOR_INVALID;
2127         else if (sc->vmm_minor != minor)
2128                 ret = SDEV_VTOR_STALE;
2129         else
2130                 ret = SDEV_VTOR_VALID;
2131         mutex_exit(&vmm_mtx);
2132 
2133         return (ret);
2134 }
2135 
2136 static int
2137 vmm_sdev_filldir(sdev_ctx_t ctx)
2138 {
2139         vmm_softc_t *sc;
2140         int ret;
2141 
2142         if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2143                 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2144                     sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2145                 return (EINVAL);
2146         }
2147 
2148         mutex_enter(&vmm_mtx);
2149         ASSERT(vmmdev_dip != NULL);
2150         for (sc = list_head(&vmm_list); sc != NULL;
2151             sc = list_next(&vmm_list, sc)) {
2152                 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2153                         ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2154                             S_IFCHR | 0600,
2155                             makedevice(ddi_driver_major(vmmdev_dip),
2156                             sc->vmm_minor));
2157                 } else {
2158                         continue;
2159                 }
2160                 if (ret != 0 && ret != EEXIST)
2161                         goto out;
2162         }
2163 
2164         ret = 0;
2165 
2166 out:
2167         mutex_exit(&vmm_mtx);
2168         return (ret);
2169 }
2170 
2171 /* ARGSUSED */
2172 static void
2173 vmm_sdev_inactive(sdev_ctx_t ctx)
2174 {
2175 }
2176 
2177 static sdev_plugin_ops_t vmm_sdev_ops = {
2178         .spo_version = SDEV_PLUGIN_VERSION,
2179         .spo_flags = SDEV_PLUGIN_SUBDIR,
2180         .spo_validate = vmm_sdev_validate,
2181         .spo_filldir = vmm_sdev_filldir,
2182         .spo_inactive = vmm_sdev_inactive
2183 };
2184 
2185 /* ARGSUSED */
2186 static int
2187 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2188 {
2189         int error;
2190 
2191         switch (cmd) {
2192         case DDI_INFO_DEVT2DEVINFO:
2193                 *result = (void *)vmmdev_dip;
2194                 error = DDI_SUCCESS;
2195                 break;
2196         case DDI_INFO_DEVT2INSTANCE:
2197                 *result = (void *)0;
2198                 error = DDI_SUCCESS;
2199                 break;
2200         default:
2201                 error = DDI_FAILURE;
2202                 break;
2203         }
2204         return (error);
2205 }
2206 
2207 static int
2208 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2209 {
2210         sdev_plugin_hdl_t sph;
2211         hma_reg_t *reg = NULL;
2212         boolean_t vmm_loaded = B_FALSE;
2213 
2214         if (cmd != DDI_ATTACH) {
2215                 return (DDI_FAILURE);
2216         }
2217 
2218         mutex_enter(&vmmdev_mtx);
2219         /* Ensure we are not already attached. */
2220         if (vmmdev_dip != NULL) {
2221                 mutex_exit(&vmmdev_mtx);
2222                 return (DDI_FAILURE);
2223         }
2224 
2225         vmm_sol_glue_init();
2226         vmm_arena_init();
2227 
2228         /*
2229          * Perform temporary HMA registration to determine if the system
2230          * is capable.
2231          */
2232         if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2233                 goto fail;
2234         } else if (vmm_mod_load() != 0) {
2235                 goto fail;
2236         }
2237         vmm_loaded = B_TRUE;
2238         hma_unregister(reg);
2239         reg = NULL;
2240 
2241         /* Create control node.  Other nodes will be created on demand. */
2242         if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2243             VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2244                 goto fail;
2245         }
2246 
2247         if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) ==
2248             (sdev_plugin_hdl_t)NULL) {
2249                 ddi_remove_minor_node(dip, NULL);
2250                 goto fail;
2251         }
2252 
2253         ddi_report_dev(dip);
2254         vmmdev_sdev_hdl = sph;
2255         vmmdev_dip = dip;
2256         mutex_exit(&vmmdev_mtx);
2257         return (DDI_SUCCESS);
2258 
2259 fail:
2260         if (vmm_loaded) {
2261                 VERIFY0(vmm_mod_unload());
2262         }
2263         if (reg != NULL) {
2264                 hma_unregister(reg);
2265         }
2266         vmm_arena_fini();
2267         vmm_sol_glue_cleanup();
2268         mutex_exit(&vmmdev_mtx);
2269         return (DDI_FAILURE);
2270 }
2271 
2272 static int
2273 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2274 {
2275         if (cmd != DDI_DETACH) {
2276                 return (DDI_FAILURE);
2277         }
2278 
2279         /*
2280          * Ensure that all resources have been cleaned up.
2281          *
2282          * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2283          * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2284          * devinfo locked as iommu_cleanup() tries to recursively lock each
2285          * devinfo, including our own, while holding vmmdev_mtx.
2286          */
2287         if (mutex_tryenter(&vmmdev_mtx) == 0)
2288                 return (DDI_FAILURE);
2289 
2290         mutex_enter(&vmm_mtx);
2291         if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2292                 mutex_exit(&vmm_mtx);
2293                 mutex_exit(&vmmdev_mtx);
2294                 return (DDI_FAILURE);
2295         }
2296         mutex_exit(&vmm_mtx);
2297 
2298         VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2299         if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2300                 mutex_exit(&vmmdev_mtx);
2301                 return (DDI_FAILURE);
2302         }
2303         vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2304 
2305         /* Remove the control node. */
2306         ddi_remove_minor_node(dip, "ctl");
2307         vmmdev_dip = NULL;
2308 
2309         VERIFY0(vmm_mod_unload());
2310         VERIFY3U(vmmdev_hma_reg, ==, NULL);
2311         vmm_arena_fini();
2312         vmm_sol_glue_cleanup();
2313 
2314         mutex_exit(&vmmdev_mtx);
2315 
2316         return (DDI_SUCCESS);
2317 }
2318 
2319 static struct cb_ops vmm_cb_ops = {
2320         vmm_open,
2321         vmm_close,
2322         nodev,          /* strategy */
2323         nodev,          /* print */
2324         nodev,          /* dump */
2325         nodev,          /* read */
2326         nodev,          /* write */
2327         vmm_ioctl,
2328         nodev,          /* devmap */
2329         nodev,          /* mmap */
2330         vmm_segmap,
2331         nochpoll,       /* poll */
2332         ddi_prop_op,
2333         NULL,
2334         D_NEW | D_MP | D_DEVMAP
2335 };
2336 
2337 static struct dev_ops vmm_ops = {
2338         DEVO_REV,
2339         0,
2340         vmm_info,
2341         nulldev,        /* identify */
2342         nulldev,        /* probe */
2343         vmm_attach,
2344         vmm_detach,
2345         nodev,          /* reset */
2346         &vmm_cb_ops,
2347         (struct bus_ops *)NULL
2348 };
2349 
2350 static struct modldrv modldrv = {
2351         &mod_driverops,
2352         "bhyve vmm",
2353         &vmm_ops
2354 };
2355 
2356 static struct modlinkage modlinkage = {
2357         MODREV_1,
2358         &modldrv,
2359         NULL
2360 };
2361 
2362 int
2363 _init(void)
2364 {
2365         int     error;
2366 
2367         sysinit();
2368 
2369         mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2370         mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2371         list_create(&vmm_list, sizeof (vmm_softc_t),
2372             offsetof(vmm_softc_t, vmm_node));
2373         list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2374             offsetof(vmm_softc_t, vmm_node));
2375         vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2376 
2377         error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2378         if (error) {
2379                 return (error);
2380         }
2381 
2382         vmm_zsd_init();
2383 
2384         error = mod_install(&modlinkage);
2385         if (error) {
2386                 ddi_soft_state_fini(&vmm_statep);
2387                 vmm_zsd_fini();
2388         }
2389 
2390         return (error);
2391 }
2392 
2393 int
2394 _fini(void)
2395 {
2396         int     error;
2397 
2398         error = mod_remove(&modlinkage);
2399         if (error) {
2400                 return (error);
2401         }
2402 
2403         vmm_zsd_fini();
2404 
2405         ddi_soft_state_fini(&vmm_statep);
2406 
2407         return (0);
2408 }
2409 
2410 int
2411 _info(struct modinfo *modinfop)
2412 {
2413         return (mod_info(&modlinkage, modinfop));
2414 }