checkme New usr/src/uts/i86pc/io/vmm/vmm_sol

   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
  12 
  13 /*
  14  * Copyright 2015 Pluribus Networks Inc.
  15  * Copyright 2019 Joyent, Inc.
  16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  17  * Copyright 2020 Oxide Computer Company
  18  */
  19 
  20 #include <sys/types.h>
  21 #include <sys/conf.h>
  22 #include <sys/cpuvar.h>
  23 #include <sys/ioccom.h>
  24 #include <sys/stat.h>
  25 #include <sys/vmsystm.h>
  26 #include <sys/ddi.h>
  27 #include <sys/mkdev.h>
  28 #include <sys/sunddi.h>
  29 #include <sys/fs/dv_node.h>
  30 #include <sys/cpuset.h>
  31 #include <sys/id_space.h>
  32 #include <sys/fs/sdev_plugin.h>
  33 #include <sys/smt.h>
  34 
  35 #include <sys/kernel.h>
  36 #include <sys/hma.h>
  37 #include <sys/x86_archext.h>
  38 #include <x86/apicreg.h>
  39 
  40 #include <sys/vmm.h>
  41 #include <sys/vmm_kernel.h>
  42 #include <sys/vmm_instruction_emul.h>
  43 #include <sys/vmm_dev.h>
  44 #include <sys/vmm_impl.h>
  45 #include <sys/vmm_drv.h>
  46 
  47 #include <vm/vm.h>
  48 #include <vm/seg_dev.h>
  49 
  50 #include "io/ppt.h"
  51 #include "io/vatpic.h"
  52 #include "io/vioapic.h"
  53 #include "io/vrtc.h"
  54 #include "io/vhpet.h"
  55 #include "io/vpmtmr.h"
  56 #include "vmm_lapic.h"
  57 #include "vmm_stat.h"
  58 #include "vmm_util.h"
  59 #include "vm/vm_glue.h"
  60 
  61 /*
  62  * Locking details:
  63  *
  64  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
  65  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
  66  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
  67  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
  68  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
  69  */
  70 
  71 static kmutex_t         vmmdev_mtx;
  72 static dev_info_t       *vmmdev_dip;
  73 static hma_reg_t        *vmmdev_hma_reg;
  74 static uint_t           vmmdev_hma_ref;
  75 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
  76 
  77 static kmutex_t         vmm_mtx;
  78 static list_t           vmm_list;
  79 static list_t           vmm_destroy_list;
  80 static id_space_t       *vmm_minors;
  81 static void             *vmm_statep;
  82 
  83 static const char *vmmdev_hvm_name = "bhyve";
  84 
  85 /* For sdev plugin (/dev) */
  86 #define VMM_SDEV_ROOT "/dev/vmm"
  87 
  88 /* From uts/i86pc/io/vmm/intel/vmx.c */
  89 extern int vmx_x86_supported(const char **);
  90 
  91 /* Holds and hooks from drivers external to vmm */
  92 struct vmm_hold {
  93         list_node_t     vmh_node;
  94         vmm_softc_t     *vmh_sc;
  95         boolean_t       vmh_release_req;
  96         uint_t          vmh_ioport_hook_cnt;
  97 };
  98 
  99 struct vmm_lease {
 100         list_node_t             vml_node;
 101         struct vm               *vml_vm;
 102         boolean_t               vml_expired;
 103         boolean_t               (*vml_expire_func)(void *);
 104         void                    *vml_expire_arg;
 105         list_node_t             vml_expire_node;
 106         struct vmm_hold         *vml_hold;
 107 };
 108 
 109 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
 110 static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
 111 
 112 static int
 113 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 114 {
 115         int error;
 116         bool sysmem;
 117 
 118         error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
 119             NULL);
 120         if (error || mseg->len == 0)
 121                 return (error);
 122 
 123         if (!sysmem) {
 124                 vmm_devmem_entry_t *de;
 125                 list_t *dl = &sc->vmm_devmem_list;
 126 
 127                 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
 128                         if (de->vde_segid == mseg->segid) {
 129                                 break;
 130                         }
 131                 }
 132                 if (de != NULL) {
 133                         (void) strlcpy(mseg->name, de->vde_name,
 134                             sizeof (mseg->name));
 135                 }
 136         } else {
 137                 bzero(mseg->name, sizeof (mseg->name));
 138         }
 139 
 140         return (error);
 141 }
 142 
 143 /*
 144  * The 'devmem' hack:
 145  *
 146  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
 147  * in the vm which appear with their own name related to the vm under /dev.
 148  * Since this would be a hassle from an sdev perspective and would require a
 149  * new cdev interface (or complicate the existing one), we choose to implement
 150  * this in a different manner.  When 'devmem' mappings are created, an
 151  * identifying off_t is communicated back out to userspace.  That off_t,
 152  * residing above the normal guest memory space, can be used to mmap the
 153  * 'devmem' mapping from the already-open vm device.
 154  */
 155 
 156 static int
 157 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
 158 {
 159         off_t map_offset;
 160         vmm_devmem_entry_t *entry;
 161 
 162         if (list_is_empty(&sc->vmm_devmem_list)) {
 163                 map_offset = VM_DEVMEM_START;
 164         } else {
 165                 entry = list_tail(&sc->vmm_devmem_list);
 166                 map_offset = entry->vde_off + entry->vde_len;
 167                 if (map_offset < entry->vde_off) {
 168                         /* Do not tolerate overflow */
 169                         return (ERANGE);
 170                 }
 171                 /*
 172                  * XXXJOY: We could choose to search the list for duplicate
 173                  * names and toss an error.  Since we're using the offset
 174                  * method for now, it does not make much of a difference.
 175                  */
 176         }
 177 
 178         entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
 179         entry->vde_segid = mseg->segid;
 180         entry->vde_len = mseg->len;
 181         entry->vde_off = map_offset;
 182         (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
 183         list_insert_tail(&sc->vmm_devmem_list, entry);
 184 
 185         return (0);
 186 }
 187 
 188 static boolean_t
 189 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
 190     off_t *map_offp)
 191 {
 192         list_t *dl = &sc->vmm_devmem_list;
 193         vmm_devmem_entry_t *de = NULL;
 194         const off_t map_end = off + len;
 195 
 196         VERIFY(off >= VM_DEVMEM_START);
 197 
 198         if (map_end < off) {
 199                 /* No match on overflow */
 200                 return (B_FALSE);
 201         }
 202 
 203         for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
 204                 const off_t item_end = de->vde_off + de->vde_len;
 205 
 206                 if (de->vde_off <= off && item_end >= map_end) {
 207                         *segidp = de->vde_segid;
 208                         *map_offp = off - de->vde_off;
 209                         return (B_TRUE);
 210                 }
 211         }
 212         return (B_FALSE);
 213 }
 214 
 215 static void
 216 vmmdev_devmem_purge(vmm_softc_t *sc)
 217 {
 218         vmm_devmem_entry_t *entry;
 219 
 220         while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
 221                 kmem_free(entry, sizeof (*entry));
 222         }
 223 }
 224 
 225 static int
 226 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 227 {
 228         int error;
 229         bool sysmem = true;
 230 
 231         if (VM_MEMSEG_NAME(mseg)) {
 232                 sysmem = false;
 233         }
 234         error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
 235 
 236         if (error == 0 && VM_MEMSEG_NAME(mseg)) {
 237                 /*
 238                  * Rather than create a whole fresh device from which userspace
 239                  * can mmap this segment, instead make it available at an
 240                  * offset above where the main guest memory resides.
 241                  */
 242                 error = vmmdev_devmem_create(sc, mseg, mseg->name);
 243                 if (error != 0) {
 244                         vm_free_memseg(sc->vmm_vm, mseg->segid);
 245                 }
 246         }
 247         return (error);
 248 }
 249 
 250 /*
 251  * Resource Locking and Exclusion
 252  *
 253  * Much of bhyve depends on key portions of VM state, such as the guest memory
 254  * map, to remain unchanged while the guest is running.  As ported from
 255  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
 256  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
 257  * performing the work of actually running the guest in VMX/SVM, would lock
 258  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
 259  * state, all of the vCPUs would be first locked, ensuring that the
 260  * operation(s) could complete without any other threads stumbling into
 261  * intermediate states.
 262  *
 263  * This approach is largely effective for bhyve.  Common operations, such as
 264  * running the vCPUs, steer clear of lock contention.  The model begins to
 265  * break down for operations which do not occur in the context of a specific
 266  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
 267  * thread in the bhyve process.  In order to properly protect those vCPU-less
 268  * operations from encountering invalid states, additional locking is required.
 269  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
 270  * It does mean that class of operations will be serialized on locking the
 271  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
 272  * undue contention on the VM_MAXCPU-1 vCPU.
 273  *
 274  * In order to address the shortcomings of this model, the concept of a
 275  * read/write lock has been added to bhyve.  Operations which change
 276  * fundamental aspects of a VM (such as the memory map) must acquire the write
 277  * lock, which also implies locking all of the vCPUs and waiting for all read
 278  * lock holders to release.  While it increases the cost and waiting time for
 279  * those few operations, it allows most hot-path operations on the VM (which
 280  * depend on its configuration remaining stable) to occur with minimal locking.
 281  *
 282  * Consumers of the Driver API (see below) are a special case when it comes to
 283  * this locking, since they may hold a read lock via the drv_lease mechanism
 284  * for an extended period of time.  Rather than forcing those consumers to
 285  * continuously poll for a write lock attempt, the lease system forces them to
 286  * provide a release callback to trigger their clean-up (and potential later
 287  * reacquisition) of the read lock.
 288  */
 289 
 290 static void
 291 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
 292 {
 293         ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 294 
 295         /*
 296          * Since this state transition is utilizing from_idle=true, it should
 297          * not fail, but rather block until it can be successful.
 298          */
 299         VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
 300 }
 301 
 302 static void
 303 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
 304 {
 305         ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 306 
 307         VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
 308         vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
 309 }
 310 
 311 static void
 312 vmm_read_lock(vmm_softc_t *sc)
 313 {
 314         rw_enter(&sc->vmm_rwlock, RW_READER);
 315 }
 316 
 317 static void
 318 vmm_read_unlock(vmm_softc_t *sc)
 319 {
 320         rw_exit(&sc->vmm_rwlock);
 321 }
 322 
 323 static void
 324 vmm_write_lock(vmm_softc_t *sc)
 325 {
 326         int maxcpus;
 327 
 328         /* First lock all the vCPUs */
 329         maxcpus = vm_get_maxcpus(sc->vmm_vm);
 330         for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
 331                 vcpu_lock_one(sc, vcpu);
 332         }
 333 
 334         mutex_enter(&sc->vmm_lease_lock);
 335         VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
 336         sc->vmm_lease_blocker++;
 337         if (sc->vmm_lease_blocker == 1) {
 338                 list_t *list = &sc->vmm_lease_list;
 339                 vmm_lease_t *lease = list_head(list);
 340 
 341                 while (lease != NULL) {
 342                         boolean_t sync_break = B_FALSE;
 343 
 344                         if (!lease->vml_expired) {
 345                                 void *arg = lease->vml_expire_arg;
 346                                 lease->vml_expired = B_TRUE;
 347                                 sync_break = lease->vml_expire_func(arg);
 348                         }
 349 
 350                         if (sync_break) {
 351                                 vmm_lease_t *next;
 352 
 353                                 /*
 354                                  * These leases which are synchronously broken
 355                                  * result in vmm_read_unlock() calls from a
 356                                  * different thread than the corresponding
 357                                  * vmm_read_lock().  This is acceptable, given
 358                                  * that the rwlock underpinning the whole
 359                                  * mechanism tolerates the behavior.  This
 360                                  * flexibility is _only_ afforded to VM read
 361                                  * lock (RW_READER) holders.
 362                                  */
 363                                 next = list_next(list, lease);
 364                                 vmm_lease_break_locked(sc, lease);
 365                                 lease = next;
 366                         } else {
 367                                 lease = list_next(list, lease);
 368                         }
 369                 }
 370         }
 371         mutex_exit(&sc->vmm_lease_lock);
 372 
 373         rw_enter(&sc->vmm_rwlock, RW_WRITER);
 374         /*
 375          * For now, the 'maxcpus' value for an instance is fixed at the
 376          * compile-time constant of VM_MAXCPU at creation.  If this changes in
 377          * the future, allowing for dynamic vCPU resource sizing, acquisition
 378          * of the write lock will need to be wary of such changes.
 379          */
 380         VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
 381 }
 382 
 383 static void
 384 vmm_write_unlock(vmm_softc_t *sc)
 385 {
 386         int maxcpus;
 387 
 388         mutex_enter(&sc->vmm_lease_lock);
 389         VERIFY3U(sc->vmm_lease_blocker, !=, 0);
 390         sc->vmm_lease_blocker--;
 391         if (sc->vmm_lease_blocker == 0) {
 392                 cv_broadcast(&sc->vmm_lease_cv);
 393         }
 394         mutex_exit(&sc->vmm_lease_lock);
 395 
 396         /*
 397          * The VM write lock _must_ be released from the same thread it was
 398          * acquired in, unlike the read lock.
 399          */
 400         VERIFY(rw_write_held(&sc->vmm_rwlock));
 401         rw_exit(&sc->vmm_rwlock);
 402 
 403         /* Unlock all the vCPUs */
 404         maxcpus = vm_get_maxcpus(sc->vmm_vm);
 405         for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
 406                 vcpu_unlock_one(sc, vcpu);
 407         }
 408 }
 409 
 410 static int
 411 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 412     cred_t *credp, int *rvalp)
 413 {
 414         int error = 0, vcpu = -1;
 415         void *datap = (void *)arg;
 416         enum vm_lock_type {
 417                 LOCK_NONE = 0,
 418                 LOCK_VCPU,
 419                 LOCK_READ_HOLD,
 420                 LOCK_WRITE_HOLD
 421         } lock_type = LOCK_NONE;
 422 
 423         /* Acquire any exclusion resources needed for the operation. */
 424         switch (cmd) {
 425         case VM_RUN:
 426         case VM_GET_REGISTER:
 427         case VM_SET_REGISTER:
 428         case VM_GET_SEGMENT_DESCRIPTOR:
 429         case VM_SET_SEGMENT_DESCRIPTOR:
 430         case VM_GET_REGISTER_SET:
 431         case VM_SET_REGISTER_SET:
 432         case VM_INJECT_EXCEPTION:
 433         case VM_GET_CAPABILITY:
 434         case VM_SET_CAPABILITY:
 435         case VM_PPTDEV_MSI:
 436         case VM_PPTDEV_MSIX:
 437         case VM_SET_X2APIC_STATE:
 438         case VM_GLA2GPA:
 439         case VM_GLA2GPA_NOFAULT:
 440         case VM_ACTIVATE_CPU:
 441         case VM_SET_INTINFO:
 442         case VM_GET_INTINFO:
 443         case VM_RESTART_INSTRUCTION:
 444         case VM_SET_KERNEMU_DEV:
 445         case VM_GET_KERNEMU_DEV:
 446         case VM_RESET_CPU:
 447         case VM_GET_RUN_STATE:
 448         case VM_SET_RUN_STATE:
 449                 /*
 450                  * Copy in the ID of the vCPU chosen for this operation.
 451                  * Since a nefarious caller could update their struct between
 452                  * this locking and when the rest of the ioctl data is copied
 453                  * in, it is _critical_ that this local 'vcpu' variable be used
 454                  * rather than the in-struct one when performing the ioctl.
 455                  */
 456                 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
 457                         return (EFAULT);
 458                 }
 459                 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
 460                         return (EINVAL);
 461                 }
 462                 vcpu_lock_one(sc, vcpu);
 463                 lock_type = LOCK_VCPU;
 464                 break;
 465 
 466         case VM_REINIT:
 467         case VM_BIND_PPTDEV:
 468         case VM_UNBIND_PPTDEV:
 469         case VM_MAP_PPTDEV_MMIO:
 470         case VM_ALLOC_MEMSEG:
 471         case VM_MMAP_MEMSEG:
 472         case VM_WRLOCK_CYCLE:
 473         case VM_PMTMR_LOCATE:
 474                 vmm_write_lock(sc);
 475                 lock_type = LOCK_WRITE_HOLD;
 476                 break;
 477 
 478         case VM_GET_GPA_PMAP:
 479         case VM_GET_MEMSEG:
 480         case VM_MMAP_GETNEXT:
 481         case VM_LAPIC_IRQ:
 482         case VM_INJECT_NMI:
 483         case VM_IOAPIC_ASSERT_IRQ:
 484         case VM_IOAPIC_DEASSERT_IRQ:
 485         case VM_IOAPIC_PULSE_IRQ:
 486         case VM_LAPIC_MSI:
 487         case VM_LAPIC_LOCAL_IRQ:
 488         case VM_GET_X2APIC_STATE:
 489         case VM_RTC_READ:
 490         case VM_RTC_WRITE:
 491         case VM_RTC_SETTIME:
 492         case VM_RTC_GETTIME:
 493 #ifndef __FreeBSD__
 494         case VM_DEVMEM_GETOFFSET:
 495 #endif
 496                 vmm_read_lock(sc);
 497                 lock_type = LOCK_READ_HOLD;
 498                 break;
 499 
 500         case VM_IOAPIC_PINCOUNT:
 501         default:
 502                 break;
 503         }
 504 
 505         /* Execute the primary logic for the ioctl. */
 506         switch (cmd) {
 507         case VM_RUN: {
 508                 struct vm_entry entry;
 509 
 510                 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
 511                         error = EFAULT;
 512                         break;
 513                 }
 514 
 515                 if (!(curthread->t_schedflag & TS_VCPU))
 516                         smt_mark_as_vcpu();
 517 
 518                 error = vm_run(sc->vmm_vm, vcpu, &entry);
 519 
 520                 /*
 521                  * Unexpected states in vm_run() are expressed through positive
 522                  * errno-oriented return values.  VM states which expect further
 523                  * processing in userspace (necessary context via exitinfo) are
 524                  * expressed through negative return values.  For the time being
 525                  * a return value of 0 is not expected from vm_run().
 526                  */
 527                 ASSERT(error != 0);
 528                 if (error < 0) {
 529                         const struct vm_exit *vme;
 530                         void *outp = entry.exit_data;
 531 
 532                         error = 0;
 533                         vme = vm_exitinfo(sc->vmm_vm, vcpu);
 534                         if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
 535                                 error = EFAULT;
 536                         }
 537                 }
 538                 break;
 539         }
 540         case VM_SUSPEND: {
 541                 struct vm_suspend vmsuspend;
 542 
 543                 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
 544                         error = EFAULT;
 545                         break;
 546                 }
 547                 error = vm_suspend(sc->vmm_vm, vmsuspend.how);
 548                 break;
 549         }
 550         case VM_REINIT:
 551                 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
 552                         /*
 553                          * The VM instance should be free of driver-attached
 554                          * hooks during the reinitialization process.
 555                          */
 556                         break;
 557                 }
 558                 error = vm_reinit(sc->vmm_vm);
 559                 (void) vmm_drv_block_hook(sc, B_FALSE);
 560                 break;
 561         case VM_STAT_DESC: {
 562                 struct vm_stat_desc statdesc;
 563 
 564                 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
 565                         error = EFAULT;
 566                         break;
 567                 }
 568                 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
 569                     sizeof (statdesc.desc));
 570                 if (error == 0 &&
 571                     ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
 572                         error = EFAULT;
 573                         break;
 574                 }
 575                 break;
 576         }
 577         case VM_STATS_IOC: {
 578                 struct vm_stats vmstats;
 579 
 580                 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
 581                 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
 582                         error = EFAULT;
 583                         break;
 584                 }
 585                 hrt2tv(gethrtime(), &vmstats.tv);
 586                 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
 587                     &vmstats.num_entries, vmstats.statbuf);
 588                 if (error == 0 &&
 589                     ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
 590                         error = EFAULT;
 591                         break;
 592                 }
 593                 break;
 594         }
 595 
 596         case VM_PPTDEV_MSI: {
 597                 struct vm_pptdev_msi pptmsi;
 598 
 599                 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
 600                         error = EFAULT;
 601                         break;
 602                 }
 603                 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
 604                     pptmsi.addr, pptmsi.msg, pptmsi.numvec);
 605                 break;
 606         }
 607         case VM_PPTDEV_MSIX: {
 608                 struct vm_pptdev_msix pptmsix;
 609 
 610                 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
 611                         error = EFAULT;
 612                         break;
 613                 }
 614                 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
 615                     pptmsix.idx, pptmsix.addr, pptmsix.msg,
 616                     pptmsix.vector_control);
 617                 break;
 618         }
 619         case VM_MAP_PPTDEV_MMIO: {
 620                 struct vm_pptdev_mmio pptmmio;
 621 
 622                 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
 623                         error = EFAULT;
 624                         break;
 625                 }
 626                 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
 627                     pptmmio.len, pptmmio.hpa);
 628                 break;
 629         }
 630         case VM_BIND_PPTDEV: {
 631                 struct vm_pptdev pptdev;
 632 
 633                 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 634                         error = EFAULT;
 635                         break;
 636                 }
 637                 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
 638                 break;
 639         }
 640         case VM_UNBIND_PPTDEV: {
 641                 struct vm_pptdev pptdev;
 642 
 643                 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 644                         error = EFAULT;
 645                         break;
 646                 }
 647                 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
 648                 break;
 649         }
 650         case VM_GET_PPTDEV_LIMITS: {
 651                 struct vm_pptdev_limits pptlimits;
 652 
 653                 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
 654                         error = EFAULT;
 655                         break;
 656                 }
 657                 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
 658                     &pptlimits.msi_limit, &pptlimits.msix_limit);
 659                 if (error == 0 &&
 660                     ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
 661                         error = EFAULT;
 662                         break;
 663                 }
 664                 break;
 665         }
 666         case VM_INJECT_EXCEPTION: {
 667                 struct vm_exception vmexc;
 668                 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
 669                         error = EFAULT;
 670                         break;
 671                 }
 672                 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
 673                     vmexc.error_code_valid, vmexc.error_code,
 674                     vmexc.restart_instruction);
 675                 break;
 676         }
 677         case VM_INJECT_NMI: {
 678                 struct vm_nmi vmnmi;
 679 
 680                 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
 681                         error = EFAULT;
 682                         break;
 683                 }
 684                 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
 685                 break;
 686         }
 687         case VM_LAPIC_IRQ: {
 688                 struct vm_lapic_irq vmirq;
 689 
 690                 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
 691                         error = EFAULT;
 692                         break;
 693                 }
 694                 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
 695                 break;
 696         }
 697         case VM_LAPIC_LOCAL_IRQ: {
 698                 struct vm_lapic_irq vmirq;
 699 
 700                 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
 701                         error = EFAULT;
 702                         break;
 703                 }
 704                 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
 705                     vmirq.vector);
 706                 break;
 707         }
 708         case VM_LAPIC_MSI: {
 709                 struct vm_lapic_msi vmmsi;
 710 
 711                 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
 712                         error = EFAULT;
 713                         break;
 714                 }
 715                 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
 716                 break;
 717         }
 718 
 719         case VM_IOAPIC_ASSERT_IRQ: {
 720                 struct vm_ioapic_irq ioapic_irq;
 721 
 722                 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 723                         error = EFAULT;
 724                         break;
 725                 }
 726                 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
 727                 break;
 728         }
 729         case VM_IOAPIC_DEASSERT_IRQ: {
 730                 struct vm_ioapic_irq ioapic_irq;
 731 
 732                 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 733                         error = EFAULT;
 734                         break;
 735                 }
 736                 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
 737                 break;
 738         }
 739         case VM_IOAPIC_PULSE_IRQ: {
 740                 struct vm_ioapic_irq ioapic_irq;
 741 
 742                 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 743                         error = EFAULT;
 744                         break;
 745                 }
 746                 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
 747                 break;
 748         }
 749         case VM_IOAPIC_PINCOUNT: {
 750                 int pincount;
 751 
 752                 pincount = vioapic_pincount(sc->vmm_vm);
 753                 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
 754                         error = EFAULT;
 755                         break;
 756                 }
 757                 break;
 758         }
 759 
 760         case VM_ISA_ASSERT_IRQ: {
 761                 struct vm_isa_irq isa_irq;
 762 
 763                 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 764                         error = EFAULT;
 765                         break;
 766                 }
 767                 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
 768                 if (error == 0 && isa_irq.ioapic_irq != -1) {
 769                         error = vioapic_assert_irq(sc->vmm_vm,
 770                             isa_irq.ioapic_irq);
 771                 }
 772                 break;
 773         }
 774         case VM_ISA_DEASSERT_IRQ: {
 775                 struct vm_isa_irq isa_irq;
 776 
 777                 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 778                         error = EFAULT;
 779                         break;
 780                 }
 781                 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
 782                 if (error == 0 && isa_irq.ioapic_irq != -1) {
 783                         error = vioapic_deassert_irq(sc->vmm_vm,
 784                             isa_irq.ioapic_irq);
 785                 }
 786                 break;
 787         }
 788         case VM_ISA_PULSE_IRQ: {
 789                 struct vm_isa_irq isa_irq;
 790 
 791                 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 792                         error = EFAULT;
 793                         break;
 794                 }
 795                 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
 796                 if (error == 0 && isa_irq.ioapic_irq != -1) {
 797                         error = vioapic_pulse_irq(sc->vmm_vm,
 798                             isa_irq.ioapic_irq);
 799                 }
 800                 break;
 801         }
 802         case VM_ISA_SET_IRQ_TRIGGER: {
 803                 struct vm_isa_irq_trigger isa_irq_trigger;
 804 
 805                 if (ddi_copyin(datap, &isa_irq_trigger,
 806                     sizeof (isa_irq_trigger), md)) {
 807                         error = EFAULT;
 808                         break;
 809                 }
 810                 error = vatpic_set_irq_trigger(sc->vmm_vm,
 811                     isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
 812                 break;
 813         }
 814 
 815         case VM_MMAP_GETNEXT: {
 816                 struct vm_memmap mm;
 817 
 818                 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
 819                         error = EFAULT;
 820                         break;
 821                 }
 822                 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
 823                     &mm.segoff, &mm.len, &mm.prot, &mm.flags);
 824                 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
 825                         error = EFAULT;
 826                         break;
 827                 }
 828                 break;
 829         }
 830         case VM_MMAP_MEMSEG: {
 831                 struct vm_memmap mm;
 832 
 833                 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
 834                         error = EFAULT;
 835                         break;
 836                 }
 837                 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
 838                     mm.len, mm.prot, mm.flags);
 839                 break;
 840         }
 841         case VM_ALLOC_MEMSEG: {
 842                 struct vm_memseg vmseg;
 843 
 844                 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
 845                         error = EFAULT;
 846                         break;
 847                 }
 848                 error = vmmdev_alloc_memseg(sc, &vmseg);
 849                 break;
 850         }
 851         case VM_GET_MEMSEG: {
 852                 struct vm_memseg vmseg;
 853 
 854                 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
 855                         error = EFAULT;
 856                         break;
 857                 }
 858                 error = vmmdev_get_memseg(sc, &vmseg);
 859                 if (error == 0 &&
 860                     ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
 861                         error = EFAULT;
 862                         break;
 863                 }
 864                 break;
 865         }
 866         case VM_GET_REGISTER: {
 867                 struct vm_register vmreg;
 868 
 869                 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
 870                         error = EFAULT;
 871                         break;
 872                 }
 873                 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
 874                     &vmreg.regval);
 875                 if (error == 0 &&
 876                     ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
 877                         error = EFAULT;
 878                         break;
 879                 }
 880                 break;
 881         }
 882         case VM_SET_REGISTER: {
 883                 struct vm_register vmreg;
 884 
 885                 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
 886                         error = EFAULT;
 887                         break;
 888                 }
 889                 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
 890                     vmreg.regval);
 891                 break;
 892         }
 893         case VM_SET_SEGMENT_DESCRIPTOR: {
 894                 struct vm_seg_desc vmsegd;
 895 
 896                 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
 897                         error = EFAULT;
 898                         break;
 899                 }
 900                 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
 901                     &vmsegd.desc);
 902                 break;
 903         }
 904         case VM_GET_SEGMENT_DESCRIPTOR: {
 905                 struct vm_seg_desc vmsegd;
 906 
 907                 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
 908                         error = EFAULT;
 909                         break;
 910                 }
 911                 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
 912                     &vmsegd.desc);
 913                 if (error == 0 &&
 914                     ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
 915                         error = EFAULT;
 916                         break;
 917                 }
 918                 break;
 919         }
 920         case VM_GET_REGISTER_SET: {
 921                 struct vm_register_set vrs;
 922                 int regnums[VM_REG_LAST];
 923                 uint64_t regvals[VM_REG_LAST];
 924 
 925                 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
 926                         error = EFAULT;
 927                         break;
 928                 }
 929                 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
 930                         error = EINVAL;
 931                         break;
 932                 }
 933                 if (ddi_copyin(vrs.regnums, regnums,
 934                     sizeof (int) * vrs.count, md)) {
 935                         error = EFAULT;
 936                         break;
 937                 }
 938 
 939                 error = 0;
 940                 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
 941                         if (regnums[i] < 0) {
 942                                 error = EINVAL;
 943                                 break;
 944                         }
 945                         error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
 946                             &regvals[i]);
 947                 }
 948                 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
 949                     sizeof (uint64_t) * vrs.count, md)) {
 950                         error = EFAULT;
 951                 }
 952                 break;
 953         }
 954         case VM_SET_REGISTER_SET: {
 955                 struct vm_register_set vrs;
 956                 int regnums[VM_REG_LAST];
 957                 uint64_t regvals[VM_REG_LAST];
 958 
 959                 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
 960                         error = EFAULT;
 961                         break;
 962                 }
 963                 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
 964                         error = EINVAL;
 965                         break;
 966                 }
 967                 if (ddi_copyin(vrs.regnums, regnums,
 968                     sizeof (int) * vrs.count, md)) {
 969                         error = EFAULT;
 970                         break;
 971                 }
 972                 if (ddi_copyin(vrs.regvals, regvals,
 973                     sizeof (uint64_t) * vrs.count, md)) {
 974                         error = EFAULT;
 975                         break;
 976                 }
 977 
 978                 error = 0;
 979                 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
 980                         /*
 981                          * Setting registers in a set is not atomic, since a
 982                          * failure in the middle of the set will cause a
 983                          * bail-out and inconsistent register state.  Callers
 984                          * should be wary of this.
 985                          */
 986                         if (regnums[i] < 0) {
 987                                 error = EINVAL;
 988                                 break;
 989                         }
 990                         error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
 991                             regvals[i]);
 992                 }
 993                 break;
 994         }
 995         case VM_RESET_CPU: {
 996                 struct vm_vcpu_reset vvr;
 997 
 998                 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
 999                         error = EFAULT;
1000                         break;
1001                 }
1002                 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1003                         error = EINVAL;
1004                 }
1005 
1006                 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1007                 break;
1008         }
1009         case VM_GET_RUN_STATE: {
1010                 struct vm_run_state vrs;
1011 
1012                 bzero(&vrs, sizeof (vrs));
1013                 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1014                     &vrs.sipi_vector);
1015                 if (error == 0) {
1016                         if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1017                                 error = EFAULT;
1018                                 break;
1019                         }
1020                 }
1021                 break;
1022         }
1023         case VM_SET_RUN_STATE: {
1024                 struct vm_run_state vrs;
1025 
1026                 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1027                         error = EFAULT;
1028                         break;
1029                 }
1030                 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1031                     vrs.sipi_vector);
1032                 break;
1033         }
1034 
1035         case VM_SET_KERNEMU_DEV:
1036         case VM_GET_KERNEMU_DEV: {
1037                 struct vm_readwrite_kernemu_device kemu;
1038                 size_t size = 0;
1039 
1040                 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1041                         error = EFAULT;
1042                         break;
1043                 }
1044 
1045                 if (kemu.access_width > 3) {
1046                         error = EINVAL;
1047                         break;
1048                 }
1049                 size = (1 << kemu.access_width);
1050                 ASSERT(size >= 1 && size <= 8);
1051 
1052                 if (cmd == VM_SET_KERNEMU_DEV) {
1053                         error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1054                             kemu.gpa, kemu.value, size);
1055                 } else {
1056                         error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1057                             kemu.gpa, &kemu.value, size);
1058                 }
1059 
1060                 if (error == 0) {
1061                         if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1062                                 error = EFAULT;
1063                                 break;
1064                         }
1065                 }
1066                 break;
1067         }
1068 
1069         case VM_GET_CAPABILITY: {
1070                 struct vm_capability vmcap;
1071 
1072                 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1073                         error = EFAULT;
1074                         break;
1075                 }
1076                 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1077                     &vmcap.capval);
1078                 if (error == 0 &&
1079                     ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1080                         error = EFAULT;
1081                         break;
1082                 }
1083                 break;
1084         }
1085         case VM_SET_CAPABILITY: {
1086                 struct vm_capability vmcap;
1087 
1088                 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1089                         error = EFAULT;
1090                         break;
1091                 }
1092                 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1093                     vmcap.capval);
1094                 break;
1095         }
1096         case VM_SET_X2APIC_STATE: {
1097                 struct vm_x2apic x2apic;
1098 
1099                 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1100                         error = EFAULT;
1101                         break;
1102                 }
1103                 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1104                 break;
1105         }
1106         case VM_GET_X2APIC_STATE: {
1107                 struct vm_x2apic x2apic;
1108 
1109                 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1110                         error = EFAULT;
1111                         break;
1112                 }
1113                 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1114                     &x2apic.state);
1115                 if (error == 0 &&
1116                     ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1117                         error = EFAULT;
1118                         break;
1119                 }
1120                 break;
1121         }
1122         case VM_GET_GPA_PMAP: {
1123                 struct vm_gpa_pte gpapte;
1124 
1125                 if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1126                         error = EFAULT;
1127                         break;
1128                 }
1129 #ifdef __FreeBSD__
1130                 /* XXXJOY: add function? */
1131                 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1132                     gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1133 #endif
1134                 error = 0;
1135                 break;
1136         }
1137         case VM_GET_HPET_CAPABILITIES: {
1138                 struct vm_hpet_cap hpetcap;
1139 
1140                 error = vhpet_getcap(&hpetcap);
1141                 if (error == 0 &&
1142                     ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1143                         error = EFAULT;
1144                         break;
1145                 }
1146                 break;
1147         }
1148         case VM_GLA2GPA: {
1149                 struct vm_gla2gpa gg;
1150 
1151                 CTASSERT(PROT_READ == VM_PROT_READ);
1152                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1153                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1154 
1155                 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1156                         error = EFAULT;
1157                         break;
1158                 }
1159                 gg.vcpuid = vcpu;
1160                 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1161                     gg.prot, &gg.gpa, &gg.fault);
1162                 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1163                         error = EFAULT;
1164                         break;
1165                 }
1166                 break;
1167         }
1168         case VM_GLA2GPA_NOFAULT: {
1169                 struct vm_gla2gpa gg;
1170 
1171                 CTASSERT(PROT_READ == VM_PROT_READ);
1172                 CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1173                 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1174 
1175                 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1176                         error = EFAULT;
1177                         break;
1178                 }
1179                 gg.vcpuid = vcpu;
1180                 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1181                     gg.gla, gg.prot, &gg.gpa, &gg.fault);
1182                 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1183                         error = EFAULT;
1184                         break;
1185                 }
1186                 break;
1187         }
1188 
1189         case VM_ACTIVATE_CPU:
1190                 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1191                 break;
1192 
1193         case VM_SUSPEND_CPU:
1194                 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1195                         error = EFAULT;
1196                 } else {
1197                         error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1198                 }
1199                 break;
1200 
1201         case VM_RESUME_CPU:
1202                 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1203                         error = EFAULT;
1204                 } else {
1205                         error = vm_resume_cpu(sc->vmm_vm, vcpu);
1206                 }
1207                 break;
1208 
1209         case VM_GET_CPUS: {
1210                 struct vm_cpuset vm_cpuset;
1211                 cpuset_t tempset;
1212                 void *srcp = &tempset;
1213                 int size;
1214 
1215                 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1216                         error = EFAULT;
1217                         break;
1218                 }
1219 
1220                 /* Be more generous about sizing since our cpuset_t is large. */
1221                 size = vm_cpuset.cpusetsize;
1222                 if (size <= 0 || size > sizeof (cpuset_t)) {
1223                         error = ERANGE;
1224                 }
1225                 /*
1226                  * If they want a ulong_t or less, make sure they receive the
1227                  * low bits with all the useful information.
1228                  */
1229                 if (size <= sizeof (tempset.cpub[0])) {
1230                         srcp = &tempset.cpub[0];
1231                 }
1232 
1233                 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1234                         tempset = vm_active_cpus(sc->vmm_vm);
1235                 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1236                         tempset = vm_suspended_cpus(sc->vmm_vm);
1237                 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1238                         tempset = vm_debug_cpus(sc->vmm_vm);
1239                 } else {
1240                         error = EINVAL;
1241                 }
1242 
1243                 ASSERT(size > 0 && size <= sizeof (tempset));
1244                 if (error == 0 &&
1245                     ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1246                         error = EFAULT;
1247                         break;
1248                 }
1249                 break;
1250         }
1251         case VM_SET_INTINFO: {
1252                 struct vm_intinfo vmii;
1253 
1254                 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1255                         error = EFAULT;
1256                         break;
1257                 }
1258                 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1259                 break;
1260         }
1261         case VM_GET_INTINFO: {
1262                 struct vm_intinfo vmii;
1263 
1264                 vmii.vcpuid = vcpu;
1265                 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1266                     &vmii.info2);
1267                 if (error == 0 &&
1268                     ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1269                         error = EFAULT;
1270                         break;
1271                 }
1272                 break;
1273         }
1274         case VM_RTC_WRITE: {
1275                 struct vm_rtc_data rtcdata;
1276 
1277                 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1278                         error = EFAULT;
1279                         break;
1280                 }
1281                 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1282                     rtcdata.value);
1283                 break;
1284         }
1285         case VM_RTC_READ: {
1286                 struct vm_rtc_data rtcdata;
1287 
1288                 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1289                         error = EFAULT;
1290                         break;
1291                 }
1292                 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1293                     &rtcdata.value);
1294                 if (error == 0 &&
1295                     ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1296                         error = EFAULT;
1297                         break;
1298                 }
1299                 break;
1300         }
1301         case VM_RTC_SETTIME: {
1302                 struct vm_rtc_time rtctime;
1303 
1304                 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1305                         error = EFAULT;
1306                         break;
1307                 }
1308                 error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1309                 break;
1310         }
1311         case VM_RTC_GETTIME: {
1312                 struct vm_rtc_time rtctime;
1313 
1314                 rtctime.secs = vrtc_get_time(sc->vmm_vm);
1315                 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1316                         error = EFAULT;
1317                         break;
1318                 }
1319                 break;
1320         }
1321 
1322         case VM_PMTMR_LOCATE: {
1323                 uint16_t port = arg;
1324                 error = vpmtmr_set_location(sc->vmm_vm, port);
1325                 break;
1326         }
1327 
1328         case VM_RESTART_INSTRUCTION:
1329                 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1330                 break;
1331 
1332         case VM_SET_TOPOLOGY: {
1333                 struct vm_cpu_topology topo;
1334 
1335                 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1336                         error = EFAULT;
1337                         break;
1338                 }
1339                 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1340                     topo.threads, topo.maxcpus);
1341                 break;
1342         }
1343         case VM_GET_TOPOLOGY: {
1344                 struct vm_cpu_topology topo;
1345 
1346                 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1347                     &topo.threads, &topo.maxcpus);
1348                 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1349                         error = EFAULT;
1350                         break;
1351                 }
1352                 break;
1353         }
1354 
1355 #ifndef __FreeBSD__
1356         case VM_DEVMEM_GETOFFSET: {
1357                 struct vm_devmem_offset vdo;
1358                 list_t *dl = &sc->vmm_devmem_list;
1359                 vmm_devmem_entry_t *de = NULL;
1360 
1361                 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1362                         error = EFAULT;
1363                         break;
1364                 }
1365 
1366                 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1367                         if (de->vde_segid == vdo.segid) {
1368                                 break;
1369                         }
1370                 }
1371                 if (de != NULL) {
1372                         vdo.offset = de->vde_off;
1373                         if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1374                                 error = EFAULT;
1375                         }
1376                 } else {
1377                         error = ENOENT;
1378                 }
1379                 break;
1380         }
1381         case VM_WRLOCK_CYCLE: {
1382                 /*
1383                  * Present a test mechanism to acquire/release the write lock
1384                  * on the VM without any other effects.
1385                  */
1386                 break;
1387         }
1388 #endif
1389         default:
1390                 error = ENOTTY;
1391                 break;
1392         }
1393 
1394         /* Release exclusion resources */
1395         switch (lock_type) {
1396         case LOCK_NONE:
1397                 break;
1398         case LOCK_VCPU:
1399                 vcpu_unlock_one(sc, vcpu);
1400                 break;
1401         case LOCK_READ_HOLD:
1402                 vmm_read_unlock(sc);
1403                 break;
1404         case LOCK_WRITE_HOLD:
1405                 vmm_write_unlock(sc);
1406                 break;
1407         default:
1408                 panic("unexpected lock type");
1409                 break;
1410         }
1411 
1412         return (error);
1413 }
1414 
1415 static vmm_softc_t *
1416 vmm_lookup(const char *name)
1417 {
1418         list_t *vml = &vmm_list;
1419         vmm_softc_t *sc;
1420 
1421         ASSERT(MUTEX_HELD(&vmm_mtx));
1422 
1423         for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1424                 if (strcmp(sc->vmm_name, name) == 0) {
1425                         break;
1426                 }
1427         }
1428 
1429         return (sc);
1430 }
1431 
1432 /*
1433  * Acquire an HMA registration if not already held.
1434  */
1435 static boolean_t
1436 vmm_hma_acquire(void)
1437 {
1438         ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1439 
1440         mutex_enter(&vmmdev_mtx);
1441 
1442         if (vmmdev_hma_reg == NULL) {
1443                 VERIFY3U(vmmdev_hma_ref, ==, 0);
1444                 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1445                 if (vmmdev_hma_reg == NULL) {
1446                         cmn_err(CE_WARN, "%s HMA registration failed.",
1447                             vmmdev_hvm_name);
1448                         mutex_exit(&vmmdev_mtx);
1449                         return (B_FALSE);
1450                 }
1451         }
1452 
1453         vmmdev_hma_ref++;
1454 
1455         mutex_exit(&vmmdev_mtx);
1456 
1457         return (B_TRUE);
1458 }
1459 
1460 /*
1461  * Release the HMA registration if held and there are no remaining VMs.
1462  */
1463 static void
1464 vmm_hma_release(void)
1465 {
1466         ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1467 
1468         mutex_enter(&vmmdev_mtx);
1469 
1470         VERIFY3U(vmmdev_hma_ref, !=, 0);
1471 
1472         vmmdev_hma_ref--;
1473 
1474         if (vmmdev_hma_ref == 0) {
1475                 VERIFY(vmmdev_hma_reg != NULL);
1476                 hma_unregister(vmmdev_hma_reg);
1477                 vmmdev_hma_reg = NULL;
1478         }
1479         mutex_exit(&vmmdev_mtx);
1480 }
1481 
1482 static int
1483 vmmdev_do_vm_create(char *name, cred_t *cr)
1484 {
1485         vmm_softc_t     *sc = NULL;
1486         minor_t         minor;
1487         int             error = ENOMEM;
1488 
1489         if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1490                 return (EINVAL);
1491         }
1492 
1493         if (!vmm_hma_acquire())
1494                 return (ENXIO);
1495 
1496         mutex_enter(&vmm_mtx);
1497 
1498         /* Look for duplicate names */
1499         if (vmm_lookup(name) != NULL) {
1500                 mutex_exit(&vmm_mtx);
1501                 vmm_hma_release();
1502                 return (EEXIST);
1503         }
1504 
1505         /* Allow only one instance per non-global zone. */
1506         if (!INGLOBALZONE(curproc)) {
1507                 for (sc = list_head(&vmm_list); sc != NULL;
1508                     sc = list_next(&vmm_list, sc)) {
1509                         if (sc->vmm_zone == curzone) {
1510                                 mutex_exit(&vmm_mtx);
1511                                 vmm_hma_release();
1512                                 return (EINVAL);
1513                         }
1514                 }
1515         }
1516 
1517         minor = id_alloc(vmm_minors);
1518         if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1519                 goto fail;
1520         } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1521                 ddi_soft_state_free(vmm_statep, minor);
1522                 goto fail;
1523         } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1524             DDI_PSEUDO, 0) != DDI_SUCCESS) {
1525                 goto fail;
1526         }
1527 
1528         error = vm_create(name, &sc->vmm_vm);
1529         if (error == 0) {
1530                 /* Complete VM intialization and report success. */
1531                 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1532                 sc->vmm_minor = minor;
1533                 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1534                     offsetof(vmm_devmem_entry_t, vde_node));
1535 
1536                 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1537                     offsetof(vmm_hold_t, vmh_node));
1538                 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1539 
1540                 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1541                 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1542                     offsetof(vmm_lease_t, vml_node));
1543                 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1544                 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1545 
1546                 sc->vmm_zone = crgetzone(cr);
1547                 zone_hold(sc->vmm_zone);
1548                 vmm_zsd_add_vm(sc);
1549 
1550                 list_insert_tail(&vmm_list, sc);
1551                 mutex_exit(&vmm_mtx);
1552                 return (0);
1553         }
1554 
1555         ddi_remove_minor_node(vmmdev_dip, name);
1556 fail:
1557         id_free(vmm_minors, minor);
1558         if (sc != NULL) {
1559                 ddi_soft_state_free(vmm_statep, minor);
1560         }
1561         mutex_exit(&vmm_mtx);
1562         vmm_hma_release();
1563 
1564         return (error);
1565 }
1566 
1567 /*
1568  * Bhyve 'Driver' Interface
1569  *
1570  * While many devices are emulated in the bhyve userspace process, there are
1571  * others with performance constraints which require that they run mostly or
1572  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
1573  * needed so they can query/manipulate the portions of VM state needed to
1574  * fulfill their purpose.
1575  *
1576  * This includes:
1577  * - Translating guest-physical addresses to host-virtual pointers
1578  * - Injecting MSIs
1579  * - Hooking IO port addresses
1580  *
1581  * The vmm_drv interface exists to provide that functionality to its consumers.
1582  * (At this time, 'viona' is the only user)
1583  */
1584 int
1585 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1586 {
1587         vnode_t *vp = fp->f_vnode;
1588         const dev_t dev = vp->v_rdev;
1589         vmm_softc_t *sc;
1590         vmm_hold_t *hold;
1591         int err = 0;
1592 
1593         if (vp->v_type != VCHR) {
1594                 return (ENXIO);
1595         }
1596         const major_t major = getmajor(dev);
1597         const minor_t minor = getminor(dev);
1598 
1599         mutex_enter(&vmmdev_mtx);
1600         if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1601                 mutex_exit(&vmmdev_mtx);
1602                 return (ENOENT);
1603         }
1604         mutex_enter(&vmm_mtx);
1605         mutex_exit(&vmmdev_mtx);
1606 
1607         if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1608                 err = ENOENT;
1609                 goto out;
1610         }
1611         /* XXXJOY: check cred permissions against instance */
1612 
1613         if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1614                 err = EBUSY;
1615                 goto out;
1616         }
1617 
1618         hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1619         hold->vmh_sc = sc;
1620         hold->vmh_release_req = B_FALSE;
1621 
1622         list_insert_tail(&sc->vmm_holds, hold);
1623         sc->vmm_flags |= VMM_HELD;
1624         *holdp = hold;
1625 
1626 out:
1627         mutex_exit(&vmm_mtx);
1628         return (err);
1629 }
1630 
1631 void
1632 vmm_drv_rele(vmm_hold_t *hold)
1633 {
1634         vmm_softc_t *sc;
1635 
1636         ASSERT(hold != NULL);
1637         ASSERT(hold->vmh_sc != NULL);
1638         VERIFY(hold->vmh_ioport_hook_cnt == 0);
1639 
1640         mutex_enter(&vmm_mtx);
1641         sc = hold->vmh_sc;
1642         list_remove(&sc->vmm_holds, hold);
1643         if (list_is_empty(&sc->vmm_holds)) {
1644                 sc->vmm_flags &= ~VMM_HELD;
1645                 cv_broadcast(&sc->vmm_cv);
1646         }
1647         mutex_exit(&vmm_mtx);
1648         kmem_free(hold, sizeof (*hold));
1649 }
1650 
1651 boolean_t
1652 vmm_drv_release_reqd(vmm_hold_t *hold)
1653 {
1654         ASSERT(hold != NULL);
1655 
1656         return (hold->vmh_release_req);
1657 }
1658 
1659 vmm_lease_t *
1660 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1661 {
1662         vmm_softc_t *sc = hold->vmh_sc;
1663         vmm_lease_t *lease;
1664 
1665         ASSERT3P(expiref, !=, NULL);
1666 
1667         if (hold->vmh_release_req) {
1668                 return (NULL);
1669         }
1670 
1671         lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1672         list_link_init(&lease->vml_node);
1673         lease->vml_expire_func = expiref;
1674         lease->vml_expire_arg = arg;
1675         lease->vml_expired = B_FALSE;
1676         lease->vml_hold = hold;
1677         /* cache the VM pointer for one less pointer chase */
1678         lease->vml_vm = sc->vmm_vm;
1679 
1680         mutex_enter(&sc->vmm_lease_lock);
1681         while (sc->vmm_lease_blocker != 0) {
1682                 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1683         }
1684         list_insert_tail(&sc->vmm_lease_list, lease);
1685         vmm_read_lock(sc);
1686         mutex_exit(&sc->vmm_lease_lock);
1687 
1688         return (lease);
1689 }
1690 
1691 static void
1692 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1693 {
1694         ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1695 
1696         list_remove(&sc->vmm_lease_list, lease);
1697         vmm_read_unlock(sc);
1698         kmem_free(lease, sizeof (*lease));
1699 }
1700 
1701 void
1702 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1703 {
1704         vmm_softc_t *sc = hold->vmh_sc;
1705 
1706         VERIFY3P(hold, ==, lease->vml_hold);
1707 
1708         mutex_enter(&sc->vmm_lease_lock);
1709         vmm_lease_break_locked(sc, lease);
1710         mutex_exit(&sc->vmm_lease_lock);
1711 }
1712 
1713 boolean_t
1714 vmm_drv_lease_expired(vmm_lease_t *lease)
1715 {
1716         return (lease->vml_expired);
1717 }
1718 
1719 void *
1720 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1721 {
1722         ASSERT(lease != NULL);
1723 
1724         return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1725 }
1726 
1727 int
1728 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1729 {
1730         ASSERT(lease != NULL);
1731 
1732         return (lapic_intr_msi(lease->vml_vm, addr, msg));
1733 }
1734 
1735 int
1736 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1737     void *arg, void **cookie)
1738 {
1739         vmm_softc_t *sc;
1740         int err;
1741 
1742         ASSERT(hold != NULL);
1743         ASSERT(cookie != NULL);
1744 
1745         sc = hold->vmh_sc;
1746         mutex_enter(&vmm_mtx);
1747         /* Confirm that hook installation is not blocked */
1748         if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1749                 mutex_exit(&vmm_mtx);
1750                 return (EBUSY);
1751         }
1752         /*
1753          * Optimistically record an installed hook which will prevent a block
1754          * from being asserted while the mutex is dropped.
1755          */
1756         hold->vmh_ioport_hook_cnt++;
1757         mutex_exit(&vmm_mtx);
1758 
1759         vmm_write_lock(sc);
1760         err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1761             arg, cookie);
1762         vmm_write_unlock(sc);
1763 
1764         if (err != 0) {
1765                 mutex_enter(&vmm_mtx);
1766                 /* Walk back optimism about the hook installation */
1767                 hold->vmh_ioport_hook_cnt--;
1768                 mutex_exit(&vmm_mtx);
1769         }
1770         return (err);
1771 }
1772 
1773 void
1774 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1775 {
1776         vmm_softc_t *sc;
1777 
1778         ASSERT(hold != NULL);
1779         ASSERT(cookie != NULL);
1780         ASSERT(hold->vmh_ioport_hook_cnt != 0);
1781 
1782         sc = hold->vmh_sc;
1783         vmm_write_lock(sc);
1784         vm_ioport_unhook(sc->vmm_vm, cookie);
1785         vmm_write_unlock(sc);
1786 
1787         mutex_enter(&vmm_mtx);
1788         hold->vmh_ioport_hook_cnt--;
1789         mutex_exit(&vmm_mtx);
1790 }
1791 
1792 static int
1793 vmm_drv_purge(vmm_softc_t *sc)
1794 {
1795         ASSERT(MUTEX_HELD(&vmm_mtx));
1796 
1797         if ((sc->vmm_flags & VMM_HELD) != 0) {
1798                 vmm_hold_t *hold;
1799 
1800                 sc->vmm_flags |= VMM_CLEANUP;
1801                 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1802                     hold = list_next(&sc->vmm_holds, hold)) {
1803                         hold->vmh_release_req = B_TRUE;
1804                 }
1805                 while ((sc->vmm_flags & VMM_HELD) != 0) {
1806                         if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1807                                 return (EINTR);
1808                         }
1809                 }
1810                 sc->vmm_flags &= ~VMM_CLEANUP;
1811         }
1812 
1813         VERIFY(list_is_empty(&sc->vmm_holds));
1814         sc->vmm_flags |= VMM_PURGED;
1815         return (0);
1816 }
1817 
1818 static int
1819 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1820 {
1821         int err = 0;
1822 
1823         mutex_enter(&vmm_mtx);
1824         if (!enable_block) {
1825                 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1826 
1827                 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1828                 goto done;
1829         }
1830 
1831         /* If any holds have hooks installed, the block is a failure */
1832         if (!list_is_empty(&sc->vmm_holds)) {
1833                 vmm_hold_t *hold;
1834 
1835                 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1836                     hold = list_next(&sc->vmm_holds, hold)) {
1837                         if (hold->vmh_ioport_hook_cnt != 0) {
1838                                 err = EBUSY;
1839                                 goto done;
1840                         }
1841                 }
1842         }
1843         sc->vmm_flags |= VMM_BLOCK_HOOK;
1844 
1845 done:
1846         mutex_exit(&vmm_mtx);
1847         return (err);
1848 }
1849 
1850 static int
1851 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1852     boolean_t *hma_release)
1853 {
1854         dev_info_t      *pdip = ddi_get_parent(vmmdev_dip);
1855         minor_t         minor;
1856 
1857         ASSERT(MUTEX_HELD(&vmm_mtx));
1858 
1859         *hma_release = B_FALSE;
1860 
1861         if (clean_zsd) {
1862                 vmm_zsd_rem_vm(sc);
1863         }
1864 
1865         if (vmm_drv_purge(sc) != 0) {
1866                 return (EINTR);
1867         }
1868 
1869         /* Clean up devmem entries */
1870         vmmdev_devmem_purge(sc);
1871 
1872         list_remove(&vmm_list, sc);
1873         ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1874         minor = sc->vmm_minor;
1875         zone_rele(sc->vmm_zone);
1876         if (sc->vmm_is_open) {
1877                 list_insert_tail(&vmm_destroy_list, sc);
1878                 sc->vmm_flags |= VMM_DESTROY;
1879         } else {
1880                 vm_destroy(sc->vmm_vm);
1881                 ddi_soft_state_free(vmm_statep, minor);
1882                 id_free(vmm_minors, minor);
1883                 *hma_release = B_TRUE;
1884         }
1885         (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1886 
1887         return (0);
1888 }
1889 
1890 int
1891 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1892 {
1893         boolean_t       hma_release = B_FALSE;
1894         int             err;
1895 
1896         mutex_enter(&vmm_mtx);
1897         err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1898         mutex_exit(&vmm_mtx);
1899 
1900         if (hma_release)
1901                 vmm_hma_release();
1902 
1903         return (err);
1904 }
1905 
1906 /* ARGSUSED */
1907 static int
1908 vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1909 {
1910         boolean_t       hma_release = B_FALSE;
1911         vmm_softc_t     *sc;
1912         int             err;
1913 
1914         if (crgetuid(cr) != 0)
1915                 return (EPERM);
1916 
1917         mutex_enter(&vmm_mtx);
1918 
1919         if ((sc = vmm_lookup(name)) == NULL) {
1920                 mutex_exit(&vmm_mtx);
1921                 return (ENOENT);
1922         }
1923         /*
1924          * We don't check this in vmm_lookup() since that function is also used
1925          * for validation during create and currently vmm names must be unique.
1926          */
1927         if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1928                 mutex_exit(&vmm_mtx);
1929                 return (EPERM);
1930         }
1931         err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1932 
1933         mutex_exit(&vmm_mtx);
1934 
1935         if (hma_release)
1936                 vmm_hma_release();
1937 
1938         return (err);
1939 }
1940 
1941 static int
1942 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1943 {
1944         minor_t         minor;
1945         vmm_softc_t     *sc;
1946 
1947         minor = getminor(*devp);
1948         if (minor == VMM_CTL_MINOR) {
1949                 /*
1950                  * Master control device must be opened exclusively.
1951                  */
1952                 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
1953                         return (EINVAL);
1954                 }
1955 
1956                 return (0);
1957         }
1958 
1959         mutex_enter(&vmm_mtx);
1960         sc = ddi_get_soft_state(vmm_statep, minor);
1961         if (sc == NULL) {
1962                 mutex_exit(&vmm_mtx);
1963                 return (ENXIO);
1964         }
1965 
1966         sc->vmm_is_open = B_TRUE;
1967         mutex_exit(&vmm_mtx);
1968 
1969         return (0);
1970 }
1971 
1972 static int
1973 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
1974 {
1975         minor_t         minor;
1976         vmm_softc_t     *sc;
1977         boolean_t       hma_release = B_FALSE;
1978 
1979         minor = getminor(dev);
1980         if (minor == VMM_CTL_MINOR)
1981                 return (0);
1982 
1983         mutex_enter(&vmm_mtx);
1984         sc = ddi_get_soft_state(vmm_statep, minor);
1985         if (sc == NULL) {
1986                 mutex_exit(&vmm_mtx);
1987                 return (ENXIO);
1988         }
1989 
1990         VERIFY(sc->vmm_is_open);
1991         sc->vmm_is_open = B_FALSE;
1992 
1993         /*
1994          * If this VM was destroyed while the vmm device was open, then
1995          * clean it up now that it is closed.
1996          */
1997         if (sc->vmm_flags & VMM_DESTROY) {
1998                 list_remove(&vmm_destroy_list, sc);
1999                 vm_destroy(sc->vmm_vm);
2000                 ddi_soft_state_free(vmm_statep, minor);
2001                 id_free(vmm_minors, minor);
2002                 hma_release = B_TRUE;
2003         }
2004         mutex_exit(&vmm_mtx);
2005 
2006         if (hma_release)
2007                 vmm_hma_release();
2008 
2009         return (0);
2010 }
2011 
2012 static int
2013 vmm_is_supported(intptr_t arg)
2014 {
2015         int r;
2016         const char *msg;
2017 
2018         if (vmm_is_intel()) {
2019                 r = vmx_x86_supported(&msg);
2020         } else if (vmm_is_svm()) {
2021                 /*
2022                  * HMA already ensured that the features necessary for SVM
2023                  * operation were present and online during vmm_attach().
2024                  */
2025                 r = 0;
2026         } else {
2027                 r = ENXIO;
2028                 msg = "Unsupported CPU vendor";
2029         }
2030 
2031         if (r != 0 && arg != (intptr_t)NULL) {
2032                 if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0)
2033                         return (EFAULT);
2034         }
2035         return (r);
2036 }
2037 
2038 static int
2039 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2040     int *rvalp)
2041 {
2042         vmm_softc_t     *sc;
2043         minor_t         minor;
2044 
2045         /* The structs in bhyve ioctls assume a 64-bit datamodel */
2046         if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2047                 return (ENOTSUP);
2048         }
2049 
2050         minor = getminor(dev);
2051 
2052         if (minor == VMM_CTL_MINOR) {
2053                 void *argp = (void *)arg;
2054                 char name[VM_MAX_NAMELEN] = { 0 };
2055                 size_t len = 0;
2056 
2057                 if ((mode & FKIOCTL) != 0) {
2058                         len = strlcpy(name, argp, sizeof (name));
2059                 } else {
2060                         if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2061                                 return (EFAULT);
2062                         }
2063                 }
2064                 if (len >= VM_MAX_NAMELEN) {
2065                         return (ENAMETOOLONG);
2066                 }
2067 
2068                 switch (cmd) {
2069                 case VMM_CREATE_VM:
2070                         if ((mode & FWRITE) == 0)
2071                                 return (EPERM);
2072                         return (vmmdev_do_vm_create(name, credp));
2073                 case VMM_DESTROY_VM:
2074                         if ((mode & FWRITE) == 0)
2075                                 return (EPERM);
2076                         return (vmmdev_do_vm_destroy(name, credp));
2077                 case VMM_VM_SUPPORTED:
2078                         return (vmm_is_supported(arg));
2079                 default:
2080                         /* No other actions are legal on ctl device */
2081                         return (ENOTTY);
2082                 }
2083         }
2084 
2085         sc = ddi_get_soft_state(vmm_statep, minor);
2086         ASSERT(sc);
2087 
2088         if (sc->vmm_flags & VMM_DESTROY)
2089                 return (ENXIO);
2090 
2091         return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2092 }
2093 
2094 static int
2095 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2096     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2097 {
2098         vmm_softc_t *sc;
2099         const minor_t minor = getminor(dev);
2100         struct vm *vm;
2101         int err;
2102         vm_object_t vmo = NULL;
2103         struct vmspace *vms;
2104 
2105         if (minor == VMM_CTL_MINOR) {
2106                 return (ENODEV);
2107         }
2108         if (off < 0 || (off + len) <= 0) {
2109                 return (EINVAL);
2110         }
2111         if ((prot & PROT_USER) == 0) {
2112                 return (EACCES);
2113         }
2114 
2115         sc = ddi_get_soft_state(vmm_statep, minor);
2116         ASSERT(sc);
2117 
2118         if (sc->vmm_flags & VMM_DESTROY)
2119                 return (ENXIO);
2120 
2121         /* Grab read lock on the VM to prevent any changes to the memory map */
2122         vmm_read_lock(sc);
2123 
2124         vm = sc->vmm_vm;
2125         vms = vm_get_vmspace(vm);
2126         if (off >= VM_DEVMEM_START) {
2127                 int segid;
2128                 off_t map_off = 0;
2129 
2130                 /* Mapping a devmem "device" */
2131                 if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
2132                         err = ENODEV;
2133                         goto out;
2134                 }
2135                 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2136                 if (err != 0) {
2137                         goto out;
2138                 }
2139                 err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
2140                     flags);
2141         } else {
2142                 /* Mapping a part of the guest physical space */
2143                 err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2144                     flags);
2145         }
2146 
2147 
2148 out:
2149         vmm_read_unlock(sc);
2150         return (err);
2151 }
2152 
2153 static sdev_plugin_validate_t
2154 vmm_sdev_validate(sdev_ctx_t ctx)
2155 {
2156         const char *name = sdev_ctx_name(ctx);
2157         vmm_softc_t *sc;
2158         sdev_plugin_validate_t ret;
2159         minor_t minor;
2160 
2161         if (sdev_ctx_vtype(ctx) != VCHR)
2162                 return (SDEV_VTOR_INVALID);
2163 
2164         VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2165 
2166         mutex_enter(&vmm_mtx);
2167         if ((sc = vmm_lookup(name)) == NULL)
2168                 ret = SDEV_VTOR_INVALID;
2169         else if (sc->vmm_minor != minor)
2170                 ret = SDEV_VTOR_STALE;
2171         else
2172                 ret = SDEV_VTOR_VALID;
2173         mutex_exit(&vmm_mtx);
2174 
2175         return (ret);
2176 }
2177 
2178 static int
2179 vmm_sdev_filldir(sdev_ctx_t ctx)
2180 {
2181         vmm_softc_t *sc;
2182         int ret;
2183 
2184         if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2185                 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2186                     sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2187                 return (EINVAL);
2188         }
2189 
2190         mutex_enter(&vmm_mtx);
2191         ASSERT(vmmdev_dip != NULL);
2192         for (sc = list_head(&vmm_list); sc != NULL;
2193             sc = list_next(&vmm_list, sc)) {
2194                 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2195                         ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2196                             S_IFCHR | 0600,
2197                             makedevice(ddi_driver_major(vmmdev_dip),
2198                             sc->vmm_minor));
2199                 } else {
2200                         continue;
2201                 }
2202                 if (ret != 0 && ret != EEXIST)
2203                         goto out;
2204         }
2205 
2206         ret = 0;
2207 
2208 out:
2209         mutex_exit(&vmm_mtx);
2210         return (ret);
2211 }
2212 
2213 /* ARGSUSED */
2214 static void
2215 vmm_sdev_inactive(sdev_ctx_t ctx)
2216 {
2217 }
2218 
2219 static sdev_plugin_ops_t vmm_sdev_ops = {
2220         .spo_version = SDEV_PLUGIN_VERSION,
2221         .spo_flags = SDEV_PLUGIN_SUBDIR,
2222         .spo_validate = vmm_sdev_validate,
2223         .spo_filldir = vmm_sdev_filldir,
2224         .spo_inactive = vmm_sdev_inactive
2225 };
2226 
2227 /* ARGSUSED */
2228 static int
2229 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2230 {
2231         int error;
2232 
2233         switch (cmd) {
2234         case DDI_INFO_DEVT2DEVINFO:
2235                 *result = (void *)vmmdev_dip;
2236                 error = DDI_SUCCESS;
2237                 break;
2238         case DDI_INFO_DEVT2INSTANCE:
2239                 *result = (void *)0;
2240                 error = DDI_SUCCESS;
2241                 break;
2242         default:
2243                 error = DDI_FAILURE;
2244                 break;
2245         }
2246         return (error);
2247 }
2248 
2249 static int
2250 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2251 {
2252         sdev_plugin_hdl_t sph;
2253         hma_reg_t *reg = NULL;
2254         boolean_t vmm_loaded = B_FALSE;
2255 
2256         if (cmd != DDI_ATTACH) {
2257                 return (DDI_FAILURE);
2258         }
2259 
2260         mutex_enter(&vmmdev_mtx);
2261         /* Ensure we are not already attached. */
2262         if (vmmdev_dip != NULL) {
2263                 mutex_exit(&vmmdev_mtx);
2264                 return (DDI_FAILURE);
2265         }
2266 
2267         vmm_sol_glue_init();
2268         vmm_arena_init();
2269 
2270         /*
2271          * Perform temporary HMA registration to determine if the system
2272          * is capable.
2273          */
2274         if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2275                 goto fail;
2276         } else if (vmm_mod_load() != 0) {
2277                 goto fail;
2278         }
2279         vmm_loaded = B_TRUE;
2280         hma_unregister(reg);
2281         reg = NULL;
2282 
2283         /* Create control node.  Other nodes will be created on demand. */
2284         if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2285             VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2286                 goto fail;
2287         }
2288 
2289         if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) ==
2290             (sdev_plugin_hdl_t)NULL) {
2291                 ddi_remove_minor_node(dip, NULL);
2292                 goto fail;
2293         }
2294 
2295         ddi_report_dev(dip);
2296         vmmdev_sdev_hdl = sph;
2297         vmmdev_dip = dip;
2298         mutex_exit(&vmmdev_mtx);
2299         return (DDI_SUCCESS);
2300 
2301 fail:
2302         if (vmm_loaded) {
2303                 VERIFY0(vmm_mod_unload());
2304         }
2305         if (reg != NULL) {
2306                 hma_unregister(reg);
2307         }
2308         vmm_arena_fini();
2309         vmm_sol_glue_cleanup();
2310         mutex_exit(&vmmdev_mtx);
2311         return (DDI_FAILURE);
2312 }
2313 
2314 static int
2315 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2316 {
2317         if (cmd != DDI_DETACH) {
2318                 return (DDI_FAILURE);
2319         }
2320 
2321         /*
2322          * Ensure that all resources have been cleaned up.
2323          *
2324          * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2325          * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2326          * devinfo locked as iommu_cleanup() tries to recursively lock each
2327          * devinfo, including our own, while holding vmmdev_mtx.
2328          */
2329         if (mutex_tryenter(&vmmdev_mtx) == 0)
2330                 return (DDI_FAILURE);
2331 
2332         mutex_enter(&vmm_mtx);
2333         if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2334                 mutex_exit(&vmm_mtx);
2335                 mutex_exit(&vmmdev_mtx);
2336                 return (DDI_FAILURE);
2337         }
2338         mutex_exit(&vmm_mtx);
2339 
2340         VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2341         if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2342                 mutex_exit(&vmmdev_mtx);
2343                 return (DDI_FAILURE);
2344         }
2345         vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2346 
2347         /* Remove the control node. */
2348         ddi_remove_minor_node(dip, "ctl");
2349         vmmdev_dip = NULL;
2350 
2351         VERIFY0(vmm_mod_unload());
2352         VERIFY3U(vmmdev_hma_reg, ==, NULL);
2353         vmm_arena_fini();
2354         vmm_sol_glue_cleanup();
2355 
2356         mutex_exit(&vmmdev_mtx);
2357 
2358         return (DDI_SUCCESS);
2359 }
2360 
2361 static struct cb_ops vmm_cb_ops = {
2362         vmm_open,
2363         vmm_close,
2364         nodev,          /* strategy */
2365         nodev,          /* print */
2366         nodev,          /* dump */
2367         nodev,          /* read */
2368         nodev,          /* write */
2369         vmm_ioctl,
2370         nodev,          /* devmap */
2371         nodev,          /* mmap */
2372         vmm_segmap,
2373         nochpoll,       /* poll */
2374         ddi_prop_op,
2375         NULL,
2376         D_NEW | D_MP | D_DEVMAP
2377 };
2378 
2379 static struct dev_ops vmm_ops = {
2380         DEVO_REV,
2381         0,
2382         vmm_info,
2383         nulldev,        /* identify */
2384         nulldev,        /* probe */
2385         vmm_attach,
2386         vmm_detach,
2387         nodev,          /* reset */
2388         &vmm_cb_ops,
2389         (struct bus_ops *)NULL
2390 };
2391 
2392 static struct modldrv modldrv = {
2393         &mod_driverops,
2394         "bhyve vmm",
2395         &vmm_ops
2396 };
2397 
2398 static struct modlinkage modlinkage = {
2399         MODREV_1,
2400         &modldrv,
2401         NULL
2402 };
2403 
2404 int
2405 _init(void)
2406 {
2407         int     error;
2408 
2409         sysinit();
2410 
2411         mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2412         mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2413         list_create(&vmm_list, sizeof (vmm_softc_t),
2414             offsetof(vmm_softc_t, vmm_node));
2415         list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2416             offsetof(vmm_softc_t, vmm_node));
2417         vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2418 
2419         error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2420         if (error) {
2421                 return (error);
2422         }
2423 
2424         vmm_zsd_init();
2425 
2426         error = mod_install(&modlinkage);
2427         if (error) {
2428                 ddi_soft_state_fini(&vmm_statep);
2429                 vmm_zsd_fini();
2430         }
2431 
2432         return (error);
2433 }
2434 
2435 int
2436 _fini(void)
2437 {
2438         int     error;
2439 
2440         error = mod_remove(&modlinkage);
2441         if (error) {
2442                 return (error);
2443         }
2444 
2445         vmm_zsd_fini();
2446 
2447         ddi_soft_state_fini(&vmm_statep);
2448 
2449         return (0);
2450 }
2451 
2452 int
2453 _info(struct modinfo *modinfop)
2454 {
2455         return (mod_info(&modlinkage, modinfop));
2456 }