checkme Wdiff usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c

Print this page

13275 bhyve needs richer INIT/SIPI support
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
          +++ new/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c

   1    1  /*
   2    2   * This file and its contents are supplied under the terms of the
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
  12   12  
  13   13  /*
  14   14   * Copyright 2015 Pluribus Networks Inc.
  15   15   * Copyright 2019 Joyent, Inc.
  16   16   * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  17   17   * Copyright 2020 Oxide Computer Company
  18   18   */
  19   19  
  20   20  #include <sys/types.h>
  21   21  #include <sys/conf.h>
  22   22  #include <sys/cpuvar.h>
  23   23  #include <sys/ioccom.h>
  24   24  #include <sys/stat.h>
  25   25  #include <sys/vmsystm.h>
  26   26  #include <sys/ddi.h>
  27   27  #include <sys/mkdev.h>
  28   28  #include <sys/sunddi.h>
  29   29  #include <sys/fs/dv_node.h>
  30   30  #include <sys/cpuset.h>
  31   31  #include <sys/id_space.h>
  32   32  #include <sys/fs/sdev_plugin.h>
  33   33  #include <sys/smt.h>
  34   34  
  35   35  #include <sys/kernel.h>
  36   36  #include <sys/hma.h>
  37   37  #include <sys/x86_archext.h>
  38   38  #include <x86/apicreg.h>
  39   39  
  40   40  #include <sys/vmm.h>
  41   41  #include <sys/vmm_kernel.h>
  42   42  #include <sys/vmm_instruction_emul.h>
  43   43  #include <sys/vmm_dev.h>
  44   44  #include <sys/vmm_impl.h>
  45   45  #include <sys/vmm_drv.h>
  46   46  
  47   47  #include <vm/vm.h>
  48   48  #include <vm/seg_dev.h>
  49   49  
  50   50  #include "io/ppt.h"
  51   51  #include "io/vatpic.h"
  52   52  #include "io/vioapic.h"
  53   53  #include "io/vrtc.h"
  54   54  #include "io/vhpet.h"
  55   55  #include "io/vpmtmr.h"
  56   56  #include "vmm_lapic.h"
  57   57  #include "vmm_stat.h"
  58   58  #include "vmm_util.h"
  59   59  #include "vm/vm_glue.h"
  60   60  
  61   61  /*
  62   62   * Locking details:
  63   63   *
  64   64   * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
  65   65   * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
  66   66   * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
  67   67   * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
  68   68   * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
  69   69   */
  70   70  
  71   71  static kmutex_t         vmmdev_mtx;
  72   72  static dev_info_t       *vmmdev_dip;
  73   73  static hma_reg_t        *vmmdev_hma_reg;
  74   74  static uint_t           vmmdev_hma_ref;
  75   75  static sdev_plugin_hdl_t vmmdev_sdev_hdl;
  76   76  
  77   77  static kmutex_t         vmm_mtx;
  78   78  static list_t           vmm_list;
  79   79  static list_t           vmm_destroy_list;
  80   80  static id_space_t       *vmm_minors;
  81   81  static void             *vmm_statep;
  82   82  
  83   83  static const char *vmmdev_hvm_name = "bhyve";
  84   84  
  85   85  /* For sdev plugin (/dev) */
  86   86  #define VMM_SDEV_ROOT "/dev/vmm"
  87   87  
  88   88  /* From uts/i86pc/io/vmm/intel/vmx.c */
  89   89  extern int vmx_x86_supported(const char **);
  90   90  
  91   91  /* Holds and hooks from drivers external to vmm */
  92   92  struct vmm_hold {
  93   93          list_node_t     vmh_node;
  94   94          vmm_softc_t     *vmh_sc;
  95   95          boolean_t       vmh_release_req;
  96   96          uint_t          vmh_ioport_hook_cnt;
  97   97  };
  98   98  
  99   99  struct vmm_lease {
 100  100          list_node_t             vml_node;
 101  101          struct vm               *vml_vm;
 102  102          boolean_t               vml_expired;
 103  103          boolean_t               (*vml_expire_func)(void *);
 104  104          void                    *vml_expire_arg;
 105  105          list_node_t             vml_expire_node;
 106  106          struct vmm_hold         *vml_hold;
 107  107  };
 108  108  
 109  109  static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
 110  110  static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
 111  111  
 112  112  static int
 113  113  vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 114  114  {
 115  115          int error;
 116  116          bool sysmem;
 117  117  
 118  118          error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
 119  119              NULL);
 120  120          if (error || mseg->len == 0)
 121  121                  return (error);
 122  122  
 123  123          if (!sysmem) {
 124  124                  vmm_devmem_entry_t *de;
 125  125                  list_t *dl = &sc->vmm_devmem_list;
 126  126  
 127  127                  for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
 128  128                          if (de->vde_segid == mseg->segid) {
 129  129                                  break;
 130  130                          }
 131  131                  }
 132  132                  if (de != NULL) {
 133  133                          (void) strlcpy(mseg->name, de->vde_name,
 134  134                              sizeof (mseg->name));
 135  135                  }
 136  136          } else {
 137  137                  bzero(mseg->name, sizeof (mseg->name));
 138  138          }
 139  139  
 140  140          return (error);
 141  141  }
 142  142  
 143  143  /*
 144  144   * The 'devmem' hack:
 145  145   *
 146  146   * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
 147  147   * in the vm which appear with their own name related to the vm under /dev.
 148  148   * Since this would be a hassle from an sdev perspective and would require a
 149  149   * new cdev interface (or complicate the existing one), we choose to implement
 150  150   * this in a different manner.  When 'devmem' mappings are created, an
 151  151   * identifying off_t is communicated back out to userspace.  That off_t,
 152  152   * residing above the normal guest memory space, can be used to mmap the
 153  153   * 'devmem' mapping from the already-open vm device.
 154  154   */
 155  155  
 156  156  static int
 157  157  vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
 158  158  {
 159  159          off_t map_offset;
 160  160          vmm_devmem_entry_t *entry;
 161  161  
 162  162          if (list_is_empty(&sc->vmm_devmem_list)) {
 163  163                  map_offset = VM_DEVMEM_START;
 164  164          } else {
 165  165                  entry = list_tail(&sc->vmm_devmem_list);
 166  166                  map_offset = entry->vde_off + entry->vde_len;
 167  167                  if (map_offset < entry->vde_off) {
 168  168                          /* Do not tolerate overflow */
 169  169                          return (ERANGE);
 170  170                  }
 171  171                  /*
 172  172                   * XXXJOY: We could choose to search the list for duplicate
 173  173                   * names and toss an error.  Since we're using the offset
 174  174                   * method for now, it does not make much of a difference.
 175  175                   */
 176  176          }
 177  177  
 178  178          entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
 179  179          entry->vde_segid = mseg->segid;
 180  180          entry->vde_len = mseg->len;
 181  181          entry->vde_off = map_offset;
 182  182          (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
 183  183          list_insert_tail(&sc->vmm_devmem_list, entry);
 184  184  
 185  185          return (0);
 186  186  }
 187  187  
 188  188  static boolean_t
 189  189  vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
 190  190      off_t *map_offp)
 191  191  {
 192  192          list_t *dl = &sc->vmm_devmem_list;
 193  193          vmm_devmem_entry_t *de = NULL;
 194  194          const off_t map_end = off + len;
 195  195  
 196  196          VERIFY(off >= VM_DEVMEM_START);
 197  197  
 198  198          if (map_end < off) {
 199  199                  /* No match on overflow */
 200  200                  return (B_FALSE);
 201  201          }
 202  202  
 203  203          for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
 204  204                  const off_t item_end = de->vde_off + de->vde_len;
 205  205  
 206  206                  if (de->vde_off <= off && item_end >= map_end) {
 207  207                          *segidp = de->vde_segid;
 208  208                          *map_offp = off - de->vde_off;
 209  209                          return (B_TRUE);
 210  210                  }
 211  211          }
 212  212          return (B_FALSE);
 213  213  }
 214  214  
 215  215  static void
 216  216  vmmdev_devmem_purge(vmm_softc_t *sc)
 217  217  {
 218  218          vmm_devmem_entry_t *entry;
 219  219  
 220  220          while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
 221  221                  kmem_free(entry, sizeof (*entry));
 222  222          }
 223  223  }
 224  224  
 225  225  static int
 226  226  vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 227  227  {
 228  228          int error;
 229  229          bool sysmem = true;
 230  230  
 231  231          if (VM_MEMSEG_NAME(mseg)) {
 232  232                  sysmem = false;
 233  233          }
 234  234          error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
 235  235  
 236  236          if (error == 0 && VM_MEMSEG_NAME(mseg)) {
 237  237                  /*
 238  238                   * Rather than create a whole fresh device from which userspace
 239  239                   * can mmap this segment, instead make it available at an
 240  240                   * offset above where the main guest memory resides.
 241  241                   */
 242  242                  error = vmmdev_devmem_create(sc, mseg, mseg->name);
 243  243                  if (error != 0) {
 244  244                          vm_free_memseg(sc->vmm_vm, mseg->segid);
 245  245                  }
 246  246          }
 247  247          return (error);
 248  248  }
 249  249  
 250  250  /*
 251  251   * Resource Locking and Exclusion
 252  252   *
 253  253   * Much of bhyve depends on key portions of VM state, such as the guest memory
 254  254   * map, to remain unchanged while the guest is running.  As ported from
 255  255   * FreeBSD, the initial strategy for this resource exclusion hinged on gating
 256  256   * access to the instance vCPUs.  Threads acting on a single vCPU, like those
 257  257   * performing the work of actually running the guest in VMX/SVM, would lock
 258  258   * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
 259  259   * state, all of the vCPUs would be first locked, ensuring that the
 260  260   * operation(s) could complete without any other threads stumbling into
 261  261   * intermediate states.
 262  262   *
 263  263   * This approach is largely effective for bhyve.  Common operations, such as
 264  264   * running the vCPUs, steer clear of lock contention.  The model begins to
 265  265   * break down for operations which do not occur in the context of a specific
 266  266   * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
 267  267   * thread in the bhyve process.  In order to properly protect those vCPU-less
 268  268   * operations from encountering invalid states, additional locking is required.
 269  269   * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
 270  270   * It does mean that class of operations will be serialized on locking the
 271  271   * specific vCPU and that instances sized at VM_MAXCPU will potentially see
 272  272   * undue contention on the VM_MAXCPU-1 vCPU.
 273  273   *
 274  274   * In order to address the shortcomings of this model, the concept of a
 275  275   * read/write lock has been added to bhyve.  Operations which change
 276  276   * fundamental aspects of a VM (such as the memory map) must acquire the write
 277  277   * lock, which also implies locking all of the vCPUs and waiting for all read
 278  278   * lock holders to release.  While it increases the cost and waiting time for
 279  279   * those few operations, it allows most hot-path operations on the VM (which
 280  280   * depend on its configuration remaining stable) to occur with minimal locking.
 281  281   *
 282  282   * Consumers of the Driver API (see below) are a special case when it comes to
 283  283   * this locking, since they may hold a read lock via the drv_lease mechanism
 284  284   * for an extended period of time.  Rather than forcing those consumers to
 285  285   * continuously poll for a write lock attempt, the lease system forces them to
 286  286   * provide a release callback to trigger their clean-up (and potential later
 287  287   * reacquisition) of the read lock.
 288  288   */
 289  289  
 290  290  static void
 291  291  vcpu_lock_one(vmm_softc_t *sc, int vcpu)
 292  292  {
 293  293          ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 294  294  
 295  295          /*
 296  296           * Since this state transition is utilizing from_idle=true, it should
 297  297           * not fail, but rather block until it can be successful.
 298  298           */
 299  299          VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
 300  300  }
 301  301  
 302  302  static void
 303  303  vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
 304  304  {
 305  305          ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 306  306  
 307  307          VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
 308  308          vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
 309  309  }
 310  310  
 311  311  static void
 312  312  vmm_read_lock(vmm_softc_t *sc)
 313  313  {
 314  314          rw_enter(&sc->vmm_rwlock, RW_READER);
 315  315  }
 316  316  
 317  317  static void
 318  318  vmm_read_unlock(vmm_softc_t *sc)
 319  319  {
 320  320          rw_exit(&sc->vmm_rwlock);
 321  321  }
 322  322  
 323  323  static void
 324  324  vmm_write_lock(vmm_softc_t *sc)
 325  325  {
 326  326          int maxcpus;
 327  327  
 328  328          /* First lock all the vCPUs */
 329  329          maxcpus = vm_get_maxcpus(sc->vmm_vm);
 330  330          for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
 331  331                  vcpu_lock_one(sc, vcpu);
 332  332          }
 333  333  
 334  334          mutex_enter(&sc->vmm_lease_lock);
 335  335          VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
 336  336          sc->vmm_lease_blocker++;
 337  337          if (sc->vmm_lease_blocker == 1) {
 338  338                  list_t *list = &sc->vmm_lease_list;
 339  339                  vmm_lease_t *lease = list_head(list);
 340  340  
 341  341                  while (lease != NULL) {
 342  342                          boolean_t sync_break = B_FALSE;
 343  343  
 344  344                          if (!lease->vml_expired) {
 345  345                                  void *arg = lease->vml_expire_arg;
 346  346                                  lease->vml_expired = B_TRUE;
 347  347                                  sync_break = lease->vml_expire_func(arg);
 348  348                          }
 349  349  
 350  350                          if (sync_break) {
 351  351                                  vmm_lease_t *next;
 352  352  
 353  353                                  /*
 354  354                                   * These leases which are synchronously broken
 355  355                                   * result in vmm_read_unlock() calls from a
 356  356                                   * different thread than the corresponding
 357  357                                   * vmm_read_lock().  This is acceptable, given
 358  358                                   * that the rwlock underpinning the whole
 359  359                                   * mechanism tolerates the behavior.  This
 360  360                                   * flexibility is _only_ afforded to VM read
 361  361                                   * lock (RW_READER) holders.
 362  362                                   */
 363  363                                  next = list_next(list, lease);
 364  364                                  vmm_lease_break_locked(sc, lease);
 365  365                                  lease = next;
 366  366                          } else {
 367  367                                  lease = list_next(list, lease);
 368  368                          }
 369  369                  }
 370  370          }
 371  371          mutex_exit(&sc->vmm_lease_lock);
 372  372  
 373  373          rw_enter(&sc->vmm_rwlock, RW_WRITER);
 374  374          /*
 375  375           * For now, the 'maxcpus' value for an instance is fixed at the
 376  376           * compile-time constant of VM_MAXCPU at creation.  If this changes in
 377  377           * the future, allowing for dynamic vCPU resource sizing, acquisition
 378  378           * of the write lock will need to be wary of such changes.
 379  379           */
 380  380          VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
 381  381  }
 382  382  
 383  383  static void
 384  384  vmm_write_unlock(vmm_softc_t *sc)
 385  385  {
 386  386          int maxcpus;
 387  387  
 388  388          mutex_enter(&sc->vmm_lease_lock);
 389  389          VERIFY3U(sc->vmm_lease_blocker, !=, 0);
 390  390          sc->vmm_lease_blocker--;
 391  391          if (sc->vmm_lease_blocker == 0) {
 392  392                  cv_broadcast(&sc->vmm_lease_cv);
 393  393          }
 394  394          mutex_exit(&sc->vmm_lease_lock);
 395  395  
 396  396          /*
 397  397           * The VM write lock _must_ be released from the same thread it was
 398  398           * acquired in, unlike the read lock.
 399  399           */
 400  400          VERIFY(rw_write_held(&sc->vmm_rwlock));
 401  401          rw_exit(&sc->vmm_rwlock);
 402  402  
 403  403          /* Unlock all the vCPUs */
 404  404          maxcpus = vm_get_maxcpus(sc->vmm_vm);
 405  405          for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
 406  406                  vcpu_unlock_one(sc, vcpu);
 407  407          }
 408  408  }
 409  409  
 410  410  static int
 411  411  vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 412  412      cred_t *credp, int *rvalp)
 413  413  {
 414  414          int error = 0, vcpu = -1;
 415  415          void *datap = (void *)arg;
 416  416          enum vm_lock_type {
 417  417                  LOCK_NONE = 0,
 418  418                  LOCK_VCPU,
 419  419                  LOCK_READ_HOLD,
 420  420                  LOCK_WRITE_HOLD
 421  421          } lock_type = LOCK_NONE;
 422  422  
 423  423          /* Acquire any exclusion resources needed for the operation. */
 424  424          switch (cmd) {
 425  425          case VM_RUN:
 426  426          case VM_GET_REGISTER:
 427  427          case VM_SET_REGISTER:
 428  428          case VM_GET_SEGMENT_DESCRIPTOR:
 429  429          case VM_SET_SEGMENT_DESCRIPTOR:
 430  430          case VM_GET_REGISTER_SET:
 431  431          case VM_SET_REGISTER_SET:
 432  432          case VM_INJECT_EXCEPTION:
 433  433          case VM_GET_CAPABILITY:
 434  434          case VM_SET_CAPABILITY:
 435  435          case VM_PPTDEV_MSI:

↓ open down ↓

435 lines elided

↑ open up ↑

 436  436          case VM_PPTDEV_MSIX:
 437  437          case VM_SET_X2APIC_STATE:
 438  438          case VM_GLA2GPA:
 439  439          case VM_GLA2GPA_NOFAULT:
 440  440          case VM_ACTIVATE_CPU:
 441  441          case VM_SET_INTINFO:
 442  442          case VM_GET_INTINFO:
 443  443          case VM_RESTART_INSTRUCTION:
 444  444          case VM_SET_KERNEMU_DEV:
 445  445          case VM_GET_KERNEMU_DEV:
      446 +        case VM_RESET_CPU:
      447 +        case VM_GET_RUN_STATE:
      448 +        case VM_SET_RUN_STATE:
 446  449                  /*
 447  450                   * Copy in the ID of the vCPU chosen for this operation.
 448  451                   * Since a nefarious caller could update their struct between
 449  452                   * this locking and when the rest of the ioctl data is copied
 450  453                   * in, it is _critical_ that this local 'vcpu' variable be used
 451  454                   * rather than the in-struct one when performing the ioctl.
 452  455                   */
 453  456                  if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
 454  457                          return (EFAULT);
 455  458                  }

 456  459                  if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
 457  460                          return (EINVAL);
 458  461                  }
 459  462                  vcpu_lock_one(sc, vcpu);
 460  463                  lock_type = LOCK_VCPU;
 461  464                  break;
 462  465  
 463  466          case VM_REINIT:
 464  467          case VM_BIND_PPTDEV:
 465  468          case VM_UNBIND_PPTDEV:
 466  469          case VM_MAP_PPTDEV_MMIO:
 467  470          case VM_ALLOC_MEMSEG:
 468  471          case VM_MMAP_MEMSEG:
 469  472          case VM_WRLOCK_CYCLE:
 470  473          case VM_PMTMR_LOCATE:
 471  474                  vmm_write_lock(sc);
 472  475                  lock_type = LOCK_WRITE_HOLD;
 473  476                  break;
 474  477  
 475  478          case VM_GET_GPA_PMAP:
 476  479          case VM_GET_MEMSEG:
 477  480          case VM_MMAP_GETNEXT:
 478  481          case VM_LAPIC_IRQ:
 479  482          case VM_INJECT_NMI:
 480  483          case VM_IOAPIC_ASSERT_IRQ:
 481  484          case VM_IOAPIC_DEASSERT_IRQ:
 482  485          case VM_IOAPIC_PULSE_IRQ:
 483  486          case VM_LAPIC_MSI:
 484  487          case VM_LAPIC_LOCAL_IRQ:
 485  488          case VM_GET_X2APIC_STATE:
 486  489          case VM_RTC_READ:
 487  490          case VM_RTC_WRITE:
 488  491          case VM_RTC_SETTIME:
 489  492          case VM_RTC_GETTIME:
 490  493  #ifndef __FreeBSD__
 491  494          case VM_DEVMEM_GETOFFSET:
 492  495  #endif
 493  496                  vmm_read_lock(sc);
 494  497                  lock_type = LOCK_READ_HOLD;
 495  498                  break;
 496  499  
 497  500          case VM_IOAPIC_PINCOUNT:
 498  501          default:
 499  502                  break;
 500  503          }
 501  504  
 502  505          /* Execute the primary logic for the ioctl. */
 503  506          switch (cmd) {
 504  507          case VM_RUN: {
 505  508                  struct vm_entry entry;
 506  509  
 507  510                  if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
 508  511                          error = EFAULT;
 509  512                          break;
 510  513                  }
 511  514  
 512  515                  if (!(curthread->t_schedflag & TS_VCPU))
 513  516                          smt_mark_as_vcpu();
 514  517  
 515  518                  error = vm_run(sc->vmm_vm, vcpu, &entry);
 516  519  
 517  520                  /*
 518  521                   * Unexpected states in vm_run() are expressed through positive
 519  522                   * errno-oriented return values.  VM states which expect further
 520  523                   * processing in userspace (necessary context via exitinfo) are
 521  524                   * expressed through negative return values.  For the time being
 522  525                   * a return value of 0 is not expected from vm_run().
 523  526                   */
 524  527                  ASSERT(error != 0);
 525  528                  if (error < 0) {
 526  529                          const struct vm_exit *vme;
 527  530                          void *outp = entry.exit_data;
 528  531  
 529  532                          error = 0;
 530  533                          vme = vm_exitinfo(sc->vmm_vm, vcpu);
 531  534                          if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
 532  535                                  error = EFAULT;
 533  536                          }
 534  537                  }
 535  538                  break;
 536  539          }
 537  540          case VM_SUSPEND: {
 538  541                  struct vm_suspend vmsuspend;
 539  542  
 540  543                  if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
 541  544                          error = EFAULT;
 542  545                          break;
 543  546                  }
 544  547                  error = vm_suspend(sc->vmm_vm, vmsuspend.how);
 545  548                  break;
 546  549          }
 547  550          case VM_REINIT:
 548  551                  if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
 549  552                          /*
 550  553                           * The VM instance should be free of driver-attached
 551  554                           * hooks during the reinitialization process.
 552  555                           */
 553  556                          break;
 554  557                  }
 555  558                  error = vm_reinit(sc->vmm_vm);
 556  559                  (void) vmm_drv_block_hook(sc, B_FALSE);
 557  560                  break;
 558  561          case VM_STAT_DESC: {
 559  562                  struct vm_stat_desc statdesc;
 560  563  
 561  564                  if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
 562  565                          error = EFAULT;
 563  566                          break;
 564  567                  }
 565  568                  error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
 566  569                      sizeof (statdesc.desc));
 567  570                  if (error == 0 &&
 568  571                      ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
 569  572                          error = EFAULT;
 570  573                          break;
 571  574                  }
 572  575                  break;
 573  576          }
 574  577          case VM_STATS_IOC: {
 575  578                  struct vm_stats vmstats;
 576  579  
 577  580                  CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
 578  581                  if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
 579  582                          error = EFAULT;
 580  583                          break;
 581  584                  }
 582  585                  hrt2tv(gethrtime(), &vmstats.tv);
 583  586                  error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
 584  587                      &vmstats.num_entries, vmstats.statbuf);
 585  588                  if (error == 0 &&
 586  589                      ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
 587  590                          error = EFAULT;
 588  591                          break;
 589  592                  }
 590  593                  break;
 591  594          }
 592  595  
 593  596          case VM_PPTDEV_MSI: {
 594  597                  struct vm_pptdev_msi pptmsi;
 595  598  
 596  599                  if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
 597  600                          error = EFAULT;
 598  601                          break;
 599  602                  }
 600  603                  error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
 601  604                      pptmsi.addr, pptmsi.msg, pptmsi.numvec);
 602  605                  break;
 603  606          }
 604  607          case VM_PPTDEV_MSIX: {
 605  608                  struct vm_pptdev_msix pptmsix;
 606  609  
 607  610                  if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
 608  611                          error = EFAULT;
 609  612                          break;
 610  613                  }
 611  614                  error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
 612  615                      pptmsix.idx, pptmsix.addr, pptmsix.msg,
 613  616                      pptmsix.vector_control);
 614  617                  break;
 615  618          }
 616  619          case VM_MAP_PPTDEV_MMIO: {
 617  620                  struct vm_pptdev_mmio pptmmio;
 618  621  
 619  622                  if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
 620  623                          error = EFAULT;
 621  624                          break;
 622  625                  }
 623  626                  error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
 624  627                      pptmmio.len, pptmmio.hpa);
 625  628                  break;
 626  629          }
 627  630          case VM_BIND_PPTDEV: {
 628  631                  struct vm_pptdev pptdev;
 629  632  
 630  633                  if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 631  634                          error = EFAULT;
 632  635                          break;
 633  636                  }
 634  637                  error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
 635  638                  break;
 636  639          }
 637  640          case VM_UNBIND_PPTDEV: {
 638  641                  struct vm_pptdev pptdev;
 639  642  
 640  643                  if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 641  644                          error = EFAULT;
 642  645                          break;
 643  646                  }
 644  647                  error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
 645  648                  break;
 646  649          }
 647  650          case VM_GET_PPTDEV_LIMITS: {
 648  651                  struct vm_pptdev_limits pptlimits;
 649  652  
 650  653                  if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
 651  654                          error = EFAULT;
 652  655                          break;
 653  656                  }
 654  657                  error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
 655  658                      &pptlimits.msi_limit, &pptlimits.msix_limit);
 656  659                  if (error == 0 &&
 657  660                      ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
 658  661                          error = EFAULT;
 659  662                          break;
 660  663                  }
 661  664                  break;
 662  665          }
 663  666          case VM_INJECT_EXCEPTION: {
 664  667                  struct vm_exception vmexc;
 665  668                  if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
 666  669                          error = EFAULT;
 667  670                          break;
 668  671                  }
 669  672                  error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
 670  673                      vmexc.error_code_valid, vmexc.error_code,
 671  674                      vmexc.restart_instruction);
 672  675                  break;
 673  676          }
 674  677          case VM_INJECT_NMI: {
 675  678                  struct vm_nmi vmnmi;
 676  679  
 677  680                  if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
 678  681                          error = EFAULT;
 679  682                          break;
 680  683                  }
 681  684                  error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
 682  685                  break;
 683  686          }
 684  687          case VM_LAPIC_IRQ: {
 685  688                  struct vm_lapic_irq vmirq;
 686  689  
 687  690                  if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
 688  691                          error = EFAULT;
 689  692                          break;
 690  693                  }
 691  694                  error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
 692  695                  break;
 693  696          }
 694  697          case VM_LAPIC_LOCAL_IRQ: {
 695  698                  struct vm_lapic_irq vmirq;
 696  699  
 697  700                  if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
 698  701                          error = EFAULT;
 699  702                          break;
 700  703                  }
 701  704                  error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
 702  705                      vmirq.vector);
 703  706                  break;
 704  707          }
 705  708          case VM_LAPIC_MSI: {
 706  709                  struct vm_lapic_msi vmmsi;
 707  710  
 708  711                  if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
 709  712                          error = EFAULT;
 710  713                          break;
 711  714                  }
 712  715                  error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
 713  716                  break;
 714  717          }
 715  718  
 716  719          case VM_IOAPIC_ASSERT_IRQ: {
 717  720                  struct vm_ioapic_irq ioapic_irq;
 718  721  
 719  722                  if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 720  723                          error = EFAULT;
 721  724                          break;
 722  725                  }
 723  726                  error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
 724  727                  break;
 725  728          }
 726  729          case VM_IOAPIC_DEASSERT_IRQ: {
 727  730                  struct vm_ioapic_irq ioapic_irq;
 728  731  
 729  732                  if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 730  733                          error = EFAULT;
 731  734                          break;
 732  735                  }
 733  736                  error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
 734  737                  break;
 735  738          }
 736  739          case VM_IOAPIC_PULSE_IRQ: {
 737  740                  struct vm_ioapic_irq ioapic_irq;
 738  741  
 739  742                  if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 740  743                          error = EFAULT;
 741  744                          break;
 742  745                  }
 743  746                  error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
 744  747                  break;
 745  748          }
 746  749          case VM_IOAPIC_PINCOUNT: {
 747  750                  int pincount;
 748  751  
 749  752                  pincount = vioapic_pincount(sc->vmm_vm);
 750  753                  if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
 751  754                          error = EFAULT;
 752  755                          break;
 753  756                  }
 754  757                  break;
 755  758          }
 756  759  
 757  760          case VM_ISA_ASSERT_IRQ: {
 758  761                  struct vm_isa_irq isa_irq;
 759  762  
 760  763                  if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 761  764                          error = EFAULT;
 762  765                          break;
 763  766                  }
 764  767                  error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
 765  768                  if (error == 0 && isa_irq.ioapic_irq != -1) {
 766  769                          error = vioapic_assert_irq(sc->vmm_vm,
 767  770                              isa_irq.ioapic_irq);
 768  771                  }
 769  772                  break;
 770  773          }
 771  774          case VM_ISA_DEASSERT_IRQ: {
 772  775                  struct vm_isa_irq isa_irq;
 773  776  
 774  777                  if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 775  778                          error = EFAULT;
 776  779                          break;
 777  780                  }
 778  781                  error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
 779  782                  if (error == 0 && isa_irq.ioapic_irq != -1) {
 780  783                          error = vioapic_deassert_irq(sc->vmm_vm,
 781  784                              isa_irq.ioapic_irq);
 782  785                  }
 783  786                  break;
 784  787          }
 785  788          case VM_ISA_PULSE_IRQ: {
 786  789                  struct vm_isa_irq isa_irq;
 787  790  
 788  791                  if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 789  792                          error = EFAULT;
 790  793                          break;
 791  794                  }
 792  795                  error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
 793  796                  if (error == 0 && isa_irq.ioapic_irq != -1) {
 794  797                          error = vioapic_pulse_irq(sc->vmm_vm,
 795  798                              isa_irq.ioapic_irq);
 796  799                  }
 797  800                  break;
 798  801          }
 799  802          case VM_ISA_SET_IRQ_TRIGGER: {
 800  803                  struct vm_isa_irq_trigger isa_irq_trigger;
 801  804  
 802  805                  if (ddi_copyin(datap, &isa_irq_trigger,
 803  806                      sizeof (isa_irq_trigger), md)) {
 804  807                          error = EFAULT;
 805  808                          break;
 806  809                  }
 807  810                  error = vatpic_set_irq_trigger(sc->vmm_vm,
 808  811                      isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
 809  812                  break;
 810  813          }
 811  814  
 812  815          case VM_MMAP_GETNEXT: {
 813  816                  struct vm_memmap mm;
 814  817  
 815  818                  if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
 816  819                          error = EFAULT;
 817  820                          break;
 818  821                  }
 819  822                  error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
 820  823                      &mm.segoff, &mm.len, &mm.prot, &mm.flags);
 821  824                  if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
 822  825                          error = EFAULT;
 823  826                          break;
 824  827                  }
 825  828                  break;
 826  829          }
 827  830          case VM_MMAP_MEMSEG: {
 828  831                  struct vm_memmap mm;
 829  832  
 830  833                  if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
 831  834                          error = EFAULT;
 832  835                          break;
 833  836                  }
 834  837                  error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
 835  838                      mm.len, mm.prot, mm.flags);
 836  839                  break;
 837  840          }
 838  841          case VM_ALLOC_MEMSEG: {
 839  842                  struct vm_memseg vmseg;
 840  843  
 841  844                  if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
 842  845                          error = EFAULT;
 843  846                          break;
 844  847                  }
 845  848                  error = vmmdev_alloc_memseg(sc, &vmseg);
 846  849                  break;
 847  850          }
 848  851          case VM_GET_MEMSEG: {
 849  852                  struct vm_memseg vmseg;
 850  853  
 851  854                  if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
 852  855                          error = EFAULT;
 853  856                          break;
 854  857                  }
 855  858                  error = vmmdev_get_memseg(sc, &vmseg);
 856  859                  if (error == 0 &&
 857  860                      ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
 858  861                          error = EFAULT;
 859  862                          break;
 860  863                  }
 861  864                  break;
 862  865          }
 863  866          case VM_GET_REGISTER: {
 864  867                  struct vm_register vmreg;
 865  868  
 866  869                  if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
 867  870                          error = EFAULT;
 868  871                          break;
 869  872                  }
 870  873                  error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
 871  874                      &vmreg.regval);
 872  875                  if (error == 0 &&
 873  876                      ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
 874  877                          error = EFAULT;
 875  878                          break;
 876  879                  }
 877  880                  break;
 878  881          }
 879  882          case VM_SET_REGISTER: {
 880  883                  struct vm_register vmreg;
 881  884  
 882  885                  if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
 883  886                          error = EFAULT;
 884  887                          break;
 885  888                  }
 886  889                  error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
 887  890                      vmreg.regval);
 888  891                  break;
 889  892          }
 890  893          case VM_SET_SEGMENT_DESCRIPTOR: {
 891  894                  struct vm_seg_desc vmsegd;
 892  895  
 893  896                  if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
 894  897                          error = EFAULT;
 895  898                          break;
 896  899                  }
 897  900                  error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
 898  901                      &vmsegd.desc);
 899  902                  break;
 900  903          }
 901  904          case VM_GET_SEGMENT_DESCRIPTOR: {
 902  905                  struct vm_seg_desc vmsegd;
 903  906  
 904  907                  if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
 905  908                          error = EFAULT;
 906  909                          break;
 907  910                  }
 908  911                  error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
 909  912                      &vmsegd.desc);
 910  913                  if (error == 0 &&
 911  914                      ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
 912  915                          error = EFAULT;
 913  916                          break;
 914  917                  }
 915  918                  break;
 916  919          }
 917  920          case VM_GET_REGISTER_SET: {
 918  921                  struct vm_register_set vrs;
 919  922                  int regnums[VM_REG_LAST];
 920  923                  uint64_t regvals[VM_REG_LAST];
 921  924  
 922  925                  if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
 923  926                          error = EFAULT;
 924  927                          break;
 925  928                  }
 926  929                  if (vrs.count > VM_REG_LAST || vrs.count == 0) {
 927  930                          error = EINVAL;
 928  931                          break;
 929  932                  }
 930  933                  if (ddi_copyin(vrs.regnums, regnums,
 931  934                      sizeof (int) * vrs.count, md)) {
 932  935                          error = EFAULT;
 933  936                          break;
 934  937                  }
 935  938  
 936  939                  error = 0;
 937  940                  for (uint_t i = 0; i < vrs.count && error == 0; i++) {
 938  941                          if (regnums[i] < 0) {
 939  942                                  error = EINVAL;
 940  943                                  break;
 941  944                          }
 942  945                          error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
 943  946                              &regvals[i]);
 944  947                  }
 945  948                  if (error == 0 && ddi_copyout(regvals, vrs.regvals,
 946  949                      sizeof (uint64_t) * vrs.count, md)) {
 947  950                          error = EFAULT;
 948  951                  }
 949  952                  break;
 950  953          }
 951  954          case VM_SET_REGISTER_SET: {
 952  955                  struct vm_register_set vrs;
 953  956                  int regnums[VM_REG_LAST];
 954  957                  uint64_t regvals[VM_REG_LAST];
 955  958  
 956  959                  if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
 957  960                          error = EFAULT;
 958  961                          break;
 959  962                  }
 960  963                  if (vrs.count > VM_REG_LAST || vrs.count == 0) {
 961  964                          error = EINVAL;
 962  965                          break;
 963  966                  }
 964  967                  if (ddi_copyin(vrs.regnums, regnums,
 965  968                      sizeof (int) * vrs.count, md)) {
 966  969                          error = EFAULT;
 967  970                          break;
 968  971                  }
 969  972                  if (ddi_copyin(vrs.regvals, regvals,
 970  973                      sizeof (uint64_t) * vrs.count, md)) {
 971  974                          error = EFAULT;
 972  975                          break;
 973  976                  }
 974  977  
 975  978                  error = 0;
 976  979                  for (uint_t i = 0; i < vrs.count && error == 0; i++) {
 977  980                          /*
 978  981                           * Setting registers in a set is not atomic, since a
 979  982                           * failure in the middle of the set will cause a
 980  983                           * bail-out and inconsistent register state.  Callers
 981  984                           * should be wary of this.

↓ open down ↓

526 lines elided

↑ open up ↑

 982  985                           */
 983  986                          if (regnums[i] < 0) {
 984  987                                  error = EINVAL;
 985  988                                  break;
 986  989                          }
 987  990                          error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
 988  991                              regvals[i]);
 989  992                  }
 990  993                  break;
 991  994          }
      995 +        case VM_RESET_CPU: {
      996 +                struct vm_vcpu_reset vvr;
      997 +
      998 +                if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
      999 +                        error = EFAULT;
     1000 +                        break;
     1001 +                }
     1002 +                if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
     1003 +                        error = EINVAL;
     1004 +                }
     1005 +
     1006 +                error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
     1007 +                break;
     1008 +        }
     1009 +        case VM_GET_RUN_STATE: {
     1010 +                struct vm_run_state vrs;
     1011 +
     1012 +                bzero(&vrs, sizeof (vrs));
     1013 +                error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
     1014 +                    &vrs.sipi_vector);
     1015 +                if (error == 0) {
     1016 +                        if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
     1017 +                                error = EFAULT;
     1018 +                                break;
     1019 +                        }
     1020 +                }
     1021 +                break;
     1022 +        }
     1023 +        case VM_SET_RUN_STATE: {
     1024 +                struct vm_run_state vrs;
     1025 +
     1026 +                if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
     1027 +                        error = EFAULT;
     1028 +                        break;
     1029 +                }
     1030 +                error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
     1031 +                    vrs.sipi_vector);
     1032 +                break;
     1033 +        }
 992 1034  
 993 1035          case VM_SET_KERNEMU_DEV:
 994 1036          case VM_GET_KERNEMU_DEV: {
 995 1037                  struct vm_readwrite_kernemu_device kemu;
 996 1038                  size_t size = 0;
 997 1039  
 998 1040                  if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
 999 1041                          error = EFAULT;
1000 1042                          break;
1001 1043                  }

1002 1044  
1003 1045                  if (kemu.access_width > 3) {
1004 1046                          error = EINVAL;
1005 1047                          break;
1006 1048                  }
1007 1049                  size = (1 << kemu.access_width);
1008 1050                  ASSERT(size >= 1 && size <= 8);
1009 1051  
1010 1052                  if (cmd == VM_SET_KERNEMU_DEV) {
1011 1053                          error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1012 1054                              kemu.gpa, kemu.value, size);
1013 1055                  } else {
1014 1056                          error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1015 1057                              kemu.gpa, &kemu.value, size);
1016 1058                  }
1017 1059  
1018 1060                  if (error == 0) {
1019 1061                          if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1020 1062                                  error = EFAULT;
1021 1063                                  break;
1022 1064                          }
1023 1065                  }
1024 1066                  break;
1025 1067          }
1026 1068  
1027 1069          case VM_GET_CAPABILITY: {
1028 1070                  struct vm_capability vmcap;
1029 1071  
1030 1072                  if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1031 1073                          error = EFAULT;
1032 1074                          break;
1033 1075                  }
1034 1076                  error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1035 1077                      &vmcap.capval);
1036 1078                  if (error == 0 &&
1037 1079                      ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1038 1080                          error = EFAULT;
1039 1081                          break;
1040 1082                  }
1041 1083                  break;
1042 1084          }
1043 1085          case VM_SET_CAPABILITY: {
1044 1086                  struct vm_capability vmcap;
1045 1087  
1046 1088                  if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1047 1089                          error = EFAULT;
1048 1090                          break;
1049 1091                  }
1050 1092                  error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1051 1093                      vmcap.capval);
1052 1094                  break;
1053 1095          }
1054 1096          case VM_SET_X2APIC_STATE: {
1055 1097                  struct vm_x2apic x2apic;
1056 1098  
1057 1099                  if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1058 1100                          error = EFAULT;
1059 1101                          break;
1060 1102                  }
1061 1103                  error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1062 1104                  break;
1063 1105          }
1064 1106          case VM_GET_X2APIC_STATE: {
1065 1107                  struct vm_x2apic x2apic;
1066 1108  
1067 1109                  if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1068 1110                          error = EFAULT;
1069 1111                          break;
1070 1112                  }
1071 1113                  error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1072 1114                      &x2apic.state);
1073 1115                  if (error == 0 &&
1074 1116                      ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1075 1117                          error = EFAULT;
1076 1118                          break;
1077 1119                  }
1078 1120                  break;
1079 1121          }
1080 1122          case VM_GET_GPA_PMAP: {
1081 1123                  struct vm_gpa_pte gpapte;
1082 1124  
1083 1125                  if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1084 1126                          error = EFAULT;
1085 1127                          break;
1086 1128                  }
1087 1129  #ifdef __FreeBSD__
1088 1130                  /* XXXJOY: add function? */
1089 1131                  pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1090 1132                      gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1091 1133  #endif
1092 1134                  error = 0;
1093 1135                  break;
1094 1136          }
1095 1137          case VM_GET_HPET_CAPABILITIES: {
1096 1138                  struct vm_hpet_cap hpetcap;
1097 1139  
1098 1140                  error = vhpet_getcap(&hpetcap);
1099 1141                  if (error == 0 &&
1100 1142                      ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1101 1143                          error = EFAULT;
1102 1144                          break;
1103 1145                  }
1104 1146                  break;
1105 1147          }
1106 1148          case VM_GLA2GPA: {
1107 1149                  struct vm_gla2gpa gg;
1108 1150  
1109 1151                  CTASSERT(PROT_READ == VM_PROT_READ);
1110 1152                  CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1111 1153                  CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1112 1154  
1113 1155                  if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1114 1156                          error = EFAULT;
1115 1157                          break;
1116 1158                  }
1117 1159                  gg.vcpuid = vcpu;
1118 1160                  error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1119 1161                      gg.prot, &gg.gpa, &gg.fault);
1120 1162                  if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1121 1163                          error = EFAULT;
1122 1164                          break;
1123 1165                  }
1124 1166                  break;
1125 1167          }
1126 1168          case VM_GLA2GPA_NOFAULT: {
1127 1169                  struct vm_gla2gpa gg;
1128 1170  
1129 1171                  CTASSERT(PROT_READ == VM_PROT_READ);
1130 1172                  CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1131 1173                  CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1132 1174  
1133 1175                  if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1134 1176                          error = EFAULT;
1135 1177                          break;
1136 1178                  }
1137 1179                  gg.vcpuid = vcpu;
1138 1180                  error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1139 1181                      gg.gla, gg.prot, &gg.gpa, &gg.fault);
1140 1182                  if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1141 1183                          error = EFAULT;
1142 1184                          break;
1143 1185                  }
1144 1186                  break;
1145 1187          }
1146 1188  
1147 1189          case VM_ACTIVATE_CPU:
1148 1190                  error = vm_activate_cpu(sc->vmm_vm, vcpu);
1149 1191                  break;
1150 1192  
1151 1193          case VM_SUSPEND_CPU:
1152 1194                  if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1153 1195                          error = EFAULT;
1154 1196                  } else {
1155 1197                          error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1156 1198                  }
1157 1199                  break;
1158 1200  
1159 1201          case VM_RESUME_CPU:
1160 1202                  if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1161 1203                          error = EFAULT;
1162 1204                  } else {
1163 1205                          error = vm_resume_cpu(sc->vmm_vm, vcpu);
1164 1206                  }
1165 1207                  break;
1166 1208  
1167 1209          case VM_GET_CPUS: {
1168 1210                  struct vm_cpuset vm_cpuset;
1169 1211                  cpuset_t tempset;
1170 1212                  void *srcp = &tempset;
1171 1213                  int size;
1172 1214  
1173 1215                  if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1174 1216                          error = EFAULT;
1175 1217                          break;
1176 1218                  }
1177 1219  
1178 1220                  /* Be more generous about sizing since our cpuset_t is large. */
1179 1221                  size = vm_cpuset.cpusetsize;
1180 1222                  if (size <= 0 || size > sizeof (cpuset_t)) {
1181 1223                          error = ERANGE;
1182 1224                  }
1183 1225                  /*
1184 1226                   * If they want a ulong_t or less, make sure they receive the
1185 1227                   * low bits with all the useful information.
1186 1228                   */
1187 1229                  if (size <= sizeof (tempset.cpub[0])) {
1188 1230                          srcp = &tempset.cpub[0];
1189 1231                  }
1190 1232  
1191 1233                  if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1192 1234                          tempset = vm_active_cpus(sc->vmm_vm);
1193 1235                  } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1194 1236                          tempset = vm_suspended_cpus(sc->vmm_vm);
1195 1237                  } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1196 1238                          tempset = vm_debug_cpus(sc->vmm_vm);
1197 1239                  } else {
1198 1240                          error = EINVAL;
1199 1241                  }
1200 1242  
1201 1243                  ASSERT(size > 0 && size <= sizeof (tempset));
1202 1244                  if (error == 0 &&
1203 1245                      ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1204 1246                          error = EFAULT;
1205 1247                          break;
1206 1248                  }
1207 1249                  break;
1208 1250          }
1209 1251          case VM_SET_INTINFO: {
1210 1252                  struct vm_intinfo vmii;
1211 1253  
1212 1254                  if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1213 1255                          error = EFAULT;
1214 1256                          break;
1215 1257                  }
1216 1258                  error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1217 1259                  break;
1218 1260          }
1219 1261          case VM_GET_INTINFO: {
1220 1262                  struct vm_intinfo vmii;
1221 1263  
1222 1264                  vmii.vcpuid = vcpu;
1223 1265                  error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1224 1266                      &vmii.info2);
1225 1267                  if (error == 0 &&
1226 1268                      ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1227 1269                          error = EFAULT;
1228 1270                          break;
1229 1271                  }
1230 1272                  break;
1231 1273          }
1232 1274          case VM_RTC_WRITE: {
1233 1275                  struct vm_rtc_data rtcdata;
1234 1276  
1235 1277                  if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1236 1278                          error = EFAULT;
1237 1279                          break;
1238 1280                  }
1239 1281                  error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1240 1282                      rtcdata.value);
1241 1283                  break;
1242 1284          }
1243 1285          case VM_RTC_READ: {
1244 1286                  struct vm_rtc_data rtcdata;
1245 1287  
1246 1288                  if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1247 1289                          error = EFAULT;
1248 1290                          break;
1249 1291                  }
1250 1292                  error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1251 1293                      &rtcdata.value);
1252 1294                  if (error == 0 &&
1253 1295                      ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1254 1296                          error = EFAULT;
1255 1297                          break;
1256 1298                  }
1257 1299                  break;
1258 1300          }
1259 1301          case VM_RTC_SETTIME: {
1260 1302                  struct vm_rtc_time rtctime;
1261 1303  
1262 1304                  if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1263 1305                          error = EFAULT;
1264 1306                          break;
1265 1307                  }
1266 1308                  error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1267 1309                  break;
1268 1310          }
1269 1311          case VM_RTC_GETTIME: {
1270 1312                  struct vm_rtc_time rtctime;
1271 1313  
1272 1314                  rtctime.secs = vrtc_get_time(sc->vmm_vm);
1273 1315                  if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1274 1316                          error = EFAULT;
1275 1317                          break;
1276 1318                  }
1277 1319                  break;
1278 1320          }
1279 1321  
1280 1322          case VM_PMTMR_LOCATE: {
1281 1323                  uint16_t port = arg;
1282 1324                  error = vpmtmr_set_location(sc->vmm_vm, port);
1283 1325                  break;
1284 1326          }
1285 1327  
1286 1328          case VM_RESTART_INSTRUCTION:
1287 1329                  error = vm_restart_instruction(sc->vmm_vm, vcpu);
1288 1330                  break;
1289 1331  
1290 1332          case VM_SET_TOPOLOGY: {
1291 1333                  struct vm_cpu_topology topo;
1292 1334  
1293 1335                  if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1294 1336                          error = EFAULT;
1295 1337                          break;
1296 1338                  }
1297 1339                  error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1298 1340                      topo.threads, topo.maxcpus);
1299 1341                  break;
1300 1342          }
1301 1343          case VM_GET_TOPOLOGY: {
1302 1344                  struct vm_cpu_topology topo;
1303 1345  
1304 1346                  vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1305 1347                      &topo.threads, &topo.maxcpus);
1306 1348                  if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1307 1349                          error = EFAULT;
1308 1350                          break;
1309 1351                  }
1310 1352                  break;
1311 1353          }
1312 1354  
1313 1355  #ifndef __FreeBSD__
1314 1356          case VM_DEVMEM_GETOFFSET: {
1315 1357                  struct vm_devmem_offset vdo;
1316 1358                  list_t *dl = &sc->vmm_devmem_list;
1317 1359                  vmm_devmem_entry_t *de = NULL;
1318 1360  
1319 1361                  if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1320 1362                          error = EFAULT;
1321 1363                          break;
1322 1364                  }
1323 1365  
1324 1366                  for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1325 1367                          if (de->vde_segid == vdo.segid) {
1326 1368                                  break;
1327 1369                          }
1328 1370                  }
1329 1371                  if (de != NULL) {
1330 1372                          vdo.offset = de->vde_off;
1331 1373                          if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1332 1374                                  error = EFAULT;
1333 1375                          }
1334 1376                  } else {
1335 1377                          error = ENOENT;
1336 1378                  }
1337 1379                  break;
1338 1380          }
1339 1381          case VM_WRLOCK_CYCLE: {
1340 1382                  /*
1341 1383                   * Present a test mechanism to acquire/release the write lock
1342 1384                   * on the VM without any other effects.
1343 1385                   */
1344 1386                  break;
1345 1387          }
1346 1388  #endif
1347 1389          default:
1348 1390                  error = ENOTTY;
1349 1391                  break;
1350 1392          }
1351 1393  
1352 1394          /* Release exclusion resources */
1353 1395          switch (lock_type) {
1354 1396          case LOCK_NONE:
1355 1397                  break;
1356 1398          case LOCK_VCPU:
1357 1399                  vcpu_unlock_one(sc, vcpu);
1358 1400                  break;
1359 1401          case LOCK_READ_HOLD:
1360 1402                  vmm_read_unlock(sc);
1361 1403                  break;
1362 1404          case LOCK_WRITE_HOLD:
1363 1405                  vmm_write_unlock(sc);
1364 1406                  break;
1365 1407          default:
1366 1408                  panic("unexpected lock type");
1367 1409                  break;
1368 1410          }
1369 1411  
1370 1412          return (error);
1371 1413  }
1372 1414  
1373 1415  static vmm_softc_t *
1374 1416  vmm_lookup(const char *name)
1375 1417  {
1376 1418          list_t *vml = &vmm_list;
1377 1419          vmm_softc_t *sc;
1378 1420  
1379 1421          ASSERT(MUTEX_HELD(&vmm_mtx));
1380 1422  
1381 1423          for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1382 1424                  if (strcmp(sc->vmm_name, name) == 0) {
1383 1425                          break;
1384 1426                  }
1385 1427          }
1386 1428  
1387 1429          return (sc);
1388 1430  }
1389 1431  
1390 1432  /*
1391 1433   * Acquire an HMA registration if not already held.
1392 1434   */
1393 1435  static boolean_t
1394 1436  vmm_hma_acquire(void)
1395 1437  {
1396 1438          ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1397 1439  
1398 1440          mutex_enter(&vmmdev_mtx);
1399 1441  
1400 1442          if (vmmdev_hma_reg == NULL) {
1401 1443                  VERIFY3U(vmmdev_hma_ref, ==, 0);
1402 1444                  vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1403 1445                  if (vmmdev_hma_reg == NULL) {
1404 1446                          cmn_err(CE_WARN, "%s HMA registration failed.",
1405 1447                              vmmdev_hvm_name);
1406 1448                          mutex_exit(&vmmdev_mtx);
1407 1449                          return (B_FALSE);
1408 1450                  }
1409 1451          }
1410 1452  
1411 1453          vmmdev_hma_ref++;
1412 1454  
1413 1455          mutex_exit(&vmmdev_mtx);
1414 1456  
1415 1457          return (B_TRUE);
1416 1458  }
1417 1459  
1418 1460  /*
1419 1461   * Release the HMA registration if held and there are no remaining VMs.
1420 1462   */
1421 1463  static void
1422 1464  vmm_hma_release(void)
1423 1465  {
1424 1466          ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1425 1467  
1426 1468          mutex_enter(&vmmdev_mtx);
1427 1469  
1428 1470          VERIFY3U(vmmdev_hma_ref, !=, 0);
1429 1471  
1430 1472          vmmdev_hma_ref--;
1431 1473  
1432 1474          if (vmmdev_hma_ref == 0) {
1433 1475                  VERIFY(vmmdev_hma_reg != NULL);
1434 1476                  hma_unregister(vmmdev_hma_reg);
1435 1477                  vmmdev_hma_reg = NULL;
1436 1478          }
1437 1479          mutex_exit(&vmmdev_mtx);
1438 1480  }
1439 1481  
1440 1482  static int
1441 1483  vmmdev_do_vm_create(char *name, cred_t *cr)
1442 1484  {
1443 1485          vmm_softc_t     *sc = NULL;
1444 1486          minor_t         minor;
1445 1487          int             error = ENOMEM;
1446 1488  
1447 1489          if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1448 1490                  return (EINVAL);
1449 1491          }
1450 1492  
1451 1493          if (!vmm_hma_acquire())
1452 1494                  return (ENXIO);
1453 1495  
1454 1496          mutex_enter(&vmm_mtx);
1455 1497  
1456 1498          /* Look for duplicate names */
1457 1499          if (vmm_lookup(name) != NULL) {
1458 1500                  mutex_exit(&vmm_mtx);
1459 1501                  vmm_hma_release();
1460 1502                  return (EEXIST);
1461 1503          }
1462 1504  
1463 1505          /* Allow only one instance per non-global zone. */
1464 1506          if (!INGLOBALZONE(curproc)) {
1465 1507                  for (sc = list_head(&vmm_list); sc != NULL;
1466 1508                      sc = list_next(&vmm_list, sc)) {
1467 1509                          if (sc->vmm_zone == curzone) {
1468 1510                                  mutex_exit(&vmm_mtx);
1469 1511                                  vmm_hma_release();
1470 1512                                  return (EINVAL);
1471 1513                          }
1472 1514                  }
1473 1515          }
1474 1516  
1475 1517          minor = id_alloc(vmm_minors);
1476 1518          if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1477 1519                  goto fail;
1478 1520          } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1479 1521                  ddi_soft_state_free(vmm_statep, minor);
1480 1522                  goto fail;
1481 1523          } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1482 1524              DDI_PSEUDO, 0) != DDI_SUCCESS) {
1483 1525                  goto fail;
1484 1526          }
1485 1527  
1486 1528          error = vm_create(name, &sc->vmm_vm);
1487 1529          if (error == 0) {
1488 1530                  /* Complete VM intialization and report success. */
1489 1531                  (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1490 1532                  sc->vmm_minor = minor;
1491 1533                  list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1492 1534                      offsetof(vmm_devmem_entry_t, vde_node));
1493 1535  
1494 1536                  list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1495 1537                      offsetof(vmm_hold_t, vmh_node));
1496 1538                  cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1497 1539  
1498 1540                  mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1499 1541                  list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1500 1542                      offsetof(vmm_lease_t, vml_node));
1501 1543                  cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1502 1544                  rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1503 1545  
1504 1546                  sc->vmm_zone = crgetzone(cr);
1505 1547                  zone_hold(sc->vmm_zone);
1506 1548                  vmm_zsd_add_vm(sc);
1507 1549  
1508 1550                  list_insert_tail(&vmm_list, sc);
1509 1551                  mutex_exit(&vmm_mtx);
1510 1552                  return (0);
1511 1553          }
1512 1554  
1513 1555          ddi_remove_minor_node(vmmdev_dip, name);
1514 1556  fail:
1515 1557          id_free(vmm_minors, minor);
1516 1558          if (sc != NULL) {
1517 1559                  ddi_soft_state_free(vmm_statep, minor);
1518 1560          }
1519 1561          mutex_exit(&vmm_mtx);
1520 1562          vmm_hma_release();
1521 1563  
1522 1564          return (error);
1523 1565  }
1524 1566  
1525 1567  /*
1526 1568   * Bhyve 'Driver' Interface
1527 1569   *
1528 1570   * While many devices are emulated in the bhyve userspace process, there are
1529 1571   * others with performance constraints which require that they run mostly or
1530 1572   * entirely in-kernel.  For those not integrated directly into bhyve, an API is
1531 1573   * needed so they can query/manipulate the portions of VM state needed to
1532 1574   * fulfill their purpose.
1533 1575   *
1534 1576   * This includes:
1535 1577   * - Translating guest-physical addresses to host-virtual pointers
1536 1578   * - Injecting MSIs
1537 1579   * - Hooking IO port addresses
1538 1580   *
1539 1581   * The vmm_drv interface exists to provide that functionality to its consumers.
1540 1582   * (At this time, 'viona' is the only user)
1541 1583   */
1542 1584  int
1543 1585  vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1544 1586  {
1545 1587          vnode_t *vp = fp->f_vnode;
1546 1588          const dev_t dev = vp->v_rdev;
1547 1589          vmm_softc_t *sc;
1548 1590          vmm_hold_t *hold;
1549 1591          int err = 0;
1550 1592  
1551 1593          if (vp->v_type != VCHR) {
1552 1594                  return (ENXIO);
1553 1595          }
1554 1596          const major_t major = getmajor(dev);
1555 1597          const minor_t minor = getminor(dev);
1556 1598  
1557 1599          mutex_enter(&vmmdev_mtx);
1558 1600          if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1559 1601                  mutex_exit(&vmmdev_mtx);
1560 1602                  return (ENOENT);
1561 1603          }
1562 1604          mutex_enter(&vmm_mtx);
1563 1605          mutex_exit(&vmmdev_mtx);
1564 1606  
1565 1607          if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1566 1608                  err = ENOENT;
1567 1609                  goto out;
1568 1610          }
1569 1611          /* XXXJOY: check cred permissions against instance */
1570 1612  
1571 1613          if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1572 1614                  err = EBUSY;
1573 1615                  goto out;
1574 1616          }
1575 1617  
1576 1618          hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1577 1619          hold->vmh_sc = sc;
1578 1620          hold->vmh_release_req = B_FALSE;
1579 1621  
1580 1622          list_insert_tail(&sc->vmm_holds, hold);
1581 1623          sc->vmm_flags |= VMM_HELD;
1582 1624          *holdp = hold;
1583 1625  
1584 1626  out:
1585 1627          mutex_exit(&vmm_mtx);
1586 1628          return (err);
1587 1629  }
1588 1630  
1589 1631  void
1590 1632  vmm_drv_rele(vmm_hold_t *hold)
1591 1633  {
1592 1634          vmm_softc_t *sc;
1593 1635  
1594 1636          ASSERT(hold != NULL);
1595 1637          ASSERT(hold->vmh_sc != NULL);
1596 1638          VERIFY(hold->vmh_ioport_hook_cnt == 0);
1597 1639  
1598 1640          mutex_enter(&vmm_mtx);
1599 1641          sc = hold->vmh_sc;
1600 1642          list_remove(&sc->vmm_holds, hold);
1601 1643          if (list_is_empty(&sc->vmm_holds)) {
1602 1644                  sc->vmm_flags &= ~VMM_HELD;
1603 1645                  cv_broadcast(&sc->vmm_cv);
1604 1646          }
1605 1647          mutex_exit(&vmm_mtx);
1606 1648          kmem_free(hold, sizeof (*hold));
1607 1649  }
1608 1650  
1609 1651  boolean_t
1610 1652  vmm_drv_release_reqd(vmm_hold_t *hold)
1611 1653  {
1612 1654          ASSERT(hold != NULL);
1613 1655  
1614 1656          return (hold->vmh_release_req);
1615 1657  }
1616 1658  
1617 1659  vmm_lease_t *
1618 1660  vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1619 1661  {
1620 1662          vmm_softc_t *sc = hold->vmh_sc;
1621 1663          vmm_lease_t *lease;
1622 1664  
1623 1665          ASSERT3P(expiref, !=, NULL);
1624 1666  
1625 1667          if (hold->vmh_release_req) {
1626 1668                  return (NULL);
1627 1669          }
1628 1670  
1629 1671          lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1630 1672          list_link_init(&lease->vml_node);
1631 1673          lease->vml_expire_func = expiref;
1632 1674          lease->vml_expire_arg = arg;
1633 1675          lease->vml_expired = B_FALSE;
1634 1676          lease->vml_hold = hold;
1635 1677          /* cache the VM pointer for one less pointer chase */
1636 1678          lease->vml_vm = sc->vmm_vm;
1637 1679  
1638 1680          mutex_enter(&sc->vmm_lease_lock);
1639 1681          while (sc->vmm_lease_blocker != 0) {
1640 1682                  cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1641 1683          }
1642 1684          list_insert_tail(&sc->vmm_lease_list, lease);
1643 1685          vmm_read_lock(sc);
1644 1686          mutex_exit(&sc->vmm_lease_lock);
1645 1687  
1646 1688          return (lease);
1647 1689  }
1648 1690  
1649 1691  static void
1650 1692  vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1651 1693  {
1652 1694          ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1653 1695  
1654 1696          list_remove(&sc->vmm_lease_list, lease);
1655 1697          vmm_read_unlock(sc);
1656 1698          kmem_free(lease, sizeof (*lease));
1657 1699  }
1658 1700  
1659 1701  void
1660 1702  vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1661 1703  {
1662 1704          vmm_softc_t *sc = hold->vmh_sc;
1663 1705  
1664 1706          VERIFY3P(hold, ==, lease->vml_hold);
1665 1707  
1666 1708          mutex_enter(&sc->vmm_lease_lock);
1667 1709          vmm_lease_break_locked(sc, lease);
1668 1710          mutex_exit(&sc->vmm_lease_lock);
1669 1711  }
1670 1712  
1671 1713  boolean_t
1672 1714  vmm_drv_lease_expired(vmm_lease_t *lease)
1673 1715  {
1674 1716          return (lease->vml_expired);
1675 1717  }
1676 1718  
1677 1719  void *
1678 1720  vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1679 1721  {
1680 1722          ASSERT(lease != NULL);
1681 1723  
1682 1724          return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1683 1725  }
1684 1726  
1685 1727  int
1686 1728  vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1687 1729  {
1688 1730          ASSERT(lease != NULL);
1689 1731  
1690 1732          return (lapic_intr_msi(lease->vml_vm, addr, msg));
1691 1733  }
1692 1734  
1693 1735  int
1694 1736  vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1695 1737      void *arg, void **cookie)
1696 1738  {
1697 1739          vmm_softc_t *sc;
1698 1740          int err;
1699 1741  
1700 1742          ASSERT(hold != NULL);
1701 1743          ASSERT(cookie != NULL);
1702 1744  
1703 1745          sc = hold->vmh_sc;
1704 1746          mutex_enter(&vmm_mtx);
1705 1747          /* Confirm that hook installation is not blocked */
1706 1748          if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1707 1749                  mutex_exit(&vmm_mtx);
1708 1750                  return (EBUSY);
1709 1751          }
1710 1752          /*
1711 1753           * Optimistically record an installed hook which will prevent a block
1712 1754           * from being asserted while the mutex is dropped.
1713 1755           */
1714 1756          hold->vmh_ioport_hook_cnt++;
1715 1757          mutex_exit(&vmm_mtx);
1716 1758  
1717 1759          vmm_write_lock(sc);
1718 1760          err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1719 1761              arg, cookie);
1720 1762          vmm_write_unlock(sc);
1721 1763  
1722 1764          if (err != 0) {
1723 1765                  mutex_enter(&vmm_mtx);
1724 1766                  /* Walk back optimism about the hook installation */
1725 1767                  hold->vmh_ioport_hook_cnt--;
1726 1768                  mutex_exit(&vmm_mtx);
1727 1769          }
1728 1770          return (err);
1729 1771  }
1730 1772  
1731 1773  void
1732 1774  vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1733 1775  {
1734 1776          vmm_softc_t *sc;
1735 1777  
1736 1778          ASSERT(hold != NULL);
1737 1779          ASSERT(cookie != NULL);
1738 1780          ASSERT(hold->vmh_ioport_hook_cnt != 0);
1739 1781  
1740 1782          sc = hold->vmh_sc;
1741 1783          vmm_write_lock(sc);
1742 1784          vm_ioport_unhook(sc->vmm_vm, cookie);
1743 1785          vmm_write_unlock(sc);
1744 1786  
1745 1787          mutex_enter(&vmm_mtx);
1746 1788          hold->vmh_ioport_hook_cnt--;
1747 1789          mutex_exit(&vmm_mtx);
1748 1790  }
1749 1791  
1750 1792  static int
1751 1793  vmm_drv_purge(vmm_softc_t *sc)
1752 1794  {
1753 1795          ASSERT(MUTEX_HELD(&vmm_mtx));
1754 1796  
1755 1797          if ((sc->vmm_flags & VMM_HELD) != 0) {
1756 1798                  vmm_hold_t *hold;
1757 1799  
1758 1800                  sc->vmm_flags |= VMM_CLEANUP;
1759 1801                  for (hold = list_head(&sc->vmm_holds); hold != NULL;
1760 1802                      hold = list_next(&sc->vmm_holds, hold)) {
1761 1803                          hold->vmh_release_req = B_TRUE;
1762 1804                  }
1763 1805                  while ((sc->vmm_flags & VMM_HELD) != 0) {
1764 1806                          if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1765 1807                                  return (EINTR);
1766 1808                          }
1767 1809                  }
1768 1810                  sc->vmm_flags &= ~VMM_CLEANUP;
1769 1811          }
1770 1812  
1771 1813          VERIFY(list_is_empty(&sc->vmm_holds));
1772 1814          sc->vmm_flags |= VMM_PURGED;
1773 1815          return (0);
1774 1816  }
1775 1817  
1776 1818  static int
1777 1819  vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1778 1820  {
1779 1821          int err = 0;
1780 1822  
1781 1823          mutex_enter(&vmm_mtx);
1782 1824          if (!enable_block) {
1783 1825                  VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1784 1826  
1785 1827                  sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1786 1828                  goto done;
1787 1829          }
1788 1830  
1789 1831          /* If any holds have hooks installed, the block is a failure */
1790 1832          if (!list_is_empty(&sc->vmm_holds)) {
1791 1833                  vmm_hold_t *hold;
1792 1834  
1793 1835                  for (hold = list_head(&sc->vmm_holds); hold != NULL;
1794 1836                      hold = list_next(&sc->vmm_holds, hold)) {
1795 1837                          if (hold->vmh_ioport_hook_cnt != 0) {
1796 1838                                  err = EBUSY;
1797 1839                                  goto done;
1798 1840                          }
1799 1841                  }
1800 1842          }
1801 1843          sc->vmm_flags |= VMM_BLOCK_HOOK;
1802 1844  
1803 1845  done:
1804 1846          mutex_exit(&vmm_mtx);
1805 1847          return (err);
1806 1848  }
1807 1849  
1808 1850  static int
1809 1851  vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1810 1852      boolean_t *hma_release)
1811 1853  {
1812 1854          dev_info_t      *pdip = ddi_get_parent(vmmdev_dip);
1813 1855          minor_t         minor;
1814 1856  
1815 1857          ASSERT(MUTEX_HELD(&vmm_mtx));
1816 1858  
1817 1859          *hma_release = B_FALSE;
1818 1860  
1819 1861          if (clean_zsd) {
1820 1862                  vmm_zsd_rem_vm(sc);
1821 1863          }
1822 1864  
1823 1865          if (vmm_drv_purge(sc) != 0) {
1824 1866                  return (EINTR);
1825 1867          }
1826 1868  
1827 1869          /* Clean up devmem entries */
1828 1870          vmmdev_devmem_purge(sc);
1829 1871  
1830 1872          list_remove(&vmm_list, sc);
1831 1873          ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1832 1874          minor = sc->vmm_minor;
1833 1875          zone_rele(sc->vmm_zone);
1834 1876          if (sc->vmm_is_open) {
1835 1877                  list_insert_tail(&vmm_destroy_list, sc);
1836 1878                  sc->vmm_flags |= VMM_DESTROY;
1837 1879          } else {
1838 1880                  vm_destroy(sc->vmm_vm);
1839 1881                  ddi_soft_state_free(vmm_statep, minor);
1840 1882                  id_free(vmm_minors, minor);
1841 1883                  *hma_release = B_TRUE;
1842 1884          }
1843 1885          (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1844 1886  
1845 1887          return (0);
1846 1888  }
1847 1889  
1848 1890  int
1849 1891  vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1850 1892  {
1851 1893          boolean_t       hma_release = B_FALSE;
1852 1894          int             err;
1853 1895  
1854 1896          mutex_enter(&vmm_mtx);
1855 1897          err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1856 1898          mutex_exit(&vmm_mtx);
1857 1899  
1858 1900          if (hma_release)
1859 1901                  vmm_hma_release();
1860 1902  
1861 1903          return (err);
1862 1904  }
1863 1905  
1864 1906  /* ARGSUSED */
1865 1907  static int
1866 1908  vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1867 1909  {
1868 1910          boolean_t       hma_release = B_FALSE;
1869 1911          vmm_softc_t     *sc;
1870 1912          int             err;
1871 1913  
1872 1914          if (crgetuid(cr) != 0)
1873 1915                  return (EPERM);
1874 1916  
1875 1917          mutex_enter(&vmm_mtx);
1876 1918  
1877 1919          if ((sc = vmm_lookup(name)) == NULL) {
1878 1920                  mutex_exit(&vmm_mtx);
1879 1921                  return (ENOENT);
1880 1922          }
1881 1923          /*
1882 1924           * We don't check this in vmm_lookup() since that function is also used
1883 1925           * for validation during create and currently vmm names must be unique.
1884 1926           */
1885 1927          if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1886 1928                  mutex_exit(&vmm_mtx);
1887 1929                  return (EPERM);
1888 1930          }
1889 1931          err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1890 1932  
1891 1933          mutex_exit(&vmm_mtx);
1892 1934  
1893 1935          if (hma_release)
1894 1936                  vmm_hma_release();
1895 1937  
1896 1938          return (err);
1897 1939  }
1898 1940  
1899 1941  static int
1900 1942  vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1901 1943  {
1902 1944          minor_t         minor;
1903 1945          vmm_softc_t     *sc;
1904 1946  
1905 1947          minor = getminor(*devp);
1906 1948          if (minor == VMM_CTL_MINOR) {
1907 1949                  /*
1908 1950                   * Master control device must be opened exclusively.
1909 1951                   */
1910 1952                  if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
1911 1953                          return (EINVAL);
1912 1954                  }
1913 1955  
1914 1956                  return (0);
1915 1957          }
1916 1958  
1917 1959          mutex_enter(&vmm_mtx);
1918 1960          sc = ddi_get_soft_state(vmm_statep, minor);
1919 1961          if (sc == NULL) {
1920 1962                  mutex_exit(&vmm_mtx);
1921 1963                  return (ENXIO);
1922 1964          }
1923 1965  
1924 1966          sc->vmm_is_open = B_TRUE;
1925 1967          mutex_exit(&vmm_mtx);
1926 1968  
1927 1969          return (0);
1928 1970  }
1929 1971  
1930 1972  static int
1931 1973  vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
1932 1974  {
1933 1975          minor_t         minor;
1934 1976          vmm_softc_t     *sc;
1935 1977          boolean_t       hma_release = B_FALSE;
1936 1978  
1937 1979          minor = getminor(dev);
1938 1980          if (minor == VMM_CTL_MINOR)
1939 1981                  return (0);
1940 1982  
1941 1983          mutex_enter(&vmm_mtx);
1942 1984          sc = ddi_get_soft_state(vmm_statep, minor);
1943 1985          if (sc == NULL) {
1944 1986                  mutex_exit(&vmm_mtx);
1945 1987                  return (ENXIO);
1946 1988          }
1947 1989  
1948 1990          VERIFY(sc->vmm_is_open);
1949 1991          sc->vmm_is_open = B_FALSE;
1950 1992  
1951 1993          /*
1952 1994           * If this VM was destroyed while the vmm device was open, then
1953 1995           * clean it up now that it is closed.
1954 1996           */
1955 1997          if (sc->vmm_flags & VMM_DESTROY) {
1956 1998                  list_remove(&vmm_destroy_list, sc);
1957 1999                  vm_destroy(sc->vmm_vm);
1958 2000                  ddi_soft_state_free(vmm_statep, minor);
1959 2001                  id_free(vmm_minors, minor);
1960 2002                  hma_release = B_TRUE;
1961 2003          }
1962 2004          mutex_exit(&vmm_mtx);
1963 2005  
1964 2006          if (hma_release)
1965 2007                  vmm_hma_release();
1966 2008  
1967 2009          return (0);
1968 2010  }
1969 2011  
1970 2012  static int
1971 2013  vmm_is_supported(intptr_t arg)
1972 2014  {
1973 2015          int r;
1974 2016          const char *msg;
1975 2017  
1976 2018          if (vmm_is_intel()) {
1977 2019                  r = vmx_x86_supported(&msg);
1978 2020          } else if (vmm_is_svm()) {
1979 2021                  /*
1980 2022                   * HMA already ensured that the features necessary for SVM
1981 2023                   * operation were present and online during vmm_attach().
1982 2024                   */
1983 2025                  r = 0;
1984 2026          } else {
1985 2027                  r = ENXIO;
1986 2028                  msg = "Unsupported CPU vendor";
1987 2029          }
1988 2030  
1989 2031          if (r != 0 && arg != (intptr_t)NULL) {
1990 2032                  if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0)
1991 2033                          return (EFAULT);
1992 2034          }
1993 2035          return (r);
1994 2036  }
1995 2037  
1996 2038  static int
1997 2039  vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1998 2040      int *rvalp)
1999 2041  {
2000 2042          vmm_softc_t     *sc;
2001 2043          minor_t         minor;
2002 2044  
2003 2045          /* The structs in bhyve ioctls assume a 64-bit datamodel */
2004 2046          if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2005 2047                  return (ENOTSUP);
2006 2048          }
2007 2049  
2008 2050          minor = getminor(dev);
2009 2051  
2010 2052          if (minor == VMM_CTL_MINOR) {
2011 2053                  void *argp = (void *)arg;
2012 2054                  char name[VM_MAX_NAMELEN] = { 0 };
2013 2055                  size_t len = 0;
2014 2056  
2015 2057                  if ((mode & FKIOCTL) != 0) {
2016 2058                          len = strlcpy(name, argp, sizeof (name));
2017 2059                  } else {
2018 2060                          if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2019 2061                                  return (EFAULT);
2020 2062                          }
2021 2063                  }
2022 2064                  if (len >= VM_MAX_NAMELEN) {
2023 2065                          return (ENAMETOOLONG);
2024 2066                  }
2025 2067  
2026 2068                  switch (cmd) {
2027 2069                  case VMM_CREATE_VM:
2028 2070                          if ((mode & FWRITE) == 0)
2029 2071                                  return (EPERM);
2030 2072                          return (vmmdev_do_vm_create(name, credp));
2031 2073                  case VMM_DESTROY_VM:
2032 2074                          if ((mode & FWRITE) == 0)
2033 2075                                  return (EPERM);
2034 2076                          return (vmmdev_do_vm_destroy(name, credp));
2035 2077                  case VMM_VM_SUPPORTED:
2036 2078                          return (vmm_is_supported(arg));
2037 2079                  default:
2038 2080                          /* No other actions are legal on ctl device */
2039 2081                          return (ENOTTY);
2040 2082                  }
2041 2083          }
2042 2084  
2043 2085          sc = ddi_get_soft_state(vmm_statep, minor);
2044 2086          ASSERT(sc);
2045 2087  
2046 2088          if (sc->vmm_flags & VMM_DESTROY)
2047 2089                  return (ENXIO);
2048 2090  
2049 2091          return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2050 2092  }
2051 2093  
2052 2094  static int
2053 2095  vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2054 2096      unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2055 2097  {
2056 2098          vmm_softc_t *sc;
2057 2099          const minor_t minor = getminor(dev);
2058 2100          struct vm *vm;
2059 2101          int err;
2060 2102          vm_object_t vmo = NULL;
2061 2103          struct vmspace *vms;
2062 2104  
2063 2105          if (minor == VMM_CTL_MINOR) {
2064 2106                  return (ENODEV);
2065 2107          }
2066 2108          if (off < 0 || (off + len) <= 0) {
2067 2109                  return (EINVAL);
2068 2110          }
2069 2111          if ((prot & PROT_USER) == 0) {
2070 2112                  return (EACCES);
2071 2113          }
2072 2114  
2073 2115          sc = ddi_get_soft_state(vmm_statep, minor);
2074 2116          ASSERT(sc);
2075 2117  
2076 2118          if (sc->vmm_flags & VMM_DESTROY)
2077 2119                  return (ENXIO);
2078 2120  
2079 2121          /* Grab read lock on the VM to prevent any changes to the memory map */
2080 2122          vmm_read_lock(sc);
2081 2123  
2082 2124          vm = sc->vmm_vm;
2083 2125          vms = vm_get_vmspace(vm);
2084 2126          if (off >= VM_DEVMEM_START) {
2085 2127                  int segid;
2086 2128                  off_t map_off = 0;
2087 2129  
2088 2130                  /* Mapping a devmem "device" */
2089 2131                  if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
2090 2132                          err = ENODEV;
2091 2133                          goto out;
2092 2134                  }
2093 2135                  err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2094 2136                  if (err != 0) {
2095 2137                          goto out;
2096 2138                  }
2097 2139                  err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
2098 2140                      flags);
2099 2141          } else {
2100 2142                  /* Mapping a part of the guest physical space */
2101 2143                  err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2102 2144                      flags);
2103 2145          }
2104 2146  
2105 2147  
2106 2148  out:
2107 2149          vmm_read_unlock(sc);
2108 2150          return (err);
2109 2151  }
2110 2152  
2111 2153  static sdev_plugin_validate_t
2112 2154  vmm_sdev_validate(sdev_ctx_t ctx)
2113 2155  {
2114 2156          const char *name = sdev_ctx_name(ctx);
2115 2157          vmm_softc_t *sc;
2116 2158          sdev_plugin_validate_t ret;
2117 2159          minor_t minor;
2118 2160  
2119 2161          if (sdev_ctx_vtype(ctx) != VCHR)
2120 2162                  return (SDEV_VTOR_INVALID);
2121 2163  
2122 2164          VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2123 2165  
2124 2166          mutex_enter(&vmm_mtx);
2125 2167          if ((sc = vmm_lookup(name)) == NULL)
2126 2168                  ret = SDEV_VTOR_INVALID;
2127 2169          else if (sc->vmm_minor != minor)
2128 2170                  ret = SDEV_VTOR_STALE;
2129 2171          else
2130 2172                  ret = SDEV_VTOR_VALID;
2131 2173          mutex_exit(&vmm_mtx);
2132 2174  
2133 2175          return (ret);
2134 2176  }
2135 2177  
2136 2178  static int
2137 2179  vmm_sdev_filldir(sdev_ctx_t ctx)
2138 2180  {
2139 2181          vmm_softc_t *sc;
2140 2182          int ret;
2141 2183  
2142 2184          if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2143 2185                  cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2144 2186                      sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2145 2187                  return (EINVAL);
2146 2188          }
2147 2189  
2148 2190          mutex_enter(&vmm_mtx);
2149 2191          ASSERT(vmmdev_dip != NULL);
2150 2192          for (sc = list_head(&vmm_list); sc != NULL;
2151 2193              sc = list_next(&vmm_list, sc)) {
2152 2194                  if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2153 2195                          ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2154 2196                              S_IFCHR | 0600,
2155 2197                              makedevice(ddi_driver_major(vmmdev_dip),
2156 2198                              sc->vmm_minor));
2157 2199                  } else {
2158 2200                          continue;
2159 2201                  }
2160 2202                  if (ret != 0 && ret != EEXIST)
2161 2203                          goto out;
2162 2204          }
2163 2205  
2164 2206          ret = 0;
2165 2207  
2166 2208  out:
2167 2209          mutex_exit(&vmm_mtx);
2168 2210          return (ret);
2169 2211  }
2170 2212  
2171 2213  /* ARGSUSED */
2172 2214  static void
2173 2215  vmm_sdev_inactive(sdev_ctx_t ctx)
2174 2216  {
2175 2217  }
2176 2218  
2177 2219  static sdev_plugin_ops_t vmm_sdev_ops = {
2178 2220          .spo_version = SDEV_PLUGIN_VERSION,
2179 2221          .spo_flags = SDEV_PLUGIN_SUBDIR,
2180 2222          .spo_validate = vmm_sdev_validate,
2181 2223          .spo_filldir = vmm_sdev_filldir,
2182 2224          .spo_inactive = vmm_sdev_inactive
2183 2225  };
2184 2226  
2185 2227  /* ARGSUSED */
2186 2228  static int
2187 2229  vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2188 2230  {
2189 2231          int error;
2190 2232  
2191 2233          switch (cmd) {
2192 2234          case DDI_INFO_DEVT2DEVINFO:
2193 2235                  *result = (void *)vmmdev_dip;
2194 2236                  error = DDI_SUCCESS;
2195 2237                  break;
2196 2238          case DDI_INFO_DEVT2INSTANCE:
2197 2239                  *result = (void *)0;
2198 2240                  error = DDI_SUCCESS;
2199 2241                  break;
2200 2242          default:
2201 2243                  error = DDI_FAILURE;
2202 2244                  break;
2203 2245          }
2204 2246          return (error);
2205 2247  }
2206 2248  
2207 2249  static int
2208 2250  vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2209 2251  {
2210 2252          sdev_plugin_hdl_t sph;
2211 2253          hma_reg_t *reg = NULL;
2212 2254          boolean_t vmm_loaded = B_FALSE;
2213 2255  
2214 2256          if (cmd != DDI_ATTACH) {
2215 2257                  return (DDI_FAILURE);
2216 2258          }
2217 2259  
2218 2260          mutex_enter(&vmmdev_mtx);
2219 2261          /* Ensure we are not already attached. */
2220 2262          if (vmmdev_dip != NULL) {
2221 2263                  mutex_exit(&vmmdev_mtx);
2222 2264                  return (DDI_FAILURE);
2223 2265          }
2224 2266  
2225 2267          vmm_sol_glue_init();
2226 2268          vmm_arena_init();
2227 2269  
2228 2270          /*
2229 2271           * Perform temporary HMA registration to determine if the system
2230 2272           * is capable.
2231 2273           */
2232 2274          if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2233 2275                  goto fail;
2234 2276          } else if (vmm_mod_load() != 0) {
2235 2277                  goto fail;
2236 2278          }
2237 2279          vmm_loaded = B_TRUE;
2238 2280          hma_unregister(reg);
2239 2281          reg = NULL;
2240 2282  
2241 2283          /* Create control node.  Other nodes will be created on demand. */
2242 2284          if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2243 2285              VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2244 2286                  goto fail;
2245 2287          }
2246 2288  
2247 2289          if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) ==
2248 2290              (sdev_plugin_hdl_t)NULL) {
2249 2291                  ddi_remove_minor_node(dip, NULL);
2250 2292                  goto fail;
2251 2293          }
2252 2294  
2253 2295          ddi_report_dev(dip);
2254 2296          vmmdev_sdev_hdl = sph;
2255 2297          vmmdev_dip = dip;
2256 2298          mutex_exit(&vmmdev_mtx);
2257 2299          return (DDI_SUCCESS);
2258 2300  
2259 2301  fail:
2260 2302          if (vmm_loaded) {
2261 2303                  VERIFY0(vmm_mod_unload());
2262 2304          }
2263 2305          if (reg != NULL) {
2264 2306                  hma_unregister(reg);
2265 2307          }
2266 2308          vmm_arena_fini();
2267 2309          vmm_sol_glue_cleanup();
2268 2310          mutex_exit(&vmmdev_mtx);
2269 2311          return (DDI_FAILURE);
2270 2312  }
2271 2313  
2272 2314  static int
2273 2315  vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2274 2316  {
2275 2317          if (cmd != DDI_DETACH) {
2276 2318                  return (DDI_FAILURE);
2277 2319          }
2278 2320  
2279 2321          /*
2280 2322           * Ensure that all resources have been cleaned up.
2281 2323           *
2282 2324           * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2283 2325           * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2284 2326           * devinfo locked as iommu_cleanup() tries to recursively lock each
2285 2327           * devinfo, including our own, while holding vmmdev_mtx.
2286 2328           */
2287 2329          if (mutex_tryenter(&vmmdev_mtx) == 0)
2288 2330                  return (DDI_FAILURE);
2289 2331  
2290 2332          mutex_enter(&vmm_mtx);
2291 2333          if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2292 2334                  mutex_exit(&vmm_mtx);
2293 2335                  mutex_exit(&vmmdev_mtx);
2294 2336                  return (DDI_FAILURE);
2295 2337          }
2296 2338          mutex_exit(&vmm_mtx);
2297 2339  
2298 2340          VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2299 2341          if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2300 2342                  mutex_exit(&vmmdev_mtx);
2301 2343                  return (DDI_FAILURE);
2302 2344          }
2303 2345          vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2304 2346  
2305 2347          /* Remove the control node. */
2306 2348          ddi_remove_minor_node(dip, "ctl");
2307 2349          vmmdev_dip = NULL;
2308 2350  
2309 2351          VERIFY0(vmm_mod_unload());
2310 2352          VERIFY3U(vmmdev_hma_reg, ==, NULL);
2311 2353          vmm_arena_fini();
2312 2354          vmm_sol_glue_cleanup();
2313 2355  
2314 2356          mutex_exit(&vmmdev_mtx);
2315 2357  
2316 2358          return (DDI_SUCCESS);
2317 2359  }
2318 2360  
2319 2361  static struct cb_ops vmm_cb_ops = {
2320 2362          vmm_open,
2321 2363          vmm_close,
2322 2364          nodev,          /* strategy */
2323 2365          nodev,          /* print */
2324 2366          nodev,          /* dump */
2325 2367          nodev,          /* read */
2326 2368          nodev,          /* write */
2327 2369          vmm_ioctl,
2328 2370          nodev,          /* devmap */
2329 2371          nodev,          /* mmap */
2330 2372          vmm_segmap,
2331 2373          nochpoll,       /* poll */
2332 2374          ddi_prop_op,
2333 2375          NULL,
2334 2376          D_NEW | D_MP | D_DEVMAP
2335 2377  };
2336 2378  
2337 2379  static struct dev_ops vmm_ops = {
2338 2380          DEVO_REV,
2339 2381          0,
2340 2382          vmm_info,
2341 2383          nulldev,        /* identify */
2342 2384          nulldev,        /* probe */
2343 2385          vmm_attach,
2344 2386          vmm_detach,
2345 2387          nodev,          /* reset */
2346 2388          &vmm_cb_ops,
2347 2389          (struct bus_ops *)NULL
2348 2390  };
2349 2391  
2350 2392  static struct modldrv modldrv = {
2351 2393          &mod_driverops,
2352 2394          "bhyve vmm",
2353 2395          &vmm_ops
2354 2396  };
2355 2397  
2356 2398  static struct modlinkage modlinkage = {
2357 2399          MODREV_1,
2358 2400          &modldrv,
2359 2401          NULL
2360 2402  };
2361 2403  
2362 2404  int
2363 2405  _init(void)
2364 2406  {
2365 2407          int     error;
2366 2408  
2367 2409          sysinit();
2368 2410  
2369 2411          mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2370 2412          mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2371 2413          list_create(&vmm_list, sizeof (vmm_softc_t),
2372 2414              offsetof(vmm_softc_t, vmm_node));
2373 2415          list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2374 2416              offsetof(vmm_softc_t, vmm_node));
2375 2417          vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2376 2418  
2377 2419          error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2378 2420          if (error) {
2379 2421                  return (error);
2380 2422          }
2381 2423  
2382 2424          vmm_zsd_init();
2383 2425  
2384 2426          error = mod_install(&modlinkage);
2385 2427          if (error) {
2386 2428                  ddi_soft_state_fini(&vmm_statep);
2387 2429                  vmm_zsd_fini();
2388 2430          }
2389 2431  
2390 2432          return (error);
2391 2433  }
2392 2434  
2393 2435  int
2394 2436  _fini(void)
2395 2437  {
2396 2438          int     error;
2397 2439  
2398 2440          error = mod_remove(&modlinkage);
2399 2441          if (error) {
2400 2442                  return (error);
2401 2443          }
2402 2444  
2403 2445          vmm_zsd_fini();
2404 2446  
2405 2447          ddi_soft_state_fini(&vmm_statep);
2406 2448  
2407 2449          return (0);
2408 2450  }
2409 2451  
2410 2452  int
2411 2453  _info(struct modinfo *modinfop)
2412 2454  {
2413 2455          return (mod_info(&modlinkage, modinfop));
2414 2456  }

↓ open down ↓

1413 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX