8005-backout Wdiff usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c

Print this page

Revert "OS-8005 bhyve memory pressure needs to target ARC better (#354)"
This reverts commit a6033573eedd94118d2b9e65f45deca0bf4b42f7.

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
          +++ new/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c

   1    1  /*
   2    2   * This file and its contents are supplied under the terms of the
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version

↓ open down ↓

4 lines elided

↑ open up ↑

   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
  12   12  
  13   13  /*
  14   14   * Copyright 2015 Pluribus Networks Inc.
  15      - * Copyright 2020 Joyent, Inc.
       15 + * Copyright 2019 Joyent, Inc.
  16   16   * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  17   17   * Copyright 2021 Oxide Computer Company
  18   18   */
  19   19  
  20   20  #include <sys/types.h>
  21   21  #include <sys/conf.h>
  22   22  #include <sys/cpuvar.h>
  23   23  #include <sys/ioccom.h>
  24   24  #include <sys/stat.h>
  25   25  #include <sys/vmsystm.h>

  26   26  #include <sys/ddi.h>
  27   27  #include <sys/mkdev.h>
  28   28  #include <sys/sunddi.h>
  29   29  #include <sys/fs/dv_node.h>
  30   30  #include <sys/cpuset.h>
  31   31  #include <sys/id_space.h>
  32   32  #include <sys/fs/sdev_plugin.h>
  33   33  #include <sys/smt.h>
  34   34  #include <sys/kstat.h>
  35   35  
  36   36  #include <sys/kernel.h>
  37   37  #include <sys/hma.h>
  38   38  #include <sys/x86_archext.h>
  39   39  #include <x86/apicreg.h>
  40   40  
  41   41  #include <sys/vmm.h>
  42   42  #include <sys/vmm_kernel.h>
  43   43  #include <sys/vmm_instruction_emul.h>
  44   44  #include <sys/vmm_dev.h>
  45   45  #include <sys/vmm_impl.h>
  46   46  #include <sys/vmm_drv.h>
  47   47  #include <sys/vmm_vm.h>
  48   48  
  49   49  #include <vm/seg_dev.h>
  50   50  
  51   51  #include "io/ppt.h"
  52   52  #include "io/vatpic.h"
  53   53  #include "io/vioapic.h"
  54   54  #include "io/vrtc.h"
  55   55  #include "io/vhpet.h"
  56   56  #include "io/vpmtmr.h"
  57   57  #include "vmm_lapic.h"
  58   58  #include "vmm_stat.h"
  59   59  #include "vmm_util.h"
  60   60  
  61   61  /*
  62   62   * Locking details:
  63   63   *
  64   64   * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
  65   65   * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
  66   66   * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
  67   67   * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
  68   68   * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
  69   69   */
  70   70  
  71   71  static kmutex_t         vmmdev_mtx;
  72   72  static dev_info_t       *vmmdev_dip;
  73   73  static hma_reg_t        *vmmdev_hma_reg;
  74   74  static uint_t           vmmdev_hma_ref;
  75   75  static sdev_plugin_hdl_t vmmdev_sdev_hdl;
  76   76  
  77   77  static kmutex_t         vmm_mtx;
  78   78  static list_t           vmm_list;
  79   79  static list_t           vmm_destroy_list;
  80   80  static id_space_t       *vmm_minors;
  81   81  static void             *vmm_statep;
  82   82  
  83   83  static const char *vmmdev_hvm_name = "bhyve";
  84   84  
  85   85  /* For sdev plugin (/dev) */
  86   86  #define VMM_SDEV_ROOT "/dev/vmm"
  87   87  
  88   88  /* From uts/i86pc/io/vmm/intel/vmx.c */
  89   89  extern int vmx_x86_supported(const char **);
  90   90  
  91   91  /* Holds and hooks from drivers external to vmm */
  92   92  struct vmm_hold {
  93   93          list_node_t     vmh_node;
  94   94          vmm_softc_t     *vmh_sc;
  95   95          boolean_t       vmh_release_req;
  96   96          uint_t          vmh_ioport_hook_cnt;
  97   97  };
  98   98  
  99   99  struct vmm_lease {
 100  100          list_node_t             vml_node;
 101  101          struct vm               *vml_vm;
 102  102          boolean_t               vml_expired;
 103  103          boolean_t               (*vml_expire_func)(void *);
 104  104          void                    *vml_expire_arg;
 105  105          list_node_t             vml_expire_node;
 106  106          struct vmm_hold         *vml_hold;
 107  107  };
 108  108  
 109  109  static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
 110  110  static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
 111  111  static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
 112  112  static void vmm_kstat_init(vmm_softc_t *);
 113  113  static void vmm_kstat_fini(vmm_softc_t *);
 114  114  
 115  115  static int
 116  116  vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 117  117  {
 118  118          int error;
 119  119          bool sysmem;
 120  120  
 121  121          error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
 122  122              NULL);
 123  123          if (error || mseg->len == 0)
 124  124                  return (error);
 125  125  
 126  126          if (!sysmem) {
 127  127                  vmm_devmem_entry_t *de;
 128  128                  list_t *dl = &sc->vmm_devmem_list;
 129  129  
 130  130                  for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
 131  131                          if (de->vde_segid == mseg->segid) {
 132  132                                  break;
 133  133                          }
 134  134                  }
 135  135                  if (de != NULL) {
 136  136                          (void) strlcpy(mseg->name, de->vde_name,
 137  137                              sizeof (mseg->name));
 138  138                  }
 139  139          } else {
 140  140                  bzero(mseg->name, sizeof (mseg->name));
 141  141          }
 142  142  
 143  143          return (error);
 144  144  }
 145  145  
 146  146  /*
 147  147   * The 'devmem' hack:
 148  148   *
 149  149   * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
 150  150   * in the vm which appear with their own name related to the vm under /dev.
 151  151   * Since this would be a hassle from an sdev perspective and would require a
 152  152   * new cdev interface (or complicate the existing one), we choose to implement
 153  153   * this in a different manner.  When 'devmem' mappings are created, an
 154  154   * identifying off_t is communicated back out to userspace.  That off_t,
 155  155   * residing above the normal guest memory space, can be used to mmap the
 156  156   * 'devmem' mapping from the already-open vm device.
 157  157   */
 158  158  
 159  159  static int
 160  160  vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
 161  161  {
 162  162          off_t map_offset;
 163  163          vmm_devmem_entry_t *entry;
 164  164  
 165  165          if (list_is_empty(&sc->vmm_devmem_list)) {
 166  166                  map_offset = VM_DEVMEM_START;
 167  167          } else {
 168  168                  entry = list_tail(&sc->vmm_devmem_list);
 169  169                  map_offset = entry->vde_off + entry->vde_len;
 170  170                  if (map_offset < entry->vde_off) {
 171  171                          /* Do not tolerate overflow */
 172  172                          return (ERANGE);
 173  173                  }
 174  174                  /*
 175  175                   * XXXJOY: We could choose to search the list for duplicate
 176  176                   * names and toss an error.  Since we're using the offset
 177  177                   * method for now, it does not make much of a difference.
 178  178                   */
 179  179          }
 180  180  
 181  181          entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
 182  182          entry->vde_segid = mseg->segid;
 183  183          entry->vde_len = mseg->len;
 184  184          entry->vde_off = map_offset;
 185  185          (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
 186  186          list_insert_tail(&sc->vmm_devmem_list, entry);
 187  187  
 188  188          return (0);
 189  189  }
 190  190  
 191  191  static boolean_t
 192  192  vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
 193  193      off_t *map_offp)
 194  194  {
 195  195          list_t *dl = &sc->vmm_devmem_list;
 196  196          vmm_devmem_entry_t *de = NULL;
 197  197          const off_t map_end = off + len;
 198  198  
 199  199          VERIFY(off >= VM_DEVMEM_START);
 200  200  
 201  201          if (map_end < off) {
 202  202                  /* No match on overflow */
 203  203                  return (B_FALSE);
 204  204          }
 205  205  
 206  206          for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
 207  207                  const off_t item_end = de->vde_off + de->vde_len;
 208  208  
 209  209                  if (de->vde_off <= off && item_end >= map_end) {
 210  210                          *segidp = de->vde_segid;
 211  211                          *map_offp = off - de->vde_off;
 212  212                          return (B_TRUE);
 213  213                  }
 214  214          }
 215  215          return (B_FALSE);
 216  216  }
 217  217  
 218  218  static void
 219  219  vmmdev_devmem_purge(vmm_softc_t *sc)
 220  220  {
 221  221          vmm_devmem_entry_t *entry;
 222  222  
 223  223          while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
 224  224                  kmem_free(entry, sizeof (*entry));
 225  225          }
 226  226  }
 227  227  
 228  228  static int
 229  229  vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 230  230  {
 231  231          int error;
 232  232          bool sysmem = true;
 233  233  
 234  234          if (VM_MEMSEG_NAME(mseg)) {
 235  235                  sysmem = false;
 236  236          }
 237  237          error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
 238  238  
 239  239          if (error == 0 && VM_MEMSEG_NAME(mseg)) {
 240  240                  /*
 241  241                   * Rather than create a whole fresh device from which userspace
 242  242                   * can mmap this segment, instead make it available at an
 243  243                   * offset above where the main guest memory resides.
 244  244                   */
 245  245                  error = vmmdev_devmem_create(sc, mseg, mseg->name);
 246  246                  if (error != 0) {
 247  247                          vm_free_memseg(sc->vmm_vm, mseg->segid);
 248  248                  }
 249  249          }
 250  250          return (error);
 251  251  }
 252  252  
 253  253  /*
 254  254   * Resource Locking and Exclusion
 255  255   *
 256  256   * Much of bhyve depends on key portions of VM state, such as the guest memory
 257  257   * map, to remain unchanged while the guest is running.  As ported from
 258  258   * FreeBSD, the initial strategy for this resource exclusion hinged on gating
 259  259   * access to the instance vCPUs.  Threads acting on a single vCPU, like those
 260  260   * performing the work of actually running the guest in VMX/SVM, would lock
 261  261   * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
 262  262   * state, all of the vCPUs would be first locked, ensuring that the
 263  263   * operation(s) could complete without any other threads stumbling into
 264  264   * intermediate states.
 265  265   *
 266  266   * This approach is largely effective for bhyve.  Common operations, such as
 267  267   * running the vCPUs, steer clear of lock contention.  The model begins to
 268  268   * break down for operations which do not occur in the context of a specific
 269  269   * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
 270  270   * thread in the bhyve process.  In order to properly protect those vCPU-less
 271  271   * operations from encountering invalid states, additional locking is required.
 272  272   * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
 273  273   * It does mean that class of operations will be serialized on locking the
 274  274   * specific vCPU and that instances sized at VM_MAXCPU will potentially see
 275  275   * undue contention on the VM_MAXCPU-1 vCPU.
 276  276   *
 277  277   * In order to address the shortcomings of this model, the concept of a
 278  278   * read/write lock has been added to bhyve.  Operations which change
 279  279   * fundamental aspects of a VM (such as the memory map) must acquire the write
 280  280   * lock, which also implies locking all of the vCPUs and waiting for all read
 281  281   * lock holders to release.  While it increases the cost and waiting time for
 282  282   * those few operations, it allows most hot-path operations on the VM (which
 283  283   * depend on its configuration remaining stable) to occur with minimal locking.
 284  284   *
 285  285   * Consumers of the Driver API (see below) are a special case when it comes to
 286  286   * this locking, since they may hold a read lock via the drv_lease mechanism
 287  287   * for an extended period of time.  Rather than forcing those consumers to
 288  288   * continuously poll for a write lock attempt, the lease system forces them to
 289  289   * provide a release callback to trigger their clean-up (and potential later
 290  290   * reacquisition) of the read lock.
 291  291   */
 292  292  
 293  293  static void
 294  294  vcpu_lock_one(vmm_softc_t *sc, int vcpu)
 295  295  {
 296  296          ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 297  297  
 298  298          /*
 299  299           * Since this state transition is utilizing from_idle=true, it should
 300  300           * not fail, but rather block until it can be successful.
 301  301           */
 302  302          VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
 303  303  }
 304  304  
 305  305  static void
 306  306  vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
 307  307  {
 308  308          ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 309  309  
 310  310          VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
 311  311          vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
 312  312  }
 313  313  
 314  314  static void
 315  315  vmm_read_lock(vmm_softc_t *sc)
 316  316  {
 317  317          rw_enter(&sc->vmm_rwlock, RW_READER);
 318  318  }
 319  319  
 320  320  static void
 321  321  vmm_read_unlock(vmm_softc_t *sc)
 322  322  {
 323  323          rw_exit(&sc->vmm_rwlock);
 324  324  }
 325  325  
 326  326  static void
 327  327  vmm_write_lock(vmm_softc_t *sc)
 328  328  {
 329  329          int maxcpus;
 330  330  
 331  331          /* First lock all the vCPUs */
 332  332          maxcpus = vm_get_maxcpus(sc->vmm_vm);
 333  333          for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
 334  334                  vcpu_lock_one(sc, vcpu);
 335  335          }
 336  336  
 337  337          mutex_enter(&sc->vmm_lease_lock);
 338  338          VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
 339  339          sc->vmm_lease_blocker++;
 340  340          if (sc->vmm_lease_blocker == 1) {
 341  341                  list_t *list = &sc->vmm_lease_list;
 342  342                  vmm_lease_t *lease = list_head(list);
 343  343  
 344  344                  while (lease != NULL) {
 345  345                          boolean_t sync_break = B_FALSE;
 346  346  
 347  347                          if (!lease->vml_expired) {
 348  348                                  void *arg = lease->vml_expire_arg;
 349  349                                  lease->vml_expired = B_TRUE;
 350  350                                  sync_break = lease->vml_expire_func(arg);
 351  351                          }
 352  352  
 353  353                          if (sync_break) {
 354  354                                  vmm_lease_t *next;
 355  355  
 356  356                                  /*
 357  357                                   * These leases which are synchronously broken
 358  358                                   * result in vmm_read_unlock() calls from a
 359  359                                   * different thread than the corresponding
 360  360                                   * vmm_read_lock().  This is acceptable, given
 361  361                                   * that the rwlock underpinning the whole
 362  362                                   * mechanism tolerates the behavior.  This
 363  363                                   * flexibility is _only_ afforded to VM read
 364  364                                   * lock (RW_READER) holders.
 365  365                                   */
 366  366                                  next = list_next(list, lease);
 367  367                                  vmm_lease_break_locked(sc, lease);
 368  368                                  lease = next;
 369  369                          } else {
 370  370                                  lease = list_next(list, lease);
 371  371                          }
 372  372                  }
 373  373          }
 374  374          mutex_exit(&sc->vmm_lease_lock);
 375  375  
 376  376          rw_enter(&sc->vmm_rwlock, RW_WRITER);
 377  377          /*
 378  378           * For now, the 'maxcpus' value for an instance is fixed at the
 379  379           * compile-time constant of VM_MAXCPU at creation.  If this changes in
 380  380           * the future, allowing for dynamic vCPU resource sizing, acquisition
 381  381           * of the write lock will need to be wary of such changes.
 382  382           */
 383  383          VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
 384  384  }
 385  385  
 386  386  static void
 387  387  vmm_write_unlock(vmm_softc_t *sc)
 388  388  {
 389  389          int maxcpus;
 390  390  
 391  391          mutex_enter(&sc->vmm_lease_lock);
 392  392          VERIFY3U(sc->vmm_lease_blocker, !=, 0);
 393  393          sc->vmm_lease_blocker--;
 394  394          if (sc->vmm_lease_blocker == 0) {
 395  395                  cv_broadcast(&sc->vmm_lease_cv);
 396  396          }
 397  397          mutex_exit(&sc->vmm_lease_lock);
 398  398  
 399  399          /*
 400  400           * The VM write lock _must_ be released from the same thread it was
 401  401           * acquired in, unlike the read lock.
 402  402           */
 403  403          VERIFY(rw_write_held(&sc->vmm_rwlock));
 404  404          rw_exit(&sc->vmm_rwlock);
 405  405  
 406  406          /* Unlock all the vCPUs */
 407  407          maxcpus = vm_get_maxcpus(sc->vmm_vm);
 408  408          for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
 409  409                  vcpu_unlock_one(sc, vcpu);
 410  410          }
 411  411  }
 412  412  
 413  413  static int
 414  414  vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 415  415      cred_t *credp, int *rvalp)
 416  416  {
 417  417          int error = 0, vcpu = -1;
 418  418          void *datap = (void *)arg;
 419  419          enum vm_lock_type {
 420  420                  LOCK_NONE = 0,
 421  421                  LOCK_VCPU,
 422  422                  LOCK_READ_HOLD,
 423  423                  LOCK_WRITE_HOLD
 424  424          } lock_type = LOCK_NONE;
 425  425  
 426  426          /* Acquire any exclusion resources needed for the operation. */
 427  427          switch (cmd) {
 428  428          case VM_RUN:
 429  429          case VM_GET_REGISTER:
 430  430          case VM_SET_REGISTER:
 431  431          case VM_GET_SEGMENT_DESCRIPTOR:
 432  432          case VM_SET_SEGMENT_DESCRIPTOR:
 433  433          case VM_GET_REGISTER_SET:
 434  434          case VM_SET_REGISTER_SET:
 435  435          case VM_INJECT_EXCEPTION:
 436  436          case VM_GET_CAPABILITY:
 437  437          case VM_SET_CAPABILITY:
 438  438          case VM_PPTDEV_MSI:
 439  439          case VM_PPTDEV_MSIX:
 440  440          case VM_SET_X2APIC_STATE:
 441  441          case VM_GLA2GPA:
 442  442          case VM_GLA2GPA_NOFAULT:
 443  443          case VM_ACTIVATE_CPU:
 444  444          case VM_SET_INTINFO:
 445  445          case VM_GET_INTINFO:
 446  446          case VM_RESTART_INSTRUCTION:
 447  447          case VM_SET_KERNEMU_DEV:
 448  448          case VM_GET_KERNEMU_DEV:
 449  449          case VM_RESET_CPU:
 450  450          case VM_GET_RUN_STATE:
 451  451          case VM_SET_RUN_STATE:
 452  452                  /*
 453  453                   * Copy in the ID of the vCPU chosen for this operation.
 454  454                   * Since a nefarious caller could update their struct between
 455  455                   * this locking and when the rest of the ioctl data is copied
 456  456                   * in, it is _critical_ that this local 'vcpu' variable be used
 457  457                   * rather than the in-struct one when performing the ioctl.
 458  458                   */
 459  459                  if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
 460  460                          return (EFAULT);
 461  461                  }
 462  462                  if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
 463  463                          return (EINVAL);
 464  464                  }
 465  465                  vcpu_lock_one(sc, vcpu);
 466  466                  lock_type = LOCK_VCPU;
 467  467                  break;
 468  468

↓ open down ↓

443 lines elided

↑ open up ↑

 469  469          case VM_REINIT:
 470  470          case VM_BIND_PPTDEV:
 471  471          case VM_UNBIND_PPTDEV:
 472  472          case VM_MAP_PPTDEV_MMIO:
 473  473          case VM_UNMAP_PPTDEV_MMIO:
 474  474          case VM_ALLOC_MEMSEG:
 475  475          case VM_MMAP_MEMSEG:
 476  476          case VM_MUNMAP_MEMSEG:
 477  477          case VM_WRLOCK_CYCLE:
 478  478          case VM_PMTMR_LOCATE:
 479      -        case VM_ARC_RESV:
 480  479                  vmm_write_lock(sc);
 481  480                  lock_type = LOCK_WRITE_HOLD;
 482  481                  break;
 483  482  
 484  483          case VM_GET_GPA_PMAP:
 485  484          case VM_GET_MEMSEG:
 486  485          case VM_MMAP_GETNEXT:
 487  486          case VM_LAPIC_IRQ:
 488  487          case VM_INJECT_NMI:
 489  488          case VM_IOAPIC_ASSERT_IRQ:

 490  489          case VM_IOAPIC_DEASSERT_IRQ:
 491  490          case VM_IOAPIC_PULSE_IRQ:
 492  491          case VM_LAPIC_MSI:
 493  492          case VM_LAPIC_LOCAL_IRQ:
 494  493          case VM_GET_X2APIC_STATE:
 495  494          case VM_RTC_READ:
 496  495          case VM_RTC_WRITE:
 497  496          case VM_RTC_SETTIME:
 498  497          case VM_RTC_GETTIME:
 499  498          case VM_PPTDEV_DISABLE_MSIX:
 500  499          case VM_DEVMEM_GETOFFSET:
 501  500                  vmm_read_lock(sc);
 502  501                  lock_type = LOCK_READ_HOLD;
 503  502                  break;
 504  503  
 505  504          case VM_IOAPIC_PINCOUNT:
 506  505          default:
 507  506                  break;
 508  507          }
 509  508  
 510  509          /* Execute the primary logic for the ioctl. */
 511  510          switch (cmd) {
 512  511          case VM_RUN: {
 513  512                  struct vm_entry entry;
 514  513  
 515  514                  if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
 516  515                          error = EFAULT;
 517  516                          break;
 518  517                  }
 519  518  
 520  519                  if (!(curthread->t_schedflag & TS_VCPU))
 521  520                          smt_mark_as_vcpu();
 522  521  
 523  522                  error = vm_run(sc->vmm_vm, vcpu, &entry);
 524  523  
 525  524                  /*
 526  525                   * Unexpected states in vm_run() are expressed through positive
 527  526                   * errno-oriented return values.  VM states which expect further
 528  527                   * processing in userspace (necessary context via exitinfo) are
 529  528                   * expressed through negative return values.  For the time being
 530  529                   * a return value of 0 is not expected from vm_run().
 531  530                   */
 532  531                  ASSERT(error != 0);
 533  532                  if (error < 0) {
 534  533                          const struct vm_exit *vme;
 535  534                          void *outp = entry.exit_data;
 536  535  
 537  536                          error = 0;
 538  537                          vme = vm_exitinfo(sc->vmm_vm, vcpu);
 539  538                          if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
 540  539                                  error = EFAULT;
 541  540                          }
 542  541                  }
 543  542                  break;
 544  543          }
 545  544          case VM_SUSPEND: {
 546  545                  struct vm_suspend vmsuspend;
 547  546  
 548  547                  if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
 549  548                          error = EFAULT;
 550  549                          break;
 551  550                  }
 552  551                  error = vm_suspend(sc->vmm_vm, vmsuspend.how);
 553  552                  break;
 554  553          }
 555  554          case VM_REINIT:
 556  555                  if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
 557  556                          /*
 558  557                           * The VM instance should be free of driver-attached
 559  558                           * hooks during the reinitialization process.
 560  559                           */
 561  560                          break;
 562  561                  }
 563  562                  error = vm_reinit(sc->vmm_vm);
 564  563                  (void) vmm_drv_block_hook(sc, B_FALSE);
 565  564                  break;
 566  565          case VM_STAT_DESC: {
 567  566                  struct vm_stat_desc statdesc;
 568  567  
 569  568                  if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
 570  569                          error = EFAULT;
 571  570                          break;
 572  571                  }
 573  572                  error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
 574  573                      sizeof (statdesc.desc));
 575  574                  if (error == 0 &&
 576  575                      ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
 577  576                          error = EFAULT;
 578  577                          break;
 579  578                  }
 580  579                  break;
 581  580          }
 582  581          case VM_STATS_IOC: {
 583  582                  struct vm_stats vmstats;
 584  583  
 585  584                  CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
 586  585                  if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
 587  586                          error = EFAULT;
 588  587                          break;
 589  588                  }
 590  589                  hrt2tv(gethrtime(), &vmstats.tv);
 591  590                  error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
 592  591                      &vmstats.num_entries, vmstats.statbuf);
 593  592                  if (error == 0 &&
 594  593                      ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
 595  594                          error = EFAULT;
 596  595                          break;
 597  596                  }
 598  597                  break;
 599  598          }
 600  599  
 601  600          case VM_PPTDEV_MSI: {
 602  601                  struct vm_pptdev_msi pptmsi;
 603  602  
 604  603                  if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
 605  604                          error = EFAULT;
 606  605                          break;
 607  606                  }
 608  607                  error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
 609  608                      pptmsi.addr, pptmsi.msg, pptmsi.numvec);
 610  609                  break;
 611  610          }
 612  611          case VM_PPTDEV_MSIX: {
 613  612                  struct vm_pptdev_msix pptmsix;
 614  613  
 615  614                  if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
 616  615                          error = EFAULT;
 617  616                          break;
 618  617                  }
 619  618                  error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
 620  619                      pptmsix.idx, pptmsix.addr, pptmsix.msg,
 621  620                      pptmsix.vector_control);
 622  621                  break;
 623  622          }
 624  623          case VM_PPTDEV_DISABLE_MSIX: {
 625  624                  struct vm_pptdev pptdev;
 626  625  
 627  626                  if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 628  627                          error = EFAULT;
 629  628                          break;
 630  629                  }
 631  630                  error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
 632  631                  break;
 633  632          }
 634  633          case VM_MAP_PPTDEV_MMIO: {
 635  634                  struct vm_pptdev_mmio pptmmio;
 636  635  
 637  636                  if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
 638  637                          error = EFAULT;
 639  638                          break;
 640  639                  }
 641  640                  error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
 642  641                      pptmmio.len, pptmmio.hpa);
 643  642                  break;
 644  643          }
 645  644          case VM_UNMAP_PPTDEV_MMIO: {
 646  645                  struct vm_pptdev_mmio pptmmio;
 647  646  
 648  647                  if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
 649  648                          error = EFAULT;
 650  649                          break;
 651  650                  }
 652  651                  error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
 653  652                      pptmmio.len);
 654  653                  break;
 655  654          }
 656  655          case VM_BIND_PPTDEV: {
 657  656                  struct vm_pptdev pptdev;
 658  657  
 659  658                  if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 660  659                          error = EFAULT;
 661  660                          break;
 662  661                  }
 663  662                  error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
 664  663                  break;
 665  664          }
 666  665          case VM_UNBIND_PPTDEV: {
 667  666                  struct vm_pptdev pptdev;
 668  667  
 669  668                  if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
 670  669                          error = EFAULT;
 671  670                          break;
 672  671                  }
 673  672                  error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
 674  673                  break;
 675  674          }
 676  675          case VM_GET_PPTDEV_LIMITS: {
 677  676                  struct vm_pptdev_limits pptlimits;
 678  677  
 679  678                  if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
 680  679                          error = EFAULT;
 681  680                          break;
 682  681                  }
 683  682                  error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
 684  683                      &pptlimits.msi_limit, &pptlimits.msix_limit);
 685  684                  if (error == 0 &&
 686  685                      ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
 687  686                          error = EFAULT;
 688  687                          break;
 689  688                  }
 690  689                  break;
 691  690          }
 692  691          case VM_INJECT_EXCEPTION: {
 693  692                  struct vm_exception vmexc;
 694  693                  if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
 695  694                          error = EFAULT;
 696  695                          break;
 697  696                  }
 698  697                  error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
 699  698                      vmexc.error_code_valid, vmexc.error_code,
 700  699                      vmexc.restart_instruction);
 701  700                  break;
 702  701          }
 703  702          case VM_INJECT_NMI: {
 704  703                  struct vm_nmi vmnmi;
 705  704  
 706  705                  if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
 707  706                          error = EFAULT;
 708  707                          break;
 709  708                  }
 710  709                  error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
 711  710                  break;
 712  711          }
 713  712          case VM_LAPIC_IRQ: {
 714  713                  struct vm_lapic_irq vmirq;
 715  714  
 716  715                  if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
 717  716                          error = EFAULT;
 718  717                          break;
 719  718                  }
 720  719                  error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
 721  720                  break;
 722  721          }
 723  722          case VM_LAPIC_LOCAL_IRQ: {
 724  723                  struct vm_lapic_irq vmirq;
 725  724  
 726  725                  if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
 727  726                          error = EFAULT;
 728  727                          break;
 729  728                  }
 730  729                  error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
 731  730                      vmirq.vector);
 732  731                  break;
 733  732          }
 734  733          case VM_LAPIC_MSI: {
 735  734                  struct vm_lapic_msi vmmsi;
 736  735  
 737  736                  if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
 738  737                          error = EFAULT;
 739  738                          break;
 740  739                  }
 741  740                  error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
 742  741                  break;
 743  742          }
 744  743  
 745  744          case VM_IOAPIC_ASSERT_IRQ: {
 746  745                  struct vm_ioapic_irq ioapic_irq;
 747  746  
 748  747                  if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 749  748                          error = EFAULT;
 750  749                          break;
 751  750                  }
 752  751                  error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
 753  752                  break;
 754  753          }
 755  754          case VM_IOAPIC_DEASSERT_IRQ: {
 756  755                  struct vm_ioapic_irq ioapic_irq;
 757  756  
 758  757                  if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 759  758                          error = EFAULT;
 760  759                          break;
 761  760                  }
 762  761                  error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
 763  762                  break;
 764  763          }
 765  764          case VM_IOAPIC_PULSE_IRQ: {
 766  765                  struct vm_ioapic_irq ioapic_irq;
 767  766  
 768  767                  if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
 769  768                          error = EFAULT;
 770  769                          break;
 771  770                  }
 772  771                  error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
 773  772                  break;
 774  773          }
 775  774          case VM_IOAPIC_PINCOUNT: {
 776  775                  int pincount;
 777  776  
 778  777                  pincount = vioapic_pincount(sc->vmm_vm);
 779  778                  if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
 780  779                          error = EFAULT;
 781  780                          break;
 782  781                  }
 783  782                  break;
 784  783          }
 785  784  
 786  785          case VM_ISA_ASSERT_IRQ: {
 787  786                  struct vm_isa_irq isa_irq;
 788  787  
 789  788                  if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 790  789                          error = EFAULT;
 791  790                          break;
 792  791                  }
 793  792                  error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
 794  793                  if (error == 0 && isa_irq.ioapic_irq != -1) {
 795  794                          error = vioapic_assert_irq(sc->vmm_vm,
 796  795                              isa_irq.ioapic_irq);
 797  796                  }
 798  797                  break;
 799  798          }
 800  799          case VM_ISA_DEASSERT_IRQ: {
 801  800                  struct vm_isa_irq isa_irq;
 802  801  
 803  802                  if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 804  803                          error = EFAULT;
 805  804                          break;
 806  805                  }
 807  806                  error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
 808  807                  if (error == 0 && isa_irq.ioapic_irq != -1) {
 809  808                          error = vioapic_deassert_irq(sc->vmm_vm,
 810  809                              isa_irq.ioapic_irq);
 811  810                  }
 812  811                  break;
 813  812          }
 814  813          case VM_ISA_PULSE_IRQ: {
 815  814                  struct vm_isa_irq isa_irq;
 816  815  
 817  816                  if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
 818  817                          error = EFAULT;
 819  818                          break;
 820  819                  }
 821  820                  error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
 822  821                  if (error == 0 && isa_irq.ioapic_irq != -1) {
 823  822                          error = vioapic_pulse_irq(sc->vmm_vm,
 824  823                              isa_irq.ioapic_irq);
 825  824                  }
 826  825                  break;
 827  826          }
 828  827          case VM_ISA_SET_IRQ_TRIGGER: {
 829  828                  struct vm_isa_irq_trigger isa_irq_trigger;
 830  829  
 831  830                  if (ddi_copyin(datap, &isa_irq_trigger,
 832  831                      sizeof (isa_irq_trigger), md)) {
 833  832                          error = EFAULT;
 834  833                          break;
 835  834                  }
 836  835                  error = vatpic_set_irq_trigger(sc->vmm_vm,
 837  836                      isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
 838  837                  break;
 839  838          }
 840  839  
 841  840          case VM_MMAP_GETNEXT: {
 842  841                  struct vm_memmap mm;
 843  842  
 844  843                  if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
 845  844                          error = EFAULT;
 846  845                          break;
 847  846                  }
 848  847                  error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
 849  848                      &mm.segoff, &mm.len, &mm.prot, &mm.flags);
 850  849                  if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
 851  850                          error = EFAULT;
 852  851                          break;
 853  852                  }
 854  853                  break;
 855  854          }
 856  855          case VM_MMAP_MEMSEG: {
 857  856                  struct vm_memmap mm;
 858  857  
 859  858                  if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
 860  859                          error = EFAULT;
 861  860                          break;
 862  861                  }
 863  862                  error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
 864  863                      mm.len, mm.prot, mm.flags);
 865  864                  break;
 866  865          }
 867  866          case VM_MUNMAP_MEMSEG: {
 868  867                  struct vm_munmap mu;
 869  868  
 870  869                  if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
 871  870                          error = EFAULT;
 872  871                          break;
 873  872                  }
 874  873                  error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
 875  874                  break;
 876  875          }
 877  876          case VM_ALLOC_MEMSEG: {
 878  877                  struct vm_memseg vmseg;
 879  878  
 880  879                  if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
 881  880                          error = EFAULT;
 882  881                          break;
 883  882                  }
 884  883                  error = vmmdev_alloc_memseg(sc, &vmseg);
 885  884                  break;
 886  885          }
 887  886          case VM_GET_MEMSEG: {
 888  887                  struct vm_memseg vmseg;
 889  888  
 890  889                  if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
 891  890                          error = EFAULT;
 892  891                          break;
 893  892                  }
 894  893                  error = vmmdev_get_memseg(sc, &vmseg);
 895  894                  if (error == 0 &&
 896  895                      ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
 897  896                          error = EFAULT;
 898  897                          break;
 899  898                  }
 900  899                  break;
 901  900          }
 902  901          case VM_GET_REGISTER: {
 903  902                  struct vm_register vmreg;
 904  903  
 905  904                  if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
 906  905                          error = EFAULT;
 907  906                          break;
 908  907                  }
 909  908                  error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
 910  909                      &vmreg.regval);
 911  910                  if (error == 0 &&
 912  911                      ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
 913  912                          error = EFAULT;
 914  913                          break;
 915  914                  }
 916  915                  break;
 917  916          }
 918  917          case VM_SET_REGISTER: {
 919  918                  struct vm_register vmreg;
 920  919  
 921  920                  if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
 922  921                          error = EFAULT;
 923  922                          break;
 924  923                  }
 925  924                  error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
 926  925                      vmreg.regval);
 927  926                  break;
 928  927          }
 929  928          case VM_SET_SEGMENT_DESCRIPTOR: {
 930  929                  struct vm_seg_desc vmsegd;
 931  930  
 932  931                  if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
 933  932                          error = EFAULT;
 934  933                          break;
 935  934                  }
 936  935                  error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
 937  936                      &vmsegd.desc);
 938  937                  break;
 939  938          }
 940  939          case VM_GET_SEGMENT_DESCRIPTOR: {
 941  940                  struct vm_seg_desc vmsegd;
 942  941  
 943  942                  if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
 944  943                          error = EFAULT;
 945  944                          break;
 946  945                  }
 947  946                  error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
 948  947                      &vmsegd.desc);
 949  948                  if (error == 0 &&
 950  949                      ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
 951  950                          error = EFAULT;
 952  951                          break;
 953  952                  }
 954  953                  break;
 955  954          }
 956  955          case VM_GET_REGISTER_SET: {
 957  956                  struct vm_register_set vrs;
 958  957                  int regnums[VM_REG_LAST];
 959  958                  uint64_t regvals[VM_REG_LAST];
 960  959  
 961  960                  if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
 962  961                          error = EFAULT;
 963  962                          break;
 964  963                  }
 965  964                  if (vrs.count > VM_REG_LAST || vrs.count == 0) {
 966  965                          error = EINVAL;
 967  966                          break;
 968  967                  }
 969  968                  if (ddi_copyin(vrs.regnums, regnums,
 970  969                      sizeof (int) * vrs.count, md)) {
 971  970                          error = EFAULT;
 972  971                          break;
 973  972                  }
 974  973  
 975  974                  error = 0;
 976  975                  for (uint_t i = 0; i < vrs.count && error == 0; i++) {
 977  976                          if (regnums[i] < 0) {
 978  977                                  error = EINVAL;
 979  978                                  break;
 980  979                          }
 981  980                          error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
 982  981                              &regvals[i]);
 983  982                  }
 984  983                  if (error == 0 && ddi_copyout(regvals, vrs.regvals,
 985  984                      sizeof (uint64_t) * vrs.count, md)) {
 986  985                          error = EFAULT;
 987  986                  }
 988  987                  break;
 989  988          }
 990  989          case VM_SET_REGISTER_SET: {
 991  990                  struct vm_register_set vrs;
 992  991                  int regnums[VM_REG_LAST];
 993  992                  uint64_t regvals[VM_REG_LAST];
 994  993  
 995  994                  if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
 996  995                          error = EFAULT;
 997  996                          break;
 998  997                  }
 999  998                  if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1000  999                          error = EINVAL;
1001 1000                          break;
1002 1001                  }
1003 1002                  if (ddi_copyin(vrs.regnums, regnums,
1004 1003                      sizeof (int) * vrs.count, md)) {
1005 1004                          error = EFAULT;
1006 1005                          break;
1007 1006                  }
1008 1007                  if (ddi_copyin(vrs.regvals, regvals,
1009 1008                      sizeof (uint64_t) * vrs.count, md)) {
1010 1009                          error = EFAULT;
1011 1010                          break;
1012 1011                  }
1013 1012  
1014 1013                  error = 0;
1015 1014                  for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1016 1015                          /*
1017 1016                           * Setting registers in a set is not atomic, since a
1018 1017                           * failure in the middle of the set will cause a
1019 1018                           * bail-out and inconsistent register state.  Callers
1020 1019                           * should be wary of this.
1021 1020                           */
1022 1021                          if (regnums[i] < 0) {
1023 1022                                  error = EINVAL;
1024 1023                                  break;
1025 1024                          }
1026 1025                          error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1027 1026                              regvals[i]);
1028 1027                  }
1029 1028                  break;
1030 1029          }
1031 1030          case VM_RESET_CPU: {
1032 1031                  struct vm_vcpu_reset vvr;
1033 1032  
1034 1033                  if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1035 1034                          error = EFAULT;
1036 1035                          break;
1037 1036                  }
1038 1037                  if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1039 1038                          error = EINVAL;
1040 1039                  }
1041 1040  
1042 1041                  error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1043 1042                  break;
1044 1043          }
1045 1044          case VM_GET_RUN_STATE: {
1046 1045                  struct vm_run_state vrs;
1047 1046  
1048 1047                  bzero(&vrs, sizeof (vrs));
1049 1048                  error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1050 1049                      &vrs.sipi_vector);
1051 1050                  if (error == 0) {
1052 1051                          if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1053 1052                                  error = EFAULT;
1054 1053                                  break;
1055 1054                          }
1056 1055                  }
1057 1056                  break;
1058 1057          }
1059 1058          case VM_SET_RUN_STATE: {
1060 1059                  struct vm_run_state vrs;
1061 1060  
1062 1061                  if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1063 1062                          error = EFAULT;
1064 1063                          break;
1065 1064                  }
1066 1065                  error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1067 1066                      vrs.sipi_vector);
1068 1067                  break;
1069 1068          }
1070 1069  
1071 1070          case VM_SET_KERNEMU_DEV:
1072 1071          case VM_GET_KERNEMU_DEV: {
1073 1072                  struct vm_readwrite_kernemu_device kemu;
1074 1073                  size_t size = 0;
1075 1074  
1076 1075                  if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1077 1076                          error = EFAULT;
1078 1077                          break;
1079 1078                  }
1080 1079  
1081 1080                  if (kemu.access_width > 3) {
1082 1081                          error = EINVAL;
1083 1082                          break;
1084 1083                  }
1085 1084                  size = (1 << kemu.access_width);
1086 1085                  ASSERT(size >= 1 && size <= 8);
1087 1086  
1088 1087                  if (cmd == VM_SET_KERNEMU_DEV) {
1089 1088                          error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1090 1089                              kemu.gpa, kemu.value, size);
1091 1090                  } else {
1092 1091                          error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1093 1092                              kemu.gpa, &kemu.value, size);
1094 1093                  }
1095 1094  
1096 1095                  if (error == 0) {
1097 1096                          if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1098 1097                                  error = EFAULT;
1099 1098                                  break;
1100 1099                          }
1101 1100                  }
1102 1101                  break;
1103 1102          }
1104 1103  
1105 1104          case VM_GET_CAPABILITY: {
1106 1105                  struct vm_capability vmcap;
1107 1106  
1108 1107                  if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1109 1108                          error = EFAULT;
1110 1109                          break;
1111 1110                  }
1112 1111                  error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1113 1112                      &vmcap.capval);
1114 1113                  if (error == 0 &&
1115 1114                      ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1116 1115                          error = EFAULT;
1117 1116                          break;
1118 1117                  }
1119 1118                  break;
1120 1119          }
1121 1120          case VM_SET_CAPABILITY: {
1122 1121                  struct vm_capability vmcap;
1123 1122  
1124 1123                  if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1125 1124                          error = EFAULT;
1126 1125                          break;
1127 1126                  }
1128 1127                  error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1129 1128                      vmcap.capval);
1130 1129                  break;
1131 1130          }
1132 1131          case VM_SET_X2APIC_STATE: {
1133 1132                  struct vm_x2apic x2apic;
1134 1133  
1135 1134                  if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1136 1135                          error = EFAULT;
1137 1136                          break;
1138 1137                  }
1139 1138                  error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1140 1139                  break;
1141 1140          }
1142 1141          case VM_GET_X2APIC_STATE: {
1143 1142                  struct vm_x2apic x2apic;
1144 1143  
1145 1144                  if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1146 1145                          error = EFAULT;
1147 1146                          break;
1148 1147                  }
1149 1148                  error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1150 1149                      &x2apic.state);
1151 1150                  if (error == 0 &&
1152 1151                      ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1153 1152                          error = EFAULT;
1154 1153                          break;
1155 1154                  }
1156 1155                  break;
1157 1156          }
1158 1157          case VM_GET_GPA_PMAP: {
1159 1158                  struct vm_gpa_pte gpapte;
1160 1159  
1161 1160                  if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1162 1161                          error = EFAULT;
1163 1162                          break;
1164 1163                  }
1165 1164  #ifdef __FreeBSD__
1166 1165                  /* XXXJOY: add function? */
1167 1166                  pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1168 1167                      gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1169 1168  #endif
1170 1169                  error = 0;
1171 1170                  break;
1172 1171          }
1173 1172          case VM_GET_HPET_CAPABILITIES: {
1174 1173                  struct vm_hpet_cap hpetcap;
1175 1174  
1176 1175                  error = vhpet_getcap(&hpetcap);
1177 1176                  if (error == 0 &&
1178 1177                      ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1179 1178                          error = EFAULT;
1180 1179                          break;
1181 1180                  }
1182 1181                  break;
1183 1182          }
1184 1183          case VM_GLA2GPA: {
1185 1184                  struct vm_gla2gpa gg;
1186 1185  
1187 1186                  if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1188 1187                          error = EFAULT;
1189 1188                          break;
1190 1189                  }
1191 1190                  gg.vcpuid = vcpu;
1192 1191                  error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1193 1192                      gg.prot, &gg.gpa, &gg.fault);
1194 1193                  if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1195 1194                          error = EFAULT;
1196 1195                          break;
1197 1196                  }
1198 1197                  break;
1199 1198          }
1200 1199          case VM_GLA2GPA_NOFAULT: {
1201 1200                  struct vm_gla2gpa gg;
1202 1201  
1203 1202                  if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1204 1203                          error = EFAULT;
1205 1204                          break;
1206 1205                  }
1207 1206                  gg.vcpuid = vcpu;
1208 1207                  error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1209 1208                      gg.gla, gg.prot, &gg.gpa, &gg.fault);
1210 1209                  if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1211 1210                          error = EFAULT;
1212 1211                          break;
1213 1212                  }
1214 1213                  break;
1215 1214          }
1216 1215  
1217 1216          case VM_ACTIVATE_CPU:
1218 1217                  error = vm_activate_cpu(sc->vmm_vm, vcpu);
1219 1218                  break;
1220 1219  
1221 1220          case VM_SUSPEND_CPU:
1222 1221                  if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1223 1222                          error = EFAULT;
1224 1223                  } else {
1225 1224                          error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1226 1225                  }
1227 1226                  break;
1228 1227  
1229 1228          case VM_RESUME_CPU:
1230 1229                  if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1231 1230                          error = EFAULT;
1232 1231                  } else {
1233 1232                          error = vm_resume_cpu(sc->vmm_vm, vcpu);
1234 1233                  }
1235 1234                  break;
1236 1235  
1237 1236          case VM_GET_CPUS: {
1238 1237                  struct vm_cpuset vm_cpuset;
1239 1238                  cpuset_t tempset;
1240 1239                  void *srcp = &tempset;
1241 1240                  int size;
1242 1241  
1243 1242                  if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1244 1243                          error = EFAULT;
1245 1244                          break;
1246 1245                  }
1247 1246  
1248 1247                  /* Be more generous about sizing since our cpuset_t is large. */
1249 1248                  size = vm_cpuset.cpusetsize;
1250 1249                  if (size <= 0 || size > sizeof (cpuset_t)) {
1251 1250                          error = ERANGE;
1252 1251                  }
1253 1252                  /*
1254 1253                   * If they want a ulong_t or less, make sure they receive the
1255 1254                   * low bits with all the useful information.
1256 1255                   */
1257 1256                  if (size <= sizeof (tempset.cpub[0])) {
1258 1257                          srcp = &tempset.cpub[0];
1259 1258                  }
1260 1259  
1261 1260                  if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1262 1261                          tempset = vm_active_cpus(sc->vmm_vm);
1263 1262                  } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1264 1263                          tempset = vm_suspended_cpus(sc->vmm_vm);
1265 1264                  } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1266 1265                          tempset = vm_debug_cpus(sc->vmm_vm);
1267 1266                  } else {
1268 1267                          error = EINVAL;
1269 1268                  }
1270 1269  
1271 1270                  ASSERT(size > 0 && size <= sizeof (tempset));
1272 1271                  if (error == 0 &&
1273 1272                      ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1274 1273                          error = EFAULT;
1275 1274                          break;
1276 1275                  }
1277 1276                  break;
1278 1277          }
1279 1278          case VM_SET_INTINFO: {
1280 1279                  struct vm_intinfo vmii;
1281 1280  
1282 1281                  if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1283 1282                          error = EFAULT;
1284 1283                          break;
1285 1284                  }
1286 1285                  error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1287 1286                  break;
1288 1287          }
1289 1288          case VM_GET_INTINFO: {
1290 1289                  struct vm_intinfo vmii;
1291 1290  
1292 1291                  vmii.vcpuid = vcpu;
1293 1292                  error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1294 1293                      &vmii.info2);
1295 1294                  if (error == 0 &&
1296 1295                      ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1297 1296                          error = EFAULT;
1298 1297                          break;
1299 1298                  }
1300 1299                  break;
1301 1300          }
1302 1301          case VM_RTC_WRITE: {
1303 1302                  struct vm_rtc_data rtcdata;
1304 1303  
1305 1304                  if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1306 1305                          error = EFAULT;
1307 1306                          break;
1308 1307                  }
1309 1308                  error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1310 1309                      rtcdata.value);
1311 1310                  break;
1312 1311          }
1313 1312          case VM_RTC_READ: {
1314 1313                  struct vm_rtc_data rtcdata;
1315 1314  
1316 1315                  if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1317 1316                          error = EFAULT;
1318 1317                          break;
1319 1318                  }
1320 1319                  error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1321 1320                      &rtcdata.value);
1322 1321                  if (error == 0 &&
1323 1322                      ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1324 1323                          error = EFAULT;
1325 1324                          break;
1326 1325                  }
1327 1326                  break;
1328 1327          }
1329 1328          case VM_RTC_SETTIME: {
1330 1329                  struct vm_rtc_time rtctime;
1331 1330  
1332 1331                  if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1333 1332                          error = EFAULT;
1334 1333                          break;
1335 1334                  }
1336 1335                  error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1337 1336                  break;
1338 1337          }
1339 1338          case VM_RTC_GETTIME: {
1340 1339                  struct vm_rtc_time rtctime;
1341 1340  
1342 1341                  rtctime.secs = vrtc_get_time(sc->vmm_vm);
1343 1342                  if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1344 1343                          error = EFAULT;
1345 1344                          break;
1346 1345                  }
1347 1346                  break;
1348 1347          }
1349 1348  
1350 1349          case VM_PMTMR_LOCATE: {
1351 1350                  uint16_t port = arg;
1352 1351                  error = vpmtmr_set_location(sc->vmm_vm, port);
1353 1352                  break;
1354 1353          }
1355 1354  
1356 1355          case VM_RESTART_INSTRUCTION:
1357 1356                  error = vm_restart_instruction(sc->vmm_vm, vcpu);
1358 1357                  break;
1359 1358  
1360 1359          case VM_SET_TOPOLOGY: {
1361 1360                  struct vm_cpu_topology topo;
1362 1361  
1363 1362                  if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1364 1363                          error = EFAULT;
1365 1364                          break;
1366 1365                  }
1367 1366                  error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1368 1367                      topo.threads, topo.maxcpus);
1369 1368                  break;
1370 1369          }
1371 1370          case VM_GET_TOPOLOGY: {
1372 1371                  struct vm_cpu_topology topo;
1373 1372  
1374 1373                  vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1375 1374                      &topo.threads, &topo.maxcpus);
1376 1375                  if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1377 1376                          error = EFAULT;
1378 1377                          break;
1379 1378                  }
1380 1379                  break;
1381 1380          }
1382 1381  
1383 1382          case VM_DEVMEM_GETOFFSET: {
1384 1383                  struct vm_devmem_offset vdo;
1385 1384                  list_t *dl = &sc->vmm_devmem_list;
1386 1385                  vmm_devmem_entry_t *de = NULL;
1387 1386  
1388 1387                  if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1389 1388                          error = EFAULT;
1390 1389                          break;
1391 1390                  }
1392 1391  
1393 1392                  for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1394 1393                          if (de->vde_segid == vdo.segid) {
1395 1394                                  break;
1396 1395                          }
1397 1396                  }
1398 1397                  if (de != NULL) {
1399 1398                          vdo.offset = de->vde_off;
1400 1399                          if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1401 1400                                  error = EFAULT;
1402 1401                          }
1403 1402                  } else {
1404 1403                          error = ENOENT;

↓ open down ↓

915 lines elided

↑ open up ↑

1405 1404                  }
1406 1405                  break;
1407 1406          }
1408 1407          case VM_WRLOCK_CYCLE: {
1409 1408                  /*
1410 1409                   * Present a test mechanism to acquire/release the write lock
1411 1410                   * on the VM without any other effects.
1412 1411                   */
1413 1412                  break;
1414 1413          }
1415      -        case VM_ARC_RESV:
1416      -                error = vm_arc_resv(sc->vmm_vm, (uint64_t)arg);
1417      -                break;
     1414 +
1418 1415          default:
1419 1416                  error = ENOTTY;
1420 1417                  break;
1421 1418          }
1422 1419  
1423 1420          /* Release exclusion resources */
1424 1421          switch (lock_type) {
1425 1422          case LOCK_NONE:
1426 1423                  break;
1427 1424          case LOCK_VCPU:

1428 1425                  vcpu_unlock_one(sc, vcpu);
1429 1426                  break;
1430 1427          case LOCK_READ_HOLD:
1431 1428                  vmm_read_unlock(sc);
1432 1429                  break;
1433 1430          case LOCK_WRITE_HOLD:
1434 1431                  vmm_write_unlock(sc);
1435 1432                  break;
1436 1433          default:
1437 1434                  panic("unexpected lock type");
1438 1435                  break;
1439 1436          }
1440 1437  
1441 1438          return (error);
1442 1439  }
1443 1440  
1444 1441  static vmm_softc_t *
1445 1442  vmm_lookup(const char *name)
1446 1443  {
1447 1444          list_t *vml = &vmm_list;
1448 1445          vmm_softc_t *sc;
1449 1446  
1450 1447          ASSERT(MUTEX_HELD(&vmm_mtx));
1451 1448  
1452 1449          for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1453 1450                  if (strcmp(sc->vmm_name, name) == 0) {
1454 1451                          break;
1455 1452                  }
1456 1453          }
1457 1454  
1458 1455          return (sc);
1459 1456  }
1460 1457  
1461 1458  /*
1462 1459   * Acquire an HMA registration if not already held.
1463 1460   */
1464 1461  static boolean_t
1465 1462  vmm_hma_acquire(void)
1466 1463  {
1467 1464          ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1468 1465  
1469 1466          mutex_enter(&vmmdev_mtx);
1470 1467  
1471 1468          if (vmmdev_hma_reg == NULL) {
1472 1469                  VERIFY3U(vmmdev_hma_ref, ==, 0);
1473 1470                  vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1474 1471                  if (vmmdev_hma_reg == NULL) {
1475 1472                          cmn_err(CE_WARN, "%s HMA registration failed.",
1476 1473                              vmmdev_hvm_name);
1477 1474                          mutex_exit(&vmmdev_mtx);
1478 1475                          return (B_FALSE);
1479 1476                  }
1480 1477          }
1481 1478  
1482 1479          vmmdev_hma_ref++;
1483 1480  
1484 1481          mutex_exit(&vmmdev_mtx);
1485 1482  
1486 1483          return (B_TRUE);
1487 1484  }
1488 1485  
1489 1486  /*
1490 1487   * Release the HMA registration if held and there are no remaining VMs.
1491 1488   */
1492 1489  static void
1493 1490  vmm_hma_release(void)
1494 1491  {
1495 1492          ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1496 1493  
1497 1494          mutex_enter(&vmmdev_mtx);
1498 1495  
1499 1496          VERIFY3U(vmmdev_hma_ref, !=, 0);
1500 1497  
1501 1498          vmmdev_hma_ref--;
1502 1499  
1503 1500          if (vmmdev_hma_ref == 0) {
1504 1501                  VERIFY(vmmdev_hma_reg != NULL);
1505 1502                  hma_unregister(vmmdev_hma_reg);
1506 1503                  vmmdev_hma_reg = NULL;
1507 1504          }
1508 1505          mutex_exit(&vmmdev_mtx);
1509 1506  }
1510 1507  
1511 1508  static int
1512 1509  vmmdev_do_vm_create(char *name, cred_t *cr)
1513 1510  {
1514 1511          vmm_softc_t     *sc = NULL;
1515 1512          minor_t         minor;
1516 1513          int             error = ENOMEM;
1517 1514  
1518 1515          if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1519 1516                  return (EINVAL);
1520 1517          }
1521 1518  
1522 1519          if (!vmm_hma_acquire())
1523 1520                  return (ENXIO);
1524 1521  
1525 1522          mutex_enter(&vmm_mtx);
1526 1523  
1527 1524          /* Look for duplicate names */
1528 1525          if (vmm_lookup(name) != NULL) {
1529 1526                  mutex_exit(&vmm_mtx);
1530 1527                  vmm_hma_release();
1531 1528                  return (EEXIST);
1532 1529          }
1533 1530  
1534 1531          /* Allow only one instance per non-global zone. */
1535 1532          if (!INGLOBALZONE(curproc)) {
1536 1533                  for (sc = list_head(&vmm_list); sc != NULL;
1537 1534                      sc = list_next(&vmm_list, sc)) {
1538 1535                          if (sc->vmm_zone == curzone) {
1539 1536                                  mutex_exit(&vmm_mtx);
1540 1537                                  vmm_hma_release();
1541 1538                                  return (EINVAL);
1542 1539                          }
1543 1540                  }
1544 1541          }
1545 1542  
1546 1543          minor = id_alloc(vmm_minors);
1547 1544          if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1548 1545                  goto fail;
1549 1546          } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1550 1547                  ddi_soft_state_free(vmm_statep, minor);
1551 1548                  goto fail;
1552 1549          } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1553 1550              DDI_PSEUDO, 0) != DDI_SUCCESS) {
1554 1551                  goto fail;
1555 1552          }
1556 1553  
1557 1554          if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1558 1555                  goto fail;
1559 1556          }
1560 1557  
1561 1558          error = vm_create(name, &sc->vmm_vm);
1562 1559          if (error == 0) {
1563 1560                  /* Complete VM intialization and report success. */
1564 1561                  (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1565 1562                  sc->vmm_minor = minor;
1566 1563                  list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1567 1564                      offsetof(vmm_devmem_entry_t, vde_node));
1568 1565  
1569 1566                  list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1570 1567                      offsetof(vmm_hold_t, vmh_node));
1571 1568                  cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1572 1569  
1573 1570                  mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1574 1571                  list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1575 1572                      offsetof(vmm_lease_t, vml_node));
1576 1573                  cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1577 1574                  rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1578 1575  
1579 1576                  sc->vmm_zone = crgetzone(cr);
1580 1577                  zone_hold(sc->vmm_zone);
1581 1578                  vmm_zsd_add_vm(sc);
1582 1579                  vmm_kstat_init(sc);
1583 1580  
1584 1581                  list_insert_tail(&vmm_list, sc);
1585 1582                  mutex_exit(&vmm_mtx);
1586 1583                  return (0);
1587 1584          }
1588 1585  
1589 1586          vmm_kstat_fini(sc);
1590 1587          ddi_remove_minor_node(vmmdev_dip, name);
1591 1588  fail:
1592 1589          id_free(vmm_minors, minor);
1593 1590          if (sc != NULL) {
1594 1591                  ddi_soft_state_free(vmm_statep, minor);
1595 1592          }
1596 1593          mutex_exit(&vmm_mtx);
1597 1594          vmm_hma_release();
1598 1595  
1599 1596          return (error);
1600 1597  }
1601 1598  
1602 1599  /*
1603 1600   * Bhyve 'Driver' Interface
1604 1601   *
1605 1602   * While many devices are emulated in the bhyve userspace process, there are
1606 1603   * others with performance constraints which require that they run mostly or
1607 1604   * entirely in-kernel.  For those not integrated directly into bhyve, an API is
1608 1605   * needed so they can query/manipulate the portions of VM state needed to
1609 1606   * fulfill their purpose.
1610 1607   *
1611 1608   * This includes:
1612 1609   * - Translating guest-physical addresses to host-virtual pointers
1613 1610   * - Injecting MSIs
1614 1611   * - Hooking IO port addresses
1615 1612   *
1616 1613   * The vmm_drv interface exists to provide that functionality to its consumers.
1617 1614   * (At this time, 'viona' is the only user)
1618 1615   */
1619 1616  int
1620 1617  vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1621 1618  {
1622 1619          vnode_t *vp = fp->f_vnode;
1623 1620          const dev_t dev = vp->v_rdev;
1624 1621          vmm_softc_t *sc;
1625 1622          vmm_hold_t *hold;
1626 1623          int err = 0;
1627 1624  
1628 1625          if (vp->v_type != VCHR) {
1629 1626                  return (ENXIO);
1630 1627          }
1631 1628          const major_t major = getmajor(dev);
1632 1629          const minor_t minor = getminor(dev);
1633 1630  
1634 1631          mutex_enter(&vmmdev_mtx);
1635 1632          if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1636 1633                  mutex_exit(&vmmdev_mtx);
1637 1634                  return (ENOENT);
1638 1635          }
1639 1636          mutex_enter(&vmm_mtx);
1640 1637          mutex_exit(&vmmdev_mtx);
1641 1638  
1642 1639          if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1643 1640                  err = ENOENT;
1644 1641                  goto out;
1645 1642          }
1646 1643          /* XXXJOY: check cred permissions against instance */
1647 1644  
1648 1645          if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1649 1646                  err = EBUSY;
1650 1647                  goto out;
1651 1648          }
1652 1649  
1653 1650          hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1654 1651          hold->vmh_sc = sc;
1655 1652          hold->vmh_release_req = B_FALSE;
1656 1653  
1657 1654          list_insert_tail(&sc->vmm_holds, hold);
1658 1655          sc->vmm_flags |= VMM_HELD;
1659 1656          *holdp = hold;
1660 1657  
1661 1658  out:
1662 1659          mutex_exit(&vmm_mtx);
1663 1660          return (err);
1664 1661  }
1665 1662  
1666 1663  void
1667 1664  vmm_drv_rele(vmm_hold_t *hold)
1668 1665  {
1669 1666          vmm_softc_t *sc;
1670 1667  
1671 1668          ASSERT(hold != NULL);
1672 1669          ASSERT(hold->vmh_sc != NULL);
1673 1670          VERIFY(hold->vmh_ioport_hook_cnt == 0);
1674 1671  
1675 1672          mutex_enter(&vmm_mtx);
1676 1673          sc = hold->vmh_sc;
1677 1674          list_remove(&sc->vmm_holds, hold);
1678 1675          if (list_is_empty(&sc->vmm_holds)) {
1679 1676                  sc->vmm_flags &= ~VMM_HELD;
1680 1677                  cv_broadcast(&sc->vmm_cv);
1681 1678          }
1682 1679          mutex_exit(&vmm_mtx);
1683 1680          kmem_free(hold, sizeof (*hold));
1684 1681  }
1685 1682  
1686 1683  boolean_t
1687 1684  vmm_drv_release_reqd(vmm_hold_t *hold)
1688 1685  {
1689 1686          ASSERT(hold != NULL);
1690 1687  
1691 1688          return (hold->vmh_release_req);
1692 1689  }
1693 1690  
1694 1691  vmm_lease_t *
1695 1692  vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1696 1693  {
1697 1694          vmm_softc_t *sc = hold->vmh_sc;
1698 1695          vmm_lease_t *lease;
1699 1696  
1700 1697          ASSERT3P(expiref, !=, NULL);
1701 1698  
1702 1699          if (hold->vmh_release_req) {
1703 1700                  return (NULL);
1704 1701          }
1705 1702  
1706 1703          lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1707 1704          list_link_init(&lease->vml_node);
1708 1705          lease->vml_expire_func = expiref;
1709 1706          lease->vml_expire_arg = arg;
1710 1707          lease->vml_expired = B_FALSE;
1711 1708          lease->vml_hold = hold;
1712 1709          /* cache the VM pointer for one less pointer chase */
1713 1710          lease->vml_vm = sc->vmm_vm;
1714 1711  
1715 1712          mutex_enter(&sc->vmm_lease_lock);
1716 1713          while (sc->vmm_lease_blocker != 0) {
1717 1714                  cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1718 1715          }
1719 1716          list_insert_tail(&sc->vmm_lease_list, lease);
1720 1717          vmm_read_lock(sc);
1721 1718          mutex_exit(&sc->vmm_lease_lock);
1722 1719  
1723 1720          return (lease);
1724 1721  }
1725 1722  
1726 1723  static void
1727 1724  vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1728 1725  {
1729 1726          ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1730 1727  
1731 1728          list_remove(&sc->vmm_lease_list, lease);
1732 1729          vmm_read_unlock(sc);
1733 1730          kmem_free(lease, sizeof (*lease));
1734 1731  }
1735 1732  
1736 1733  void
1737 1734  vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1738 1735  {
1739 1736          vmm_softc_t *sc = hold->vmh_sc;
1740 1737  
1741 1738          VERIFY3P(hold, ==, lease->vml_hold);
1742 1739  
1743 1740          mutex_enter(&sc->vmm_lease_lock);
1744 1741          vmm_lease_break_locked(sc, lease);
1745 1742          mutex_exit(&sc->vmm_lease_lock);
1746 1743  }
1747 1744  
1748 1745  boolean_t
1749 1746  vmm_drv_lease_expired(vmm_lease_t *lease)
1750 1747  {
1751 1748          return (lease->vml_expired);
1752 1749  }
1753 1750  
1754 1751  void *
1755 1752  vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1756 1753  {
1757 1754          ASSERT(lease != NULL);
1758 1755  
1759 1756          return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1760 1757  }
1761 1758  
1762 1759  int
1763 1760  vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1764 1761  {
1765 1762          ASSERT(lease != NULL);
1766 1763  
1767 1764          return (lapic_intr_msi(lease->vml_vm, addr, msg));
1768 1765  }
1769 1766  
1770 1767  int
1771 1768  vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1772 1769      void *arg, void **cookie)
1773 1770  {
1774 1771          vmm_softc_t *sc;
1775 1772          int err;
1776 1773  
1777 1774          ASSERT(hold != NULL);
1778 1775          ASSERT(cookie != NULL);
1779 1776  
1780 1777          sc = hold->vmh_sc;
1781 1778          mutex_enter(&vmm_mtx);
1782 1779          /* Confirm that hook installation is not blocked */
1783 1780          if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1784 1781                  mutex_exit(&vmm_mtx);
1785 1782                  return (EBUSY);
1786 1783          }
1787 1784          /*
1788 1785           * Optimistically record an installed hook which will prevent a block
1789 1786           * from being asserted while the mutex is dropped.
1790 1787           */
1791 1788          hold->vmh_ioport_hook_cnt++;
1792 1789          mutex_exit(&vmm_mtx);
1793 1790  
1794 1791          vmm_write_lock(sc);
1795 1792          err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1796 1793              arg, cookie);
1797 1794          vmm_write_unlock(sc);
1798 1795  
1799 1796          if (err != 0) {
1800 1797                  mutex_enter(&vmm_mtx);
1801 1798                  /* Walk back optimism about the hook installation */
1802 1799                  hold->vmh_ioport_hook_cnt--;
1803 1800                  mutex_exit(&vmm_mtx);
1804 1801          }
1805 1802          return (err);
1806 1803  }
1807 1804  
1808 1805  void
1809 1806  vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1810 1807  {
1811 1808          vmm_softc_t *sc;
1812 1809  
1813 1810          ASSERT(hold != NULL);
1814 1811          ASSERT(cookie != NULL);
1815 1812          ASSERT(hold->vmh_ioport_hook_cnt != 0);
1816 1813  
1817 1814          sc = hold->vmh_sc;
1818 1815          vmm_write_lock(sc);
1819 1816          vm_ioport_unhook(sc->vmm_vm, cookie);
1820 1817          vmm_write_unlock(sc);
1821 1818  
1822 1819          mutex_enter(&vmm_mtx);
1823 1820          hold->vmh_ioport_hook_cnt--;
1824 1821          mutex_exit(&vmm_mtx);
1825 1822  }
1826 1823  
1827 1824  static int
1828 1825  vmm_drv_purge(vmm_softc_t *sc)
1829 1826  {
1830 1827          ASSERT(MUTEX_HELD(&vmm_mtx));
1831 1828  
1832 1829          if ((sc->vmm_flags & VMM_HELD) != 0) {
1833 1830                  vmm_hold_t *hold;
1834 1831  
1835 1832                  sc->vmm_flags |= VMM_CLEANUP;
1836 1833                  for (hold = list_head(&sc->vmm_holds); hold != NULL;
1837 1834                      hold = list_next(&sc->vmm_holds, hold)) {
1838 1835                          hold->vmh_release_req = B_TRUE;
1839 1836                  }
1840 1837                  while ((sc->vmm_flags & VMM_HELD) != 0) {
1841 1838                          if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1842 1839                                  return (EINTR);
1843 1840                          }
1844 1841                  }
1845 1842                  sc->vmm_flags &= ~VMM_CLEANUP;
1846 1843          }
1847 1844  
1848 1845          VERIFY(list_is_empty(&sc->vmm_holds));
1849 1846          sc->vmm_flags |= VMM_PURGED;
1850 1847          return (0);
1851 1848  }
1852 1849  
1853 1850  static int
1854 1851  vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1855 1852  {
1856 1853          int err = 0;
1857 1854  
1858 1855          mutex_enter(&vmm_mtx);
1859 1856          if (!enable_block) {
1860 1857                  VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1861 1858  
1862 1859                  sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1863 1860                  goto done;
1864 1861          }
1865 1862  
1866 1863          /* If any holds have hooks installed, the block is a failure */
1867 1864          if (!list_is_empty(&sc->vmm_holds)) {
1868 1865                  vmm_hold_t *hold;
1869 1866  
1870 1867                  for (hold = list_head(&sc->vmm_holds); hold != NULL;
1871 1868                      hold = list_next(&sc->vmm_holds, hold)) {
1872 1869                          if (hold->vmh_ioport_hook_cnt != 0) {
1873 1870                                  err = EBUSY;
1874 1871                                  goto done;
1875 1872                          }
1876 1873                  }
1877 1874          }
1878 1875          sc->vmm_flags |= VMM_BLOCK_HOOK;
1879 1876  
1880 1877  done:
1881 1878          mutex_exit(&vmm_mtx);
1882 1879          return (err);
1883 1880  }
1884 1881  
1885 1882  static int
1886 1883  vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1887 1884      boolean_t *hma_release)
1888 1885  {
1889 1886          dev_info_t      *pdip = ddi_get_parent(vmmdev_dip);
1890 1887          minor_t         minor;
1891 1888  
1892 1889          ASSERT(MUTEX_HELD(&vmm_mtx));
1893 1890  
1894 1891          *hma_release = B_FALSE;
1895 1892  
1896 1893          if (vmm_drv_purge(sc) != 0) {
1897 1894                  return (EINTR);
1898 1895          }
1899 1896  
1900 1897          if (clean_zsd) {
1901 1898                  vmm_zsd_rem_vm(sc);
1902 1899          }
1903 1900  
1904 1901          /* Clean up devmem entries */
1905 1902          vmmdev_devmem_purge(sc);
1906 1903  
1907 1904          list_remove(&vmm_list, sc);
1908 1905          ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1909 1906          minor = sc->vmm_minor;
1910 1907          zone_rele(sc->vmm_zone);
1911 1908          if (sc->vmm_is_open) {
1912 1909                  list_insert_tail(&vmm_destroy_list, sc);
1913 1910                  sc->vmm_flags |= VMM_DESTROY;
1914 1911          } else {
1915 1912                  vm_destroy(sc->vmm_vm);
1916 1913                  vmm_kstat_fini(sc);
1917 1914                  ddi_soft_state_free(vmm_statep, minor);
1918 1915                  id_free(vmm_minors, minor);
1919 1916                  *hma_release = B_TRUE;
1920 1917          }
1921 1918          (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1922 1919  
1923 1920          return (0);
1924 1921  }
1925 1922  
1926 1923  int
1927 1924  vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1928 1925  {
1929 1926          boolean_t       hma_release = B_FALSE;
1930 1927          int             err;
1931 1928  
1932 1929          mutex_enter(&vmm_mtx);
1933 1930          err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1934 1931          mutex_exit(&vmm_mtx);
1935 1932  
1936 1933          if (hma_release)
1937 1934                  vmm_hma_release();
1938 1935  
1939 1936          return (err);
1940 1937  }
1941 1938  
1942 1939  /* ARGSUSED */
1943 1940  static int
1944 1941  vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1945 1942  {
1946 1943          boolean_t       hma_release = B_FALSE;
1947 1944          vmm_softc_t     *sc;
1948 1945          int             err;
1949 1946  
1950 1947          if (crgetuid(cr) != 0)
1951 1948                  return (EPERM);
1952 1949  
1953 1950          mutex_enter(&vmm_mtx);
1954 1951  
1955 1952          if ((sc = vmm_lookup(name)) == NULL) {
1956 1953                  mutex_exit(&vmm_mtx);
1957 1954                  return (ENOENT);
1958 1955          }
1959 1956          /*
1960 1957           * We don't check this in vmm_lookup() since that function is also used
1961 1958           * for validation during create and currently vmm names must be unique.
1962 1959           */
1963 1960          if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1964 1961                  mutex_exit(&vmm_mtx);
1965 1962                  return (EPERM);
1966 1963          }
1967 1964          err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1968 1965  
1969 1966          mutex_exit(&vmm_mtx);
1970 1967  
1971 1968          if (hma_release)
1972 1969                  vmm_hma_release();
1973 1970  
1974 1971          return (err);
1975 1972  }
1976 1973  
1977 1974  #define VCPU_NAME_BUFLEN        32
1978 1975  
1979 1976  static int
1980 1977  vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
1981 1978  {
1982 1979          zoneid_t zid = crgetzoneid(cr);
1983 1980          int instance = minor;
1984 1981          kstat_t *ksp;
1985 1982  
1986 1983          ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
1987 1984  
1988 1985          ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
1989 1986              VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
1990 1987              sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
1991 1988  
1992 1989          if (ksp == NULL) {
1993 1990                  return (-1);
1994 1991          }
1995 1992          sc->vmm_kstat_vm = ksp;
1996 1993  
1997 1994          for (uint_t i = 0; i < VM_MAXCPU; i++) {
1998 1995                  char namebuf[VCPU_NAME_BUFLEN];
1999 1996  
2000 1997                  ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2001 1998  
2002 1999                  (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2003 2000                  ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2004 2001                      VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2005 2002                      sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2006 2003                      0, zid);
2007 2004                  if (ksp == NULL) {
2008 2005                          goto fail;
2009 2006                  }
2010 2007  
2011 2008                  sc->vmm_kstat_vcpu[i] = ksp;
2012 2009          }
2013 2010  
2014 2011          /*
2015 2012           * If this instance is associated with a non-global zone, make its
2016 2013           * kstats visible from the GZ.
2017 2014           */
2018 2015          if (zid != GLOBAL_ZONEID) {
2019 2016                  kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2020 2017                  for (uint_t i = 0; i < VM_MAXCPU; i++) {
2021 2018                          kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2022 2019                  }
2023 2020          }
2024 2021  
2025 2022          return (0);
2026 2023  
2027 2024  fail:
2028 2025          for (uint_t i = 0; i < VM_MAXCPU; i++) {
2029 2026                  if (sc->vmm_kstat_vcpu[i] != NULL) {
2030 2027                          kstat_delete(sc->vmm_kstat_vcpu[i]);
2031 2028                          sc->vmm_kstat_vcpu[i] = NULL;
2032 2029                  } else {
2033 2030                          break;
2034 2031                  }
2035 2032          }
2036 2033          kstat_delete(sc->vmm_kstat_vm);
2037 2034          sc->vmm_kstat_vm = NULL;
2038 2035          return (-1);
2039 2036  }
2040 2037  
2041 2038  static void
2042 2039  vmm_kstat_init(vmm_softc_t *sc)
2043 2040  {
2044 2041          kstat_t *ksp;
2045 2042  
2046 2043          ASSERT3P(sc->vmm_vm, !=, NULL);
2047 2044          ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2048 2045  
2049 2046          ksp = sc->vmm_kstat_vm;
2050 2047          vmm_kstats_t *vk = ksp->ks_data;
2051 2048          ksp->ks_private = sc->vmm_vm;
2052 2049          kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2053 2050          kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2054 2051  
2055 2052          for (uint_t i = 0; i < VM_MAXCPU; i++) {
2056 2053                  ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2057 2054  
2058 2055                  ksp = sc->vmm_kstat_vcpu[i];
2059 2056                  vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2060 2057  
2061 2058                  kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2062 2059                  vvk->vvk_vcpu.value.ui32 = i;
2063 2060                  kstat_named_init(&vvk->vvk_time_init, "time_init",
2064 2061                      KSTAT_DATA_UINT64);
2065 2062                  kstat_named_init(&vvk->vvk_time_run, "time_run",
2066 2063                      KSTAT_DATA_UINT64);
2067 2064                  kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2068 2065                      KSTAT_DATA_UINT64);
2069 2066                  kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2070 2067                      KSTAT_DATA_UINT64);
2071 2068                  kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2072 2069                      KSTAT_DATA_UINT64);
2073 2070                  kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2074 2071                      KSTAT_DATA_UINT64);
2075 2072                  ksp->ks_private = sc->vmm_vm;
2076 2073                  ksp->ks_update = vmm_kstat_update_vcpu;
2077 2074          }
2078 2075  
2079 2076          kstat_install(sc->vmm_kstat_vm);
2080 2077          for (uint_t i = 0; i < VM_MAXCPU; i++) {
2081 2078                  kstat_install(sc->vmm_kstat_vcpu[i]);
2082 2079          }
2083 2080  }
2084 2081  
2085 2082  static void
2086 2083  vmm_kstat_fini(vmm_softc_t *sc)
2087 2084  {
2088 2085          ASSERT(sc->vmm_kstat_vm != NULL);
2089 2086  
2090 2087          kstat_delete(sc->vmm_kstat_vm);
2091 2088          sc->vmm_kstat_vm = NULL;
2092 2089  
2093 2090          for (uint_t i = 0; i < VM_MAXCPU; i++) {
2094 2091                  ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2095 2092  
2096 2093                  kstat_delete(sc->vmm_kstat_vcpu[i]);
2097 2094                  sc->vmm_kstat_vcpu[i] = NULL;
2098 2095          }
2099 2096  }
2100 2097  
2101 2098  static int
2102 2099  vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2103 2100  {
2104 2101          minor_t         minor;
2105 2102          vmm_softc_t     *sc;
2106 2103  
2107 2104          minor = getminor(*devp);
2108 2105          if (minor == VMM_CTL_MINOR) {
2109 2106                  /*
2110 2107                   * Master control device must be opened exclusively.
2111 2108                   */
2112 2109                  if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2113 2110                          return (EINVAL);
2114 2111                  }
2115 2112  
2116 2113                  return (0);
2117 2114          }
2118 2115  
2119 2116          mutex_enter(&vmm_mtx);
2120 2117          sc = ddi_get_soft_state(vmm_statep, minor);
2121 2118          if (sc == NULL) {
2122 2119                  mutex_exit(&vmm_mtx);
2123 2120                  return (ENXIO);
2124 2121          }
2125 2122  
2126 2123          sc->vmm_is_open = B_TRUE;
2127 2124          mutex_exit(&vmm_mtx);
2128 2125  
2129 2126          return (0);
2130 2127  }
2131 2128  
2132 2129  static int
2133 2130  vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2134 2131  {
2135 2132          minor_t         minor;
2136 2133          vmm_softc_t     *sc;
2137 2134          boolean_t       hma_release = B_FALSE;
2138 2135  
2139 2136          minor = getminor(dev);
2140 2137          if (minor == VMM_CTL_MINOR)
2141 2138                  return (0);
2142 2139  
2143 2140          mutex_enter(&vmm_mtx);
2144 2141          sc = ddi_get_soft_state(vmm_statep, minor);
2145 2142          if (sc == NULL) {
2146 2143                  mutex_exit(&vmm_mtx);
2147 2144                  return (ENXIO);
2148 2145          }
2149 2146  
2150 2147          VERIFY(sc->vmm_is_open);
2151 2148          sc->vmm_is_open = B_FALSE;
2152 2149  
2153 2150          /*
2154 2151           * If this VM was destroyed while the vmm device was open, then
2155 2152           * clean it up now that it is closed.
2156 2153           */
2157 2154          if (sc->vmm_flags & VMM_DESTROY) {
2158 2155                  list_remove(&vmm_destroy_list, sc);
2159 2156                  vm_destroy(sc->vmm_vm);
2160 2157                  ddi_soft_state_free(vmm_statep, minor);
2161 2158                  id_free(vmm_minors, minor);
2162 2159                  hma_release = B_TRUE;
2163 2160          }
2164 2161          mutex_exit(&vmm_mtx);
2165 2162  
2166 2163          if (hma_release)
2167 2164                  vmm_hma_release();
2168 2165  
2169 2166          return (0);
2170 2167  }
2171 2168  
2172 2169  static int
2173 2170  vmm_is_supported(intptr_t arg)
2174 2171  {
2175 2172          int r;
2176 2173          const char *msg;
2177 2174  
2178 2175          if (vmm_is_intel()) {
2179 2176                  r = vmx_x86_supported(&msg);
2180 2177          } else if (vmm_is_svm()) {
2181 2178                  /*

↓ open down ↓

754 lines elided

↑ open up ↑

2182 2179                   * HMA already ensured that the features necessary for SVM
2183 2180                   * operation were present and online during vmm_attach().
2184 2181                   */
2185 2182                  r = 0;
2186 2183          } else {
2187 2184                  r = ENXIO;
2188 2185                  msg = "Unsupported CPU vendor";
2189 2186          }
2190 2187  
2191 2188          if (r != 0 && arg != (intptr_t)NULL) {
2192      -                if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0)
     2189 +                if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2193 2190                          return (EFAULT);
2194 2191          }
2195 2192          return (r);
2196 2193  }
2197 2194  
2198 2195  static int
2199 2196  vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2200 2197      int *rvalp)
2201 2198  {
2202 2199          vmm_softc_t     *sc;

2203 2200          minor_t         minor;
2204 2201  
2205 2202          /* The structs in bhyve ioctls assume a 64-bit datamodel */
2206 2203          if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2207 2204                  return (ENOTSUP);
2208 2205          }
2209 2206  
2210 2207          minor = getminor(dev);
2211 2208  
2212 2209          if (minor == VMM_CTL_MINOR) {
2213 2210                  void *argp = (void *)arg;
2214 2211                  char name[VM_MAX_NAMELEN] = { 0 };
2215 2212                  size_t len = 0;
2216 2213  
2217 2214                  if ((mode & FKIOCTL) != 0) {
2218 2215                          len = strlcpy(name, argp, sizeof (name));
2219 2216                  } else {
2220 2217                          if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2221 2218                                  return (EFAULT);
2222 2219                          }
2223 2220                  }
2224 2221                  if (len >= VM_MAX_NAMELEN) {
2225 2222                          return (ENAMETOOLONG);
2226 2223                  }
2227 2224  
2228 2225                  switch (cmd) {
2229 2226                  case VMM_CREATE_VM:
2230 2227                          if ((mode & FWRITE) == 0)
2231 2228                                  return (EPERM);
2232 2229                          return (vmmdev_do_vm_create(name, credp));
2233 2230                  case VMM_DESTROY_VM:
2234 2231                          if ((mode & FWRITE) == 0)
2235 2232                                  return (EPERM);
2236 2233                          return (vmmdev_do_vm_destroy(name, credp));
2237 2234                  case VMM_VM_SUPPORTED:
2238 2235                          return (vmm_is_supported(arg));
2239 2236                  default:
2240 2237                          /* No other actions are legal on ctl device */
2241 2238                          return (ENOTTY);
2242 2239                  }
2243 2240          }
2244 2241  
2245 2242          sc = ddi_get_soft_state(vmm_statep, minor);
2246 2243          ASSERT(sc);
2247 2244  
2248 2245          if (sc->vmm_flags & VMM_DESTROY)
2249 2246                  return (ENXIO);
2250 2247  
2251 2248          return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2252 2249  }
2253 2250  
2254 2251  static int
2255 2252  vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2256 2253      unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2257 2254  {
2258 2255          vmm_softc_t *sc;
2259 2256          const minor_t minor = getminor(dev);
2260 2257          struct vm *vm;
2261 2258          int err;
2262 2259          vm_object_t vmo = NULL;
2263 2260          struct vmspace *vms;
2264 2261  
2265 2262          if (minor == VMM_CTL_MINOR) {
2266 2263                  return (ENODEV);
2267 2264          }
2268 2265          if (off < 0 || (off + len) <= 0) {
2269 2266                  return (EINVAL);
2270 2267          }
2271 2268          if ((prot & PROT_USER) == 0) {
2272 2269                  return (EACCES);
2273 2270          }
2274 2271  
2275 2272          sc = ddi_get_soft_state(vmm_statep, minor);
2276 2273          ASSERT(sc);
2277 2274  
2278 2275          if (sc->vmm_flags & VMM_DESTROY)
2279 2276                  return (ENXIO);
2280 2277  
2281 2278          /* Grab read lock on the VM to prevent any changes to the memory map */
2282 2279          vmm_read_lock(sc);
2283 2280  
2284 2281          vm = sc->vmm_vm;
2285 2282          vms = vm_get_vmspace(vm);
2286 2283          if (off >= VM_DEVMEM_START) {
2287 2284                  int segid;
2288 2285                  off_t map_off = 0;
2289 2286  
2290 2287                  /* Mapping a devmem "device" */
2291 2288                  if (!vmmdev_devmem_segid(sc, off, len, &segid, &map_off)) {
2292 2289                          err = ENODEV;
2293 2290                          goto out;
2294 2291                  }
2295 2292                  err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2296 2293                  if (err != 0) {
2297 2294                          goto out;
2298 2295                  }
2299 2296                  err = vm_segmap_obj(vmo, map_off, len, as, addrp, prot, maxprot,
2300 2297                      flags);
2301 2298          } else {
2302 2299                  /* Mapping a part of the guest physical space */
2303 2300                  err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2304 2301                      flags);
2305 2302          }
2306 2303  
2307 2304  
2308 2305  out:
2309 2306          vmm_read_unlock(sc);
2310 2307          return (err);
2311 2308  }
2312 2309  
2313 2310  static sdev_plugin_validate_t
2314 2311  vmm_sdev_validate(sdev_ctx_t ctx)
2315 2312  {
2316 2313          const char *name = sdev_ctx_name(ctx);
2317 2314          vmm_softc_t *sc;
2318 2315          sdev_plugin_validate_t ret;
2319 2316          minor_t minor;
2320 2317  
2321 2318          if (sdev_ctx_vtype(ctx) != VCHR)
2322 2319                  return (SDEV_VTOR_INVALID);
2323 2320  
2324 2321          VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2325 2322  
2326 2323          mutex_enter(&vmm_mtx);
2327 2324          if ((sc = vmm_lookup(name)) == NULL)
2328 2325                  ret = SDEV_VTOR_INVALID;
2329 2326          else if (sc->vmm_minor != minor)
2330 2327                  ret = SDEV_VTOR_STALE;
2331 2328          else
2332 2329                  ret = SDEV_VTOR_VALID;
2333 2330          mutex_exit(&vmm_mtx);
2334 2331  
2335 2332          return (ret);
2336 2333  }
2337 2334  
2338 2335  static int
2339 2336  vmm_sdev_filldir(sdev_ctx_t ctx)
2340 2337  {
2341 2338          vmm_softc_t *sc;
2342 2339          int ret;
2343 2340  
2344 2341          if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2345 2342                  cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2346 2343                      sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2347 2344                  return (EINVAL);
2348 2345          }
2349 2346  
2350 2347          mutex_enter(&vmm_mtx);
2351 2348          ASSERT(vmmdev_dip != NULL);
2352 2349          for (sc = list_head(&vmm_list); sc != NULL;
2353 2350              sc = list_next(&vmm_list, sc)) {
2354 2351                  if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2355 2352                          ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2356 2353                              S_IFCHR | 0600,
2357 2354                              makedevice(ddi_driver_major(vmmdev_dip),
2358 2355                              sc->vmm_minor));
2359 2356                  } else {
2360 2357                          continue;
2361 2358                  }
2362 2359                  if (ret != 0 && ret != EEXIST)
2363 2360                          goto out;
2364 2361          }
2365 2362  
2366 2363          ret = 0;
2367 2364  
2368 2365  out:
2369 2366          mutex_exit(&vmm_mtx);
2370 2367          return (ret);
2371 2368  }
2372 2369  
2373 2370  /* ARGSUSED */
2374 2371  static void
2375 2372  vmm_sdev_inactive(sdev_ctx_t ctx)
2376 2373  {
2377 2374  }
2378 2375  
2379 2376  static sdev_plugin_ops_t vmm_sdev_ops = {
2380 2377          .spo_version = SDEV_PLUGIN_VERSION,
2381 2378          .spo_flags = SDEV_PLUGIN_SUBDIR,
2382 2379          .spo_validate = vmm_sdev_validate,
2383 2380          .spo_filldir = vmm_sdev_filldir,
2384 2381          .spo_inactive = vmm_sdev_inactive
2385 2382  };
2386 2383  
2387 2384  /* ARGSUSED */
2388 2385  static int
2389 2386  vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2390 2387  {
2391 2388          int error;
2392 2389  
2393 2390          switch (cmd) {
2394 2391          case DDI_INFO_DEVT2DEVINFO:
2395 2392                  *result = (void *)vmmdev_dip;
2396 2393                  error = DDI_SUCCESS;
2397 2394                  break;
2398 2395          case DDI_INFO_DEVT2INSTANCE:
2399 2396                  *result = (void *)0;
2400 2397                  error = DDI_SUCCESS;
2401 2398                  break;
2402 2399          default:
2403 2400                  error = DDI_FAILURE;
2404 2401                  break;
2405 2402          }
2406 2403          return (error);
2407 2404  }
2408 2405  
2409 2406  static int
2410 2407  vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2411 2408  {
2412 2409          sdev_plugin_hdl_t sph;
2413 2410          hma_reg_t *reg = NULL;
2414 2411          boolean_t vmm_loaded = B_FALSE;
2415 2412  
2416 2413          if (cmd != DDI_ATTACH) {
2417 2414                  return (DDI_FAILURE);
2418 2415          }
2419 2416  
2420 2417          mutex_enter(&vmmdev_mtx);
2421 2418          /* Ensure we are not already attached. */
2422 2419          if (vmmdev_dip != NULL) {
2423 2420                  mutex_exit(&vmmdev_mtx);
2424 2421                  return (DDI_FAILURE);
2425 2422          }
2426 2423  
2427 2424          vmm_sol_glue_init();
2428 2425          vmm_arena_init();
2429 2426  
2430 2427          /*
2431 2428           * Perform temporary HMA registration to determine if the system
2432 2429           * is capable.
2433 2430           */
2434 2431          if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2435 2432                  goto fail;
2436 2433          } else if (vmm_mod_load() != 0) {
2437 2434                  goto fail;
2438 2435          }
2439 2436          vmm_loaded = B_TRUE;
2440 2437          hma_unregister(reg);
2441 2438          reg = NULL;
2442 2439  
2443 2440          /* Create control node.  Other nodes will be created on demand. */
2444 2441          if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2445 2442              VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2446 2443                  goto fail;
2447 2444          }
2448 2445  
2449 2446          sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
2450 2447          if (sph == (sdev_plugin_hdl_t)NULL) {
2451 2448                  ddi_remove_minor_node(dip, NULL);
2452 2449                  goto fail;
2453 2450          }
2454 2451  
2455 2452          ddi_report_dev(dip);
2456 2453          vmmdev_sdev_hdl = sph;
2457 2454          vmmdev_dip = dip;
2458 2455          mutex_exit(&vmmdev_mtx);
2459 2456          return (DDI_SUCCESS);
2460 2457  
2461 2458  fail:
2462 2459          if (vmm_loaded) {
2463 2460                  VERIFY0(vmm_mod_unload());
2464 2461          }
2465 2462          if (reg != NULL) {
2466 2463                  hma_unregister(reg);
2467 2464          }
2468 2465          vmm_arena_fini();
2469 2466          vmm_sol_glue_cleanup();
2470 2467          mutex_exit(&vmmdev_mtx);
2471 2468          return (DDI_FAILURE);
2472 2469  }
2473 2470  
2474 2471  static int
2475 2472  vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2476 2473  {
2477 2474          if (cmd != DDI_DETACH) {
2478 2475                  return (DDI_FAILURE);
2479 2476          }
2480 2477  
2481 2478          /*
2482 2479           * Ensure that all resources have been cleaned up.
2483 2480           *
2484 2481           * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2485 2482           * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2486 2483           * devinfo locked as iommu_cleanup() tries to recursively lock each
2487 2484           * devinfo, including our own, while holding vmmdev_mtx.
2488 2485           */
2489 2486          if (mutex_tryenter(&vmmdev_mtx) == 0)
2490 2487                  return (DDI_FAILURE);
2491 2488  
2492 2489          mutex_enter(&vmm_mtx);
2493 2490          if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2494 2491                  mutex_exit(&vmm_mtx);
2495 2492                  mutex_exit(&vmmdev_mtx);
2496 2493                  return (DDI_FAILURE);
2497 2494          }
2498 2495          mutex_exit(&vmm_mtx);
2499 2496  
2500 2497          VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2501 2498          if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2502 2499                  mutex_exit(&vmmdev_mtx);
2503 2500                  return (DDI_FAILURE);
2504 2501          }
2505 2502          vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2506 2503  
2507 2504          /* Remove the control node. */
2508 2505          ddi_remove_minor_node(dip, "ctl");
2509 2506          vmmdev_dip = NULL;
2510 2507  
2511 2508          VERIFY0(vmm_mod_unload());
2512 2509          VERIFY3U(vmmdev_hma_reg, ==, NULL);
2513 2510          vmm_arena_fini();
2514 2511          vmm_sol_glue_cleanup();
2515 2512  
2516 2513          mutex_exit(&vmmdev_mtx);
2517 2514  
2518 2515          return (DDI_SUCCESS);
2519 2516  }
2520 2517  
2521 2518  static struct cb_ops vmm_cb_ops = {
2522 2519          vmm_open,
2523 2520          vmm_close,
2524 2521          nodev,          /* strategy */
2525 2522          nodev,          /* print */
2526 2523          nodev,          /* dump */
2527 2524          nodev,          /* read */
2528 2525          nodev,          /* write */
2529 2526          vmm_ioctl,
2530 2527          nodev,          /* devmap */
2531 2528          nodev,          /* mmap */
2532 2529          vmm_segmap,
2533 2530          nochpoll,       /* poll */
2534 2531          ddi_prop_op,
2535 2532          NULL,
2536 2533          D_NEW | D_MP | D_DEVMAP
2537 2534  };
2538 2535  
2539 2536  static struct dev_ops vmm_ops = {
2540 2537          DEVO_REV,
2541 2538          0,
2542 2539          vmm_info,
2543 2540          nulldev,        /* identify */
2544 2541          nulldev,        /* probe */
2545 2542          vmm_attach,
2546 2543          vmm_detach,
2547 2544          nodev,          /* reset */
2548 2545          &vmm_cb_ops,
2549 2546          (struct bus_ops *)NULL
2550 2547  };
2551 2548  
2552 2549  static struct modldrv modldrv = {
2553 2550          &mod_driverops,
2554 2551          "bhyve vmm",
2555 2552          &vmm_ops
2556 2553  };
2557 2554  
2558 2555  static struct modlinkage modlinkage = {
2559 2556          MODREV_1,
2560 2557          &modldrv,
2561 2558          NULL
2562 2559  };
2563 2560  
2564 2561  int
2565 2562  _init(void)
2566 2563  {
2567 2564          int     error;
2568 2565  
2569 2566          sysinit();
2570 2567  
2571 2568          mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2572 2569          mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2573 2570          list_create(&vmm_list, sizeof (vmm_softc_t),
2574 2571              offsetof(vmm_softc_t, vmm_node));
2575 2572          list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2576 2573              offsetof(vmm_softc_t, vmm_node));
2577 2574          vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2578 2575  
2579 2576          error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2580 2577          if (error) {
2581 2578                  return (error);
2582 2579          }
2583 2580  
2584 2581          vmm_zsd_init();
2585 2582  
2586 2583          error = mod_install(&modlinkage);
2587 2584          if (error) {
2588 2585                  ddi_soft_state_fini(&vmm_statep);
2589 2586                  vmm_zsd_fini();
2590 2587          }
2591 2588  
2592 2589          return (error);
2593 2590  }
2594 2591  
2595 2592  int
2596 2593  _fini(void)
2597 2594  {
2598 2595          int     error;
2599 2596  
2600 2597          error = mod_remove(&modlinkage);
2601 2598          if (error) {
2602 2599                  return (error);
2603 2600          }
2604 2601  
2605 2602          vmm_zsd_fini();
2606 2603  
2607 2604          ddi_soft_state_fini(&vmm_statep);
2608 2605  
2609 2606          return (0);
2610 2607  }
2611 2608  
2612 2609  int
2613 2610  _info(struct modinfo *modinfop)
2614 2611  {
2615 2612          return (mod_info(&modlinkage, modinfop));
2616 2613  }

↓ open down ↓

414 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX