webrev-prealloc Wdiff usr/src/uts/i86pc/io/vmm/vmm.c

Print this page

13902 Fix for 13717 may break 8-disk raidz2
13915 installctx() blocking allocate causes problems
Portions contributed by: Jerry Jelinek <gjelinek@gmail.com>
Change-Id: I934d69946cec42630fc541fa8c7385b862b69ca2

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/io/vmm/vmm.c
          +++ new/usr/src/uts/i86pc/io/vmm/vmm.c

   1    1  /*-
   2    2   * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3    3   *
   4    4   * Copyright (c) 2011 NetApp, Inc.
   5    5   * All rights reserved.
   6    6   *
   7    7   * Redistribution and use in source and binary forms, with or without
   8    8   * modification, are permitted provided that the following conditions
   9    9   * are met:
  10   10   * 1. Redistributions of source code must retain the above copyright
  11   11   *    notice, this list of conditions and the following disclaimer.
  12   12   * 2. Redistributions in binary form must reproduce the above copyright
  13   13   *    notice, this list of conditions and the following disclaimer in the
  14   14   *    documentation and/or other materials provided with the distribution.
  15   15   *
  16   16   * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  17   17   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18   18   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19   19   * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  20   20   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21   21   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22   22   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23   23   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24   24   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25   25   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26   26   * SUCH DAMAGE.
  27   27   *
  28   28   * $FreeBSD$
  29   29   */
  30   30  /*
  31   31   * This file and its contents are supplied under the terms of the
  32   32   * Common Development and Distribution License ("CDDL"), version 1.0.
  33   33   * You may only use this file in accordance with the terms of version
  34   34   * 1.0 of the CDDL.
  35   35   *
  36   36   * A full copy of the text of the CDDL should have accompanied this
  37   37   * source.  A copy of the CDDL is also available via the Internet at
  38   38   * http://www.illumos.org/license/CDDL.
  39   39   *
  40   40   * Copyright 2015 Pluribus Networks Inc.
  41   41   * Copyright 2018 Joyent, Inc.
  42   42   * Copyright 2021 Oxide Computer Company
  43   43   * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  44   44   */
  45   45  
  46   46  #include <sys/cdefs.h>
  47   47  __FBSDID("$FreeBSD$");
  48   48  
  49   49  #include <sys/param.h>
  50   50  #include <sys/systm.h>
  51   51  #include <sys/kernel.h>
  52   52  #include <sys/module.h>
  53   53  #include <sys/sysctl.h>
  54   54  #include <sys/malloc.h>
  55   55  #include <sys/pcpu.h>
  56   56  #include <sys/lock.h>
  57   57  #include <sys/mutex.h>
  58   58  #include <sys/proc.h>
  59   59  #include <sys/rwlock.h>
  60   60  #include <sys/sched.h>
  61   61  #include <sys/smp.h>
  62   62  #include <sys/systm.h>
  63   63  
  64   64  #include <machine/pcb.h>
  65   65  #include <machine/smp.h>
  66   66  #include <machine/md_var.h>
  67   67  #include <x86/psl.h>
  68   68  #include <x86/apicreg.h>
  69   69  
  70   70  #include <machine/specialreg.h>
  71   71  #include <machine/vmm.h>
  72   72  #include <machine/vmm_dev.h>
  73   73  #include <machine/vmparam.h>
  74   74  #include <sys/vmm_instruction_emul.h>
  75   75  #include <sys/vmm_vm.h>
  76   76  
  77   77  #include "vmm_ioport.h"
  78   78  #include "vmm_ktr.h"
  79   79  #include "vmm_host.h"
  80   80  #include "vmm_mem.h"
  81   81  #include "vmm_util.h"
  82   82  #include "vatpic.h"
  83   83  #include "vatpit.h"
  84   84  #include "vhpet.h"
  85   85  #include "vioapic.h"
  86   86  #include "vlapic.h"
  87   87  #include "vpmtmr.h"
  88   88  #include "vrtc.h"
  89   89  #include "vmm_stat.h"
  90   90  #include "vmm_lapic.h"
  91   91  
  92   92  #include "io/ppt.h"
  93   93  #include "io/iommu.h"
  94   94  
  95   95  struct vlapic;
  96   96  
  97   97  /*
  98   98   * Initialization:
  99   99   * (a) allocated when vcpu is created
 100  100   * (i) initialized when vcpu is created and when it is reinitialized
 101  101   * (o) initialized the first time the vcpu is created
 102  102   * (x) initialized before use
 103  103   */
 104  104  struct vcpu {
 105  105          /* (o) protects state, run_state, hostcpu, sipi_vector */
 106  106          struct mtx      mtx;
 107  107  
 108  108          enum vcpu_state state;          /* (o) vcpu state */
 109  109          enum vcpu_run_state run_state;  /* (i) vcpu init/sipi/run state */
 110  110          kcondvar_t      vcpu_cv;        /* (o) cpu waiter cv */
 111  111          kcondvar_t      state_cv;       /* (o) IDLE-transition cv */
 112  112          int             hostcpu;        /* (o) vcpu's current host cpu */
 113  113          int             lastloccpu;     /* (o) last host cpu localized to */
 114  114          int             reqidle;        /* (i) request vcpu to idle */
 115  115          struct vlapic   *vlapic;        /* (i) APIC device model */
 116  116          enum x2apic_state x2apic_state; /* (i) APIC mode */
 117  117          uint64_t        exitintinfo;    /* (i) events pending at VM exit */
 118  118          int             nmi_pending;    /* (i) NMI pending */
 119  119          int             extint_pending; /* (i) INTR pending */
 120  120          int     exception_pending;      /* (i) exception pending */
 121  121          int     exc_vector;             /* (x) exception collateral */
 122  122          int     exc_errcode_valid;
 123  123          uint32_t exc_errcode;
 124  124          uint8_t         sipi_vector;    /* (i) SIPI vector */
 125  125          struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
 126  126          uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
 127  127          void            *stats;         /* (a,i) statistics */
 128  128          struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
 129  129          uint64_t        nextrip;        /* (x) next instruction to execute */
 130  130          struct vie      *vie_ctx;       /* (x) instruction emulation context */
 131  131          uint64_t        tsc_offset;     /* (x) offset from host TSC */
 132  132  
 133  133          enum vcpu_ustate ustate;        /* (i) microstate for the vcpu */
 134  134          hrtime_t        ustate_when;    /* (i) time of last ustate change */
 135  135          uint64_t ustate_total[VU_MAX];  /* (o) total time spent in ustates */
 136  136  };
 137  137  
 138  138  #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 139  139  #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 140  140  #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
 141  141  #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
 142  142  #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
 143  143  
 144  144  struct mem_seg {
 145  145          size_t  len;
 146  146          bool    sysmem;
 147  147          struct vm_object *object;
 148  148  };
 149  149  #define VM_MAX_MEMSEGS  4
 150  150  
 151  151  struct mem_map {
 152  152          vm_paddr_t      gpa;
 153  153          size_t          len;
 154  154          vm_ooffset_t    segoff;
 155  155          int             segid;
 156  156          int             prot;
 157  157          int             flags;
 158  158  };
 159  159  #define VM_MAX_MEMMAPS  8
 160  160  
 161  161  /*
 162  162   * Initialization:
 163  163   * (o) initialized the first time the VM is created
 164  164   * (i) initialized when VM is created and when it is reinitialized
 165  165   * (x) initialized before use
 166  166   */
 167  167  struct vm {
 168  168          void            *cookie;                /* (i) cpu-specific data */
 169  169          void            *iommu;                 /* (x) iommu-specific data */
 170  170          struct vhpet    *vhpet;                 /* (i) virtual HPET */
 171  171          struct vioapic  *vioapic;               /* (i) virtual ioapic */
 172  172          struct vatpic   *vatpic;                /* (i) virtual atpic */
 173  173          struct vatpit   *vatpit;                /* (i) virtual atpit */
 174  174          struct vpmtmr   *vpmtmr;                /* (i) virtual ACPI PM timer */
 175  175          struct vrtc     *vrtc;                  /* (o) virtual RTC */
 176  176          volatile cpuset_t active_cpus;          /* (i) active vcpus */
 177  177          volatile cpuset_t debug_cpus;           /* (i) vcpus stopped for dbg */
 178  178          int             suspend;                /* (i) stop VM execution */
 179  179          volatile cpuset_t suspended_cpus;       /* (i) suspended vcpus */
 180  180          volatile cpuset_t halted_cpus;          /* (x) cpus in a hard halt */
 181  181          struct mem_map  mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 182  182          struct mem_seg  mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 183  183          struct vmspace  *vmspace;               /* (o) guest's address space */
 184  184          char            name[VM_MAX_NAMELEN];   /* (o) virtual machine name */
 185  185          struct vcpu     vcpu[VM_MAXCPU];        /* (i) guest vcpus */
 186  186          /* The following describe the vm cpu topology */
 187  187          uint16_t        sockets;                /* (o) num of sockets */
 188  188          uint16_t        cores;                  /* (o) num of cores/socket */
 189  189          uint16_t        threads;                /* (o) num of threads/core */
 190  190          uint16_t        maxcpus;                /* (o) max pluggable cpus */
 191  191          uint64_t        boot_tsc_offset;        /* (i) TSC offset at VM boot */
 192  192  
 193  193          struct ioport_config ioports;           /* (o) ioport handling */
 194  194  };
 195  195  
 196  196  static int vmm_initialized;
 197  197  
 198  198  
 199  199  static void
 200  200  nullop_panic(void)
 201  201  {
 202  202          panic("null vmm operation call");
 203  203  }
 204  204  
 205  205  /* Do not allow use of an un-set `ops` to do anything but panic */
 206  206  static struct vmm_ops vmm_ops_null = {
 207  207          .init           = (vmm_init_func_t)nullop_panic,
 208  208          .cleanup        = (vmm_cleanup_func_t)nullop_panic,
 209  209          .resume         = (vmm_resume_func_t)nullop_panic,
 210  210          .vminit         = (vmi_init_func_t)nullop_panic,
 211  211          .vmrun          = (vmi_run_func_t)nullop_panic,
 212  212          .vmcleanup      = (vmi_cleanup_func_t)nullop_panic,
 213  213          .vmgetreg       = (vmi_get_register_t)nullop_panic,
 214  214          .vmsetreg       = (vmi_set_register_t)nullop_panic,
 215  215          .vmgetdesc      = (vmi_get_desc_t)nullop_panic,
 216  216          .vmsetdesc      = (vmi_set_desc_t)nullop_panic,
 217  217          .vmgetcap       = (vmi_get_cap_t)nullop_panic,
 218  218          .vmsetcap       = (vmi_set_cap_t)nullop_panic,
 219  219          .vmspace_alloc  = (vmi_vmspace_alloc)nullop_panic,
 220  220          .vmspace_free   = (vmi_vmspace_free)nullop_panic,
 221  221          .vlapic_init    = (vmi_vlapic_init)nullop_panic,
 222  222          .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
 223  223          .vmsavectx      = (vmi_savectx)nullop_panic,
 224  224          .vmrestorectx   = (vmi_restorectx)nullop_panic,
 225  225  };
 226  226  
 227  227  static struct vmm_ops *ops = &vmm_ops_null;
 228  228  
 229  229  #define VMM_INIT(num)                   ((*ops->init)(num))
 230  230  #define VMM_CLEANUP()                   ((*ops->cleanup)())
 231  231  #define VMM_RESUME()                    ((*ops->resume)())
 232  232  
 233  233  #define VMINIT(vm, pmap)                ((*ops->vminit)(vm, pmap))
 234  234  #define VMRUN(vmi, vcpu, rip, pmap) \
 235  235          ((*ops->vmrun)(vmi, vcpu, rip, pmap))
 236  236  #define VMCLEANUP(vmi)                  ((*ops->vmcleanup)(vmi))
 237  237  #define VMSPACE_ALLOC(min, max)         ((*ops->vmspace_alloc)(min, max))
 238  238  #define VMSPACE_FREE(vmspace)           ((*ops->vmspace_free)(vmspace))
 239  239  
 240  240  #define VMGETREG(vmi, vcpu, num, rv)    ((*ops->vmgetreg)(vmi, vcpu, num, rv))
 241  241  #define VMSETREG(vmi, vcpu, num, val)   ((*ops->vmsetreg)(vmi, vcpu, num, val))
 242  242  #define VMGETDESC(vmi, vcpu, num, dsc)  ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
 243  243  #define VMSETDESC(vmi, vcpu, num, dsc)  ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
 244  244  #define VMGETCAP(vmi, vcpu, num, rv)    ((*ops->vmgetcap)(vmi, vcpu, num, rv))
 245  245  #define VMSETCAP(vmi, vcpu, num, val)   ((*ops->vmsetcap)(vmi, vcpu, num, val))
 246  246  #define VLAPIC_INIT(vmi, vcpu)          ((*ops->vlapic_init)(vmi, vcpu))
 247  247  #define VLAPIC_CLEANUP(vmi, vlapic)     ((*ops->vlapic_cleanup)(vmi, vlapic))
 248  248  
 249  249  #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 250  250  #define fpu_stop_emulating()    clts()
 251  251  
 252  252  SDT_PROVIDER_DEFINE(vmm);
 253  253  
 254  254  static MALLOC_DEFINE(M_VM, "vm", "vm");
 255  255  
 256  256  SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 257  257      NULL);
 258  258  
 259  259  /*
 260  260   * Halt the guest if all vcpus are executing a HLT instruction with
 261  261   * interrupts disabled.
 262  262   */
 263  263  static int halt_detection_enabled = 1;
 264  264  
 265  265  /* IPI vector used for vcpu notifications */
 266  266  static int vmm_ipinum;
 267  267  
 268  268  /* Trap into hypervisor on all guest exceptions and reflect them back */
 269  269  static int trace_guest_exceptions;
 270  270  
 271  271  static void vm_free_memmap(struct vm *vm, int ident);
 272  272  static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 273  273  static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
 274  274  static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
 275  275  static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
 276  276  
 277  277  /* Flags for vtc_status */
 278  278  #define VTCS_FPU_RESTORED       1 /* guest FPU restored, host FPU saved */
 279  279  #define VTCS_FPU_CTX_CRITICAL   2 /* in ctx where FPU restore cannot be lazy */
 280  280  
 281  281  typedef struct vm_thread_ctx {
 282  282          struct vm       *vtc_vm;
 283  283          int             vtc_vcpuid;
 284  284          uint_t          vtc_status;
 285  285          enum vcpu_ustate vtc_ustate;
 286  286  } vm_thread_ctx_t;
 287  287  
 288  288  #ifdef KTR
 289  289  static const char *
 290  290  vcpu_state2str(enum vcpu_state state)
 291  291  {
 292  292  
 293  293          switch (state) {
 294  294          case VCPU_IDLE:
 295  295                  return ("idle");
 296  296          case VCPU_FROZEN:
 297  297                  return ("frozen");
 298  298          case VCPU_RUNNING:
 299  299                  return ("running");
 300  300          case VCPU_SLEEPING:
 301  301                  return ("sleeping");
 302  302          default:
 303  303                  return ("unknown");
 304  304          }
 305  305  }
 306  306  #endif
 307  307  
 308  308  static void
 309  309  vcpu_cleanup(struct vm *vm, int i, bool destroy)
 310  310  {
 311  311          struct vcpu *vcpu = &vm->vcpu[i];
 312  312  
 313  313          VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 314  314          if (destroy) {
 315  315                  vmm_stat_free(vcpu->stats);
 316  316                  fpu_save_area_free(vcpu->guestfpu);
 317  317                  vie_free(vcpu->vie_ctx);
 318  318                  vcpu->vie_ctx = NULL;
 319  319          }
 320  320  }
 321  321  
 322  322  static void
 323  323  vcpu_init(struct vm *vm, int vcpu_id, bool create)
 324  324  {
 325  325          struct vcpu *vcpu;
 326  326  
 327  327          KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 328  328              ("vcpu_init: invalid vcpu %d", vcpu_id));
 329  329  
 330  330          vcpu = &vm->vcpu[vcpu_id];
 331  331  
 332  332          if (create) {
 333  333                  vcpu_lock_init(vcpu);
 334  334                  vcpu->state = VCPU_IDLE;
 335  335                  vcpu->hostcpu = NOCPU;
 336  336                  vcpu->lastloccpu = NOCPU;
 337  337                  vcpu->guestfpu = fpu_save_area_alloc();
 338  338                  vcpu->stats = vmm_stat_alloc();
 339  339                  vcpu->vie_ctx = vie_alloc();
 340  340  
 341  341                  vcpu->ustate = VU_INIT;
 342  342                  vcpu->ustate_when = gethrtime();
 343  343          } else {
 344  344                  vie_reset(vcpu->vie_ctx);
 345  345                  bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
 346  346                  if (vcpu->ustate != VU_INIT) {
 347  347                          vcpu_ustate_change(vm, vcpu_id, VU_INIT);
 348  348                  }
 349  349          }
 350  350  
 351  351          vcpu->run_state = VRS_HALT;
 352  352          vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 353  353          vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 354  354          vcpu->reqidle = 0;
 355  355          vcpu->exitintinfo = 0;
 356  356          vcpu->nmi_pending = 0;
 357  357          vcpu->extint_pending = 0;
 358  358          vcpu->exception_pending = 0;
 359  359          vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 360  360          fpu_save_area_reset(vcpu->guestfpu);
 361  361          vmm_stat_init(vcpu->stats);
 362  362          vcpu->tsc_offset = 0;
 363  363  }
 364  364  
 365  365  int
 366  366  vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 367  367  {
 368  368  
 369  369          return (trace_guest_exceptions);
 370  370  }
 371  371  
 372  372  struct vm_exit *
 373  373  vm_exitinfo(struct vm *vm, int cpuid)
 374  374  {
 375  375          struct vcpu *vcpu;
 376  376  
 377  377          if (cpuid < 0 || cpuid >= vm->maxcpus)
 378  378                  panic("vm_exitinfo: invalid cpuid %d", cpuid);
 379  379  
 380  380          vcpu = &vm->vcpu[cpuid];
 381  381  
 382  382          return (&vcpu->exitinfo);
 383  383  }
 384  384  
 385  385  struct vie *
 386  386  vm_vie_ctx(struct vm *vm, int cpuid)
 387  387  {
 388  388          if (cpuid < 0 || cpuid >= vm->maxcpus)
 389  389                  panic("vm_vie_ctx: invalid cpuid %d", cpuid);
 390  390  
 391  391          return (vm->vcpu[cpuid].vie_ctx);
 392  392  }
 393  393  
 394  394  static int
 395  395  vmm_init(void)
 396  396  {
 397  397          int error;
 398  398  
 399  399          vmm_host_state_init();
 400  400  
 401  401          /* We use cpu_poke() for IPIs */
 402  402          vmm_ipinum = 0;
 403  403  
 404  404          error = vmm_mem_init();
 405  405          if (error)
 406  406                  return (error);
 407  407  
 408  408          if (vmm_is_intel())
 409  409                  ops = &vmm_ops_intel;
 410  410          else if (vmm_is_svm())
 411  411                  ops = &vmm_ops_amd;
 412  412          else
 413  413                  return (ENXIO);
 414  414  
 415  415          return (VMM_INIT(vmm_ipinum));
 416  416  }
 417  417  
 418  418  int
 419  419  vmm_mod_load()
 420  420  {
 421  421          int     error;
 422  422  
 423  423          VERIFY(vmm_initialized == 0);
 424  424  
 425  425          error = vmm_init();
 426  426          if (error == 0)
 427  427                  vmm_initialized = 1;
 428  428  
 429  429          return (error);
 430  430  }
 431  431  
 432  432  int
 433  433  vmm_mod_unload()
 434  434  {
 435  435          int     error;
 436  436  
 437  437          VERIFY(vmm_initialized == 1);
 438  438  
 439  439          iommu_cleanup();
 440  440          error = VMM_CLEANUP();
 441  441          if (error)
 442  442                  return (error);
 443  443          vmm_initialized = 0;
 444  444  
 445  445          return (0);
 446  446  }
 447  447  
 448  448  static void
 449  449  vm_init(struct vm *vm, bool create)
 450  450  {
 451  451          int i;
 452  452  
 453  453          vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 454  454          vm->iommu = NULL;
 455  455          vm->vioapic = vioapic_init(vm);
 456  456          vm->vhpet = vhpet_init(vm);
 457  457          vm->vatpic = vatpic_init(vm);
 458  458          vm->vatpit = vatpit_init(vm);
 459  459          vm->vpmtmr = vpmtmr_init(vm);
 460  460          if (create)
 461  461                  vm->vrtc = vrtc_init(vm);
 462  462  
 463  463          vm_inout_init(vm, &vm->ioports);
 464  464  
 465  465          CPU_ZERO(&vm->active_cpus);
 466  466          CPU_ZERO(&vm->debug_cpus);
 467  467  
 468  468          vm->suspend = 0;
 469  469          CPU_ZERO(&vm->suspended_cpus);
 470  470  
 471  471          for (i = 0; i < vm->maxcpus; i++)
 472  472                  vcpu_init(vm, i, create);
 473  473  
 474  474          /*
 475  475           * Configure the VM-wide TSC offset so that the call to vm_init()
 476  476           * represents the boot time (when the TSC(s) read 0).  Each vCPU will
 477  477           * have its own offset from this, which is altered if/when the guest
 478  478           * writes to MSR_TSC.
 479  479           *
 480  480           * The TSC offsetting math is all unsigned, using overflow for negative
 481  481           * offets.  A reading of the TSC is negated to form the boot offset.
 482  482           */
 483  483          vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset());
 484  484  }
 485  485  
 486  486  /*
 487  487   * The default CPU topology is a single thread per package.
 488  488   */
 489  489  uint_t cores_per_package = 1;
 490  490  uint_t threads_per_core = 1;
 491  491  
 492  492  int
 493  493  vm_create(const char *name, struct vm **retvm)
 494  494  {
 495  495          struct vm *vm;
 496  496          struct vmspace *vmspace;
 497  497  
 498  498          /*
 499  499           * If vmm.ko could not be successfully initialized then don't attempt
 500  500           * to create the virtual machine.
 501  501           */
 502  502          if (!vmm_initialized)
 503  503                  return (ENXIO);
 504  504  
 505  505          if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 506  506                  return (EINVAL);
 507  507  
 508  508          vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
 509  509          if (vmspace == NULL)
 510  510                  return (ENOMEM);
 511  511  
 512  512          vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
 513  513          strcpy(vm->name, name);
 514  514          vm->vmspace = vmspace;
 515  515  
 516  516          vm->sockets = 1;
 517  517          vm->cores = cores_per_package;  /* XXX backwards compatibility */
 518  518          vm->threads = threads_per_core; /* XXX backwards compatibility */
 519  519          vm->maxcpus = VM_MAXCPU;        /* XXX temp to keep code working */
 520  520  
 521  521          vm_init(vm, true);
 522  522  
 523  523          *retvm = vm;
 524  524          return (0);
 525  525  }
 526  526  
 527  527  void
 528  528  vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
 529  529      uint16_t *threads, uint16_t *maxcpus)
 530  530  {
 531  531          *sockets = vm->sockets;
 532  532          *cores = vm->cores;
 533  533          *threads = vm->threads;
 534  534          *maxcpus = vm->maxcpus;
 535  535  }
 536  536  
 537  537  uint16_t
 538  538  vm_get_maxcpus(struct vm *vm)
 539  539  {
 540  540          return (vm->maxcpus);
 541  541  }
 542  542  
 543  543  int
 544  544  vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
 545  545      uint16_t threads, uint16_t maxcpus)
 546  546  {
 547  547          if (maxcpus != 0)
 548  548                  return (EINVAL);        /* XXX remove when supported */
 549  549          if ((sockets * cores * threads) > vm->maxcpus)
 550  550                  return (EINVAL);
 551  551          /* XXX need to check sockets * cores * threads == vCPU, how? */
 552  552          vm->sockets = sockets;
 553  553          vm->cores = cores;
 554  554          vm->threads = threads;
 555  555          vm->maxcpus = VM_MAXCPU;        /* XXX temp to keep code working */
 556  556          return (0);
 557  557  }
 558  558  
 559  559  static void
 560  560  vm_cleanup(struct vm *vm, bool destroy)
 561  561  {
 562  562          struct mem_map *mm;
 563  563          int i;
 564  564  
 565  565          ppt_unassign_all(vm);
 566  566  
 567  567          if (vm->iommu != NULL)
 568  568                  iommu_destroy_domain(vm->iommu);
 569  569  
 570  570          /*
 571  571           * Devices which attach their own ioport hooks should be cleaned up
 572  572           * first so they can tear down those registrations.
 573  573           */
 574  574          vpmtmr_cleanup(vm->vpmtmr);
 575  575  
 576  576          vm_inout_cleanup(vm, &vm->ioports);
 577  577  
 578  578          if (destroy)
 579  579                  vrtc_cleanup(vm->vrtc);
 580  580          else
 581  581                  vrtc_reset(vm->vrtc);
 582  582  
 583  583          vatpit_cleanup(vm->vatpit);
 584  584          vhpet_cleanup(vm->vhpet);
 585  585          vatpic_cleanup(vm->vatpic);
 586  586          vioapic_cleanup(vm->vioapic);
 587  587  
 588  588          for (i = 0; i < vm->maxcpus; i++)
 589  589                  vcpu_cleanup(vm, i, destroy);
 590  590  
 591  591          VMCLEANUP(vm->cookie);
 592  592  
 593  593          /*
 594  594           * System memory is removed from the guest address space only when
 595  595           * the VM is destroyed. This is because the mapping remains the same
 596  596           * across VM reset.
 597  597           *
 598  598           * Device memory can be relocated by the guest (e.g. using PCI BARs)
 599  599           * so those mappings are removed on a VM reset.
 600  600           */
 601  601          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 602  602                  mm = &vm->mem_maps[i];
 603  603                  if (destroy || !sysmem_mapping(vm, mm)) {
 604  604                          vm_free_memmap(vm, i);
 605  605                  } else {
 606  606                          /*
 607  607                           * We need to reset the IOMMU flag so this mapping can
 608  608                           * be reused when a VM is rebooted. Since the IOMMU
 609  609                           * domain has already been destroyed we can just reset
 610  610                           * the flag here.
 611  611                           */
 612  612                          mm->flags &= ~VM_MEMMAP_F_IOMMU;
 613  613                  }
 614  614          }
 615  615  
 616  616          if (destroy) {
 617  617                  for (i = 0; i < VM_MAX_MEMSEGS; i++)
 618  618                          vm_free_memseg(vm, i);
 619  619  
 620  620                  VMSPACE_FREE(vm->vmspace);
 621  621                  vm->vmspace = NULL;
 622  622          }
 623  623  }
 624  624  
 625  625  void
 626  626  vm_destroy(struct vm *vm)
 627  627  {
 628  628          vm_cleanup(vm, true);
 629  629          free(vm, M_VM);
 630  630  }
 631  631  
 632  632  int
 633  633  vm_reinit(struct vm *vm)
 634  634  {
 635  635          int error;
 636  636  
 637  637          /*
 638  638           * A virtual machine can be reset only if all vcpus are suspended.
 639  639           */
 640  640          if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 641  641                  vm_cleanup(vm, false);
 642  642                  vm_init(vm, false);
 643  643                  error = 0;
 644  644          } else {
 645  645                  error = EBUSY;
 646  646          }
 647  647  
 648  648          return (error);
 649  649  }
 650  650  
 651  651  const char *
 652  652  vm_name(struct vm *vm)
 653  653  {
 654  654          return (vm->name);
 655  655  }
 656  656  
 657  657  int
 658  658  vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 659  659  {
 660  660          vm_object_t obj;
 661  661  
 662  662          if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 663  663                  return (ENOMEM);
 664  664          else
 665  665                  return (0);
 666  666  }
 667  667  
 668  668  int
 669  669  vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 670  670  {
 671  671          return (vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len));
 672  672  }
 673  673  
 674  674  /*
 675  675   * Return 'true' if 'gpa' is allocated in the guest address space.
 676  676   *
 677  677   * This function is called in the context of a running vcpu which acts as
 678  678   * an implicit lock on 'vm->mem_maps[]'.
 679  679   */
 680  680  bool
 681  681  vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 682  682  {
 683  683          struct mem_map *mm;
 684  684          int i;
 685  685  
 686  686  #ifdef INVARIANTS
 687  687          int hostcpu, state;
 688  688          state = vcpu_get_state(vm, vcpuid, &hostcpu);
 689  689          KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
 690  690              ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
 691  691  #endif
 692  692  
 693  693          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 694  694                  mm = &vm->mem_maps[i];
 695  695                  if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
 696  696                          return (true);          /* 'gpa' is sysmem or devmem */
 697  697          }
 698  698  
 699  699          if (ppt_is_mmio(vm, gpa))
 700  700                  return (true);                  /* 'gpa' is pci passthru mmio */
 701  701  
 702  702          return (false);
 703  703  }
 704  704  
 705  705  int
 706  706  vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 707  707  {
 708  708          struct mem_seg *seg;
 709  709          vm_object_t obj;
 710  710  
 711  711  #ifndef __FreeBSD__
 712  712          extern pgcnt_t get_max_page_get(void);
 713  713  #endif
 714  714  
 715  715          if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 716  716                  return (EINVAL);
 717  717  
 718  718          if (len == 0 || (len & PAGE_MASK))
 719  719                  return (EINVAL);
 720  720  
 721  721  #ifndef __FreeBSD__
 722  722          if (len > ptob(get_max_page_get()))
 723  723                  return (EINVAL);
 724  724  #endif
 725  725  
 726  726          seg = &vm->mem_segs[ident];
 727  727          if (seg->object != NULL) {
 728  728                  if (seg->len == len && seg->sysmem == sysmem)
 729  729                          return (EEXIST);
 730  730                  else
 731  731                          return (EINVAL);
 732  732          }
 733  733  
 734  734          obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
 735  735          if (obj == NULL)
 736  736                  return (ENOMEM);
 737  737  
 738  738          seg->len = len;
 739  739          seg->object = obj;
 740  740          seg->sysmem = sysmem;
 741  741          return (0);
 742  742  }
 743  743  
 744  744  int
 745  745  vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
 746  746      vm_object_t *objptr)
 747  747  {
 748  748          struct mem_seg *seg;
 749  749  
 750  750          if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 751  751                  return (EINVAL);
 752  752  
 753  753          seg = &vm->mem_segs[ident];
 754  754          if (len)
 755  755                  *len = seg->len;
 756  756          if (sysmem)
 757  757                  *sysmem = seg->sysmem;
 758  758          if (objptr)
 759  759                  *objptr = seg->object;
 760  760          return (0);
 761  761  }
 762  762  
 763  763  void
 764  764  vm_free_memseg(struct vm *vm, int ident)
 765  765  {
 766  766          struct mem_seg *seg;
 767  767  
 768  768          KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 769  769              ("%s: invalid memseg ident %d", __func__, ident));
 770  770  
 771  771          seg = &vm->mem_segs[ident];
 772  772          if (seg->object != NULL) {
 773  773                  vm_object_deallocate(seg->object);
 774  774                  bzero(seg, sizeof (struct mem_seg));
 775  775          }
 776  776  }
 777  777  
 778  778  int
 779  779  vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
 780  780      size_t len, int prot, int flags)
 781  781  {
 782  782          struct mem_seg *seg;
 783  783          struct mem_map *m, *map;
 784  784          vm_ooffset_t last;
 785  785          int i, error;
 786  786  
 787  787          if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
 788  788                  return (EINVAL);
 789  789  
 790  790          if (flags & ~VM_MEMMAP_F_WIRED)
 791  791                  return (EINVAL);
 792  792  
 793  793          if (segid < 0 || segid >= VM_MAX_MEMSEGS)
 794  794                  return (EINVAL);
 795  795  
 796  796          seg = &vm->mem_segs[segid];
 797  797          if (seg->object == NULL)
 798  798                  return (EINVAL);
 799  799  
 800  800          last = first + len;
 801  801          if (first < 0 || first >= last || last > seg->len)
 802  802                  return (EINVAL);
 803  803  
 804  804          if ((gpa | first | last) & PAGE_MASK)
 805  805                  return (EINVAL);
 806  806  
 807  807          map = NULL;
 808  808          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 809  809                  m = &vm->mem_maps[i];
 810  810                  if (m->len == 0) {
 811  811                          map = m;
 812  812                          break;
 813  813                  }
 814  814          }
 815  815  
 816  816          if (map == NULL)
 817  817                  return (ENOSPC);
 818  818  
 819  819          error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
 820  820              len, 0, VMFS_NO_SPACE, prot, prot, 0);
 821  821          if (error != 0)
 822  822                  return (EFAULT);
 823  823  
 824  824          vm_object_reference(seg->object);
 825  825  
 826  826          if ((flags & VM_MEMMAP_F_WIRED) != 0) {
 827  827                  error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
 828  828                      VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 829  829                  if (error != 0) {
 830  830                          vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
 831  831                          return (EFAULT);
 832  832                  }
 833  833          }
 834  834  
 835  835          map->gpa = gpa;
 836  836          map->len = len;
 837  837          map->segoff = first;
 838  838          map->segid = segid;
 839  839          map->prot = prot;
 840  840          map->flags = flags;
 841  841          return (0);
 842  842  }
 843  843  
 844  844  int
 845  845  vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
 846  846  {
 847  847          struct mem_map *m;
 848  848          int i;
 849  849  
 850  850          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 851  851                  m = &vm->mem_maps[i];
 852  852                  if (m->gpa == gpa && m->len == len &&
 853  853                      (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
 854  854                          vm_free_memmap(vm, i);
 855  855                          return (0);
 856  856                  }
 857  857          }
 858  858  
 859  859          return (EINVAL);
 860  860  }
 861  861  
 862  862  int
 863  863  vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
 864  864      vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 865  865  {
 866  866          struct mem_map *mm, *mmnext;
 867  867          int i;
 868  868  
 869  869          mmnext = NULL;
 870  870          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 871  871                  mm = &vm->mem_maps[i];
 872  872                  if (mm->len == 0 || mm->gpa < *gpa)
 873  873                          continue;
 874  874                  if (mmnext == NULL || mm->gpa < mmnext->gpa)
 875  875                          mmnext = mm;
 876  876          }
 877  877  
 878  878          if (mmnext != NULL) {
 879  879                  *gpa = mmnext->gpa;
 880  880                  if (segid)
 881  881                          *segid = mmnext->segid;
 882  882                  if (segoff)
 883  883                          *segoff = mmnext->segoff;
 884  884                  if (len)
 885  885                          *len = mmnext->len;
 886  886                  if (prot)
 887  887                          *prot = mmnext->prot;
 888  888                  if (flags)
 889  889                          *flags = mmnext->flags;
 890  890                  return (0);
 891  891          } else {
 892  892                  return (ENOENT);
 893  893          }
 894  894  }
 895  895  
 896  896  static void
 897  897  vm_free_memmap(struct vm *vm, int ident)
 898  898  {
 899  899          struct mem_map *mm;
 900  900          int error;
 901  901  
 902  902          mm = &vm->mem_maps[ident];
 903  903          if (mm->len) {
 904  904                  error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 905  905                      mm->gpa + mm->len);
 906  906                  KASSERT(error == 0, ("%s: vm_map_remove error %d",
 907  907                      __func__, error));
 908  908                  bzero(mm, sizeof (struct mem_map));
 909  909          }
 910  910  }
 911  911  
 912  912  static __inline bool
 913  913  sysmem_mapping(struct vm *vm, struct mem_map *mm)
 914  914  {
 915  915  
 916  916          if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 917  917                  return (true);
 918  918          else
 919  919                  return (false);
 920  920  }
 921  921  
 922  922  vm_paddr_t
 923  923  vmm_sysmem_maxaddr(struct vm *vm)
 924  924  {
 925  925          struct mem_map *mm;
 926  926          vm_paddr_t maxaddr;
 927  927          int i;
 928  928  
 929  929          maxaddr = 0;
 930  930          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 931  931                  mm = &vm->mem_maps[i];
 932  932                  if (sysmem_mapping(vm, mm)) {
 933  933                          if (maxaddr < mm->gpa + mm->len)
 934  934                                  maxaddr = mm->gpa + mm->len;
 935  935                  }
 936  936          }
 937  937          return (maxaddr);
 938  938  }
 939  939  
 940  940  static void
 941  941  vm_iommu_modify(struct vm *vm, bool map)
 942  942  {
 943  943          int i, sz;
 944  944          vm_paddr_t gpa, hpa;
 945  945          struct mem_map *mm;
 946  946  #ifdef __FreeBSD__
 947  947          void *vp, *cookie, *host_domain;
 948  948  #else
 949  949          void *vp, *cookie, *host_domain __unused;
 950  950  #endif
 951  951  
 952  952          sz = PAGE_SIZE;
 953  953          host_domain = iommu_host_domain();
 954  954  
 955  955          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 956  956                  mm = &vm->mem_maps[i];
 957  957                  if (!sysmem_mapping(vm, mm))
 958  958                          continue;
 959  959  
 960  960                  if (map) {
 961  961                          KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
 962  962                              ("iommu map found invalid memmap %lx/%lx/%x",
 963  963                              mm->gpa, mm->len, mm->flags));
 964  964                          if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
 965  965                                  continue;
 966  966                          mm->flags |= VM_MEMMAP_F_IOMMU;
 967  967                  } else {
 968  968                          if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
 969  969                                  continue;
 970  970                          mm->flags &= ~VM_MEMMAP_F_IOMMU;
 971  971                          KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
 972  972                              ("iommu unmap found invalid memmap %lx/%lx/%x",
 973  973                              mm->gpa, mm->len, mm->flags));
 974  974                  }
 975  975  
 976  976                  gpa = mm->gpa;
 977  977                  while (gpa < mm->gpa + mm->len) {
 978  978                          vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, PROT_WRITE,
 979  979                              &cookie);
 980  980                          KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
 981  981                              vm_name(vm), gpa));
 982  982  
 983  983                          vm_gpa_release(cookie);
 984  984  
 985  985                          hpa = DMAP_TO_PHYS((uintptr_t)vp);
 986  986                          if (map) {
 987  987                                  iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 988  988  #ifdef __FreeBSD__
 989  989                                  iommu_remove_mapping(host_domain, hpa, sz);
 990  990  #endif
 991  991                          } else {
 992  992                                  iommu_remove_mapping(vm->iommu, gpa, sz);
 993  993  #ifdef __FreeBSD__
 994  994                                  iommu_create_mapping(host_domain, hpa, hpa, sz);
 995  995  #endif
 996  996                          }
 997  997  
 998  998                          gpa += PAGE_SIZE;
 999  999                  }
1000 1000          }
1001 1001  
1002 1002          /*
1003 1003           * Invalidate the cached translations associated with the domain
1004 1004           * from which pages were removed.
1005 1005           */
1006 1006  #ifdef __FreeBSD__
1007 1007          if (map)
1008 1008                  iommu_invalidate_tlb(host_domain);
1009 1009          else
1010 1010                  iommu_invalidate_tlb(vm->iommu);
1011 1011  #else
1012 1012          iommu_invalidate_tlb(vm->iommu);
1013 1013  #endif
1014 1014  }
1015 1015  
1016 1016  #define vm_iommu_unmap(vm)      vm_iommu_modify((vm), false)
1017 1017  #define vm_iommu_map(vm)        vm_iommu_modify((vm), true)
1018 1018  
1019 1019  int
1020 1020  vm_unassign_pptdev(struct vm *vm, int pptfd)
1021 1021  {
1022 1022          int error;
1023 1023  
1024 1024          error = ppt_unassign_device(vm, pptfd);
1025 1025          if (error)
1026 1026                  return (error);
1027 1027  
1028 1028          if (ppt_assigned_devices(vm) == 0)
1029 1029                  vm_iommu_unmap(vm);
1030 1030  
1031 1031          return (0);
1032 1032  }
1033 1033  
1034 1034  int
1035 1035  vm_assign_pptdev(struct vm *vm, int pptfd)
1036 1036  {
1037 1037          int error;
1038 1038          vm_paddr_t maxaddr;
1039 1039  
1040 1040          /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1041 1041          if (ppt_assigned_devices(vm) == 0) {
1042 1042                  KASSERT(vm->iommu == NULL,
1043 1043                      ("vm_assign_pptdev: iommu must be NULL"));
1044 1044                  maxaddr = vmm_sysmem_maxaddr(vm);
1045 1045                  vm->iommu = iommu_create_domain(maxaddr);
1046 1046                  if (vm->iommu == NULL)
1047 1047                          return (ENXIO);
1048 1048                  vm_iommu_map(vm);
1049 1049          }
1050 1050  
1051 1051          error = ppt_assign_device(vm, pptfd);
1052 1052          return (error);
1053 1053  }
1054 1054  
1055 1055  void *
1056 1056  vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1057 1057      void **cookie)
1058 1058  {
1059 1059          int i, count, pageoff;
1060 1060          struct mem_map *mm;
1061 1061          vm_page_t m;
1062 1062  #ifdef INVARIANTS
1063 1063          /*
1064 1064           * All vcpus are frozen by ioctls that modify the memory map
1065 1065           * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1066 1066           * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1067 1067           */
1068 1068          int state;
1069 1069          KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1070 1070              __func__, vcpuid));
1071 1071          for (i = 0; i < vm->maxcpus; i++) {
1072 1072                  if (vcpuid != -1 && vcpuid != i)
1073 1073                          continue;
1074 1074                  state = vcpu_get_state(vm, i, NULL);
1075 1075                  KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1076 1076                      __func__, state));
1077 1077          }
1078 1078  #endif
1079 1079          pageoff = gpa & PAGE_MASK;
1080 1080          if (len > PAGE_SIZE - pageoff)
1081 1081                  panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1082 1082  
1083 1083          count = 0;
1084 1084          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1085 1085                  mm = &vm->mem_maps[i];
1086 1086                  if (mm->len == 0) {
1087 1087                          continue;
1088 1088                  }
1089 1089                  if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1090 1090                          count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1091 1091                              trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1092 1092                          break;
1093 1093                  }
1094 1094          }
1095 1095  
1096 1096          if (count == 1) {
1097 1097                  *cookie = m;
1098 1098                  return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1099 1099          } else {
1100 1100                  *cookie = NULL;
1101 1101                  return (NULL);
1102 1102          }
1103 1103  }
1104 1104  
1105 1105  void
1106 1106  vm_gpa_release(void *cookie)
1107 1107  {
1108 1108          vm_page_t m = cookie;
1109 1109  
1110 1110          vm_page_unwire(m, PQ_ACTIVE);
1111 1111  }
1112 1112  
1113 1113  int
1114 1114  vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1115 1115  {
1116 1116  
1117 1117          if (vcpu < 0 || vcpu >= vm->maxcpus)
1118 1118                  return (EINVAL);
1119 1119  
1120 1120          if (reg >= VM_REG_LAST)
1121 1121                  return (EINVAL);
1122 1122  
1123 1123          return (VMGETREG(vm->cookie, vcpu, reg, retval));
1124 1124  }
1125 1125  
1126 1126  int
1127 1127  vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1128 1128  {
1129 1129          struct vcpu *vcpu;
1130 1130          int error;
1131 1131  
1132 1132          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1133 1133                  return (EINVAL);
1134 1134  
1135 1135          if (reg >= VM_REG_LAST)
1136 1136                  return (EINVAL);
1137 1137  
1138 1138          error = VMSETREG(vm->cookie, vcpuid, reg, val);
1139 1139          if (error || reg != VM_REG_GUEST_RIP)
1140 1140                  return (error);
1141 1141  
1142 1142          /* Set 'nextrip' to match the value of %rip */
1143 1143          VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1144 1144          vcpu = &vm->vcpu[vcpuid];
1145 1145          vcpu->nextrip = val;
1146 1146          return (0);
1147 1147  }
1148 1148  
1149 1149  static bool
1150 1150  is_descriptor_table(int reg)
1151 1151  {
1152 1152          switch (reg) {
1153 1153          case VM_REG_GUEST_IDTR:
1154 1154          case VM_REG_GUEST_GDTR:
1155 1155                  return (true);
1156 1156          default:
1157 1157                  return (false);
1158 1158          }
1159 1159  }
1160 1160  
1161 1161  static bool
1162 1162  is_segment_register(int reg)
1163 1163  {
1164 1164          switch (reg) {
1165 1165          case VM_REG_GUEST_ES:
1166 1166          case VM_REG_GUEST_CS:
1167 1167          case VM_REG_GUEST_SS:
1168 1168          case VM_REG_GUEST_DS:
1169 1169          case VM_REG_GUEST_FS:
1170 1170          case VM_REG_GUEST_GS:
1171 1171          case VM_REG_GUEST_TR:
1172 1172          case VM_REG_GUEST_LDTR:
1173 1173                  return (true);
1174 1174          default:
1175 1175                  return (false);
1176 1176          }
1177 1177  }
1178 1178  
1179 1179  int
1180 1180  vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1181 1181  {
1182 1182  
1183 1183          if (vcpu < 0 || vcpu >= vm->maxcpus)
1184 1184                  return (EINVAL);
1185 1185  
1186 1186          if (!is_segment_register(reg) && !is_descriptor_table(reg))
1187 1187                  return (EINVAL);
1188 1188  
1189 1189          return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1190 1190  }
1191 1191  
1192 1192  int
1193 1193  vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1194 1194  {
1195 1195          if (vcpu < 0 || vcpu >= vm->maxcpus)
1196 1196                  return (EINVAL);
1197 1197  
1198 1198          if (!is_segment_register(reg) && !is_descriptor_table(reg))
1199 1199                  return (EINVAL);
1200 1200  
1201 1201          return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1202 1202  }
1203 1203  
1204 1204  int
1205 1205  vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1206 1206  {
1207 1207          struct vcpu *vcpu;
1208 1208  
1209 1209          if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1210 1210                  return (EINVAL);
1211 1211          }
1212 1212  
1213 1213          vcpu = &vm->vcpu[vcpuid];
1214 1214  
1215 1215          vcpu_lock(vcpu);
1216 1216          *state = vcpu->run_state;
1217 1217          *sipi_vec = vcpu->sipi_vector;
1218 1218          vcpu_unlock(vcpu);
1219 1219  
1220 1220          return (0);
1221 1221  }
1222 1222  
1223 1223  int
1224 1224  vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1225 1225  {
1226 1226          struct vcpu *vcpu;
1227 1227  
1228 1228          if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1229 1229                  return (EINVAL);
1230 1230          }
1231 1231          if (!VRS_IS_VALID(state)) {
1232 1232                  return (EINVAL);
1233 1233          }
1234 1234  
1235 1235          vcpu = &vm->vcpu[vcpuid];
1236 1236  
1237 1237          vcpu_lock(vcpu);
1238 1238          vcpu->run_state = state;
1239 1239          vcpu->sipi_vector = sipi_vec;
1240 1240          vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1241 1241          vcpu_unlock(vcpu);
1242 1242  
1243 1243          return (0);
1244 1244  }
1245 1245  
1246 1246  
1247 1247  static void
1248 1248  restore_guest_fpustate(struct vcpu *vcpu)
1249 1249  {
1250 1250  
1251 1251          /* flush host state to the pcb */
1252 1252          fpuexit(curthread);
1253 1253  
1254 1254          /* restore guest FPU state */
1255 1255          fpu_stop_emulating();
1256 1256          fpurestore(vcpu->guestfpu);
1257 1257  
1258 1258          /* restore guest XCR0 if XSAVE is enabled in the host */
1259 1259          if (rcr4() & CR4_XSAVE)
1260 1260                  load_xcr(0, vcpu->guest_xcr0);
1261 1261  
1262 1262          /*
1263 1263           * The FPU is now "dirty" with the guest's state so turn on emulation
1264 1264           * to trap any access to the FPU by the host.
1265 1265           */
1266 1266          fpu_start_emulating();
1267 1267  }
1268 1268  
1269 1269  static void
1270 1270  save_guest_fpustate(struct vcpu *vcpu)
1271 1271  {
1272 1272  
1273 1273          if ((rcr0() & CR0_TS) == 0)
1274 1274                  panic("fpu emulation not enabled in host!");
1275 1275  
1276 1276          /* save guest XCR0 and restore host XCR0 */
1277 1277          if (rcr4() & CR4_XSAVE) {
1278 1278                  vcpu->guest_xcr0 = rxcr(0);
1279 1279                  load_xcr(0, vmm_get_host_xcr0());
1280 1280          }
1281 1281  
1282 1282          /* save guest FPU state */
1283 1283          fpu_stop_emulating();
1284 1284          fpusave(vcpu->guestfpu);
1285 1285          /*
1286 1286           * When the host state has been restored, we should not re-enable
1287 1287           * CR0.TS on illumos for eager FPU.
1288 1288           */
1289 1289  }
1290 1290  
1291 1291  static int
1292 1292  vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1293 1293      bool from_idle)
1294 1294  {
1295 1295          struct vcpu *vcpu;
1296 1296          int error;
1297 1297  
1298 1298          vcpu = &vm->vcpu[vcpuid];
1299 1299          vcpu_assert_locked(vcpu);
1300 1300  
1301 1301          /*
1302 1302           * State transitions from the vmmdev_ioctl() must always begin from
1303 1303           * the VCPU_IDLE state. This guarantees that there is only a single
1304 1304           * ioctl() operating on a vcpu at any point.
1305 1305           */
1306 1306          if (from_idle) {
1307 1307                  while (vcpu->state != VCPU_IDLE) {
1308 1308                          vcpu->reqidle = 1;
1309 1309                          vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1310 1310                          VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1311 1311                              "idle requested", vcpu_state2str(vcpu->state));
1312 1312                          cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1313 1313                  }
1314 1314          } else {
1315 1315                  KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1316 1316                      "vcpu idle state"));
1317 1317          }
1318 1318  
1319 1319          if (vcpu->state == VCPU_RUNNING) {
1320 1320                  KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1321 1321                      "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1322 1322          } else {
1323 1323                  KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1324 1324                      "vcpu that is not running", vcpu->hostcpu));
1325 1325          }
1326 1326  
1327 1327          /*
1328 1328           * The following state transitions are allowed:
1329 1329           * IDLE -> FROZEN -> IDLE
1330 1330           * FROZEN -> RUNNING -> FROZEN
1331 1331           * FROZEN -> SLEEPING -> FROZEN
1332 1332           */
1333 1333          switch (vcpu->state) {
1334 1334          case VCPU_IDLE:
1335 1335          case VCPU_RUNNING:
1336 1336          case VCPU_SLEEPING:
1337 1337                  error = (newstate != VCPU_FROZEN);
1338 1338                  break;
1339 1339          case VCPU_FROZEN:
1340 1340                  error = (newstate == VCPU_FROZEN);
1341 1341                  break;
1342 1342          default:
1343 1343                  error = 1;
1344 1344                  break;
1345 1345          }
1346 1346  
1347 1347          if (error)
1348 1348                  return (EBUSY);
1349 1349  
1350 1350          VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1351 1351              vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1352 1352  
1353 1353          vcpu->state = newstate;
1354 1354          if (newstate == VCPU_RUNNING)
1355 1355                  vcpu->hostcpu = curcpu;
1356 1356          else
1357 1357                  vcpu->hostcpu = NOCPU;
1358 1358  
1359 1359          if (newstate == VCPU_IDLE) {
1360 1360                  cv_broadcast(&vcpu->state_cv);
1361 1361          }
1362 1362  
1363 1363          return (0);
1364 1364  }
1365 1365  
1366 1366  static void
1367 1367  vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1368 1368  {
1369 1369          int error;
1370 1370  
1371 1371          if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1372 1372                  panic("Error %d setting state to %d\n", error, newstate);
1373 1373  }
1374 1374  
1375 1375  static void
1376 1376  vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1377 1377  {
1378 1378          int error;
1379 1379  
1380 1380          if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1381 1381                  panic("Error %d setting state to %d", error, newstate);
1382 1382  }
1383 1383  
1384 1384  /*
1385 1385   * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1386 1386   */
1387 1387  static int
1388 1388  vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1389 1389  {
1390 1390          struct vcpu *vcpu;
1391 1391          int vcpu_halted, vm_halted;
1392 1392          bool userspace_exit = false;
1393 1393  
1394 1394          KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1395 1395  
1396 1396          vcpu = &vm->vcpu[vcpuid];
1397 1397          vcpu_halted = 0;
1398 1398          vm_halted = 0;
1399 1399  
1400 1400          vcpu_lock(vcpu);
1401 1401          while (1) {
1402 1402                  /*
1403 1403                   * Do a final check for pending interrupts (including NMI and
1404 1404                   * INIT) before putting this thread to sleep.
1405 1405                   */
1406 1406                  if (vm_nmi_pending(vm, vcpuid))
1407 1407                          break;
1408 1408                  if (vcpu_run_state_pending(vm, vcpuid))
1409 1409                          break;
1410 1410                  if (!intr_disabled) {
1411 1411                          if (vm_extint_pending(vm, vcpuid) ||
1412 1412                              vlapic_pending_intr(vcpu->vlapic, NULL)) {
1413 1413                                  break;
1414 1414                          }
1415 1415                  }
1416 1416  
1417 1417                  /*
1418 1418                   * Also check for software events which would cause a wake-up.
1419 1419                   * This will set the appropriate exitcode directly, rather than
1420 1420                   * requiring a trip through VM_RUN().
1421 1421                   */
1422 1422                  if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1423 1423                          userspace_exit = true;
1424 1424                          break;
1425 1425                  }
1426 1426  
1427 1427                  /*
1428 1428                   * Some Linux guests implement "halt" by having all vcpus
1429 1429                   * execute HLT with interrupts disabled. 'halted_cpus' keeps
1430 1430                   * track of the vcpus that have entered this state. When all
1431 1431                   * vcpus enter the halted state the virtual machine is halted.
1432 1432                   */
1433 1433                  if (intr_disabled) {
1434 1434                          if (!vcpu_halted && halt_detection_enabled) {
1435 1435                                  vcpu_halted = 1;
1436 1436                                  CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1437 1437                          }
1438 1438                          if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1439 1439                                  vm_halted = 1;
1440 1440                                  break;
1441 1441                          }
1442 1442                  }
1443 1443  
1444 1444                  vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1445 1445                  vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1446 1446                  (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1447 1447                  vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1448 1448                  vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1449 1449          }
1450 1450  
1451 1451          if (vcpu_halted)
1452 1452                  CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1453 1453  
1454 1454          vcpu_unlock(vcpu);
1455 1455  
1456 1456          if (vm_halted)
1457 1457                  vm_suspend(vm, VM_SUSPEND_HALT);
1458 1458  
1459 1459          return (userspace_exit ? -1 : 0);
1460 1460  }
1461 1461  
1462 1462  static int
1463 1463  vm_handle_paging(struct vm *vm, int vcpuid)
1464 1464  {
1465 1465          int rv, ftype;
1466 1466          struct vm_map *map;
1467 1467          struct vcpu *vcpu;
1468 1468          struct vm_exit *vme;
1469 1469  
1470 1470          vcpu = &vm->vcpu[vcpuid];
1471 1471          vme = &vcpu->exitinfo;
1472 1472  
1473 1473          KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1474 1474              __func__, vme->inst_length));
1475 1475  
1476 1476          ftype = vme->u.paging.fault_type;
1477 1477          KASSERT(ftype == PROT_READ ||
1478 1478              ftype == PROT_WRITE || ftype == PROT_EXEC,
1479 1479              ("vm_handle_paging: invalid fault_type %d", ftype));
1480 1480  
1481 1481          if (ftype == PROT_READ || ftype == PROT_WRITE) {
1482 1482                  rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1483 1483                      vme->u.paging.gpa, ftype);
1484 1484                  if (rv == 0) {
1485 1485                          VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1486 1486                              ftype == PROT_READ ? "accessed" : "dirty",
1487 1487                              vme->u.paging.gpa);
1488 1488                          goto done;
1489 1489                  }
1490 1490          }
1491 1491  
1492 1492          map = &vm->vmspace->vm_map;
1493 1493          rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1494 1494  
1495 1495          VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1496 1496              "ftype = %d", rv, vme->u.paging.gpa, ftype);
1497 1497  
1498 1498          if (rv != 0)
1499 1499                  return (EFAULT);
1500 1500  done:
1501 1501          return (0);
1502 1502  }
1503 1503  
1504 1504  int
1505 1505  vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1506 1506      int rsize)
1507 1507  {
1508 1508          int err = ESRCH;
1509 1509  
1510 1510          if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1511 1511                  err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1512 1512          } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1513 1513                  err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1514 1514          } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1515 1515                  err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1516 1516          }
1517 1517  
1518 1518          return (err);
1519 1519  }
1520 1520  
1521 1521  int
1522 1522  vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1523 1523      int wsize)
1524 1524  {
1525 1525          int err = ESRCH;
1526 1526  
1527 1527          if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1528 1528                  err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1529 1529          } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1530 1530                  err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1531 1531          } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1532 1532                  err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1533 1533          }
1534 1534  
1535 1535          return (err);
1536 1536  }
1537 1537  
1538 1538  static int
1539 1539  vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1540 1540  {
1541 1541          struct vie *vie;
1542 1542          struct vcpu *vcpu;
1543 1543          struct vm_exit *vme;
1544 1544          uint64_t inst_addr;
1545 1545          int error, fault, cs_d;
1546 1546  
1547 1547          vcpu = &vm->vcpu[vcpuid];
1548 1548          vme = &vcpu->exitinfo;
1549 1549          vie = vcpu->vie_ctx;
1550 1550  
1551 1551          KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1552 1552              __func__, vme->inst_length));
1553 1553  
1554 1554          inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1555 1555          cs_d = vme->u.mmio_emul.cs_d;
1556 1556  
1557 1557          VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1558 1558              vme->u.mmio_emul.gpa);
1559 1559  
1560 1560          /* Fetch the faulting instruction */
1561 1561          if (vie_needs_fetch(vie)) {
1562 1562                  error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1563 1563                      &fault);
1564 1564                  if (error != 0) {
1565 1565                          return (error);
1566 1566                  } else if (fault) {
1567 1567                          /*
1568 1568                           * If a fault during instruction fetch was encountered,
1569 1569                           * it will have asserted that the appropriate exception
1570 1570                           * be injected at next entry.
1571 1571                           * No further work is required.
1572 1572                           */
1573 1573                          return (0);
1574 1574                  }
1575 1575          }
1576 1576  
1577 1577          if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1578 1578                  VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1579 1579                      inst_addr);
1580 1580                  /* Dump (unrecognized) instruction bytes in userspace */
1581 1581                  vie_fallback_exitinfo(vie, vme);
1582 1582                  return (-1);
1583 1583          }
1584 1584          if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1585 1585              vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1586 1586                  /* Decoded GLA does not match GLA from VM exit state */
1587 1587                  vie_fallback_exitinfo(vie, vme);
1588 1588                  return (-1);
1589 1589          }
1590 1590  
1591 1591  repeat:
1592 1592          error = vie_emulate_mmio(vie, vm, vcpuid);
1593 1593          if (error < 0) {
1594 1594                  /*
1595 1595                   * MMIO not handled by any of the in-kernel-emulated devices, so
1596 1596                   * make a trip out to userspace for it.
1597 1597                   */
1598 1598                  vie_exitinfo(vie, vme);
1599 1599          } else if (error == EAGAIN) {
1600 1600                  /*
1601 1601                   * Continue emulating the rep-prefixed instruction, which has
1602 1602                   * not completed its iterations.
1603 1603                   *
1604 1604                   * In case this can be emulated in-kernel and has a high
1605 1605                   * repetition count (causing a tight spin), it should be
1606 1606                   * deferential to yield conditions.
1607 1607                   */
1608 1608                  if (!vcpu_should_yield(vm, vcpuid)) {
1609 1609                          goto repeat;
1610 1610                  } else {
1611 1611                          /*
1612 1612                           * Defer to the contending load by making a trip to
1613 1613                           * userspace with a no-op (BOGUS) exit reason.
1614 1614                           */
1615 1615                          vie_reset(vie);
1616 1616                          vme->exitcode = VM_EXITCODE_BOGUS;
1617 1617                          return (-1);
1618 1618                  }
1619 1619          } else if (error == 0) {
1620 1620                  /* Update %rip now that instruction has been emulated */
1621 1621                  vie_advance_pc(vie, &vcpu->nextrip);
1622 1622          }
1623 1623          return (error);
1624 1624  }
1625 1625  
1626 1626  static int
1627 1627  vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1628 1628  {
1629 1629          struct vcpu *vcpu;
1630 1630          struct vie *vie;
1631 1631          int err;
1632 1632  
1633 1633          vcpu = &vm->vcpu[vcpuid];
1634 1634          vie = vcpu->vie_ctx;
1635 1635  
1636 1636  repeat:
1637 1637          err = vie_emulate_inout(vie, vm, vcpuid);
1638 1638  
1639 1639          if (err < 0) {
1640 1640                  /*
1641 1641                   * In/out not handled by any of the in-kernel-emulated devices,
1642 1642                   * so make a trip out to userspace for it.
1643 1643                   */
1644 1644                  vie_exitinfo(vie, vme);
1645 1645                  return (err);
1646 1646          } else if (err == EAGAIN) {
1647 1647                  /*
1648 1648                   * Continue emulating the rep-prefixed ins/outs, which has not
1649 1649                   * completed its iterations.
1650 1650                   *
1651 1651                   * In case this can be emulated in-kernel and has a high
1652 1652                   * repetition count (causing a tight spin), it should be
1653 1653                   * deferential to yield conditions.
1654 1654                   */
1655 1655                  if (!vcpu_should_yield(vm, vcpuid)) {
1656 1656                          goto repeat;
1657 1657                  } else {
1658 1658                          /*
1659 1659                           * Defer to the contending load by making a trip to
1660 1660                           * userspace with a no-op (BOGUS) exit reason.
1661 1661                           */
1662 1662                          vie_reset(vie);
1663 1663                          vme->exitcode = VM_EXITCODE_BOGUS;
1664 1664                          return (-1);
1665 1665                  }
1666 1666          } else if (err != 0) {
1667 1667                  /* Emulation failure.  Bail all the way out to userspace. */
1668 1668                  vme->exitcode = VM_EXITCODE_INST_EMUL;
1669 1669                  bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1670 1670                  return (-1);
1671 1671          }
1672 1672  
1673 1673          vie_advance_pc(vie, &vcpu->nextrip);
1674 1674          return (0);
1675 1675  }
1676 1676  
1677 1677  static int
1678 1678  vm_handle_inst_emul(struct vm *vm, int vcpuid)
1679 1679  {
1680 1680          struct vie *vie;
1681 1681          struct vcpu *vcpu;
1682 1682          struct vm_exit *vme;
1683 1683          uint64_t cs_base;
1684 1684          int error, fault, cs_d;
1685 1685  
1686 1686          vcpu = &vm->vcpu[vcpuid];
1687 1687          vme = &vcpu->exitinfo;
1688 1688          vie = vcpu->vie_ctx;
1689 1689  
1690 1690          vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1691 1691  
1692 1692          /* Fetch the faulting instruction */
1693 1693          ASSERT(vie_needs_fetch(vie));
1694 1694          error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1695 1695              &fault);
1696 1696          if (error != 0) {
1697 1697                  return (error);
1698 1698          } else if (fault) {
1699 1699                  /*
1700 1700                   * If a fault during instruction fetch was encounted, it will
1701 1701                   * have asserted that the appropriate exception be injected at
1702 1702                   * next entry.  No further work is required.
1703 1703                   */
1704 1704                  return (0);
1705 1705          }
1706 1706  
1707 1707          if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1708 1708                  /* Dump (unrecognized) instruction bytes in userspace */
1709 1709                  vie_fallback_exitinfo(vie, vme);
1710 1710                  return (-1);
1711 1711          }
1712 1712  
1713 1713          error = vie_emulate_other(vie, vm, vcpuid);
1714 1714          if (error != 0) {
1715 1715                  /*
1716 1716                   * Instruction emulation was unable to complete successfully, so
1717 1717                   * kick it out to userspace for handling.
1718 1718                   */
1719 1719                  vie_fallback_exitinfo(vie, vme);
1720 1720          } else {
1721 1721                  /* Update %rip now that instruction has been emulated */
1722 1722                  vie_advance_pc(vie, &vcpu->nextrip);
1723 1723          }
1724 1724          return (error);
1725 1725  }
1726 1726  
1727 1727  static int
1728 1728  vm_handle_suspend(struct vm *vm, int vcpuid)
1729 1729  {
1730 1730          int i;
1731 1731          struct vcpu *vcpu;
1732 1732  
1733 1733          vcpu = &vm->vcpu[vcpuid];
1734 1734  
1735 1735          CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1736 1736  
1737 1737          /*
1738 1738           * Wait until all 'active_cpus' have suspended themselves.
1739 1739           */
1740 1740          vcpu_lock(vcpu);
1741 1741          vcpu_ustate_change(vm, vcpuid, VU_INIT);
1742 1742          while (1) {
1743 1743                  int rc;
1744 1744  
1745 1745                  if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1746 1746                          VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1747 1747                          break;
1748 1748                  }
1749 1749  
1750 1750                  vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1751 1751                  rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1752 1752                      TR_CLOCK_TICK);
1753 1753                  vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1754 1754  
1755 1755                  /*
1756 1756                   * If the userspace process driving the instance is killed, any
1757 1757                   * vCPUs yet to be marked suspended (because they are not
1758 1758                   * VM_RUN-ing in the kernel presently) will never reach that
1759 1759                   * state.
1760 1760                   *
1761 1761                   * To avoid vm_handle_suspend() getting stuck in the kernel
1762 1762                   * waiting for those vCPUs, offer a bail-out even though it
1763 1763                   * means returning without all vCPUs in a suspended state.
1764 1764                   */
1765 1765                  if (rc <= 0) {
1766 1766                          if ((curproc->p_flag & SEXITING) != 0) {
1767 1767                                  break;
1768 1768                          }
1769 1769                  }
1770 1770          }
1771 1771          vcpu_unlock(vcpu);
1772 1772  
1773 1773          /*
1774 1774           * Wakeup the other sleeping vcpus and return to userspace.
1775 1775           */
1776 1776          for (i = 0; i < vm->maxcpus; i++) {
1777 1777                  if (CPU_ISSET(i, &vm->suspended_cpus)) {
1778 1778                          vcpu_notify_event(vm, i);
1779 1779                  }
1780 1780          }
1781 1781  
1782 1782          return (-1);
1783 1783  }
1784 1784  
1785 1785  static int
1786 1786  vm_handle_reqidle(struct vm *vm, int vcpuid)
1787 1787  {
1788 1788          struct vcpu *vcpu = &vm->vcpu[vcpuid];
1789 1789  
1790 1790          vcpu_lock(vcpu);
1791 1791          KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1792 1792          vcpu->reqidle = 0;
1793 1793          vcpu_unlock(vcpu);
1794 1794          return (-1);
1795 1795  }
1796 1796  
1797 1797  static int
1798 1798  vm_handle_run_state(struct vm *vm, int vcpuid)
1799 1799  {
1800 1800          struct vcpu *vcpu = &vm->vcpu[vcpuid];
1801 1801          bool handled = false;
1802 1802  
1803 1803          vcpu_lock(vcpu);
1804 1804          while (1) {
1805 1805                  if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1806 1806                          vcpu_unlock(vcpu);
1807 1807                          VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1808 1808                          vcpu_lock(vcpu);
1809 1809  
1810 1810                          vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1811 1811                          vcpu->run_state |= VRS_INIT;
1812 1812                  }
1813 1813  
1814 1814                  if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1815 1815                      (VRS_INIT | VRS_PEND_SIPI)) {
1816 1816                          const uint8_t vector = vcpu->sipi_vector;
1817 1817  
1818 1818                          vcpu_unlock(vcpu);
1819 1819                          VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1820 1820                          vcpu_lock(vcpu);
1821 1821  
1822 1822                          vcpu->run_state &= ~VRS_PEND_SIPI;
1823 1823                          vcpu->run_state |= VRS_RUN;
1824 1824                  }
1825 1825  
1826 1826                  /*
1827 1827                   * If the vCPU is now in the running state, there is no need to
1828 1828                   * wait for anything prior to re-entry.
1829 1829                   */
1830 1830                  if ((vcpu->run_state & VRS_RUN) != 0) {
1831 1831                          handled = true;
1832 1832                          break;
1833 1833                  }
1834 1834  
1835 1835                  /*
1836 1836                   * Also check for software events which would cause a wake-up.
1837 1837                   * This will set the appropriate exitcode directly, rather than
1838 1838                   * requiring a trip through VM_RUN().
1839 1839                   */
1840 1840                  if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1841 1841                          break;
1842 1842                  }
1843 1843  
1844 1844                  vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1845 1845                  vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1846 1846                  (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1847 1847                  vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1848 1848                  vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1849 1849          }
1850 1850          vcpu_unlock(vcpu);
1851 1851  
1852 1852          return (handled ? 0 : -1);
1853 1853  }
1854 1854  
1855 1855  static int
1856 1856  vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1857 1857  {
1858 1858          const uint32_t code = vme->u.msr.code;
1859 1859          uint64_t val = 0;
1860 1860  
1861 1861          switch (code) {
1862 1862          case MSR_MCG_CAP:
1863 1863          case MSR_MCG_STATUS:
1864 1864                  val = 0;
1865 1865                  break;
1866 1866  
1867 1867          case MSR_MTRRcap:
1868 1868          case MSR_MTRRdefType:
1869 1869          case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1870 1870          case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1871 1871          case MSR_MTRR64kBase:
1872 1872                  val = 0;
1873 1873                  break;
1874 1874  
1875 1875          case MSR_TSC:
1876 1876                  /*
1877 1877                   * In all likelihood, this should always be handled in guest
1878 1878                   * context by VMX/SVM rather than taking an exit.  (Both VMX and
1879 1879                   * SVM pass through read-only access to MSR_TSC to the guest.)
1880 1880                   *
1881 1881                   * No physical offset is requested of vcpu_tsc_offset() since
1882 1882                   * rdtsc_offset() takes care of that instead.
1883 1883                   */
1884 1884                  val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
1885 1885                  break;
1886 1886  
1887 1887          default:
1888 1888                  /*
1889 1889                   * Anything not handled at this point will be kicked out to
1890 1890                   * userspace for attempted processing there.
1891 1891                   */
1892 1892                  return (-1);
1893 1893          }
1894 1894  
1895 1895          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
1896 1896              val & 0xffffffff));
1897 1897          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
1898 1898              val >> 32));
1899 1899          return (0);
1900 1900  }
1901 1901  
1902 1902  static int
1903 1903  vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1904 1904  {
1905 1905          struct vcpu *vcpu = &vm->vcpu[vcpuid];
1906 1906          const uint32_t code = vme->u.msr.code;
1907 1907          const uint64_t val = vme->u.msr.wval;
1908 1908  
1909 1909          switch (code) {
1910 1910          case MSR_MCG_CAP:
1911 1911          case MSR_MCG_STATUS:
1912 1912                  /* Ignore writes */
1913 1913                  break;
1914 1914  
1915 1915          case MSR_MTRRcap:
1916 1916                  vm_inject_gp(vm, vcpuid);
1917 1917                  break;
1918 1918          case MSR_MTRRdefType:
1919 1919          case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1920 1920          case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1921 1921          case MSR_MTRR64kBase:
1922 1922                  /* Ignore writes */
1923 1923                  break;
1924 1924  
1925 1925          case MSR_TSC:
1926 1926                  /*
1927 1927                   * The effect of writing the TSC MSR is that a subsequent read
1928 1928                   * of the TSC would report that value written (plus any time
1929 1929                   * elapsed between the write and the read).  The guest TSC value
1930 1930                   * is calculated from a global offset for the guest (which
1931 1931                   * effectively makes its TSC read 0 at guest boot) and a
1932 1932                   * per-vCPU offset to handle these writes to the MSR.
1933 1933                   *
1934 1934                   * To calculate that per-vCPU offset, we can work backwards from
1935 1935                   * the guest value at the time of write:
1936 1936                   *
1937 1937                   * value = host TSC + VM boot offset + vCPU offset
1938 1938                   *
1939 1939                   * so therefore:
1940 1940                   *
1941 1941                   * value - host TSC - VM boot offset = vCPU offset
1942 1942                   */
1943 1943                  vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
1944 1944                  break;
1945 1945  
1946 1946          default:
1947 1947                  /*
1948 1948                   * Anything not handled at this point will be kicked out to
1949 1949                   * userspace for attempted processing there.
1950 1950                   */
1951 1951                  return (-1);
1952 1952          }
1953 1953  
1954 1954          return (0);
1955 1955  }
1956 1956  
1957 1957  int
1958 1958  vm_suspend(struct vm *vm, enum vm_suspend_how how)
1959 1959  {
1960 1960          int i;
1961 1961  
1962 1962          if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1963 1963                  return (EINVAL);
1964 1964  
1965 1965          if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1966 1966                  VM_CTR2(vm, "virtual machine already suspended %d/%d",
1967 1967                      vm->suspend, how);
1968 1968                  return (EALREADY);
1969 1969          }
1970 1970  
1971 1971          VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1972 1972  
1973 1973          /*
1974 1974           * Notify all active vcpus that they are now suspended.
1975 1975           */
1976 1976          for (i = 0; i < vm->maxcpus; i++) {
1977 1977                  if (CPU_ISSET(i, &vm->active_cpus))
1978 1978                          vcpu_notify_event(vm, i);
1979 1979          }
1980 1980  
1981 1981          return (0);
1982 1982  }
1983 1983  
1984 1984  void
1985 1985  vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1986 1986  {
1987 1987          struct vm_exit *vmexit;
1988 1988  
1989 1989          vmexit = vm_exitinfo(vm, vcpuid);
1990 1990          vmexit->rip = rip;
1991 1991          vmexit->inst_length = 0;
1992 1992          vmexit->exitcode = VM_EXITCODE_RUN_STATE;
1993 1993          vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
1994 1994  }
1995 1995  
1996 1996  /*
1997 1997   * Some vmm resources, such as the lapic, may have CPU-specific resources
1998 1998   * allocated to them which would benefit from migration onto the host CPU which
1999 1999   * is processing the vcpu state.
2000 2000   */
2001 2001  static void
2002 2002  vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2003 2003  {
2004 2004          /*
2005 2005           * Localizing cyclic resources requires acquisition of cpu_lock, and
2006 2006           * doing so with kpreempt disabled is a recipe for deadlock disaster.
2007 2007           */
2008 2008          VERIFY(curthread->t_preempt == 0);
2009 2009  
2010 2010          /*
2011 2011           * Do not bother with localization if this vCPU is about to return to
2012 2012           * the host CPU it was last localized to.
2013 2013           */
2014 2014          if (vcpu->lastloccpu == curcpu)
2015 2015                  return;
2016 2016  
2017 2017          /*
2018 2018           * Localize system-wide resources to the primary boot vCPU.  While any
2019 2019           * of the other vCPUs may access them, it keeps the potential interrupt
2020 2020           * footprint constrained to CPUs involved with this instance.
2021 2021           */
2022 2022          if (vcpu == &vm->vcpu[0]) {
2023 2023                  vhpet_localize_resources(vm->vhpet);
2024 2024                  vrtc_localize_resources(vm->vrtc);
2025 2025                  vatpit_localize_resources(vm->vatpit);
2026 2026          }
2027 2027  
2028 2028          vlapic_localize_resources(vcpu->vlapic);
2029 2029  
2030 2030          vcpu->lastloccpu = curcpu;
2031 2031  }
2032 2032  
2033 2033  static void
2034 2034  vmm_savectx(void *arg)
2035 2035  {
2036 2036          vm_thread_ctx_t *vtc = arg;
2037 2037          struct vm *vm = vtc->vtc_vm;
2038 2038          const int vcpuid = vtc->vtc_vcpuid;
2039 2039  
2040 2040          if (ops->vmsavectx != NULL) {
2041 2041                  ops->vmsavectx(vm->cookie, vcpuid);
2042 2042          }
2043 2043  
2044 2044          /*
2045 2045           * Account for going off-cpu, unless the vCPU is idled, where being
2046 2046           * off-cpu is the explicit point.
2047 2047           */
2048 2048          if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2049 2049                  vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2050 2050                  vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2051 2051          }
2052 2052  
2053 2053          /*
2054 2054           * If the CPU holds the restored guest FPU state, save it and restore
2055 2055           * the host FPU state before this thread goes off-cpu.
2056 2056           */
2057 2057          if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2058 2058                  struct vcpu *vcpu = &vm->vcpu[vcpuid];
2059 2059  
2060 2060                  save_guest_fpustate(vcpu);
2061 2061                  vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2062 2062          }
2063 2063  }
2064 2064  
2065 2065  static void
2066 2066  vmm_restorectx(void *arg)
2067 2067  {
2068 2068          vm_thread_ctx_t *vtc = arg;
2069 2069          struct vm *vm = vtc->vtc_vm;
2070 2070          const int vcpuid = vtc->vtc_vcpuid;
2071 2071  
2072 2072          /* Complete microstate accounting for vCPU being off-cpu */
2073 2073          if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2074 2074                  vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2075 2075          }
2076 2076  
2077 2077          /*
2078 2078           * When coming back on-cpu, only restore the guest FPU status if the
2079 2079           * thread is in a context marked as requiring it.  This should be rare,
2080 2080           * occurring only when a future logic error results in a voluntary
2081 2081           * sleep during the VMRUN critical section.
2082 2082           *
2083 2083           * The common case will result in elision of the guest FPU state
2084 2084           * restoration, deferring that action until it is clearly necessary
2085 2085           * during vm_run.
2086 2086           */
2087 2087          VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2088 2088          if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2089 2089                  struct vcpu *vcpu = &vm->vcpu[vcpuid];
2090 2090  
2091 2091                  restore_guest_fpustate(vcpu);
2092 2092                  vtc->vtc_status |= VTCS_FPU_RESTORED;
2093 2093          }
2094 2094  
2095 2095          if (ops->vmrestorectx != NULL) {
2096 2096                  ops->vmrestorectx(vm->cookie, vcpuid);
2097 2097          }
2098 2098  
2099 2099  }
2100 2100  
2101 2101  /*
2102 2102   * If we're in removectx(), we might still have state to tidy up.
2103 2103   */
2104 2104  static void
2105 2105  vmm_freectx(void *arg, int isexec)
2106 2106  {
2107 2107          vmm_savectx(arg);
2108 2108  }
2109 2109  
2110 2110  static int
2111 2111  vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2112 2112      struct vm_exit *vme)
2113 2113  {
2114 2114          struct vcpu *vcpu;
2115 2115          struct vie *vie;
2116 2116          int err;
2117 2117  
2118 2118          vcpu = &vm->vcpu[vcpuid];
2119 2119          vie = vcpu->vie_ctx;
2120 2120          err = 0;
2121 2121  
2122 2122          switch (entry->cmd) {
2123 2123          case VEC_DEFAULT:
2124 2124                  return (0);
2125 2125          case VEC_DISCARD_INSTR:
2126 2126                  vie_reset(vie);
2127 2127                  return (0);
2128 2128          case VEC_FULFILL_MMIO:
2129 2129                  err = vie_fulfill_mmio(vie, &entry->u.mmio);
2130 2130                  if (err == 0) {
2131 2131                          err = vie_emulate_mmio(vie, vm, vcpuid);
2132 2132                          if (err == 0) {
2133 2133                                  vie_advance_pc(vie, &vcpu->nextrip);
2134 2134                          } else if (err < 0) {
2135 2135                                  vie_exitinfo(vie, vme);
2136 2136                          } else if (err == EAGAIN) {
2137 2137                                  /*
2138 2138                                   * Clear the instruction emulation state in
2139 2139                                   * order to re-enter VM context and continue
2140 2140                                   * this 'rep <instruction>'
2141 2141                                   */
2142 2142                                  vie_reset(vie);
2143 2143                                  err = 0;
2144 2144                          }
2145 2145                  }
2146 2146                  break;
2147 2147          case VEC_FULFILL_INOUT:
2148 2148                  err = vie_fulfill_inout(vie, &entry->u.inout);
2149 2149                  if (err == 0) {
2150 2150                          err = vie_emulate_inout(vie, vm, vcpuid);
2151 2151                          if (err == 0) {
2152 2152                                  vie_advance_pc(vie, &vcpu->nextrip);
2153 2153                          } else if (err < 0) {
2154 2154                                  vie_exitinfo(vie, vme);
2155 2155                          } else if (err == EAGAIN) {
2156 2156                                  /*
2157 2157                                   * Clear the instruction emulation state in
2158 2158                                   * order to re-enter VM context and continue
2159 2159                                   * this 'rep ins/outs'
2160 2160                                   */
2161 2161                                  vie_reset(vie);
2162 2162                                  err = 0;
2163 2163                          }
2164 2164                  }
2165 2165                  break;
2166 2166          default:
2167 2167                  return (EINVAL);
2168 2168          }
2169 2169          return (err);
2170 2170  }
2171 2171  
2172 2172  static int
2173 2173  vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2174 2174  {
2175 2175          struct vie *vie;
2176 2176  
2177 2177          vie = vm->vcpu[vcpuid].vie_ctx;
2178 2178  
2179 2179          if (vie_pending(vie)) {
2180 2180                  /*
2181 2181                   * Userspace has not fulfilled the pending needs of the
2182 2182                   * instruction emulation, so bail back out.
2183 2183                   */
2184 2184                  vie_exitinfo(vie, vme);
2185 2185                  return (-1);
2186 2186          }
2187 2187  
2188 2188          return (0);
2189 2189  }
2190 2190  
2191 2191  int
2192 2192  vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2193 2193  {
2194 2194          int error;
2195 2195          struct vcpu *vcpu;
2196 2196          struct vm_exit *vme;
2197 2197          bool intr_disabled;
2198 2198          pmap_t pmap;
2199 2199          vm_thread_ctx_t vtc;
2200 2200          int affinity_type = CPU_CURRENT;
2201 2201  
2202 2202          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2203 2203                  return (EINVAL);
2204 2204          if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2205 2205                  return (EINVAL);
2206 2206          if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2207 2207                  return (EINVAL);
2208 2208

↓ open down ↓

2208 lines elided

↑ open up ↑

2209 2209          pmap = vmspace_pmap(vm->vmspace);
2210 2210          vcpu = &vm->vcpu[vcpuid];
2211 2211          vme = &vcpu->exitinfo;
2212 2212  
2213 2213          vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2214 2214  
2215 2215          vtc.vtc_vm = vm;
2216 2216          vtc.vtc_vcpuid = vcpuid;
2217 2217          vtc.vtc_status = 0;
2218 2218          installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2219      -            NULL, vmm_freectx);
     2219 +            NULL, vmm_freectx, NULL);
2220 2220  
2221 2221          error = vm_entry_actions(vm, vcpuid, entry, vme);
2222 2222          if (error != 0) {
2223 2223                  goto exit;
2224 2224          }
2225 2225  
2226 2226  restart:
2227 2227          error = vm_loop_checks(vm, vcpuid, vme);
2228 2228          if (error != 0) {
2229 2229                  goto exit;

2230 2230          }
2231 2231  
2232 2232          thread_affinity_set(curthread, affinity_type);
2233 2233          /*
2234 2234           * Resource localization should happen after the CPU affinity for the
2235 2235           * thread has been set to ensure that access from restricted contexts,
2236 2236           * such as VMX-accelerated APIC operations, can occur without inducing
2237 2237           * cyclic cross-calls.
2238 2238           *
2239 2239           * This must be done prior to disabling kpreempt via critical_enter().
2240 2240           */
2241 2241          vm_localize_resources(vm, vcpu);
2242 2242          affinity_type = CPU_CURRENT;
2243 2243          critical_enter();
2244 2244  
2245 2245          KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2246 2246              ("vm_run: absurd pm_active"));
2247 2247  
2248 2248          /* Force a trip through update_sregs to reload %fs/%gs and friends */
2249 2249          PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2250 2250  
2251 2251          if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2252 2252                  restore_guest_fpustate(vcpu);
2253 2253                  vtc.vtc_status |= VTCS_FPU_RESTORED;
2254 2254          }
2255 2255          vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2256 2256  
2257 2257          vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2258 2258          error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2259 2259          vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2260 2260  
2261 2261          /*
2262 2262           * Once clear of the delicate contexts comprising the VM_RUN handler,
2263 2263           * thread CPU affinity can be loosened while other processing occurs.
2264 2264           */
2265 2265          vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2266 2266          thread_affinity_clear(curthread);
2267 2267          critical_exit();
2268 2268  
2269 2269          if (error != 0) {
2270 2270                  /* Communicate out any error from VMRUN() above */
2271 2271                  goto exit;
2272 2272          }
2273 2273  
2274 2274          vcpu->nextrip = vme->rip + vme->inst_length;
2275 2275          switch (vme->exitcode) {
2276 2276          case VM_EXITCODE_REQIDLE:
2277 2277                  error = vm_handle_reqidle(vm, vcpuid);
2278 2278                  break;
2279 2279          case VM_EXITCODE_RUN_STATE:
2280 2280                  error = vm_handle_run_state(vm, vcpuid);
2281 2281                  break;
2282 2282          case VM_EXITCODE_SUSPENDED:
2283 2283                  error = vm_handle_suspend(vm, vcpuid);
2284 2284                  break;
2285 2285          case VM_EXITCODE_IOAPIC_EOI:
2286 2286                  vioapic_process_eoi(vm, vcpuid,
2287 2287                      vme->u.ioapic_eoi.vector);
2288 2288                  break;
2289 2289          case VM_EXITCODE_HLT:
2290 2290                  intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2291 2291                  error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2292 2292                  break;
2293 2293          case VM_EXITCODE_PAGING:
2294 2294                  error = vm_handle_paging(vm, vcpuid);
2295 2295                  break;
2296 2296          case VM_EXITCODE_MMIO_EMUL:
2297 2297                  error = vm_handle_mmio_emul(vm, vcpuid);
2298 2298                  break;
2299 2299          case VM_EXITCODE_INOUT:
2300 2300                  error = vm_handle_inout(vm, vcpuid, vme);
2301 2301                  break;
2302 2302          case VM_EXITCODE_INST_EMUL:
2303 2303                  error = vm_handle_inst_emul(vm, vcpuid);
2304 2304                  break;
2305 2305          case VM_EXITCODE_MONITOR:
2306 2306          case VM_EXITCODE_MWAIT:
2307 2307          case VM_EXITCODE_VMINSN:
2308 2308                  vm_inject_ud(vm, vcpuid);
2309 2309                  break;
2310 2310          case VM_EXITCODE_RDMSR:
2311 2311                  error = vm_handle_rdmsr(vm, vcpuid, vme);
2312 2312                  break;
2313 2313          case VM_EXITCODE_WRMSR:
2314 2314                  error = vm_handle_wrmsr(vm, vcpuid, vme);
2315 2315                  break;
2316 2316          case VM_EXITCODE_HT:
2317 2317                  affinity_type = CPU_BEST;
2318 2318                  break;
2319 2319          case VM_EXITCODE_MTRAP:
2320 2320                  vm_suspend_cpu(vm, vcpuid);
2321 2321                  error = -1;
2322 2322                  break;
2323 2323          default:
2324 2324                  /* handled in userland */
2325 2325                  error = -1;
2326 2326                  break;
2327 2327          }
2328 2328  
2329 2329          if (error == 0) {
2330 2330                  /* VM exit conditions handled in-kernel, continue running */
2331 2331                  goto restart;
2332 2332          }
2333 2333  
2334 2334  exit:
2335 2335          removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2336 2336              NULL, vmm_freectx);
2337 2337  
2338 2338          VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2339 2339  
2340 2340          vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2341 2341          return (error);
2342 2342  }
2343 2343  
2344 2344  int
2345 2345  vm_restart_instruction(void *arg, int vcpuid)
2346 2346  {
2347 2347          struct vm *vm;
2348 2348          struct vcpu *vcpu;
2349 2349          enum vcpu_state state;
2350 2350          uint64_t rip;
2351 2351          int error;
2352 2352  
2353 2353          vm = arg;
2354 2354          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2355 2355                  return (EINVAL);
2356 2356  
2357 2357          vcpu = &vm->vcpu[vcpuid];
2358 2358          state = vcpu_get_state(vm, vcpuid, NULL);
2359 2359          if (state == VCPU_RUNNING) {
2360 2360                  /*
2361 2361                   * When a vcpu is "running" the next instruction is determined
2362 2362                   * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2363 2363                   * Thus setting 'inst_length' to zero will cause the current
2364 2364                   * instruction to be restarted.
2365 2365                   */
2366 2366                  vcpu->exitinfo.inst_length = 0;
2367 2367                  VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2368 2368                      "setting inst_length to zero", vcpu->exitinfo.rip);
2369 2369          } else if (state == VCPU_FROZEN) {
2370 2370                  /*
2371 2371                   * When a vcpu is "frozen" it is outside the critical section
2372 2372                   * around VMRUN() and 'nextrip' points to the next instruction.
2373 2373                   * Thus instruction restart is achieved by setting 'nextrip'
2374 2374                   * to the vcpu's %rip.
2375 2375                   */
2376 2376                  error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2377 2377                  KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2378 2378                  VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2379 2379                      "nextrip from %lx to %lx", vcpu->nextrip, rip);
2380 2380                  vcpu->nextrip = rip;
2381 2381          } else {
2382 2382                  panic("%s: invalid state %d", __func__, state);
2383 2383          }
2384 2384          return (0);
2385 2385  }
2386 2386  
2387 2387  int
2388 2388  vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2389 2389  {
2390 2390          struct vcpu *vcpu;
2391 2391          int type, vector;
2392 2392  
2393 2393          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2394 2394                  return (EINVAL);
2395 2395  
2396 2396          vcpu = &vm->vcpu[vcpuid];
2397 2397  
2398 2398          if (info & VM_INTINFO_VALID) {
2399 2399                  type = info & VM_INTINFO_TYPE;
2400 2400                  vector = info & 0xff;
2401 2401                  if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2402 2402                          return (EINVAL);
2403 2403                  if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2404 2404                          return (EINVAL);
2405 2405                  if (info & VM_INTINFO_RSVD)
2406 2406                          return (EINVAL);
2407 2407          } else {
2408 2408                  info = 0;
2409 2409          }
2410 2410          VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2411 2411          vcpu->exitintinfo = info;
2412 2412          return (0);
2413 2413  }
2414 2414  
2415 2415  enum exc_class {
2416 2416          EXC_BENIGN,
2417 2417          EXC_CONTRIBUTORY,
2418 2418          EXC_PAGEFAULT
2419 2419  };
2420 2420  
2421 2421  #define IDT_VE  20      /* Virtualization Exception (Intel specific) */
2422 2422  
2423 2423  static enum exc_class
2424 2424  exception_class(uint64_t info)
2425 2425  {
2426 2426          int type, vector;
2427 2427  
2428 2428          KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2429 2429          type = info & VM_INTINFO_TYPE;
2430 2430          vector = info & 0xff;
2431 2431  
2432 2432          /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2433 2433          switch (type) {
2434 2434          case VM_INTINFO_HWINTR:
2435 2435          case VM_INTINFO_SWINTR:
2436 2436          case VM_INTINFO_NMI:
2437 2437                  return (EXC_BENIGN);
2438 2438          default:
2439 2439                  /*
2440 2440                   * Hardware exception.
2441 2441                   *
2442 2442                   * SVM and VT-x use identical type values to represent NMI,
2443 2443                   * hardware interrupt and software interrupt.
2444 2444                   *
2445 2445                   * SVM uses type '3' for all exceptions. VT-x uses type '3'
2446 2446                   * for exceptions except #BP and #OF. #BP and #OF use a type
2447 2447                   * value of '5' or '6'. Therefore we don't check for explicit
2448 2448                   * values of 'type' to classify 'intinfo' into a hardware
2449 2449                   * exception.
2450 2450                   */
2451 2451                  break;
2452 2452          }
2453 2453  
2454 2454          switch (vector) {
2455 2455          case IDT_PF:
2456 2456          case IDT_VE:
2457 2457                  return (EXC_PAGEFAULT);
2458 2458          case IDT_DE:
2459 2459          case IDT_TS:
2460 2460          case IDT_NP:
2461 2461          case IDT_SS:
2462 2462          case IDT_GP:
2463 2463                  return (EXC_CONTRIBUTORY);
2464 2464          default:
2465 2465                  return (EXC_BENIGN);
2466 2466          }
2467 2467  }
2468 2468  
2469 2469  static int
2470 2470  nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2471 2471      uint64_t *retinfo)
2472 2472  {
2473 2473          enum exc_class exc1, exc2;
2474 2474          int type1, vector1;
2475 2475  
2476 2476          KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2477 2477          KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2478 2478  
2479 2479          /*
2480 2480           * If an exception occurs while attempting to call the double-fault
2481 2481           * handler the processor enters shutdown mode (aka triple fault).
2482 2482           */
2483 2483          type1 = info1 & VM_INTINFO_TYPE;
2484 2484          vector1 = info1 & 0xff;
2485 2485          if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2486 2486                  VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2487 2487                      info1, info2);
2488 2488                  vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2489 2489                  *retinfo = 0;
2490 2490                  return (0);
2491 2491          }
2492 2492  
2493 2493          /*
2494 2494           * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2495 2495           */
2496 2496          exc1 = exception_class(info1);
2497 2497          exc2 = exception_class(info2);
2498 2498          if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2499 2499              (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2500 2500                  /* Convert nested fault into a double fault. */
2501 2501                  *retinfo = IDT_DF;
2502 2502                  *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2503 2503                  *retinfo |= VM_INTINFO_DEL_ERRCODE;
2504 2504          } else {
2505 2505                  /* Handle exceptions serially */
2506 2506                  *retinfo = info2;
2507 2507          }
2508 2508          return (1);
2509 2509  }
2510 2510  
2511 2511  static uint64_t
2512 2512  vcpu_exception_intinfo(struct vcpu *vcpu)
2513 2513  {
2514 2514          uint64_t info = 0;
2515 2515  
2516 2516          if (vcpu->exception_pending) {
2517 2517                  info = vcpu->exc_vector & 0xff;
2518 2518                  info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2519 2519                  if (vcpu->exc_errcode_valid) {
2520 2520                          info |= VM_INTINFO_DEL_ERRCODE;
2521 2521                          info |= (uint64_t)vcpu->exc_errcode << 32;
2522 2522                  }
2523 2523          }
2524 2524          return (info);
2525 2525  }
2526 2526  
2527 2527  int
2528 2528  vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2529 2529  {
2530 2530          struct vcpu *vcpu;
2531 2531          uint64_t info1, info2;
2532 2532          int valid;
2533 2533  
2534 2534          KASSERT(vcpuid >= 0 &&
2535 2535              vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2536 2536  
2537 2537          vcpu = &vm->vcpu[vcpuid];
2538 2538  
2539 2539          info1 = vcpu->exitintinfo;
2540 2540          vcpu->exitintinfo = 0;
2541 2541  
2542 2542          info2 = 0;
2543 2543          if (vcpu->exception_pending) {
2544 2544                  info2 = vcpu_exception_intinfo(vcpu);
2545 2545                  vcpu->exception_pending = 0;
2546 2546                  VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2547 2547                      vcpu->exc_vector, info2);
2548 2548          }
2549 2549  
2550 2550          if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2551 2551                  valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2552 2552          } else if (info1 & VM_INTINFO_VALID) {
2553 2553                  *retinfo = info1;
2554 2554                  valid = 1;
2555 2555          } else if (info2 & VM_INTINFO_VALID) {
2556 2556                  *retinfo = info2;
2557 2557                  valid = 1;
2558 2558          } else {
2559 2559                  valid = 0;
2560 2560          }
2561 2561  
2562 2562          if (valid) {
2563 2563                  VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2564 2564                      "retinfo(%lx)", __func__, info1, info2, *retinfo);
2565 2565          }
2566 2566  
2567 2567          return (valid);
2568 2568  }
2569 2569  
2570 2570  int
2571 2571  vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2572 2572  {
2573 2573          struct vcpu *vcpu;
2574 2574  
2575 2575          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2576 2576                  return (EINVAL);
2577 2577  
2578 2578          vcpu = &vm->vcpu[vcpuid];
2579 2579          *info1 = vcpu->exitintinfo;
2580 2580          *info2 = vcpu_exception_intinfo(vcpu);
2581 2581          return (0);
2582 2582  }
2583 2583  
2584 2584  int
2585 2585  vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2586 2586      uint32_t errcode, int restart_instruction)
2587 2587  {
2588 2588          struct vcpu *vcpu;
2589 2589          uint64_t regval;
2590 2590          int error;
2591 2591  
2592 2592          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2593 2593                  return (EINVAL);
2594 2594  
2595 2595          if (vector < 0 || vector >= 32)
2596 2596                  return (EINVAL);
2597 2597  
2598 2598          /*
2599 2599           * NMIs (which bear an exception vector of 2) are to be injected via
2600 2600           * their own specialized path using vm_inject_nmi().
2601 2601           */
2602 2602          if (vector == 2) {
2603 2603                  return (EINVAL);
2604 2604          }
2605 2605  
2606 2606          /*
2607 2607           * A double fault exception should never be injected directly into
2608 2608           * the guest. It is a derived exception that results from specific
2609 2609           * combinations of nested faults.
2610 2610           */
2611 2611          if (vector == IDT_DF)
2612 2612                  return (EINVAL);
2613 2613  
2614 2614          vcpu = &vm->vcpu[vcpuid];
2615 2615  
2616 2616          if (vcpu->exception_pending) {
2617 2617                  VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2618 2618                      "pending exception %d", vector, vcpu->exc_vector);
2619 2619                  return (EBUSY);
2620 2620          }
2621 2621  
2622 2622          if (errcode_valid) {
2623 2623                  /*
2624 2624                   * Exceptions don't deliver an error code in real mode.
2625 2625                   */
2626 2626                  error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2627 2627                  KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2628 2628                  if (!(regval & CR0_PE))
2629 2629                          errcode_valid = 0;
2630 2630          }
2631 2631  
2632 2632          /*
2633 2633           * From section 26.6.1 "Interruptibility State" in Intel SDM:
2634 2634           *
2635 2635           * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2636 2636           * one instruction or incurs an exception.
2637 2637           */
2638 2638          error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2639 2639          KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2640 2640              __func__, error));
2641 2641  
2642 2642          if (restart_instruction)
2643 2643                  vm_restart_instruction(vm, vcpuid);
2644 2644  
2645 2645          vcpu->exception_pending = 1;
2646 2646          vcpu->exc_vector = vector;
2647 2647          vcpu->exc_errcode = errcode;
2648 2648          vcpu->exc_errcode_valid = errcode_valid;
2649 2649          VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2650 2650          return (0);
2651 2651  }
2652 2652  
2653 2653  void
2654 2654  vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2655 2655      int errcode)
2656 2656  {
2657 2657          int error;
2658 2658  
2659 2659          error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2660 2660              errcode, 1);
2661 2661          KASSERT(error == 0, ("vm_inject_exception error %d", error));
2662 2662  }
2663 2663  
2664 2664  void
2665 2665  vm_inject_ud(struct vm *vm, int vcpuid)
2666 2666  {
2667 2667          vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2668 2668  }
2669 2669  
2670 2670  void
2671 2671  vm_inject_gp(struct vm *vm, int vcpuid)
2672 2672  {
2673 2673          vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2674 2674  }
2675 2675  
2676 2676  void
2677 2677  vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2678 2678  {
2679 2679          vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2680 2680  }
2681 2681  
2682 2682  void
2683 2683  vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2684 2684  {
2685 2685          vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2686 2686  }
2687 2687  
2688 2688  void
2689 2689  vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2690 2690  {
2691 2691          int error;
2692 2692  
2693 2693          VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2694 2694              error_code, cr2);
2695 2695  
2696 2696          error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2697 2697          KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2698 2698  
2699 2699          vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2700 2700  }
2701 2701  
2702 2702  static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2703 2703  
2704 2704  int
2705 2705  vm_inject_nmi(struct vm *vm, int vcpuid)
2706 2706  {
2707 2707          struct vcpu *vcpu;
2708 2708  
2709 2709          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2710 2710                  return (EINVAL);
2711 2711  
2712 2712          vcpu = &vm->vcpu[vcpuid];
2713 2713  
2714 2714          vcpu->nmi_pending = 1;
2715 2715          vcpu_notify_event(vm, vcpuid);
2716 2716          return (0);
2717 2717  }
2718 2718  
2719 2719  int
2720 2720  vm_nmi_pending(struct vm *vm, int vcpuid)
2721 2721  {
2722 2722          struct vcpu *vcpu;
2723 2723  
2724 2724          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2725 2725                  panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2726 2726  
2727 2727          vcpu = &vm->vcpu[vcpuid];
2728 2728  
2729 2729          return (vcpu->nmi_pending);
2730 2730  }
2731 2731  
2732 2732  void
2733 2733  vm_nmi_clear(struct vm *vm, int vcpuid)
2734 2734  {
2735 2735          struct vcpu *vcpu;
2736 2736  
2737 2737          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2738 2738                  panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2739 2739  
2740 2740          vcpu = &vm->vcpu[vcpuid];
2741 2741  
2742 2742          if (vcpu->nmi_pending == 0)
2743 2743                  panic("vm_nmi_clear: inconsistent nmi_pending state");
2744 2744  
2745 2745          vcpu->nmi_pending = 0;
2746 2746          vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2747 2747  }
2748 2748  
2749 2749  static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2750 2750  
2751 2751  int
2752 2752  vm_inject_extint(struct vm *vm, int vcpuid)
2753 2753  {
2754 2754          struct vcpu *vcpu;
2755 2755  
2756 2756          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2757 2757                  return (EINVAL);
2758 2758  
2759 2759          vcpu = &vm->vcpu[vcpuid];
2760 2760  
2761 2761          vcpu->extint_pending = 1;
2762 2762          vcpu_notify_event(vm, vcpuid);
2763 2763          return (0);
2764 2764  }
2765 2765  
2766 2766  int
2767 2767  vm_extint_pending(struct vm *vm, int vcpuid)
2768 2768  {
2769 2769          struct vcpu *vcpu;
2770 2770  
2771 2771          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2772 2772                  panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2773 2773  
2774 2774          vcpu = &vm->vcpu[vcpuid];
2775 2775  
2776 2776          return (vcpu->extint_pending);
2777 2777  }
2778 2778  
2779 2779  void
2780 2780  vm_extint_clear(struct vm *vm, int vcpuid)
2781 2781  {
2782 2782          struct vcpu *vcpu;
2783 2783  
2784 2784          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2785 2785                  panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2786 2786  
2787 2787          vcpu = &vm->vcpu[vcpuid];
2788 2788  
2789 2789          if (vcpu->extint_pending == 0)
2790 2790                  panic("vm_extint_clear: inconsistent extint_pending state");
2791 2791  
2792 2792          vcpu->extint_pending = 0;
2793 2793          vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2794 2794  }
2795 2795  
2796 2796  int
2797 2797  vm_inject_init(struct vm *vm, int vcpuid)
2798 2798  {
2799 2799          struct vcpu *vcpu;
2800 2800  
2801 2801          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2802 2802                  return (EINVAL);
2803 2803  
2804 2804          vcpu = &vm->vcpu[vcpuid];
2805 2805          vcpu_lock(vcpu);
2806 2806          vcpu->run_state |= VRS_PEND_INIT;
2807 2807          /*
2808 2808           * As part of queuing the INIT request, clear any pending SIPI.  It
2809 2809           * would not otherwise survive across the reset of the vCPU when it
2810 2810           * undergoes the requested INIT.  We would not want it to linger when it
2811 2811           * could be mistaken as a subsequent (after the INIT) SIPI request.
2812 2812           */
2813 2813          vcpu->run_state &= ~VRS_PEND_SIPI;
2814 2814          vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2815 2815  
2816 2816          vcpu_unlock(vcpu);
2817 2817          return (0);
2818 2818  }
2819 2819  
2820 2820  int
2821 2821  vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2822 2822  {
2823 2823          struct vcpu *vcpu;
2824 2824  
2825 2825          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2826 2826                  return (EINVAL);
2827 2827  
2828 2828          vcpu = &vm->vcpu[vcpuid];
2829 2829          vcpu_lock(vcpu);
2830 2830          vcpu->run_state |= VRS_PEND_SIPI;
2831 2831          vcpu->sipi_vector = vector;
2832 2832          /* SIPI is only actionable if the CPU is waiting in INIT state */
2833 2833          if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2834 2834                  vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2835 2835          }
2836 2836          vcpu_unlock(vcpu);
2837 2837          return (0);
2838 2838  }
2839 2839  
2840 2840  bool
2841 2841  vcpu_run_state_pending(struct vm *vm, int vcpuid)
2842 2842  {
2843 2843          struct vcpu *vcpu;
2844 2844  
2845 2845          ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2846 2846          vcpu = &vm->vcpu[vcpuid];
2847 2847  
2848 2848          /* Of interest: vCPU not in running state or with pending INIT */
2849 2849          return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2850 2850  }
2851 2851  
2852 2852  int
2853 2853  vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2854 2854  {
2855 2855          struct seg_desc desc;
2856 2856          const enum vm_reg_name clear_regs[] = {
2857 2857                  VM_REG_GUEST_CR2,
2858 2858                  VM_REG_GUEST_CR3,
2859 2859                  VM_REG_GUEST_CR4,
2860 2860                  VM_REG_GUEST_RAX,
2861 2861                  VM_REG_GUEST_RBX,
2862 2862                  VM_REG_GUEST_RCX,
2863 2863                  VM_REG_GUEST_RSI,
2864 2864                  VM_REG_GUEST_RDI,
2865 2865                  VM_REG_GUEST_RBP,
2866 2866                  VM_REG_GUEST_RSP,
2867 2867                  VM_REG_GUEST_R8,
2868 2868                  VM_REG_GUEST_R9,
2869 2869                  VM_REG_GUEST_R10,
2870 2870                  VM_REG_GUEST_R11,
2871 2871                  VM_REG_GUEST_R12,
2872 2872                  VM_REG_GUEST_R13,
2873 2873                  VM_REG_GUEST_R14,
2874 2874                  VM_REG_GUEST_R15,
2875 2875                  VM_REG_GUEST_DR0,
2876 2876                  VM_REG_GUEST_DR1,
2877 2877                  VM_REG_GUEST_DR2,
2878 2878                  VM_REG_GUEST_DR3,
2879 2879                  VM_REG_GUEST_EFER,
2880 2880          };
2881 2881          const enum vm_reg_name data_segs[] = {
2882 2882                  VM_REG_GUEST_SS,
2883 2883                  VM_REG_GUEST_DS,
2884 2884                  VM_REG_GUEST_ES,
2885 2885                  VM_REG_GUEST_FS,
2886 2886                  VM_REG_GUEST_GS,
2887 2887          };
2888 2888          struct vcpu *vcpu = &vm->vcpu[vcpuid];
2889 2889  
2890 2890          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2891 2891                  return (EINVAL);
2892 2892  
2893 2893          for (uint_t i = 0; i < nitems(clear_regs); i++) {
2894 2894                  VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2895 2895          }
2896 2896  
2897 2897          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2898 2898          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2899 2899          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2900 2900  
2901 2901          /*
2902 2902           * The prescribed contents of %rdx differ slightly between the Intel and
2903 2903           * AMD architectural definitions.  The former expects the Extended Model
2904 2904           * in bits 16-19 where the latter expects all the Family, Model, and
2905 2905           * Stepping be there.  Common boot ROMs appear to disregard this
2906 2906           * anyways, so we stick with a compromise value similar to what is
2907 2907           * spelled out in the Intel SDM.
2908 2908           */
2909 2909          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2910 2910  
2911 2911          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2912 2912          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2913 2913  
2914 2914          /* CS: Present, R/W, Accessed */
2915 2915          desc.access = 0x0093;
2916 2916          desc.base = 0xffff0000;
2917 2917          desc.limit = 0xffff;
2918 2918          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2919 2919          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2920 2920  
2921 2921          /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2922 2922          desc.access = 0x0093;
2923 2923          desc.base = 0;
2924 2924          desc.limit = 0xffff;
2925 2925          for (uint_t i = 0; i < nitems(data_segs); i++) {
2926 2926                  VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2927 2927                  VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2928 2928          }
2929 2929  
2930 2930          /* GDTR, IDTR */
2931 2931          desc.base = 0;
2932 2932          desc.limit = 0xffff;
2933 2933          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2934 2934          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2935 2935  
2936 2936          /* LDTR: Present, LDT */
2937 2937          desc.access = 0x0082;
2938 2938          desc.base = 0;
2939 2939          desc.limit = 0xffff;
2940 2940          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2941 2941          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2942 2942  
2943 2943          /* TR: Present, 32-bit TSS */
2944 2944          desc.access = 0x008b;
2945 2945          desc.base = 0;
2946 2946          desc.limit = 0xffff;
2947 2947          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2948 2948          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2949 2949  
2950 2950          vlapic_reset(vm_lapic(vm, vcpuid));
2951 2951  
2952 2952          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2953 2953  
2954 2954          vcpu->exitintinfo = 0;
2955 2955          vcpu->exception_pending = 0;
2956 2956          vcpu->nmi_pending = 0;
2957 2957          vcpu->extint_pending = 0;
2958 2958  
2959 2959          /*
2960 2960           * A CPU reset caused by power-on or system reset clears more state than
2961 2961           * one which is trigged from an INIT IPI.
2962 2962           */
2963 2963          if (!init_only) {
2964 2964                  vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2965 2965                  fpu_save_area_reset(vcpu->guestfpu);
2966 2966  
2967 2967                  /* XXX: clear MSRs and other pieces */
2968 2968          }
2969 2969  
2970 2970          return (0);
2971 2971  }
2972 2972  
2973 2973  static int
2974 2974  vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2975 2975  {
2976 2976          struct seg_desc desc;
2977 2977  
2978 2978          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2979 2979                  return (EINVAL);
2980 2980  
2981 2981          /* CS: Present, R/W, Accessed */
2982 2982          desc.access = 0x0093;
2983 2983          desc.base = (uint64_t)vector << 12;
2984 2984          desc.limit = 0xffff;
2985 2985          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2986 2986          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2987 2987              (uint64_t)vector << 8));
2988 2988  
2989 2989          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2990 2990  
2991 2991          return (0);
2992 2992  }
2993 2993  
2994 2994  int
2995 2995  vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2996 2996  {
2997 2997          if (vcpu < 0 || vcpu >= vm->maxcpus)
2998 2998                  return (EINVAL);
2999 2999  
3000 3000          if (type < 0 || type >= VM_CAP_MAX)
3001 3001                  return (EINVAL);
3002 3002  
3003 3003          return (VMGETCAP(vm->cookie, vcpu, type, retval));
3004 3004  }
3005 3005  
3006 3006  int
3007 3007  vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3008 3008  {
3009 3009          if (vcpu < 0 || vcpu >= vm->maxcpus)
3010 3010                  return (EINVAL);
3011 3011  
3012 3012          if (type < 0 || type >= VM_CAP_MAX)
3013 3013                  return (EINVAL);
3014 3014  
3015 3015          return (VMSETCAP(vm->cookie, vcpu, type, val));
3016 3016  }
3017 3017  
3018 3018  struct vlapic *
3019 3019  vm_lapic(struct vm *vm, int cpu)
3020 3020  {
3021 3021          return (vm->vcpu[cpu].vlapic);
3022 3022  }
3023 3023  
3024 3024  struct vioapic *
3025 3025  vm_ioapic(struct vm *vm)
3026 3026  {
3027 3027  
3028 3028          return (vm->vioapic);
3029 3029  }
3030 3030  
3031 3031  struct vhpet *
3032 3032  vm_hpet(struct vm *vm)
3033 3033  {
3034 3034  
3035 3035          return (vm->vhpet);
3036 3036  }
3037 3037  
3038 3038  void *
3039 3039  vm_iommu_domain(struct vm *vm)
3040 3040  {
3041 3041  
3042 3042          return (vm->iommu);
3043 3043  }
3044 3044  
3045 3045  int
3046 3046  vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3047 3047      bool from_idle)
3048 3048  {
3049 3049          int error;
3050 3050          struct vcpu *vcpu;
3051 3051  
3052 3052          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3053 3053                  panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3054 3054  
3055 3055          vcpu = &vm->vcpu[vcpuid];
3056 3056  
3057 3057          vcpu_lock(vcpu);
3058 3058          error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3059 3059          vcpu_unlock(vcpu);
3060 3060  
3061 3061          return (error);
3062 3062  }
3063 3063  
3064 3064  enum vcpu_state
3065 3065  vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3066 3066  {
3067 3067          struct vcpu *vcpu;
3068 3068          enum vcpu_state state;
3069 3069  
3070 3070          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3071 3071                  panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3072 3072  
3073 3073          vcpu = &vm->vcpu[vcpuid];
3074 3074  
3075 3075          vcpu_lock(vcpu);
3076 3076          state = vcpu->state;
3077 3077          if (hostcpu != NULL)
3078 3078                  *hostcpu = vcpu->hostcpu;
3079 3079          vcpu_unlock(vcpu);
3080 3080  
3081 3081          return (state);
3082 3082  }
3083 3083  
3084 3084  uint64_t
3085 3085  vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3086 3086  {
3087 3087          ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3088 3088  
3089 3089          uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3090 3090  
3091 3091          if (phys_adj) {
3092 3092                  /* Include any offset for the current physical CPU too */
3093 3093                  extern hrtime_t tsc_gethrtime_tick_delta(void);
3094 3094                  vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3095 3095          }
3096 3096  
3097 3097          return (vcpu_off);
3098 3098  }
3099 3099  
3100 3100  int
3101 3101  vm_activate_cpu(struct vm *vm, int vcpuid)
3102 3102  {
3103 3103  
3104 3104          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3105 3105                  return (EINVAL);
3106 3106  
3107 3107          if (CPU_ISSET(vcpuid, &vm->active_cpus))
3108 3108                  return (EBUSY);
3109 3109  
3110 3110          VCPU_CTR0(vm, vcpuid, "activated");
3111 3111          CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3112 3112          return (0);
3113 3113  }
3114 3114  
3115 3115  int
3116 3116  vm_suspend_cpu(struct vm *vm, int vcpuid)
3117 3117  {
3118 3118          int i;
3119 3119  
3120 3120          if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3121 3121                  return (EINVAL);
3122 3122  
3123 3123          if (vcpuid == -1) {
3124 3124                  vm->debug_cpus = vm->active_cpus;
3125 3125                  for (i = 0; i < vm->maxcpus; i++) {
3126 3126                          if (CPU_ISSET(i, &vm->active_cpus))
3127 3127                                  vcpu_notify_event(vm, i);
3128 3128                  }
3129 3129          } else {
3130 3130                  if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3131 3131                          return (EINVAL);
3132 3132  
3133 3133                  CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3134 3134                  vcpu_notify_event(vm, vcpuid);
3135 3135          }
3136 3136          return (0);
3137 3137  }
3138 3138  
3139 3139  int
3140 3140  vm_resume_cpu(struct vm *vm, int vcpuid)
3141 3141  {
3142 3142  
3143 3143          if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3144 3144                  return (EINVAL);
3145 3145  
3146 3146          if (vcpuid == -1) {
3147 3147                  CPU_ZERO(&vm->debug_cpus);
3148 3148          } else {
3149 3149                  if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3150 3150                          return (EINVAL);
3151 3151  
3152 3152                  CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3153 3153          }
3154 3154          return (0);
3155 3155  }
3156 3156  
3157 3157  static bool
3158 3158  vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3159 3159      uint64_t entry_rip)
3160 3160  {
3161 3161          struct vcpu *vcpu = &vm->vcpu[vcpuid];
3162 3162          struct vm_exit *vme = &vcpu->exitinfo;
3163 3163          bool bail = false;
3164 3164  
3165 3165          ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3166 3166  
3167 3167          if (vm->suspend) {
3168 3168                  if (on_entry) {
3169 3169                          VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3170 3170                              vm->suspend < VM_SUSPEND_LAST);
3171 3171  
3172 3172                          vme->exitcode = VM_EXITCODE_SUSPENDED;
3173 3173                          vme->u.suspended.how = vm->suspend;
3174 3174                  } else {
3175 3175                          /*
3176 3176                           * Handling VM suspend is complicated, so if that
3177 3177                           * condition is detected outside of VM-entry itself,
3178 3178                           * just emit a BOGUS exitcode so we take a lap to pick
3179 3179                           * up the event during an entry and are directed into
3180 3180                           * the vm_handle_suspend() logic.
3181 3181                           */
3182 3182                          vme->exitcode = VM_EXITCODE_BOGUS;
3183 3183                  }
3184 3184                  bail = true;
3185 3185          }
3186 3186          if (vcpu->reqidle) {
3187 3187                  vme->exitcode = VM_EXITCODE_REQIDLE;
3188 3188                  vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3189 3189  
3190 3190                  if (!on_entry) {
3191 3191                          /*
3192 3192                           * A reqidle request detected outside of VM-entry can be
3193 3193                           * handled directly by clearing the request (and taking
3194 3194                           * a lap to userspace).
3195 3195                           */
3196 3196                          vcpu_assert_locked(vcpu);
3197 3197                          vcpu->reqidle = 0;
3198 3198                  }
3199 3199                  bail = true;
3200 3200          }
3201 3201          if (vcpu_should_yield(vm, vcpuid)) {
3202 3202                  vme->exitcode = VM_EXITCODE_BOGUS;
3203 3203                  vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3204 3204                  bail = true;
3205 3205          }
3206 3206          if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3207 3207                  vme->exitcode = VM_EXITCODE_DEBUG;
3208 3208                  bail = true;
3209 3209          }
3210 3210  
3211 3211          if (bail) {
3212 3212                  if (on_entry) {
3213 3213                          /*
3214 3214                           * If bailing out during VM-entry, the current %rip must
3215 3215                           * be recorded in the exitinfo.
3216 3216                           */
3217 3217                          vme->rip = entry_rip;
3218 3218                  }
3219 3219                  vme->inst_length = 0;
3220 3220          }
3221 3221          return (bail);
3222 3222  }
3223 3223  
3224 3224  static bool
3225 3225  vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3226 3226  {
3227 3227          /*
3228 3228           * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3229 3229           * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3230 3230           * structure, and we would only modify the exitcode.
3231 3231           */
3232 3232          return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3233 3233  }
3234 3234  
3235 3235  bool
3236 3236  vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3237 3237  {
3238 3238          /*
3239 3239           * Bail-out checks done as part of VM entry require an updated %rip to
3240 3240           * populate the vm_exit struct if any of the conditions of interest are
3241 3241           * matched in the check.
3242 3242           */
3243 3243          return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3244 3244  }
3245 3245  
3246 3246  cpuset_t
3247 3247  vm_active_cpus(struct vm *vm)
3248 3248  {
3249 3249  
3250 3250          return (vm->active_cpus);
3251 3251  }
3252 3252  
3253 3253  cpuset_t
3254 3254  vm_debug_cpus(struct vm *vm)
3255 3255  {
3256 3256  
3257 3257          return (vm->debug_cpus);
3258 3258  }
3259 3259  
3260 3260  cpuset_t
3261 3261  vm_suspended_cpus(struct vm *vm)
3262 3262  {
3263 3263  
3264 3264          return (vm->suspended_cpus);
3265 3265  }
3266 3266  
3267 3267  void *
3268 3268  vcpu_stats(struct vm *vm, int vcpuid)
3269 3269  {
3270 3270  
3271 3271          return (vm->vcpu[vcpuid].stats);
3272 3272  }
3273 3273  
3274 3274  int
3275 3275  vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3276 3276  {
3277 3277          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3278 3278                  return (EINVAL);
3279 3279  
3280 3280          *state = vm->vcpu[vcpuid].x2apic_state;
3281 3281  
3282 3282          return (0);
3283 3283  }
3284 3284  
3285 3285  int
3286 3286  vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3287 3287  {
3288 3288          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3289 3289                  return (EINVAL);
3290 3290  
3291 3291          if (state >= X2APIC_STATE_LAST)
3292 3292                  return (EINVAL);
3293 3293  
3294 3294          vm->vcpu[vcpuid].x2apic_state = state;
3295 3295  
3296 3296          vlapic_set_x2apic_state(vm, vcpuid, state);
3297 3297  
3298 3298          return (0);
3299 3299  }
3300 3300  
3301 3301  /*
3302 3302   * This function is called to ensure that a vcpu "sees" a pending event
3303 3303   * as soon as possible:
3304 3304   * - If the vcpu thread is sleeping then it is woken up.
3305 3305   * - If the vcpu is running on a different host_cpu then an IPI will be directed
3306 3306   *   to the host_cpu to cause the vcpu to trap into the hypervisor.
3307 3307   */
3308 3308  static void
3309 3309  vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3310 3310  {
3311 3311          int hostcpu;
3312 3312  
3313 3313          ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3314 3314  
3315 3315          hostcpu = vcpu->hostcpu;
3316 3316          if (vcpu->state == VCPU_RUNNING) {
3317 3317                  KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3318 3318                  if (hostcpu != curcpu) {
3319 3319                          if (ntype == VCPU_NOTIFY_APIC) {
3320 3320                                  vlapic_post_intr(vcpu->vlapic, hostcpu,
3321 3321                                      vmm_ipinum);
3322 3322                          } else {
3323 3323                                  ipi_cpu(hostcpu, vmm_ipinum);
3324 3324                          }
3325 3325                  } else {
3326 3326                          /*
3327 3327                           * If the 'vcpu' is running on 'curcpu' then it must
3328 3328                           * be sending a notification to itself (e.g. SELF_IPI).
3329 3329                           * The pending event will be picked up when the vcpu
3330 3330                           * transitions back to guest context.
3331 3331                           */
3332 3332                  }
3333 3333          } else {
3334 3334                  KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3335 3335                      "with hostcpu %d", vcpu->state, hostcpu));
3336 3336                  if (vcpu->state == VCPU_SLEEPING) {
3337 3337                          cv_signal(&vcpu->vcpu_cv);
3338 3338                  }
3339 3339          }
3340 3340  }
3341 3341  
3342 3342  void
3343 3343  vcpu_notify_event(struct vm *vm, int vcpuid)
3344 3344  {
3345 3345          struct vcpu *vcpu = &vm->vcpu[vcpuid];
3346 3346  
3347 3347          vcpu_lock(vcpu);
3348 3348          vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3349 3349          vcpu_unlock(vcpu);
3350 3350  }
3351 3351  
3352 3352  void
3353 3353  vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3354 3354  {
3355 3355          struct vcpu *vcpu = &vm->vcpu[vcpuid];
3356 3356  
3357 3357          if (ntype == VCPU_NOTIFY_NONE) {
3358 3358                  return;
3359 3359          }
3360 3360  
3361 3361          vcpu_lock(vcpu);
3362 3362          vcpu_notify_event_locked(vcpu, ntype);
3363 3363          vcpu_unlock(vcpu);
3364 3364  }
3365 3365  
3366 3366  void
3367 3367  vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3368 3368  {
3369 3369          struct vcpu *vcpu = &vm->vcpu[vcpuid];
3370 3370          hrtime_t now = gethrtime();
3371 3371  
3372 3372          ASSERT3U(ustate, !=, vcpu->ustate);
3373 3373          ASSERT3S(ustate, <, VU_MAX);
3374 3374          ASSERT3S(ustate, >=, VU_INIT);
3375 3375  
3376 3376          hrtime_t delta = now - vcpu->ustate_when;
3377 3377          vcpu->ustate_total[vcpu->ustate] += delta;
3378 3378  
3379 3379          membar_producer();
3380 3380  
3381 3381          vcpu->ustate_when = now;
3382 3382          vcpu->ustate = ustate;
3383 3383  }
3384 3384  
3385 3385  struct vmspace *
3386 3386  vm_get_vmspace(struct vm *vm)
3387 3387  {
3388 3388  
3389 3389          return (vm->vmspace);
3390 3390  }
3391 3391  
3392 3392  int
3393 3393  vm_apicid2vcpuid(struct vm *vm, int apicid)
3394 3394  {
3395 3395          /*
3396 3396           * XXX apic id is assumed to be numerically identical to vcpu id
3397 3397           */
3398 3398          return (apicid);
3399 3399  }
3400 3400  
3401 3401  struct vatpic *
3402 3402  vm_atpic(struct vm *vm)
3403 3403  {
3404 3404          return (vm->vatpic);
3405 3405  }
3406 3406  
3407 3407  struct vatpit *
3408 3408  vm_atpit(struct vm *vm)
3409 3409  {
3410 3410          return (vm->vatpit);
3411 3411  }
3412 3412  
3413 3413  struct vpmtmr *
3414 3414  vm_pmtmr(struct vm *vm)
3415 3415  {
3416 3416  
3417 3417          return (vm->vpmtmr);
3418 3418  }
3419 3419  
3420 3420  struct vrtc *
3421 3421  vm_rtc(struct vm *vm)
3422 3422  {
3423 3423  
3424 3424          return (vm->vrtc);
3425 3425  }
3426 3426  
3427 3427  enum vm_reg_name
3428 3428  vm_segment_name(int seg)
3429 3429  {
3430 3430          static enum vm_reg_name seg_names[] = {
3431 3431                  VM_REG_GUEST_ES,
3432 3432                  VM_REG_GUEST_CS,
3433 3433                  VM_REG_GUEST_SS,
3434 3434                  VM_REG_GUEST_DS,
3435 3435                  VM_REG_GUEST_FS,
3436 3436                  VM_REG_GUEST_GS
3437 3437          };
3438 3438  
3439 3439          KASSERT(seg >= 0 && seg < nitems(seg_names),
3440 3440              ("%s: invalid segment encoding %d", __func__, seg));
3441 3441          return (seg_names[seg]);
3442 3442  }
3443 3443  
3444 3444  void
3445 3445  vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3446 3446      int num_copyinfo)
3447 3447  {
3448 3448          int idx;
3449 3449  
3450 3450          for (idx = 0; idx < num_copyinfo; idx++) {
3451 3451                  if (copyinfo[idx].cookie != NULL)
3452 3452                          vm_gpa_release(copyinfo[idx].cookie);
3453 3453          }
3454 3454          bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3455 3455  }
3456 3456  
3457 3457  int
3458 3458  vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3459 3459      uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3460 3460      int num_copyinfo, int *fault)
3461 3461  {
3462 3462          int error, idx, nused;
3463 3463          size_t n, off, remaining;
3464 3464          void *hva, *cookie;
3465 3465          uint64_t gpa;
3466 3466  
3467 3467          bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3468 3468  
3469 3469          nused = 0;
3470 3470          remaining = len;
3471 3471          while (remaining > 0) {
3472 3472                  KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3473 3473                  error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3474 3474                  if (error || *fault)
3475 3475                          return (error);
3476 3476                  off = gpa & PAGE_MASK;
3477 3477                  n = min(remaining, PAGE_SIZE - off);
3478 3478                  copyinfo[nused].gpa = gpa;
3479 3479                  copyinfo[nused].len = n;
3480 3480                  remaining -= n;
3481 3481                  gla += n;
3482 3482                  nused++;
3483 3483          }
3484 3484  
3485 3485          for (idx = 0; idx < nused; idx++) {
3486 3486                  hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3487 3487                      copyinfo[idx].len, prot, &cookie);
3488 3488                  if (hva == NULL)
3489 3489                          break;
3490 3490                  copyinfo[idx].hva = hva;
3491 3491                  copyinfo[idx].cookie = cookie;
3492 3492          }
3493 3493  
3494 3494          if (idx != nused) {
3495 3495                  vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3496 3496                  return (EFAULT);
3497 3497          } else {
3498 3498                  *fault = 0;
3499 3499                  return (0);
3500 3500          }
3501 3501  }
3502 3502  
3503 3503  void
3504 3504  vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3505 3505      size_t len)
3506 3506  {
3507 3507          char *dst;
3508 3508          int idx;
3509 3509  
3510 3510          dst = kaddr;
3511 3511          idx = 0;
3512 3512          while (len > 0) {
3513 3513                  bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3514 3514                  len -= copyinfo[idx].len;
3515 3515                  dst += copyinfo[idx].len;
3516 3516                  idx++;
3517 3517          }
3518 3518  }
3519 3519  
3520 3520  void
3521 3521  vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3522 3522      struct vm_copyinfo *copyinfo, size_t len)
3523 3523  {
3524 3524          const char *src;
3525 3525          int idx;
3526 3526  
3527 3527          src = kaddr;
3528 3528          idx = 0;
3529 3529          while (len > 0) {
3530 3530                  bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3531 3531                  len -= copyinfo[idx].len;
3532 3532                  src += copyinfo[idx].len;
3533 3533                  idx++;
3534 3534          }
3535 3535  }
3536 3536  
3537 3537  /*
3538 3538   * Return the amount of in-use and wired memory for the VM. Since
3539 3539   * these are global stats, only return the values with for vCPU 0
3540 3540   */
3541 3541  VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3542 3542  VMM_STAT_DECLARE(VMM_MEM_WIRED);
3543 3543  
3544 3544  static void
3545 3545  vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3546 3546  {
3547 3547  
3548 3548          if (vcpu == 0) {
3549 3549                  vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3550 3550                      PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3551 3551          }
3552 3552  }
3553 3553  
3554 3554  static void
3555 3555  vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3556 3556  {
3557 3557  
3558 3558          if (vcpu == 0) {
3559 3559                  vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3560 3560                      PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3561 3561          }
3562 3562  }
3563 3563  
3564 3564  VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3565 3565  VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3566 3566  
3567 3567  int
3568 3568  vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3569 3569      uint8_t bytes, uint32_t *val)
3570 3570  {
3571 3571          return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3572 3572  }
3573 3573  
3574 3574  /*
3575 3575   * bhyve-internal interfaces to attach or detach IO port handlers.
3576 3576   * Must be called with VM write lock held for safety.
3577 3577   */
3578 3578  int
3579 3579  vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3580 3580      void **cookie)
3581 3581  {
3582 3582          int err;
3583 3583          err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3584 3584          if (err == 0) {
3585 3585                  *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3586 3586          }
3587 3587          return (err);
3588 3588  }
3589 3589  int
3590 3590  vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3591 3591      void **old_arg)
3592 3592  {
3593 3593          uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3594 3594          int err;
3595 3595  
3596 3596          err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3597 3597          if (err == 0) {
3598 3598                  *cookie = NULL;
3599 3599          }
3600 3600          return (err);
3601 3601  }
3602 3602  
3603 3603  /*
3604 3604   * External driver interfaces to attach or detach IO port handlers.
3605 3605   * Must be called with VM write lock held for safety.
3606 3606   */
3607 3607  int
3608 3608  vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3609 3609      void *arg, void **cookie)
3610 3610  {
3611 3611          int err;
3612 3612  
3613 3613          if (port == 0) {
3614 3614                  return (EINVAL);
3615 3615          }
3616 3616  
3617 3617          err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3618 3618          if (err == 0) {
3619 3619                  *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3620 3620          }
3621 3621          return (err);
3622 3622  }
3623 3623  void
3624 3624  vm_ioport_unhook(struct vm *vm, void **cookie)
3625 3625  {
3626 3626          uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3627 3627          ioport_handler_t old_func;
3628 3628          void *old_arg;
3629 3629          int err;
3630 3630  
3631 3631          err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3632 3632  
3633 3633          /* ioport-hook-using drivers are expected to be well-behaved */
3634 3634          VERIFY0(err);
3635 3635          VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3636 3636  
3637 3637          *cookie = NULL;
3638 3638  }
3639 3639  
3640 3640  int
3641 3641  vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3642 3642  {
3643 3643          struct vm *vm = ksp->ks_private;
3644 3644          vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3645 3645          const int vcpuid = vvk->vvk_vcpu.value.ui32;
3646 3646          struct vcpu *vcpu = &vm->vcpu[vcpuid];
3647 3647  
3648 3648          ASSERT3U(vcpuid, <, VM_MAXCPU);
3649 3649  
3650 3650          vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3651 3651          vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3652 3652          vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3653 3653          vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3654 3654          vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3655 3655          vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3656 3656  
3657 3657          return (0);
3658 3658  }

↓ open down ↓

1429 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX