8005-backout Wdiff usr/src/uts/i86pc/io/vmm/vmm.c

Print this page

Revert "OS-8005 bhyve memory pressure needs to target ARC better (#354)"
This reverts commit a6033573eedd94118d2b9e65f45deca0bf4b42f7.

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/io/vmm/vmm.c
          +++ new/usr/src/uts/i86pc/io/vmm/vmm.c

   1    1  /*-
   2    2   * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3    3   *
   4    4   * Copyright (c) 2011 NetApp, Inc.
   5    5   * All rights reserved.
   6    6   *
   7    7   * Redistribution and use in source and binary forms, with or without
   8    8   * modification, are permitted provided that the following conditions
   9    9   * are met:
  10   10   * 1. Redistributions of source code must retain the above copyright
  11   11   *    notice, this list of conditions and the following disclaimer.
  12   12   * 2. Redistributions in binary form must reproduce the above copyright
  13   13   *    notice, this list of conditions and the following disclaimer in the
  14   14   *    documentation and/or other materials provided with the distribution.
  15   15   *
  16   16   * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  17   17   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18   18   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19   19   * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  20   20   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21   21   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22   22   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23   23   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24   24   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25   25   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26   26   * SUCH DAMAGE.
  27   27   *
  28   28   * $FreeBSD$
  29   29   */
  30   30  /*

↓ open down ↓

30 lines elided

↑ open up ↑

  31   31   * This file and its contents are supplied under the terms of the
  32   32   * Common Development and Distribution License ("CDDL"), version 1.0.
  33   33   * You may only use this file in accordance with the terms of version
  34   34   * 1.0 of the CDDL.
  35   35   *
  36   36   * A full copy of the text of the CDDL should have accompanied this
  37   37   * source.  A copy of the CDDL is also available via the Internet at
  38   38   * http://www.illumos.org/license/CDDL.
  39   39   *
  40   40   * Copyright 2015 Pluribus Networks Inc.
  41      - * Copyright 2021 Joyent, Inc.
       41 + * Copyright 2018 Joyent, Inc.
  42   42   * Copyright 2021 Oxide Computer Company
  43   43   * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  44   44   */
  45   45  
  46   46  #include <sys/cdefs.h>
  47   47  __FBSDID("$FreeBSD$");
  48   48  
  49   49  #include <sys/param.h>
  50   50  #include <sys/systm.h>
  51   51  #include <sys/kernel.h>

  52   52  #include <sys/module.h>
  53   53  #include <sys/sysctl.h>
  54   54  #include <sys/malloc.h>
  55   55  #include <sys/pcpu.h>
  56   56  #include <sys/lock.h>
  57   57  #include <sys/mutex.h>
  58   58  #include <sys/proc.h>
  59   59  #include <sys/rwlock.h>
  60   60  #include <sys/sched.h>
  61   61  #include <sys/smp.h>
  62   62  #include <sys/systm.h>
  63   63  
  64   64  #include <machine/pcb.h>
  65   65  #include <machine/smp.h>
  66   66  #include <machine/md_var.h>
  67   67  #include <x86/psl.h>
  68   68  #include <x86/apicreg.h>
  69   69  
  70   70  #include <machine/specialreg.h>
  71   71  #include <machine/vmm.h>
  72   72  #include <machine/vmm_dev.h>
  73   73  #include <machine/vmparam.h>
  74   74  #include <sys/vmm_instruction_emul.h>
  75   75  #include <sys/vmm_vm.h>
  76   76  
  77   77  #include "vmm_ioport.h"
  78   78  #include "vmm_ktr.h"
  79   79  #include "vmm_host.h"
  80   80  #include "vmm_mem.h"
  81   81  #include "vmm_util.h"
  82   82  #include "vatpic.h"
  83   83  #include "vatpit.h"
  84   84  #include "vhpet.h"
  85   85  #include "vioapic.h"
  86   86  #include "vlapic.h"
  87   87  #include "vpmtmr.h"
  88   88  #include "vrtc.h"
  89   89  #include "vmm_stat.h"
  90   90  #include "vmm_lapic.h"
  91   91  
  92   92  #include "io/ppt.h"
  93   93  #include "io/iommu.h"
  94   94  
  95   95  struct vlapic;
  96   96  
  97   97  /*
  98   98   * Initialization:
  99   99   * (a) allocated when vcpu is created
 100  100   * (i) initialized when vcpu is created and when it is reinitialized
 101  101   * (o) initialized the first time the vcpu is created
 102  102   * (x) initialized before use
 103  103   */
 104  104  struct vcpu {
 105  105          /* (o) protects state, run_state, hostcpu, sipi_vector */
 106  106          struct mtx      mtx;
 107  107  
 108  108          enum vcpu_state state;          /* (o) vcpu state */
 109  109          enum vcpu_run_state run_state;  /* (i) vcpu init/sipi/run state */
 110  110          kcondvar_t      vcpu_cv;        /* (o) cpu waiter cv */
 111  111          kcondvar_t      state_cv;       /* (o) IDLE-transition cv */
 112  112          int             hostcpu;        /* (o) vcpu's current host cpu */
 113  113          int             lastloccpu;     /* (o) last host cpu localized to */
 114  114          int             reqidle;        /* (i) request vcpu to idle */
 115  115          struct vlapic   *vlapic;        /* (i) APIC device model */
 116  116          enum x2apic_state x2apic_state; /* (i) APIC mode */
 117  117          uint64_t        exitintinfo;    /* (i) events pending at VM exit */
 118  118          int             nmi_pending;    /* (i) NMI pending */
 119  119          int             extint_pending; /* (i) INTR pending */
 120  120          int     exception_pending;      /* (i) exception pending */
 121  121          int     exc_vector;             /* (x) exception collateral */
 122  122          int     exc_errcode_valid;
 123  123          uint32_t exc_errcode;
 124  124          uint8_t         sipi_vector;    /* (i) SIPI vector */
 125  125          struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
 126  126          uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
 127  127          void            *stats;         /* (a,i) statistics */
 128  128          struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
 129  129          uint64_t        nextrip;        /* (x) next instruction to execute */
 130  130          struct vie      *vie_ctx;       /* (x) instruction emulation context */
 131  131          uint64_t        tsc_offset;     /* (x) offset from host TSC */
 132  132  
 133  133          enum vcpu_ustate ustate;        /* (i) microstate for the vcpu */
 134  134          hrtime_t        ustate_when;    /* (i) time of last ustate change */
 135  135          uint64_t ustate_total[VU_MAX];  /* (o) total time spent in ustates */
 136  136  };
 137  137  
 138  138  #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 139  139  #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 140  140  #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
 141  141  #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
 142  142  #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
 143  143  
 144  144  struct mem_seg {
 145  145          size_t  len;
 146  146          bool    sysmem;
 147  147          struct vm_object *object;
 148  148  };
 149  149  #define VM_MAX_MEMSEGS  4
 150  150  
 151  151  struct mem_map {
 152  152          vm_paddr_t      gpa;
 153  153          size_t          len;
 154  154          vm_ooffset_t    segoff;
 155  155          int             segid;
 156  156          int             prot;
 157  157          int             flags;
 158  158  };
 159  159  #define VM_MAX_MEMMAPS  8
 160  160  
 161  161  /*
 162  162   * Initialization:
 163  163   * (o) initialized the first time the VM is created
 164  164   * (i) initialized when VM is created and when it is reinitialized
 165  165   * (x) initialized before use
 166  166   */
 167  167  struct vm {
 168  168          void            *cookie;                /* (i) cpu-specific data */
 169  169          void            *iommu;                 /* (x) iommu-specific data */
 170  170          struct vhpet    *vhpet;                 /* (i) virtual HPET */
 171  171          struct vioapic  *vioapic;               /* (i) virtual ioapic */
 172  172          struct vatpic   *vatpic;                /* (i) virtual atpic */
 173  173          struct vatpit   *vatpit;                /* (i) virtual atpit */
 174  174          struct vpmtmr   *vpmtmr;                /* (i) virtual ACPI PM timer */
 175  175          struct vrtc     *vrtc;                  /* (o) virtual RTC */
 176  176          volatile cpuset_t active_cpus;          /* (i) active vcpus */
 177  177          volatile cpuset_t debug_cpus;           /* (i) vcpus stopped for dbg */
 178  178          int             suspend;                /* (i) stop VM execution */
 179  179          volatile cpuset_t suspended_cpus;       /* (i) suspended vcpus */
 180  180          volatile cpuset_t halted_cpus;          /* (x) cpus in a hard halt */
 181  181          struct mem_map  mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */

↓ open down ↓

130 lines elided

↑ open up ↑

 182  182          struct mem_seg  mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 183  183          struct vmspace  *vmspace;               /* (o) guest's address space */
 184  184          char            name[VM_MAX_NAMELEN];   /* (o) virtual machine name */
 185  185          struct vcpu     vcpu[VM_MAXCPU];        /* (i) guest vcpus */
 186  186          /* The following describe the vm cpu topology */
 187  187          uint16_t        sockets;                /* (o) num of sockets */
 188  188          uint16_t        cores;                  /* (o) num of cores/socket */
 189  189          uint16_t        threads;                /* (o) num of threads/core */
 190  190          uint16_t        maxcpus;                /* (o) max pluggable cpus */
 191  191          uint64_t        boot_tsc_offset;        /* (i) TSC offset at VM boot */
 192      -        size_t          arc_resv;               /* # of pages take from ARC */
 193  192  
 194  193          struct ioport_config ioports;           /* (o) ioport handling */
 195  194  };
 196  195  
 197  196  static int vmm_initialized;
 198  197  
 199  198  
 200  199  static void
 201  200  nullop_panic(void)
 202  201  {

 203  202          panic("null vmm operation call");
 204  203  }
 205  204  
 206  205  /* Do not allow use of an un-set `ops` to do anything but panic */
 207  206  static struct vmm_ops vmm_ops_null = {
 208  207          .init           = (vmm_init_func_t)nullop_panic,
 209  208          .cleanup        = (vmm_cleanup_func_t)nullop_panic,
 210  209          .resume         = (vmm_resume_func_t)nullop_panic,
 211  210          .vminit         = (vmi_init_func_t)nullop_panic,
 212  211          .vmrun          = (vmi_run_func_t)nullop_panic,
 213  212          .vmcleanup      = (vmi_cleanup_func_t)nullop_panic,
 214  213          .vmgetreg       = (vmi_get_register_t)nullop_panic,
 215  214          .vmsetreg       = (vmi_set_register_t)nullop_panic,
 216  215          .vmgetdesc      = (vmi_get_desc_t)nullop_panic,
 217  216          .vmsetdesc      = (vmi_set_desc_t)nullop_panic,
 218  217          .vmgetcap       = (vmi_get_cap_t)nullop_panic,
 219  218          .vmsetcap       = (vmi_set_cap_t)nullop_panic,
 220  219          .vmspace_alloc  = (vmi_vmspace_alloc)nullop_panic,
 221  220          .vmspace_free   = (vmi_vmspace_free)nullop_panic,
 222  221          .vlapic_init    = (vmi_vlapic_init)nullop_panic,
 223  222          .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
 224  223          .vmsavectx      = (vmi_savectx)nullop_panic,
 225  224          .vmrestorectx   = (vmi_restorectx)nullop_panic,
 226  225  };
 227  226  
 228  227  static struct vmm_ops *ops = &vmm_ops_null;
 229  228  
 230  229  #define VMM_INIT(num)                   ((*ops->init)(num))
 231  230  #define VMM_CLEANUP()                   ((*ops->cleanup)())
 232  231  #define VMM_RESUME()                    ((*ops->resume)())
 233  232  
 234  233  #define VMINIT(vm, pmap)                ((*ops->vminit)(vm, pmap))
 235  234  #define VMRUN(vmi, vcpu, rip, pmap) \
 236  235          ((*ops->vmrun)(vmi, vcpu, rip, pmap))
 237  236  #define VMCLEANUP(vmi)                  ((*ops->vmcleanup)(vmi))
 238  237  #define VMSPACE_ALLOC(min, max)         ((*ops->vmspace_alloc)(min, max))
 239  238  #define VMSPACE_FREE(vmspace)           ((*ops->vmspace_free)(vmspace))
 240  239  
 241  240  #define VMGETREG(vmi, vcpu, num, rv)    ((*ops->vmgetreg)(vmi, vcpu, num, rv))
 242  241  #define VMSETREG(vmi, vcpu, num, val)   ((*ops->vmsetreg)(vmi, vcpu, num, val))
 243  242  #define VMGETDESC(vmi, vcpu, num, dsc)  ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
 244  243  #define VMSETDESC(vmi, vcpu, num, dsc)  ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
 245  244  #define VMGETCAP(vmi, vcpu, num, rv)    ((*ops->vmgetcap)(vmi, vcpu, num, rv))
 246  245  #define VMSETCAP(vmi, vcpu, num, val)   ((*ops->vmsetcap)(vmi, vcpu, num, val))
 247  246  #define VLAPIC_INIT(vmi, vcpu)          ((*ops->vlapic_init)(vmi, vcpu))
 248  247  #define VLAPIC_CLEANUP(vmi, vlapic)     ((*ops->vlapic_cleanup)(vmi, vlapic))
 249  248  
 250  249  #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 251  250  #define fpu_stop_emulating()    clts()
 252  251  
 253  252  SDT_PROVIDER_DEFINE(vmm);
 254  253  
 255  254  static MALLOC_DEFINE(M_VM, "vm", "vm");
 256  255  
 257  256  SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 258  257      NULL);
 259  258  
 260  259  /*
 261  260   * Halt the guest if all vcpus are executing a HLT instruction with
 262  261   * interrupts disabled.
 263  262   */
 264  263  static int halt_detection_enabled = 1;
 265  264  
 266  265  /* IPI vector used for vcpu notifications */
 267  266  static int vmm_ipinum;

↓ open down ↓

65 lines elided

↑ open up ↑

 268  267  
 269  268  /* Trap into hypervisor on all guest exceptions and reflect them back */
 270  269  static int trace_guest_exceptions;
 271  270  
 272  271  static void vm_free_memmap(struct vm *vm, int ident);
 273  272  static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 274  273  static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
 275  274  static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
 276  275  static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
 277  276  
 278      -extern int arc_virt_machine_reserve(size_t);
 279      -extern void arc_virt_machine_release(size_t);
 280      -
 281  277  /* Flags for vtc_status */
 282  278  #define VTCS_FPU_RESTORED       1 /* guest FPU restored, host FPU saved */
 283  279  #define VTCS_FPU_CTX_CRITICAL   2 /* in ctx where FPU restore cannot be lazy */
 284  280  
 285  281  typedef struct vm_thread_ctx {
 286  282          struct vm       *vtc_vm;
 287  283          int             vtc_vcpuid;
 288  284          uint_t          vtc_status;
 289  285          enum vcpu_ustate vtc_ustate;
 290  286  } vm_thread_ctx_t;

 291  287  
 292  288  #ifdef KTR
 293  289  static const char *
 294  290  vcpu_state2str(enum vcpu_state state)
 295  291  {
 296  292  
 297  293          switch (state) {
 298  294          case VCPU_IDLE:
 299  295                  return ("idle");
 300  296          case VCPU_FROZEN:
 301  297                  return ("frozen");
 302  298          case VCPU_RUNNING:
 303  299                  return ("running");
 304  300          case VCPU_SLEEPING:
 305  301                  return ("sleeping");
 306  302          default:
 307  303                  return ("unknown");
 308  304          }
 309  305  }
 310  306  #endif
 311  307  
 312  308  static void
 313  309  vcpu_cleanup(struct vm *vm, int i, bool destroy)
 314  310  {
 315  311          struct vcpu *vcpu = &vm->vcpu[i];
 316  312  
 317  313          VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 318  314          if (destroy) {
 319  315                  vmm_stat_free(vcpu->stats);
 320  316                  fpu_save_area_free(vcpu->guestfpu);
 321  317                  vie_free(vcpu->vie_ctx);
 322  318                  vcpu->vie_ctx = NULL;
 323  319          }
 324  320  }
 325  321  
 326  322  static void
 327  323  vcpu_init(struct vm *vm, int vcpu_id, bool create)
 328  324  {
 329  325          struct vcpu *vcpu;
 330  326  
 331  327          KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 332  328              ("vcpu_init: invalid vcpu %d", vcpu_id));
 333  329  
 334  330          vcpu = &vm->vcpu[vcpu_id];
 335  331  
 336  332          if (create) {
 337  333                  vcpu_lock_init(vcpu);
 338  334                  vcpu->state = VCPU_IDLE;
 339  335                  vcpu->hostcpu = NOCPU;
 340  336                  vcpu->lastloccpu = NOCPU;
 341  337                  vcpu->guestfpu = fpu_save_area_alloc();
 342  338                  vcpu->stats = vmm_stat_alloc();
 343  339                  vcpu->vie_ctx = vie_alloc();
 344  340  
 345  341                  vcpu->ustate = VU_INIT;
 346  342                  vcpu->ustate_when = gethrtime();
 347  343          } else {
 348  344                  vie_reset(vcpu->vie_ctx);
 349  345                  bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
 350  346                  if (vcpu->ustate != VU_INIT) {
 351  347                          vcpu_ustate_change(vm, vcpu_id, VU_INIT);
 352  348                  }
 353  349          }
 354  350  
 355  351          vcpu->run_state = VRS_HALT;
 356  352          vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 357  353          vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 358  354          vcpu->reqidle = 0;
 359  355          vcpu->exitintinfo = 0;
 360  356          vcpu->nmi_pending = 0;
 361  357          vcpu->extint_pending = 0;
 362  358          vcpu->exception_pending = 0;
 363  359          vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 364  360          fpu_save_area_reset(vcpu->guestfpu);
 365  361          vmm_stat_init(vcpu->stats);
 366  362          vcpu->tsc_offset = 0;
 367  363  }
 368  364  
 369  365  int
 370  366  vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 371  367  {
 372  368  
 373  369          return (trace_guest_exceptions);
 374  370  }
 375  371  
 376  372  struct vm_exit *
 377  373  vm_exitinfo(struct vm *vm, int cpuid)
 378  374  {
 379  375          struct vcpu *vcpu;
 380  376  
 381  377          if (cpuid < 0 || cpuid >= vm->maxcpus)
 382  378                  panic("vm_exitinfo: invalid cpuid %d", cpuid);
 383  379  
 384  380          vcpu = &vm->vcpu[cpuid];
 385  381  
 386  382          return (&vcpu->exitinfo);
 387  383  }
 388  384  
 389  385  struct vie *
 390  386  vm_vie_ctx(struct vm *vm, int cpuid)
 391  387  {
 392  388          if (cpuid < 0 || cpuid >= vm->maxcpus)
 393  389                  panic("vm_vie_ctx: invalid cpuid %d", cpuid);
 394  390  
 395  391          return (vm->vcpu[cpuid].vie_ctx);
 396  392  }
 397  393  
 398  394  static int
 399  395  vmm_init(void)
 400  396  {
 401  397          int error;
 402  398  
 403  399          vmm_host_state_init();
 404  400  
 405  401          /* We use cpu_poke() for IPIs */
 406  402          vmm_ipinum = 0;
 407  403  
 408  404          error = vmm_mem_init();
 409  405          if (error)
 410  406                  return (error);
 411  407  
 412  408          if (vmm_is_intel())
 413  409                  ops = &vmm_ops_intel;
 414  410          else if (vmm_is_svm())
 415  411                  ops = &vmm_ops_amd;
 416  412          else
 417  413                  return (ENXIO);
 418  414  
 419  415          return (VMM_INIT(vmm_ipinum));
 420  416  }
 421  417  
 422  418  int
 423  419  vmm_mod_load()
 424  420  {
 425  421          int     error;
 426  422  
 427  423          VERIFY(vmm_initialized == 0);
 428  424  
 429  425          error = vmm_init();
 430  426          if (error == 0)
 431  427                  vmm_initialized = 1;
 432  428  
 433  429          return (error);
 434  430  }
 435  431  
 436  432  int
 437  433  vmm_mod_unload()
 438  434  {
 439  435          int     error;
 440  436  
 441  437          VERIFY(vmm_initialized == 1);
 442  438  
 443  439          iommu_cleanup();
 444  440          error = VMM_CLEANUP();
 445  441          if (error)
 446  442                  return (error);
 447  443          vmm_initialized = 0;
 448  444  
 449  445          return (0);
 450  446  }
 451  447  
 452  448  static void
 453  449  vm_init(struct vm *vm, bool create)
 454  450  {
 455  451          int i;
 456  452  
 457  453          vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 458  454          vm->iommu = NULL;
 459  455          vm->vioapic = vioapic_init(vm);
 460  456          vm->vhpet = vhpet_init(vm);
 461  457          vm->vatpic = vatpic_init(vm);
 462  458          vm->vatpit = vatpit_init(vm);
 463  459          vm->vpmtmr = vpmtmr_init(vm);
 464  460          if (create)
 465  461                  vm->vrtc = vrtc_init(vm);
 466  462  
 467  463          vm_inout_init(vm, &vm->ioports);
 468  464  
 469  465          CPU_ZERO(&vm->active_cpus);
 470  466          CPU_ZERO(&vm->debug_cpus);
 471  467  
 472  468          vm->suspend = 0;
 473  469          CPU_ZERO(&vm->suspended_cpus);
 474  470  
 475  471          for (i = 0; i < vm->maxcpus; i++)
 476  472                  vcpu_init(vm, i, create);
 477  473  
 478  474          /*
 479  475           * Configure the VM-wide TSC offset so that the call to vm_init()
 480  476           * represents the boot time (when the TSC(s) read 0).  Each vCPU will
 481  477           * have its own offset from this, which is altered if/when the guest
 482  478           * writes to MSR_TSC.
 483  479           *
 484  480           * The TSC offsetting math is all unsigned, using overflow for negative
 485  481           * offets.  A reading of the TSC is negated to form the boot offset.
 486  482           */
 487  483          vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset());
 488  484  }
 489  485  
 490  486  /*
 491  487   * The default CPU topology is a single thread per package.
 492  488   */
 493  489  uint_t cores_per_package = 1;
 494  490  uint_t threads_per_core = 1;
 495  491  
 496  492  int
 497  493  vm_create(const char *name, struct vm **retvm)
 498  494  {
 499  495          struct vm *vm;
 500  496          struct vmspace *vmspace;
 501  497  
 502  498          /*
 503  499           * If vmm.ko could not be successfully initialized then don't attempt
 504  500           * to create the virtual machine.
 505  501           */
 506  502          if (!vmm_initialized)
 507  503                  return (ENXIO);
 508  504  
 509  505          if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 510  506                  return (EINVAL);
 511  507  
 512  508          vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
 513  509          if (vmspace == NULL)
 514  510                  return (ENOMEM);
 515  511  
 516  512          vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
 517  513          strcpy(vm->name, name);
 518  514          vm->vmspace = vmspace;
 519  515  
 520  516          vm->sockets = 1;
 521  517          vm->cores = cores_per_package;  /* XXX backwards compatibility */
 522  518          vm->threads = threads_per_core; /* XXX backwards compatibility */
 523  519          vm->maxcpus = VM_MAXCPU;        /* XXX temp to keep code working */
 524  520  
 525  521          vm_init(vm, true);
 526  522  
 527  523          *retvm = vm;
 528  524          return (0);
 529  525  }
 530  526  
 531  527  void
 532  528  vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
 533  529      uint16_t *threads, uint16_t *maxcpus)
 534  530  {
 535  531          *sockets = vm->sockets;
 536  532          *cores = vm->cores;
 537  533          *threads = vm->threads;
 538  534          *maxcpus = vm->maxcpus;
 539  535  }
 540  536  
 541  537  uint16_t
 542  538  vm_get_maxcpus(struct vm *vm)
 543  539  {
 544  540          return (vm->maxcpus);
 545  541  }
 546  542  
 547  543  int
 548  544  vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
 549  545      uint16_t threads, uint16_t maxcpus)
 550  546  {
 551  547          if (maxcpus != 0)
 552  548                  return (EINVAL);        /* XXX remove when supported */
 553  549          if ((sockets * cores * threads) > vm->maxcpus)
 554  550                  return (EINVAL);
 555  551          /* XXX need to check sockets * cores * threads == vCPU, how? */
 556  552          vm->sockets = sockets;
 557  553          vm->cores = cores;
 558  554          vm->threads = threads;
 559  555          vm->maxcpus = VM_MAXCPU;        /* XXX temp to keep code working */
 560  556          return (0);
 561  557  }
 562  558  
 563  559  static void
 564  560  vm_cleanup(struct vm *vm, bool destroy)
 565  561  {
 566  562          struct mem_map *mm;
 567  563          int i;
 568  564  
 569  565          ppt_unassign_all(vm);
 570  566  
 571  567          if (vm->iommu != NULL)
 572  568                  iommu_destroy_domain(vm->iommu);
 573  569  
 574  570          /*
 575  571           * Devices which attach their own ioport hooks should be cleaned up
 576  572           * first so they can tear down those registrations.
 577  573           */
 578  574          vpmtmr_cleanup(vm->vpmtmr);
 579  575  
 580  576          vm_inout_cleanup(vm, &vm->ioports);
 581  577  
 582  578          if (destroy)
 583  579                  vrtc_cleanup(vm->vrtc);
 584  580          else
 585  581                  vrtc_reset(vm->vrtc);
 586  582  
 587  583          vatpit_cleanup(vm->vatpit);
 588  584          vhpet_cleanup(vm->vhpet);
 589  585          vatpic_cleanup(vm->vatpic);
 590  586          vioapic_cleanup(vm->vioapic);
 591  587  
 592  588          for (i = 0; i < vm->maxcpus; i++)
 593  589                  vcpu_cleanup(vm, i, destroy);
 594  590  
 595  591          VMCLEANUP(vm->cookie);
 596  592  
 597  593          /*
 598  594           * System memory is removed from the guest address space only when
 599  595           * the VM is destroyed. This is because the mapping remains the same
 600  596           * across VM reset.
 601  597           *
 602  598           * Device memory can be relocated by the guest (e.g. using PCI BARs)
 603  599           * so those mappings are removed on a VM reset.
 604  600           */
 605  601          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 606  602                  mm = &vm->mem_maps[i];
 607  603                  if (destroy || !sysmem_mapping(vm, mm)) {
 608  604                          vm_free_memmap(vm, i);
 609  605                  } else {
 610  606                          /*
 611  607                           * We need to reset the IOMMU flag so this mapping can
 612  608                           * be reused when a VM is rebooted. Since the IOMMU
 613  609                           * domain has already been destroyed we can just reset
 614  610                           * the flag here.
 615  611                           */

↓ open down ↓

325 lines elided

↑ open up ↑

 616  612                          mm->flags &= ~VM_MEMMAP_F_IOMMU;
 617  613                  }
 618  614          }
 619  615  
 620  616          if (destroy) {
 621  617                  for (i = 0; i < VM_MAX_MEMSEGS; i++)
 622  618                          vm_free_memseg(vm, i);
 623  619  
 624  620                  VMSPACE_FREE(vm->vmspace);
 625  621                  vm->vmspace = NULL;
 626      -
 627      -                arc_virt_machine_release(vm->arc_resv);
 628      -                vm->arc_resv = 0;
 629  622          }
 630  623  }
 631  624  
 632  625  void
 633  626  vm_destroy(struct vm *vm)
 634  627  {
 635  628          vm_cleanup(vm, true);
 636  629          free(vm, M_VM);
 637  630  }
 638  631

 639  632  int
 640  633  vm_reinit(struct vm *vm)
 641  634  {
 642  635          int error;
 643  636  
 644  637          /*
 645  638           * A virtual machine can be reset only if all vcpus are suspended.
 646  639           */
 647  640          if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 648  641                  vm_cleanup(vm, false);
 649  642                  vm_init(vm, false);
 650  643                  error = 0;
 651  644          } else {
 652  645                  error = EBUSY;
 653  646          }
 654  647  
 655  648          return (error);
 656  649  }
 657  650  
 658  651  const char *
 659  652  vm_name(struct vm *vm)
 660  653  {
 661  654          return (vm->name);
 662  655  }
 663  656  
 664  657  int
 665  658  vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 666  659  {
 667  660          vm_object_t obj;
 668  661  
 669  662          if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 670  663                  return (ENOMEM);
 671  664          else
 672  665                  return (0);
 673  666  }
 674  667  
 675  668  int
 676  669  vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 677  670  {
 678  671          return (vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len));
 679  672  }
 680  673  
 681  674  /*
 682  675   * Return 'true' if 'gpa' is allocated in the guest address space.
 683  676   *
 684  677   * This function is called in the context of a running vcpu which acts as
 685  678   * an implicit lock on 'vm->mem_maps[]'.
 686  679   */
 687  680  bool
 688  681  vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 689  682  {
 690  683          struct mem_map *mm;
 691  684          int i;
 692  685  
 693  686  #ifdef INVARIANTS
 694  687          int hostcpu, state;
 695  688          state = vcpu_get_state(vm, vcpuid, &hostcpu);
 696  689          KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
 697  690              ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
 698  691  #endif
 699  692  
 700  693          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 701  694                  mm = &vm->mem_maps[i];
 702  695                  if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
 703  696                          return (true);          /* 'gpa' is sysmem or devmem */
 704  697          }
 705  698  
 706  699          if (ppt_is_mmio(vm, gpa))
 707  700                  return (true);                  /* 'gpa' is pci passthru mmio */
 708  701  
 709  702          return (false);
 710  703  }
 711  704  
 712  705  int
 713  706  vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 714  707  {
 715  708          struct mem_seg *seg;
 716  709          vm_object_t obj;
 717  710  
 718  711  #ifndef __FreeBSD__
 719  712          extern pgcnt_t get_max_page_get(void);
 720  713  #endif
 721  714  
 722  715          if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 723  716                  return (EINVAL);
 724  717  
 725  718          if (len == 0 || (len & PAGE_MASK))
 726  719                  return (EINVAL);
 727  720  
 728  721  #ifndef __FreeBSD__
 729  722          if (len > ptob(get_max_page_get()))
 730  723                  return (EINVAL);
 731  724  #endif
 732  725  
 733  726          seg = &vm->mem_segs[ident];
 734  727          if (seg->object != NULL) {
 735  728                  if (seg->len == len && seg->sysmem == sysmem)
 736  729                          return (EEXIST);
 737  730                  else
 738  731                          return (EINVAL);
 739  732          }
 740  733  
 741  734          obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
 742  735          if (obj == NULL)
 743  736                  return (ENOMEM);
 744  737  
 745  738          seg->len = len;
 746  739          seg->object = obj;
 747  740          seg->sysmem = sysmem;
 748  741          return (0);
 749  742  }
 750  743  
 751  744  int
 752  745  vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
 753  746      vm_object_t *objptr)
 754  747  {
 755  748          struct mem_seg *seg;
 756  749  
 757  750          if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 758  751                  return (EINVAL);
 759  752  
 760  753          seg = &vm->mem_segs[ident];
 761  754          if (len)
 762  755                  *len = seg->len;
 763  756          if (sysmem)
 764  757                  *sysmem = seg->sysmem;
 765  758          if (objptr)
 766  759                  *objptr = seg->object;
 767  760          return (0);
 768  761  }
 769  762  
 770  763  void
 771  764  vm_free_memseg(struct vm *vm, int ident)
 772  765  {
 773  766          struct mem_seg *seg;
 774  767  
 775  768          KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 776  769              ("%s: invalid memseg ident %d", __func__, ident));
 777  770  
 778  771          seg = &vm->mem_segs[ident];
 779  772          if (seg->object != NULL) {
 780  773                  vm_object_deallocate(seg->object);
 781  774                  bzero(seg, sizeof (struct mem_seg));
 782  775          }
 783  776  }
 784  777  
 785  778  int
 786  779  vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
 787  780      size_t len, int prot, int flags)
 788  781  {
 789  782          struct mem_seg *seg;
 790  783          struct mem_map *m, *map;
 791  784          vm_ooffset_t last;
 792  785          int i, error;
 793  786  
 794  787          if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
 795  788                  return (EINVAL);
 796  789  
 797  790          if (flags & ~VM_MEMMAP_F_WIRED)
 798  791                  return (EINVAL);
 799  792  
 800  793          if (segid < 0 || segid >= VM_MAX_MEMSEGS)
 801  794                  return (EINVAL);
 802  795  
 803  796          seg = &vm->mem_segs[segid];
 804  797          if (seg->object == NULL)
 805  798                  return (EINVAL);
 806  799  
 807  800          last = first + len;
 808  801          if (first < 0 || first >= last || last > seg->len)
 809  802                  return (EINVAL);
 810  803  
 811  804          if ((gpa | first | last) & PAGE_MASK)
 812  805                  return (EINVAL);
 813  806  
 814  807          map = NULL;
 815  808          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 816  809                  m = &vm->mem_maps[i];
 817  810                  if (m->len == 0) {
 818  811                          map = m;
 819  812                          break;
 820  813                  }
 821  814          }
 822  815  
 823  816          if (map == NULL)
 824  817                  return (ENOSPC);
 825  818  
 826  819          error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
 827  820              len, 0, VMFS_NO_SPACE, prot, prot, 0);
 828  821          if (error != 0)
 829  822                  return (EFAULT);
 830  823  
 831  824          vm_object_reference(seg->object);
 832  825  
 833  826          if ((flags & VM_MEMMAP_F_WIRED) != 0) {
 834  827                  error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
 835  828                      VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 836  829                  if (error != 0) {
 837  830                          vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
 838  831                          return (EFAULT);
 839  832                  }
 840  833          }
 841  834  
 842  835          map->gpa = gpa;
 843  836          map->len = len;
 844  837          map->segoff = first;
 845  838          map->segid = segid;
 846  839          map->prot = prot;
 847  840          map->flags = flags;
 848  841          return (0);
 849  842  }
 850  843  
 851  844  int
 852  845  vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
 853  846  {
 854  847          struct mem_map *m;
 855  848          int i;
 856  849  
 857  850          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 858  851                  m = &vm->mem_maps[i];
 859  852                  if (m->gpa == gpa && m->len == len &&
 860  853                      (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
 861  854                          vm_free_memmap(vm, i);
 862  855                          return (0);
 863  856                  }
 864  857          }
 865  858  
 866  859          return (EINVAL);
 867  860  }
 868  861  
 869  862  int
 870  863  vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
 871  864      vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 872  865  {
 873  866          struct mem_map *mm, *mmnext;
 874  867          int i;
 875  868  
 876  869          mmnext = NULL;
 877  870          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 878  871                  mm = &vm->mem_maps[i];
 879  872                  if (mm->len == 0 || mm->gpa < *gpa)
 880  873                          continue;
 881  874                  if (mmnext == NULL || mm->gpa < mmnext->gpa)
 882  875                          mmnext = mm;
 883  876          }
 884  877  
 885  878          if (mmnext != NULL) {
 886  879                  *gpa = mmnext->gpa;
 887  880                  if (segid)
 888  881                          *segid = mmnext->segid;
 889  882                  if (segoff)
 890  883                          *segoff = mmnext->segoff;
 891  884                  if (len)
 892  885                          *len = mmnext->len;
 893  886                  if (prot)
 894  887                          *prot = mmnext->prot;
 895  888                  if (flags)
 896  889                          *flags = mmnext->flags;
 897  890                  return (0);
 898  891          } else {
 899  892                  return (ENOENT);
 900  893          }
 901  894  }
 902  895  
 903  896  static void
 904  897  vm_free_memmap(struct vm *vm, int ident)
 905  898  {
 906  899          struct mem_map *mm;
 907  900          int error;
 908  901  
 909  902          mm = &vm->mem_maps[ident];
 910  903          if (mm->len) {
 911  904                  error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 912  905                      mm->gpa + mm->len);
 913  906                  KASSERT(error == 0, ("%s: vm_map_remove error %d",
 914  907                      __func__, error));
 915  908                  bzero(mm, sizeof (struct mem_map));
 916  909          }
 917  910  }
 918  911  
 919  912  static __inline bool
 920  913  sysmem_mapping(struct vm *vm, struct mem_map *mm)
 921  914  {
 922  915  
 923  916          if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 924  917                  return (true);
 925  918          else
 926  919                  return (false);
 927  920  }
 928  921  
 929  922  vm_paddr_t
 930  923  vmm_sysmem_maxaddr(struct vm *vm)
 931  924  {
 932  925          struct mem_map *mm;
 933  926          vm_paddr_t maxaddr;
 934  927          int i;
 935  928  
 936  929          maxaddr = 0;
 937  930          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 938  931                  mm = &vm->mem_maps[i];
 939  932                  if (sysmem_mapping(vm, mm)) {
 940  933                          if (maxaddr < mm->gpa + mm->len)
 941  934                                  maxaddr = mm->gpa + mm->len;
 942  935                  }
 943  936          }
 944  937          return (maxaddr);
 945  938  }
 946  939  
 947  940  static void
 948  941  vm_iommu_modify(struct vm *vm, bool map)
 949  942  {
 950  943          int i, sz;
 951  944          vm_paddr_t gpa, hpa;
 952  945          struct mem_map *mm;
 953  946  #ifdef __FreeBSD__
 954  947          void *vp, *cookie, *host_domain;
 955  948  #else
 956  949          void *vp, *cookie, *host_domain __unused;
 957  950  #endif
 958  951  
 959  952          sz = PAGE_SIZE;
 960  953          host_domain = iommu_host_domain();
 961  954  
 962  955          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 963  956                  mm = &vm->mem_maps[i];
 964  957                  if (!sysmem_mapping(vm, mm))
 965  958                          continue;
 966  959  
 967  960                  if (map) {
 968  961                          KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
 969  962                              ("iommu map found invalid memmap %lx/%lx/%x",
 970  963                              mm->gpa, mm->len, mm->flags));
 971  964                          if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
 972  965                                  continue;
 973  966                          mm->flags |= VM_MEMMAP_F_IOMMU;
 974  967                  } else {
 975  968                          if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
 976  969                                  continue;
 977  970                          mm->flags &= ~VM_MEMMAP_F_IOMMU;
 978  971                          KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
 979  972                              ("iommu unmap found invalid memmap %lx/%lx/%x",
 980  973                              mm->gpa, mm->len, mm->flags));
 981  974                  }
 982  975  
 983  976                  gpa = mm->gpa;
 984  977                  while (gpa < mm->gpa + mm->len) {
 985  978                          vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, PROT_WRITE,
 986  979                              &cookie);
 987  980                          KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
 988  981                              vm_name(vm), gpa));
 989  982  
 990  983                          vm_gpa_release(cookie);
 991  984  
 992  985                          hpa = DMAP_TO_PHYS((uintptr_t)vp);
 993  986                          if (map) {
 994  987                                  iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 995  988  #ifdef __FreeBSD__
 996  989                                  iommu_remove_mapping(host_domain, hpa, sz);
 997  990  #endif
 998  991                          } else {
 999  992                                  iommu_remove_mapping(vm->iommu, gpa, sz);
1000  993  #ifdef __FreeBSD__
1001  994                                  iommu_create_mapping(host_domain, hpa, hpa, sz);
1002  995  #endif
1003  996                          }
1004  997  
1005  998                          gpa += PAGE_SIZE;
1006  999                  }
1007 1000          }
1008 1001  
1009 1002          /*
1010 1003           * Invalidate the cached translations associated with the domain
1011 1004           * from which pages were removed.
1012 1005           */
1013 1006  #ifdef __FreeBSD__
1014 1007          if (map)
1015 1008                  iommu_invalidate_tlb(host_domain);
1016 1009          else
1017 1010                  iommu_invalidate_tlb(vm->iommu);
1018 1011  #else
1019 1012          iommu_invalidate_tlb(vm->iommu);
1020 1013  #endif
1021 1014  }
1022 1015  
1023 1016  #define vm_iommu_unmap(vm)      vm_iommu_modify((vm), false)
1024 1017  #define vm_iommu_map(vm)        vm_iommu_modify((vm), true)
1025 1018  
1026 1019  int
1027 1020  vm_unassign_pptdev(struct vm *vm, int pptfd)
1028 1021  {
1029 1022          int error;
1030 1023  
1031 1024          error = ppt_unassign_device(vm, pptfd);
1032 1025          if (error)
1033 1026                  return (error);
1034 1027  
1035 1028          if (ppt_assigned_devices(vm) == 0)
1036 1029                  vm_iommu_unmap(vm);
1037 1030  
1038 1031          return (0);
1039 1032  }
1040 1033  
1041 1034  int
1042 1035  vm_assign_pptdev(struct vm *vm, int pptfd)
1043 1036  {
1044 1037          int error;
1045 1038          vm_paddr_t maxaddr;
1046 1039  
1047 1040          /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1048 1041          if (ppt_assigned_devices(vm) == 0) {
1049 1042                  KASSERT(vm->iommu == NULL,
1050 1043                      ("vm_assign_pptdev: iommu must be NULL"));
1051 1044                  maxaddr = vmm_sysmem_maxaddr(vm);
1052 1045                  vm->iommu = iommu_create_domain(maxaddr);
1053 1046                  if (vm->iommu == NULL)
1054 1047                          return (ENXIO);
1055 1048                  vm_iommu_map(vm);
1056 1049          }
1057 1050  
1058 1051          error = ppt_assign_device(vm, pptfd);
1059 1052          return (error);
1060 1053  }
1061 1054  
1062 1055  void *
1063 1056  vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1064 1057      void **cookie)
1065 1058  {
1066 1059          int i, count, pageoff;
1067 1060          struct mem_map *mm;
1068 1061          vm_page_t m;
1069 1062  #ifdef INVARIANTS
1070 1063          /*
1071 1064           * All vcpus are frozen by ioctls that modify the memory map
1072 1065           * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1073 1066           * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1074 1067           */
1075 1068          int state;
1076 1069          KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1077 1070              __func__, vcpuid));
1078 1071          for (i = 0; i < vm->maxcpus; i++) {
1079 1072                  if (vcpuid != -1 && vcpuid != i)
1080 1073                          continue;
1081 1074                  state = vcpu_get_state(vm, i, NULL);
1082 1075                  KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1083 1076                      __func__, state));
1084 1077          }
1085 1078  #endif
1086 1079          pageoff = gpa & PAGE_MASK;
1087 1080          if (len > PAGE_SIZE - pageoff)
1088 1081                  panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1089 1082  
1090 1083          count = 0;
1091 1084          for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1092 1085                  mm = &vm->mem_maps[i];
1093 1086                  if (mm->len == 0) {
1094 1087                          continue;
1095 1088                  }
1096 1089                  if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1097 1090                          count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1098 1091                              trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1099 1092                          break;
1100 1093                  }
1101 1094          }
1102 1095  
1103 1096          if (count == 1) {
1104 1097                  *cookie = m;
1105 1098                  return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1106 1099          } else {
1107 1100                  *cookie = NULL;
1108 1101                  return (NULL);
1109 1102          }
1110 1103  }
1111 1104  
1112 1105  void
1113 1106  vm_gpa_release(void *cookie)
1114 1107  {
1115 1108          vm_page_t m = cookie;
1116 1109  
1117 1110          vm_page_unwire(m, PQ_ACTIVE);
1118 1111  }
1119 1112  
1120 1113  int
1121 1114  vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1122 1115  {
1123 1116  
1124 1117          if (vcpu < 0 || vcpu >= vm->maxcpus)
1125 1118                  return (EINVAL);
1126 1119  
1127 1120          if (reg >= VM_REG_LAST)
1128 1121                  return (EINVAL);
1129 1122  
1130 1123          return (VMGETREG(vm->cookie, vcpu, reg, retval));
1131 1124  }
1132 1125  
1133 1126  int
1134 1127  vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1135 1128  {
1136 1129          struct vcpu *vcpu;
1137 1130          int error;
1138 1131  
1139 1132          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1140 1133                  return (EINVAL);
1141 1134  
1142 1135          if (reg >= VM_REG_LAST)
1143 1136                  return (EINVAL);
1144 1137  
1145 1138          error = VMSETREG(vm->cookie, vcpuid, reg, val);
1146 1139          if (error || reg != VM_REG_GUEST_RIP)
1147 1140                  return (error);
1148 1141  
1149 1142          /* Set 'nextrip' to match the value of %rip */
1150 1143          VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1151 1144          vcpu = &vm->vcpu[vcpuid];
1152 1145          vcpu->nextrip = val;
1153 1146          return (0);
1154 1147  }
1155 1148  
1156 1149  static bool
1157 1150  is_descriptor_table(int reg)
1158 1151  {
1159 1152          switch (reg) {
1160 1153          case VM_REG_GUEST_IDTR:
1161 1154          case VM_REG_GUEST_GDTR:
1162 1155                  return (true);
1163 1156          default:
1164 1157                  return (false);
1165 1158          }
1166 1159  }
1167 1160  
1168 1161  static bool
1169 1162  is_segment_register(int reg)
1170 1163  {
1171 1164          switch (reg) {
1172 1165          case VM_REG_GUEST_ES:
1173 1166          case VM_REG_GUEST_CS:
1174 1167          case VM_REG_GUEST_SS:
1175 1168          case VM_REG_GUEST_DS:
1176 1169          case VM_REG_GUEST_FS:
1177 1170          case VM_REG_GUEST_GS:
1178 1171          case VM_REG_GUEST_TR:
1179 1172          case VM_REG_GUEST_LDTR:
1180 1173                  return (true);
1181 1174          default:
1182 1175                  return (false);
1183 1176          }
1184 1177  }
1185 1178  
1186 1179  int
1187 1180  vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1188 1181  {
1189 1182  
1190 1183          if (vcpu < 0 || vcpu >= vm->maxcpus)
1191 1184                  return (EINVAL);
1192 1185  
1193 1186          if (!is_segment_register(reg) && !is_descriptor_table(reg))
1194 1187                  return (EINVAL);
1195 1188  
1196 1189          return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1197 1190  }
1198 1191  
1199 1192  int
1200 1193  vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1201 1194  {
1202 1195          if (vcpu < 0 || vcpu >= vm->maxcpus)
1203 1196                  return (EINVAL);
1204 1197  
1205 1198          if (!is_segment_register(reg) && !is_descriptor_table(reg))
1206 1199                  return (EINVAL);
1207 1200  
1208 1201          return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1209 1202  }
1210 1203  
1211 1204  int
1212 1205  vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1213 1206  {
1214 1207          struct vcpu *vcpu;
1215 1208  
1216 1209          if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1217 1210                  return (EINVAL);
1218 1211          }
1219 1212  
1220 1213          vcpu = &vm->vcpu[vcpuid];
1221 1214  
1222 1215          vcpu_lock(vcpu);
1223 1216          *state = vcpu->run_state;
1224 1217          *sipi_vec = vcpu->sipi_vector;
1225 1218          vcpu_unlock(vcpu);
1226 1219  
1227 1220          return (0);
1228 1221  }
1229 1222  
1230 1223  int
1231 1224  vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1232 1225  {
1233 1226          struct vcpu *vcpu;
1234 1227  
1235 1228          if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1236 1229                  return (EINVAL);
1237 1230          }
1238 1231          if (!VRS_IS_VALID(state)) {
1239 1232                  return (EINVAL);
1240 1233          }
1241 1234  
1242 1235          vcpu = &vm->vcpu[vcpuid];
1243 1236  
1244 1237          vcpu_lock(vcpu);
1245 1238          vcpu->run_state = state;
1246 1239          vcpu->sipi_vector = sipi_vec;
1247 1240          vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1248 1241          vcpu_unlock(vcpu);
1249 1242  
1250 1243          return (0);
1251 1244  }
1252 1245  
1253 1246  
1254 1247  static void
1255 1248  restore_guest_fpustate(struct vcpu *vcpu)
1256 1249  {
1257 1250  
1258 1251          /* flush host state to the pcb */
1259 1252          fpuexit(curthread);
1260 1253  
1261 1254          /* restore guest FPU state */
1262 1255          fpu_stop_emulating();
1263 1256          fpurestore(vcpu->guestfpu);
1264 1257  
1265 1258          /* restore guest XCR0 if XSAVE is enabled in the host */
1266 1259          if (rcr4() & CR4_XSAVE)
1267 1260                  load_xcr(0, vcpu->guest_xcr0);
1268 1261  
1269 1262          /*
1270 1263           * The FPU is now "dirty" with the guest's state so turn on emulation
1271 1264           * to trap any access to the FPU by the host.
1272 1265           */
1273 1266          fpu_start_emulating();
1274 1267  }
1275 1268  
1276 1269  static void
1277 1270  save_guest_fpustate(struct vcpu *vcpu)
1278 1271  {
1279 1272  
1280 1273          if ((rcr0() & CR0_TS) == 0)
1281 1274                  panic("fpu emulation not enabled in host!");
1282 1275  
1283 1276          /* save guest XCR0 and restore host XCR0 */
1284 1277          if (rcr4() & CR4_XSAVE) {
1285 1278                  vcpu->guest_xcr0 = rxcr(0);
1286 1279                  load_xcr(0, vmm_get_host_xcr0());
1287 1280          }
1288 1281  
1289 1282          /* save guest FPU state */
1290 1283          fpu_stop_emulating();
1291 1284          fpusave(vcpu->guestfpu);
1292 1285          /*
1293 1286           * When the host state has been restored, we should not re-enable
1294 1287           * CR0.TS on illumos for eager FPU.
1295 1288           */
1296 1289  }
1297 1290  
1298 1291  static int
1299 1292  vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1300 1293      bool from_idle)
1301 1294  {
1302 1295          struct vcpu *vcpu;
1303 1296          int error;
1304 1297  
1305 1298          vcpu = &vm->vcpu[vcpuid];
1306 1299          vcpu_assert_locked(vcpu);
1307 1300  
1308 1301          /*
1309 1302           * State transitions from the vmmdev_ioctl() must always begin from
1310 1303           * the VCPU_IDLE state. This guarantees that there is only a single
1311 1304           * ioctl() operating on a vcpu at any point.
1312 1305           */
1313 1306          if (from_idle) {
1314 1307                  while (vcpu->state != VCPU_IDLE) {
1315 1308                          vcpu->reqidle = 1;
1316 1309                          vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1317 1310                          VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1318 1311                              "idle requested", vcpu_state2str(vcpu->state));
1319 1312                          cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1320 1313                  }
1321 1314          } else {
1322 1315                  KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1323 1316                      "vcpu idle state"));
1324 1317          }
1325 1318  
1326 1319          if (vcpu->state == VCPU_RUNNING) {
1327 1320                  KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1328 1321                      "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1329 1322          } else {
1330 1323                  KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1331 1324                      "vcpu that is not running", vcpu->hostcpu));
1332 1325          }
1333 1326  
1334 1327          /*
1335 1328           * The following state transitions are allowed:
1336 1329           * IDLE -> FROZEN -> IDLE
1337 1330           * FROZEN -> RUNNING -> FROZEN
1338 1331           * FROZEN -> SLEEPING -> FROZEN
1339 1332           */
1340 1333          switch (vcpu->state) {
1341 1334          case VCPU_IDLE:
1342 1335          case VCPU_RUNNING:
1343 1336          case VCPU_SLEEPING:
1344 1337                  error = (newstate != VCPU_FROZEN);
1345 1338                  break;
1346 1339          case VCPU_FROZEN:
1347 1340                  error = (newstate == VCPU_FROZEN);
1348 1341                  break;
1349 1342          default:
1350 1343                  error = 1;
1351 1344                  break;
1352 1345          }
1353 1346  
1354 1347          if (error)
1355 1348                  return (EBUSY);
1356 1349  
1357 1350          VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1358 1351              vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1359 1352  
1360 1353          vcpu->state = newstate;
1361 1354          if (newstate == VCPU_RUNNING)
1362 1355                  vcpu->hostcpu = curcpu;
1363 1356          else
1364 1357                  vcpu->hostcpu = NOCPU;
1365 1358  
1366 1359          if (newstate == VCPU_IDLE) {
1367 1360                  cv_broadcast(&vcpu->state_cv);
1368 1361          }
1369 1362  
1370 1363          return (0);
1371 1364  }
1372 1365  
1373 1366  static void
1374 1367  vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1375 1368  {
1376 1369          int error;
1377 1370  
1378 1371          if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1379 1372                  panic("Error %d setting state to %d\n", error, newstate);
1380 1373  }
1381 1374  
1382 1375  static void
1383 1376  vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1384 1377  {
1385 1378          int error;
1386 1379  
1387 1380          if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1388 1381                  panic("Error %d setting state to %d", error, newstate);
1389 1382  }
1390 1383  
1391 1384  /*
1392 1385   * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1393 1386   */
1394 1387  static int
1395 1388  vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1396 1389  {
1397 1390          struct vcpu *vcpu;
1398 1391          int vcpu_halted, vm_halted;
1399 1392          bool userspace_exit = false;
1400 1393  
1401 1394          KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1402 1395  
1403 1396          vcpu = &vm->vcpu[vcpuid];
1404 1397          vcpu_halted = 0;
1405 1398          vm_halted = 0;
1406 1399  
1407 1400          vcpu_lock(vcpu);
1408 1401          while (1) {
1409 1402                  /*
1410 1403                   * Do a final check for pending interrupts (including NMI and
1411 1404                   * INIT) before putting this thread to sleep.
1412 1405                   */
1413 1406                  if (vm_nmi_pending(vm, vcpuid))
1414 1407                          break;
1415 1408                  if (vcpu_run_state_pending(vm, vcpuid))
1416 1409                          break;
1417 1410                  if (!intr_disabled) {
1418 1411                          if (vm_extint_pending(vm, vcpuid) ||
1419 1412                              vlapic_pending_intr(vcpu->vlapic, NULL)) {
1420 1413                                  break;
1421 1414                          }
1422 1415                  }
1423 1416  
1424 1417                  /*
1425 1418                   * Also check for software events which would cause a wake-up.
1426 1419                   * This will set the appropriate exitcode directly, rather than
1427 1420                   * requiring a trip through VM_RUN().
1428 1421                   */
1429 1422                  if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1430 1423                          userspace_exit = true;
1431 1424                          break;
1432 1425                  }
1433 1426  
1434 1427                  /*
1435 1428                   * Some Linux guests implement "halt" by having all vcpus
1436 1429                   * execute HLT with interrupts disabled. 'halted_cpus' keeps
1437 1430                   * track of the vcpus that have entered this state. When all
1438 1431                   * vcpus enter the halted state the virtual machine is halted.
1439 1432                   */
1440 1433                  if (intr_disabled) {
1441 1434                          if (!vcpu_halted && halt_detection_enabled) {
1442 1435                                  vcpu_halted = 1;
1443 1436                                  CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1444 1437                          }
1445 1438                          if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1446 1439                                  vm_halted = 1;
1447 1440                                  break;
1448 1441                          }
1449 1442                  }
1450 1443  
1451 1444                  vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1452 1445                  vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1453 1446                  (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1454 1447                  vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1455 1448                  vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1456 1449          }
1457 1450  
1458 1451          if (vcpu_halted)
1459 1452                  CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1460 1453  
1461 1454          vcpu_unlock(vcpu);
1462 1455  
1463 1456          if (vm_halted)
1464 1457                  vm_suspend(vm, VM_SUSPEND_HALT);
1465 1458  
1466 1459          return (userspace_exit ? -1 : 0);
1467 1460  }
1468 1461  
1469 1462  static int
1470 1463  vm_handle_paging(struct vm *vm, int vcpuid)
1471 1464  {
1472 1465          int rv, ftype;
1473 1466          struct vm_map *map;
1474 1467          struct vcpu *vcpu;
1475 1468          struct vm_exit *vme;
1476 1469  
1477 1470          vcpu = &vm->vcpu[vcpuid];
1478 1471          vme = &vcpu->exitinfo;
1479 1472  
1480 1473          KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1481 1474              __func__, vme->inst_length));
1482 1475  
1483 1476          ftype = vme->u.paging.fault_type;
1484 1477          KASSERT(ftype == PROT_READ ||
1485 1478              ftype == PROT_WRITE || ftype == PROT_EXEC,
1486 1479              ("vm_handle_paging: invalid fault_type %d", ftype));
1487 1480  
1488 1481          if (ftype == PROT_READ || ftype == PROT_WRITE) {
1489 1482                  rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1490 1483                      vme->u.paging.gpa, ftype);
1491 1484                  if (rv == 0) {
1492 1485                          VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1493 1486                              ftype == PROT_READ ? "accessed" : "dirty",
1494 1487                              vme->u.paging.gpa);
1495 1488                          goto done;
1496 1489                  }
1497 1490          }
1498 1491  
1499 1492          map = &vm->vmspace->vm_map;
1500 1493          rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1501 1494  
1502 1495          VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1503 1496              "ftype = %d", rv, vme->u.paging.gpa, ftype);
1504 1497  
1505 1498          if (rv != 0)
1506 1499                  return (EFAULT);
1507 1500  done:
1508 1501          return (0);
1509 1502  }
1510 1503  
1511 1504  int
1512 1505  vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1513 1506      int rsize)
1514 1507  {
1515 1508          int err = ESRCH;
1516 1509  
1517 1510          if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1518 1511                  err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1519 1512          } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1520 1513                  err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1521 1514          } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1522 1515                  err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1523 1516          }
1524 1517  
1525 1518          return (err);
1526 1519  }
1527 1520  
1528 1521  int
1529 1522  vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1530 1523      int wsize)
1531 1524  {
1532 1525          int err = ESRCH;
1533 1526  
1534 1527          if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1535 1528                  err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1536 1529          } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1537 1530                  err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1538 1531          } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1539 1532                  err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1540 1533          }
1541 1534  
1542 1535          return (err);
1543 1536  }
1544 1537  
1545 1538  static int
1546 1539  vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1547 1540  {
1548 1541          struct vie *vie;
1549 1542          struct vcpu *vcpu;
1550 1543          struct vm_exit *vme;
1551 1544          uint64_t inst_addr;
1552 1545          int error, fault, cs_d;
1553 1546  
1554 1547          vcpu = &vm->vcpu[vcpuid];
1555 1548          vme = &vcpu->exitinfo;
1556 1549          vie = vcpu->vie_ctx;
1557 1550  
1558 1551          KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1559 1552              __func__, vme->inst_length));
1560 1553  
1561 1554          inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1562 1555          cs_d = vme->u.mmio_emul.cs_d;
1563 1556  
1564 1557          VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1565 1558              vme->u.mmio_emul.gpa);
1566 1559  
1567 1560          /* Fetch the faulting instruction */
1568 1561          if (vie_needs_fetch(vie)) {
1569 1562                  error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1570 1563                      &fault);
1571 1564                  if (error != 0) {
1572 1565                          return (error);
1573 1566                  } else if (fault) {
1574 1567                          /*
1575 1568                           * If a fault during instruction fetch was encountered,
1576 1569                           * it will have asserted that the appropriate exception
1577 1570                           * be injected at next entry.
1578 1571                           * No further work is required.
1579 1572                           */
1580 1573                          return (0);
1581 1574                  }
1582 1575          }
1583 1576  
1584 1577          if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1585 1578                  VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1586 1579                      inst_addr);
1587 1580                  /* Dump (unrecognized) instruction bytes in userspace */
1588 1581                  vie_fallback_exitinfo(vie, vme);
1589 1582                  return (-1);
1590 1583          }
1591 1584          if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1592 1585              vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1593 1586                  /* Decoded GLA does not match GLA from VM exit state */
1594 1587                  vie_fallback_exitinfo(vie, vme);
1595 1588                  return (-1);
1596 1589          }
1597 1590  
1598 1591  repeat:
1599 1592          error = vie_emulate_mmio(vie, vm, vcpuid);
1600 1593          if (error < 0) {
1601 1594                  /*
1602 1595                   * MMIO not handled by any of the in-kernel-emulated devices, so
1603 1596                   * make a trip out to userspace for it.
1604 1597                   */
1605 1598                  vie_exitinfo(vie, vme);
1606 1599          } else if (error == EAGAIN) {
1607 1600                  /*
1608 1601                   * Continue emulating the rep-prefixed instruction, which has
1609 1602                   * not completed its iterations.
1610 1603                   *
1611 1604                   * In case this can be emulated in-kernel and has a high
1612 1605                   * repetition count (causing a tight spin), it should be
1613 1606                   * deferential to yield conditions.
1614 1607                   */
1615 1608                  if (!vcpu_should_yield(vm, vcpuid)) {
1616 1609                          goto repeat;
1617 1610                  } else {
1618 1611                          /*
1619 1612                           * Defer to the contending load by making a trip to
1620 1613                           * userspace with a no-op (BOGUS) exit reason.
1621 1614                           */
1622 1615                          vie_reset(vie);
1623 1616                          vme->exitcode = VM_EXITCODE_BOGUS;
1624 1617                          return (-1);
1625 1618                  }
1626 1619          } else if (error == 0) {
1627 1620                  /* Update %rip now that instruction has been emulated */
1628 1621                  vie_advance_pc(vie, &vcpu->nextrip);
1629 1622          }
1630 1623          return (error);
1631 1624  }
1632 1625  
1633 1626  static int
1634 1627  vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1635 1628  {
1636 1629          struct vcpu *vcpu;
1637 1630          struct vie *vie;
1638 1631          int err;
1639 1632  
1640 1633          vcpu = &vm->vcpu[vcpuid];
1641 1634          vie = vcpu->vie_ctx;
1642 1635  
1643 1636  repeat:
1644 1637          err = vie_emulate_inout(vie, vm, vcpuid);
1645 1638  
1646 1639          if (err < 0) {
1647 1640                  /*
1648 1641                   * In/out not handled by any of the in-kernel-emulated devices,
1649 1642                   * so make a trip out to userspace for it.
1650 1643                   */
1651 1644                  vie_exitinfo(vie, vme);
1652 1645                  return (err);
1653 1646          } else if (err == EAGAIN) {
1654 1647                  /*
1655 1648                   * Continue emulating the rep-prefixed ins/outs, which has not
1656 1649                   * completed its iterations.
1657 1650                   *
1658 1651                   * In case this can be emulated in-kernel and has a high
1659 1652                   * repetition count (causing a tight spin), it should be
1660 1653                   * deferential to yield conditions.
1661 1654                   */
1662 1655                  if (!vcpu_should_yield(vm, vcpuid)) {
1663 1656                          goto repeat;
1664 1657                  } else {
1665 1658                          /*
1666 1659                           * Defer to the contending load by making a trip to
1667 1660                           * userspace with a no-op (BOGUS) exit reason.
1668 1661                           */
1669 1662                          vie_reset(vie);
1670 1663                          vme->exitcode = VM_EXITCODE_BOGUS;
1671 1664                          return (-1);
1672 1665                  }
1673 1666          } else if (err != 0) {
1674 1667                  /* Emulation failure.  Bail all the way out to userspace. */
1675 1668                  vme->exitcode = VM_EXITCODE_INST_EMUL;
1676 1669                  bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1677 1670                  return (-1);
1678 1671          }
1679 1672  
1680 1673          vie_advance_pc(vie, &vcpu->nextrip);
1681 1674          return (0);
1682 1675  }
1683 1676  
1684 1677  static int
1685 1678  vm_handle_inst_emul(struct vm *vm, int vcpuid)
1686 1679  {
1687 1680          struct vie *vie;
1688 1681          struct vcpu *vcpu;
1689 1682          struct vm_exit *vme;
1690 1683          uint64_t cs_base;
1691 1684          int error, fault, cs_d;
1692 1685  
1693 1686          vcpu = &vm->vcpu[vcpuid];
1694 1687          vme = &vcpu->exitinfo;
1695 1688          vie = vcpu->vie_ctx;
1696 1689  
1697 1690          vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1698 1691  
1699 1692          /* Fetch the faulting instruction */
1700 1693          ASSERT(vie_needs_fetch(vie));
1701 1694          error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1702 1695              &fault);
1703 1696          if (error != 0) {
1704 1697                  return (error);
1705 1698          } else if (fault) {
1706 1699                  /*
1707 1700                   * If a fault during instruction fetch was encounted, it will
1708 1701                   * have asserted that the appropriate exception be injected at
1709 1702                   * next entry.  No further work is required.
1710 1703                   */
1711 1704                  return (0);
1712 1705          }
1713 1706  
1714 1707          if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1715 1708                  /* Dump (unrecognized) instruction bytes in userspace */
1716 1709                  vie_fallback_exitinfo(vie, vme);
1717 1710                  return (-1);
1718 1711          }
1719 1712  
1720 1713          error = vie_emulate_other(vie, vm, vcpuid);
1721 1714          if (error != 0) {
1722 1715                  /*
1723 1716                   * Instruction emulation was unable to complete successfully, so
1724 1717                   * kick it out to userspace for handling.
1725 1718                   */
1726 1719                  vie_fallback_exitinfo(vie, vme);
1727 1720          } else {
1728 1721                  /* Update %rip now that instruction has been emulated */
1729 1722                  vie_advance_pc(vie, &vcpu->nextrip);
1730 1723          }
1731 1724          return (error);
1732 1725  }
1733 1726  
1734 1727  static int
1735 1728  vm_handle_suspend(struct vm *vm, int vcpuid)
1736 1729  {
1737 1730          int i;
1738 1731          struct vcpu *vcpu;
1739 1732  
1740 1733          vcpu = &vm->vcpu[vcpuid];
1741 1734  
1742 1735          CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1743 1736  
1744 1737          /*
1745 1738           * Wait until all 'active_cpus' have suspended themselves.
1746 1739           */
1747 1740          vcpu_lock(vcpu);
1748 1741          vcpu_ustate_change(vm, vcpuid, VU_INIT);
1749 1742          while (1) {
1750 1743                  int rc;
1751 1744  
1752 1745                  if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1753 1746                          VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1754 1747                          break;
1755 1748                  }
1756 1749  
1757 1750                  vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1758 1751                  rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1759 1752                      TR_CLOCK_TICK);
1760 1753                  vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1761 1754  
1762 1755                  /*
1763 1756                   * If the userspace process driving the instance is killed, any
1764 1757                   * vCPUs yet to be marked suspended (because they are not
1765 1758                   * VM_RUN-ing in the kernel presently) will never reach that
1766 1759                   * state.
1767 1760                   *
1768 1761                   * To avoid vm_handle_suspend() getting stuck in the kernel
1769 1762                   * waiting for those vCPUs, offer a bail-out even though it
1770 1763                   * means returning without all vCPUs in a suspended state.
1771 1764                   */
1772 1765                  if (rc <= 0) {
1773 1766                          if ((curproc->p_flag & SEXITING) != 0) {
1774 1767                                  break;
1775 1768                          }
1776 1769                  }
1777 1770          }
1778 1771          vcpu_unlock(vcpu);
1779 1772  
1780 1773          /*
1781 1774           * Wakeup the other sleeping vcpus and return to userspace.
1782 1775           */
1783 1776          for (i = 0; i < vm->maxcpus; i++) {
1784 1777                  if (CPU_ISSET(i, &vm->suspended_cpus)) {
1785 1778                          vcpu_notify_event(vm, i);
1786 1779                  }
1787 1780          }
1788 1781  
1789 1782          return (-1);
1790 1783  }
1791 1784  
1792 1785  static int
1793 1786  vm_handle_reqidle(struct vm *vm, int vcpuid)
1794 1787  {
1795 1788          struct vcpu *vcpu = &vm->vcpu[vcpuid];
1796 1789  
1797 1790          vcpu_lock(vcpu);
1798 1791          KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1799 1792          vcpu->reqidle = 0;
1800 1793          vcpu_unlock(vcpu);
1801 1794          return (-1);
1802 1795  }
1803 1796  
1804 1797  static int
1805 1798  vm_handle_run_state(struct vm *vm, int vcpuid)
1806 1799  {
1807 1800          struct vcpu *vcpu = &vm->vcpu[vcpuid];
1808 1801          bool handled = false;
1809 1802  
1810 1803          vcpu_lock(vcpu);
1811 1804          while (1) {
1812 1805                  if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1813 1806                          vcpu_unlock(vcpu);
1814 1807                          VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1815 1808                          vcpu_lock(vcpu);
1816 1809  
1817 1810                          vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1818 1811                          vcpu->run_state |= VRS_INIT;
1819 1812                  }
1820 1813  
1821 1814                  if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1822 1815                      (VRS_INIT | VRS_PEND_SIPI)) {
1823 1816                          const uint8_t vector = vcpu->sipi_vector;
1824 1817  
1825 1818                          vcpu_unlock(vcpu);
1826 1819                          VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1827 1820                          vcpu_lock(vcpu);
1828 1821  
1829 1822                          vcpu->run_state &= ~VRS_PEND_SIPI;
1830 1823                          vcpu->run_state |= VRS_RUN;
1831 1824                  }
1832 1825  
1833 1826                  /*
1834 1827                   * If the vCPU is now in the running state, there is no need to
1835 1828                   * wait for anything prior to re-entry.
1836 1829                   */
1837 1830                  if ((vcpu->run_state & VRS_RUN) != 0) {
1838 1831                          handled = true;
1839 1832                          break;
1840 1833                  }
1841 1834  
1842 1835                  /*
1843 1836                   * Also check for software events which would cause a wake-up.
1844 1837                   * This will set the appropriate exitcode directly, rather than
1845 1838                   * requiring a trip through VM_RUN().
1846 1839                   */
1847 1840                  if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1848 1841                          break;
1849 1842                  }
1850 1843  
1851 1844                  vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1852 1845                  vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1853 1846                  (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1854 1847                  vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1855 1848                  vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1856 1849          }
1857 1850          vcpu_unlock(vcpu);
1858 1851  
1859 1852          return (handled ? 0 : -1);
1860 1853  }
1861 1854  
1862 1855  static int
1863 1856  vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1864 1857  {
1865 1858          const uint32_t code = vme->u.msr.code;
1866 1859          uint64_t val = 0;
1867 1860  
1868 1861          switch (code) {
1869 1862          case MSR_MCG_CAP:
1870 1863          case MSR_MCG_STATUS:
1871 1864                  val = 0;
1872 1865                  break;
1873 1866  
1874 1867          case MSR_MTRRcap:
1875 1868          case MSR_MTRRdefType:
1876 1869          case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1877 1870          case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1878 1871          case MSR_MTRR64kBase:
1879 1872                  val = 0;
1880 1873                  break;
1881 1874  
1882 1875          case MSR_TSC:
1883 1876                  /*
1884 1877                   * In all likelihood, this should always be handled in guest
1885 1878                   * context by VMX/SVM rather than taking an exit.  (Both VMX and
1886 1879                   * SVM pass through read-only access to MSR_TSC to the guest.)
1887 1880                   *
1888 1881                   * No physical offset is requested of vcpu_tsc_offset() since
1889 1882                   * rdtsc_offset() takes care of that instead.
1890 1883                   */
1891 1884                  val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
1892 1885                  break;
1893 1886  
1894 1887          default:
1895 1888                  /*
1896 1889                   * Anything not handled at this point will be kicked out to
1897 1890                   * userspace for attempted processing there.
1898 1891                   */
1899 1892                  return (-1);
1900 1893          }
1901 1894  
1902 1895          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
1903 1896              val & 0xffffffff));
1904 1897          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
1905 1898              val >> 32));
1906 1899          return (0);
1907 1900  }
1908 1901  
1909 1902  static int
1910 1903  vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1911 1904  {
1912 1905          struct vcpu *vcpu = &vm->vcpu[vcpuid];
1913 1906          const uint32_t code = vme->u.msr.code;
1914 1907          const uint64_t val = vme->u.msr.wval;
1915 1908  
1916 1909          switch (code) {
1917 1910          case MSR_MCG_CAP:
1918 1911          case MSR_MCG_STATUS:
1919 1912                  /* Ignore writes */
1920 1913                  break;
1921 1914  
1922 1915          case MSR_MTRRcap:
1923 1916                  vm_inject_gp(vm, vcpuid);
1924 1917                  break;
1925 1918          case MSR_MTRRdefType:
1926 1919          case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1927 1920          case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1928 1921          case MSR_MTRR64kBase:
1929 1922                  /* Ignore writes */
1930 1923                  break;
1931 1924  
1932 1925          case MSR_TSC:
1933 1926                  /*
1934 1927                   * The effect of writing the TSC MSR is that a subsequent read
1935 1928                   * of the TSC would report that value written (plus any time
1936 1929                   * elapsed between the write and the read).  The guest TSC value
1937 1930                   * is calculated from a global offset for the guest (which
1938 1931                   * effectively makes its TSC read 0 at guest boot) and a
1939 1932                   * per-vCPU offset to handle these writes to the MSR.
1940 1933                   *
1941 1934                   * To calculate that per-vCPU offset, we can work backwards from
1942 1935                   * the guest value at the time of write:
1943 1936                   *
1944 1937                   * value = host TSC + VM boot offset + vCPU offset
1945 1938                   *
1946 1939                   * so therefore:
1947 1940                   *
1948 1941                   * value - host TSC - VM boot offset = vCPU offset
1949 1942                   */
1950 1943                  vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
1951 1944                  break;
1952 1945  
1953 1946          default:
1954 1947                  /*
1955 1948                   * Anything not handled at this point will be kicked out to
1956 1949                   * userspace for attempted processing there.
1957 1950                   */
1958 1951                  return (-1);
1959 1952          }
1960 1953  
1961 1954          return (0);
1962 1955  }
1963 1956  
1964 1957  int
1965 1958  vm_suspend(struct vm *vm, enum vm_suspend_how how)
1966 1959  {
1967 1960          int i;
1968 1961  
1969 1962          if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1970 1963                  return (EINVAL);
1971 1964  
1972 1965          if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1973 1966                  VM_CTR2(vm, "virtual machine already suspended %d/%d",
1974 1967                      vm->suspend, how);
1975 1968                  return (EALREADY);
1976 1969          }
1977 1970  
1978 1971          VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1979 1972  
1980 1973          /*
1981 1974           * Notify all active vcpus that they are now suspended.
1982 1975           */
1983 1976          for (i = 0; i < vm->maxcpus; i++) {
1984 1977                  if (CPU_ISSET(i, &vm->active_cpus))
1985 1978                          vcpu_notify_event(vm, i);
1986 1979          }
1987 1980  
1988 1981          return (0);
1989 1982  }
1990 1983  
1991 1984  void
1992 1985  vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1993 1986  {
1994 1987          struct vm_exit *vmexit;
1995 1988  
1996 1989          vmexit = vm_exitinfo(vm, vcpuid);
1997 1990          vmexit->rip = rip;
1998 1991          vmexit->inst_length = 0;
1999 1992          vmexit->exitcode = VM_EXITCODE_RUN_STATE;
2000 1993          vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
2001 1994  }
2002 1995  
2003 1996  /*
2004 1997   * Some vmm resources, such as the lapic, may have CPU-specific resources
2005 1998   * allocated to them which would benefit from migration onto the host CPU which
2006 1999   * is processing the vcpu state.
2007 2000   */
2008 2001  static void
2009 2002  vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2010 2003  {
2011 2004          /*
2012 2005           * Localizing cyclic resources requires acquisition of cpu_lock, and
2013 2006           * doing so with kpreempt disabled is a recipe for deadlock disaster.
2014 2007           */
2015 2008          VERIFY(curthread->t_preempt == 0);
2016 2009  
2017 2010          /*
2018 2011           * Do not bother with localization if this vCPU is about to return to
2019 2012           * the host CPU it was last localized to.
2020 2013           */
2021 2014          if (vcpu->lastloccpu == curcpu)
2022 2015                  return;
2023 2016  
2024 2017          /*
2025 2018           * Localize system-wide resources to the primary boot vCPU.  While any
2026 2019           * of the other vCPUs may access them, it keeps the potential interrupt
2027 2020           * footprint constrained to CPUs involved with this instance.
2028 2021           */
2029 2022          if (vcpu == &vm->vcpu[0]) {
2030 2023                  vhpet_localize_resources(vm->vhpet);
2031 2024                  vrtc_localize_resources(vm->vrtc);
2032 2025                  vatpit_localize_resources(vm->vatpit);
2033 2026          }
2034 2027  
2035 2028          vlapic_localize_resources(vcpu->vlapic);
2036 2029  
2037 2030          vcpu->lastloccpu = curcpu;
2038 2031  }
2039 2032  
2040 2033  static void
2041 2034  vmm_savectx(void *arg)
2042 2035  {
2043 2036          vm_thread_ctx_t *vtc = arg;
2044 2037          struct vm *vm = vtc->vtc_vm;
2045 2038          const int vcpuid = vtc->vtc_vcpuid;
2046 2039  
2047 2040          if (ops->vmsavectx != NULL) {
2048 2041                  ops->vmsavectx(vm->cookie, vcpuid);
2049 2042          }
2050 2043  
2051 2044          /*
2052 2045           * Account for going off-cpu, unless the vCPU is idled, where being
2053 2046           * off-cpu is the explicit point.
2054 2047           */
2055 2048          if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2056 2049                  vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2057 2050                  vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2058 2051          }
2059 2052  
2060 2053          /*
2061 2054           * If the CPU holds the restored guest FPU state, save it and restore
2062 2055           * the host FPU state before this thread goes off-cpu.
2063 2056           */
2064 2057          if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2065 2058                  struct vcpu *vcpu = &vm->vcpu[vcpuid];
2066 2059  
2067 2060                  save_guest_fpustate(vcpu);
2068 2061                  vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2069 2062          }
2070 2063  }
2071 2064  
2072 2065  static void
2073 2066  vmm_restorectx(void *arg)
2074 2067  {
2075 2068          vm_thread_ctx_t *vtc = arg;
2076 2069          struct vm *vm = vtc->vtc_vm;
2077 2070          const int vcpuid = vtc->vtc_vcpuid;
2078 2071  
2079 2072          /* Complete microstate accounting for vCPU being off-cpu */
2080 2073          if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2081 2074                  vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2082 2075          }
2083 2076  
2084 2077          /*
2085 2078           * When coming back on-cpu, only restore the guest FPU status if the
2086 2079           * thread is in a context marked as requiring it.  This should be rare,
2087 2080           * occurring only when a future logic error results in a voluntary
2088 2081           * sleep during the VMRUN critical section.
2089 2082           *
2090 2083           * The common case will result in elision of the guest FPU state
2091 2084           * restoration, deferring that action until it is clearly necessary
2092 2085           * during vm_run.
2093 2086           */
2094 2087          VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2095 2088          if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2096 2089                  struct vcpu *vcpu = &vm->vcpu[vcpuid];
2097 2090  
2098 2091                  restore_guest_fpustate(vcpu);
2099 2092                  vtc->vtc_status |= VTCS_FPU_RESTORED;
2100 2093          }
2101 2094  
2102 2095          if (ops->vmrestorectx != NULL) {
2103 2096                  ops->vmrestorectx(vm->cookie, vcpuid);
2104 2097          }
2105 2098  
2106 2099  }
2107 2100  
2108 2101  /*
2109 2102   * If we're in removectx(), we might still have state to tidy up.
2110 2103   */
2111 2104  static void
2112 2105  vmm_freectx(void *arg, int isexec)
2113 2106  {
2114 2107          vmm_savectx(arg);
2115 2108  }
2116 2109  
2117 2110  static int
2118 2111  vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2119 2112      struct vm_exit *vme)
2120 2113  {
2121 2114          struct vcpu *vcpu;
2122 2115          struct vie *vie;
2123 2116          int err;
2124 2117  
2125 2118          vcpu = &vm->vcpu[vcpuid];
2126 2119          vie = vcpu->vie_ctx;
2127 2120          err = 0;
2128 2121  
2129 2122          switch (entry->cmd) {
2130 2123          case VEC_DEFAULT:
2131 2124                  return (0);
2132 2125          case VEC_DISCARD_INSTR:
2133 2126                  vie_reset(vie);
2134 2127                  return (0);
2135 2128          case VEC_FULFILL_MMIO:
2136 2129                  err = vie_fulfill_mmio(vie, &entry->u.mmio);
2137 2130                  if (err == 0) {
2138 2131                          err = vie_emulate_mmio(vie, vm, vcpuid);
2139 2132                          if (err == 0) {
2140 2133                                  vie_advance_pc(vie, &vcpu->nextrip);
2141 2134                          } else if (err < 0) {
2142 2135                                  vie_exitinfo(vie, vme);
2143 2136                          } else if (err == EAGAIN) {
2144 2137                                  /*
2145 2138                                   * Clear the instruction emulation state in
2146 2139                                   * order to re-enter VM context and continue
2147 2140                                   * this 'rep <instruction>'
2148 2141                                   */
2149 2142                                  vie_reset(vie);
2150 2143                                  err = 0;
2151 2144                          }
2152 2145                  }
2153 2146                  break;
2154 2147          case VEC_FULFILL_INOUT:
2155 2148                  err = vie_fulfill_inout(vie, &entry->u.inout);
2156 2149                  if (err == 0) {
2157 2150                          err = vie_emulate_inout(vie, vm, vcpuid);
2158 2151                          if (err == 0) {
2159 2152                                  vie_advance_pc(vie, &vcpu->nextrip);
2160 2153                          } else if (err < 0) {
2161 2154                                  vie_exitinfo(vie, vme);
2162 2155                          } else if (err == EAGAIN) {
2163 2156                                  /*
2164 2157                                   * Clear the instruction emulation state in
2165 2158                                   * order to re-enter VM context and continue
2166 2159                                   * this 'rep ins/outs'
2167 2160                                   */
2168 2161                                  vie_reset(vie);
2169 2162                                  err = 0;
2170 2163                          }
2171 2164                  }
2172 2165                  break;
2173 2166          default:
2174 2167                  return (EINVAL);
2175 2168          }
2176 2169          return (err);
2177 2170  }
2178 2171  
2179 2172  static int
2180 2173  vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2181 2174  {
2182 2175          struct vie *vie;
2183 2176  
2184 2177          vie = vm->vcpu[vcpuid].vie_ctx;
2185 2178  
2186 2179          if (vie_pending(vie)) {
2187 2180                  /*
2188 2181                   * Userspace has not fulfilled the pending needs of the
2189 2182                   * instruction emulation, so bail back out.
2190 2183                   */
2191 2184                  vie_exitinfo(vie, vme);
2192 2185                  return (-1);
2193 2186          }
2194 2187  
2195 2188          return (0);
2196 2189  }
2197 2190  
2198 2191  int
2199 2192  vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2200 2193  {
2201 2194          int error;
2202 2195          struct vcpu *vcpu;
2203 2196          struct vm_exit *vme;
2204 2197          bool intr_disabled;
2205 2198          pmap_t pmap;
2206 2199          vm_thread_ctx_t vtc;
2207 2200          int affinity_type = CPU_CURRENT;
2208 2201  
2209 2202          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2210 2203                  return (EINVAL);
2211 2204          if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2212 2205                  return (EINVAL);
2213 2206          if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2214 2207                  return (EINVAL);
2215 2208  
2216 2209          pmap = vmspace_pmap(vm->vmspace);
2217 2210          vcpu = &vm->vcpu[vcpuid];
2218 2211          vme = &vcpu->exitinfo;
2219 2212  
2220 2213          vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2221 2214  
2222 2215          vtc.vtc_vm = vm;
2223 2216          vtc.vtc_vcpuid = vcpuid;
2224 2217          vtc.vtc_status = 0;
2225 2218          installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2226 2219              NULL, vmm_freectx, NULL);
2227 2220  
2228 2221          error = vm_entry_actions(vm, vcpuid, entry, vme);
2229 2222          if (error != 0) {
2230 2223                  goto exit;
2231 2224          }
2232 2225  
2233 2226  restart:
2234 2227          error = vm_loop_checks(vm, vcpuid, vme);
2235 2228          if (error != 0) {
2236 2229                  goto exit;
2237 2230          }
2238 2231  
2239 2232          thread_affinity_set(curthread, affinity_type);
2240 2233          /*
2241 2234           * Resource localization should happen after the CPU affinity for the
2242 2235           * thread has been set to ensure that access from restricted contexts,
2243 2236           * such as VMX-accelerated APIC operations, can occur without inducing
2244 2237           * cyclic cross-calls.
2245 2238           *
2246 2239           * This must be done prior to disabling kpreempt via critical_enter().
2247 2240           */
2248 2241          vm_localize_resources(vm, vcpu);
2249 2242          affinity_type = CPU_CURRENT;
2250 2243          critical_enter();
2251 2244  
2252 2245          KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2253 2246              ("vm_run: absurd pm_active"));
2254 2247  
2255 2248          /* Force a trip through update_sregs to reload %fs/%gs and friends */
2256 2249          PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2257 2250  
2258 2251          if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2259 2252                  restore_guest_fpustate(vcpu);
2260 2253                  vtc.vtc_status |= VTCS_FPU_RESTORED;
2261 2254          }
2262 2255          vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2263 2256  
2264 2257          vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2265 2258          error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2266 2259          vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2267 2260  
2268 2261          /*
2269 2262           * Once clear of the delicate contexts comprising the VM_RUN handler,
2270 2263           * thread CPU affinity can be loosened while other processing occurs.
2271 2264           */
2272 2265          vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2273 2266          thread_affinity_clear(curthread);
2274 2267          critical_exit();
2275 2268  
2276 2269          if (error != 0) {
2277 2270                  /* Communicate out any error from VMRUN() above */
2278 2271                  goto exit;
2279 2272          }
2280 2273  
2281 2274          vcpu->nextrip = vme->rip + vme->inst_length;
2282 2275          switch (vme->exitcode) {
2283 2276          case VM_EXITCODE_REQIDLE:
2284 2277                  error = vm_handle_reqidle(vm, vcpuid);
2285 2278                  break;
2286 2279          case VM_EXITCODE_RUN_STATE:
2287 2280                  error = vm_handle_run_state(vm, vcpuid);
2288 2281                  break;
2289 2282          case VM_EXITCODE_SUSPENDED:
2290 2283                  error = vm_handle_suspend(vm, vcpuid);
2291 2284                  break;
2292 2285          case VM_EXITCODE_IOAPIC_EOI:
2293 2286                  vioapic_process_eoi(vm, vcpuid,
2294 2287                      vme->u.ioapic_eoi.vector);
2295 2288                  break;
2296 2289          case VM_EXITCODE_HLT:
2297 2290                  intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2298 2291                  error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2299 2292                  break;
2300 2293          case VM_EXITCODE_PAGING:
2301 2294                  error = vm_handle_paging(vm, vcpuid);
2302 2295                  break;
2303 2296          case VM_EXITCODE_MMIO_EMUL:
2304 2297                  error = vm_handle_mmio_emul(vm, vcpuid);
2305 2298                  break;
2306 2299          case VM_EXITCODE_INOUT:
2307 2300                  error = vm_handle_inout(vm, vcpuid, vme);
2308 2301                  break;
2309 2302          case VM_EXITCODE_INST_EMUL:
2310 2303                  error = vm_handle_inst_emul(vm, vcpuid);
2311 2304                  break;
2312 2305          case VM_EXITCODE_MONITOR:
2313 2306          case VM_EXITCODE_MWAIT:
2314 2307          case VM_EXITCODE_VMINSN:
2315 2308                  vm_inject_ud(vm, vcpuid);
2316 2309                  break;
2317 2310          case VM_EXITCODE_RDMSR:
2318 2311                  error = vm_handle_rdmsr(vm, vcpuid, vme);
2319 2312                  break;
2320 2313          case VM_EXITCODE_WRMSR:
2321 2314                  error = vm_handle_wrmsr(vm, vcpuid, vme);
2322 2315                  break;
2323 2316          case VM_EXITCODE_HT:
2324 2317                  affinity_type = CPU_BEST;
2325 2318                  break;
2326 2319          case VM_EXITCODE_MTRAP:
2327 2320                  vm_suspend_cpu(vm, vcpuid);
2328 2321                  error = -1;
2329 2322                  break;
2330 2323          default:
2331 2324                  /* handled in userland */
2332 2325                  error = -1;
2333 2326                  break;
2334 2327          }
2335 2328  
2336 2329          if (error == 0) {
2337 2330                  /* VM exit conditions handled in-kernel, continue running */
2338 2331                  goto restart;
2339 2332          }
2340 2333  
2341 2334  exit:
2342 2335          removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2343 2336              NULL, vmm_freectx);
2344 2337  
2345 2338          VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2346 2339  
2347 2340          vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2348 2341          return (error);
2349 2342  }
2350 2343  
2351 2344  int
2352 2345  vm_restart_instruction(void *arg, int vcpuid)
2353 2346  {
2354 2347          struct vm *vm;
2355 2348          struct vcpu *vcpu;
2356 2349          enum vcpu_state state;
2357 2350          uint64_t rip;
2358 2351          int error;
2359 2352  
2360 2353          vm = arg;
2361 2354          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2362 2355                  return (EINVAL);
2363 2356  
2364 2357          vcpu = &vm->vcpu[vcpuid];
2365 2358          state = vcpu_get_state(vm, vcpuid, NULL);
2366 2359          if (state == VCPU_RUNNING) {
2367 2360                  /*
2368 2361                   * When a vcpu is "running" the next instruction is determined
2369 2362                   * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2370 2363                   * Thus setting 'inst_length' to zero will cause the current
2371 2364                   * instruction to be restarted.
2372 2365                   */
2373 2366                  vcpu->exitinfo.inst_length = 0;
2374 2367                  VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2375 2368                      "setting inst_length to zero", vcpu->exitinfo.rip);
2376 2369          } else if (state == VCPU_FROZEN) {
2377 2370                  /*
2378 2371                   * When a vcpu is "frozen" it is outside the critical section
2379 2372                   * around VMRUN() and 'nextrip' points to the next instruction.
2380 2373                   * Thus instruction restart is achieved by setting 'nextrip'
2381 2374                   * to the vcpu's %rip.
2382 2375                   */
2383 2376                  error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2384 2377                  KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2385 2378                  VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2386 2379                      "nextrip from %lx to %lx", vcpu->nextrip, rip);
2387 2380                  vcpu->nextrip = rip;
2388 2381          } else {
2389 2382                  panic("%s: invalid state %d", __func__, state);
2390 2383          }
2391 2384          return (0);
2392 2385  }
2393 2386  
2394 2387  int
2395 2388  vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2396 2389  {
2397 2390          struct vcpu *vcpu;
2398 2391          int type, vector;
2399 2392  
2400 2393          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2401 2394                  return (EINVAL);
2402 2395  
2403 2396          vcpu = &vm->vcpu[vcpuid];
2404 2397  
2405 2398          if (info & VM_INTINFO_VALID) {
2406 2399                  type = info & VM_INTINFO_TYPE;
2407 2400                  vector = info & 0xff;
2408 2401                  if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2409 2402                          return (EINVAL);
2410 2403                  if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2411 2404                          return (EINVAL);
2412 2405                  if (info & VM_INTINFO_RSVD)
2413 2406                          return (EINVAL);
2414 2407          } else {
2415 2408                  info = 0;
2416 2409          }
2417 2410          VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2418 2411          vcpu->exitintinfo = info;
2419 2412          return (0);
2420 2413  }
2421 2414  
2422 2415  enum exc_class {
2423 2416          EXC_BENIGN,
2424 2417          EXC_CONTRIBUTORY,
2425 2418          EXC_PAGEFAULT
2426 2419  };
2427 2420  
2428 2421  #define IDT_VE  20      /* Virtualization Exception (Intel specific) */
2429 2422  
2430 2423  static enum exc_class
2431 2424  exception_class(uint64_t info)
2432 2425  {
2433 2426          int type, vector;
2434 2427  
2435 2428          KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2436 2429          type = info & VM_INTINFO_TYPE;
2437 2430          vector = info & 0xff;
2438 2431  
2439 2432          /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2440 2433          switch (type) {
2441 2434          case VM_INTINFO_HWINTR:
2442 2435          case VM_INTINFO_SWINTR:
2443 2436          case VM_INTINFO_NMI:
2444 2437                  return (EXC_BENIGN);
2445 2438          default:
2446 2439                  /*
2447 2440                   * Hardware exception.
2448 2441                   *
2449 2442                   * SVM and VT-x use identical type values to represent NMI,
2450 2443                   * hardware interrupt and software interrupt.
2451 2444                   *
2452 2445                   * SVM uses type '3' for all exceptions. VT-x uses type '3'
2453 2446                   * for exceptions except #BP and #OF. #BP and #OF use a type
2454 2447                   * value of '5' or '6'. Therefore we don't check for explicit
2455 2448                   * values of 'type' to classify 'intinfo' into a hardware
2456 2449                   * exception.
2457 2450                   */
2458 2451                  break;
2459 2452          }
2460 2453  
2461 2454          switch (vector) {
2462 2455          case IDT_PF:
2463 2456          case IDT_VE:
2464 2457                  return (EXC_PAGEFAULT);
2465 2458          case IDT_DE:
2466 2459          case IDT_TS:
2467 2460          case IDT_NP:
2468 2461          case IDT_SS:
2469 2462          case IDT_GP:
2470 2463                  return (EXC_CONTRIBUTORY);
2471 2464          default:
2472 2465                  return (EXC_BENIGN);
2473 2466          }
2474 2467  }
2475 2468  
2476 2469  static int
2477 2470  nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2478 2471      uint64_t *retinfo)
2479 2472  {
2480 2473          enum exc_class exc1, exc2;
2481 2474          int type1, vector1;
2482 2475  
2483 2476          KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2484 2477          KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2485 2478  
2486 2479          /*
2487 2480           * If an exception occurs while attempting to call the double-fault
2488 2481           * handler the processor enters shutdown mode (aka triple fault).
2489 2482           */
2490 2483          type1 = info1 & VM_INTINFO_TYPE;
2491 2484          vector1 = info1 & 0xff;
2492 2485          if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2493 2486                  VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2494 2487                      info1, info2);
2495 2488                  vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2496 2489                  *retinfo = 0;
2497 2490                  return (0);
2498 2491          }
2499 2492  
2500 2493          /*
2501 2494           * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2502 2495           */
2503 2496          exc1 = exception_class(info1);
2504 2497          exc2 = exception_class(info2);
2505 2498          if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2506 2499              (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2507 2500                  /* Convert nested fault into a double fault. */
2508 2501                  *retinfo = IDT_DF;
2509 2502                  *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2510 2503                  *retinfo |= VM_INTINFO_DEL_ERRCODE;
2511 2504          } else {
2512 2505                  /* Handle exceptions serially */
2513 2506                  *retinfo = info2;
2514 2507          }
2515 2508          return (1);
2516 2509  }
2517 2510  
2518 2511  static uint64_t
2519 2512  vcpu_exception_intinfo(struct vcpu *vcpu)
2520 2513  {
2521 2514          uint64_t info = 0;
2522 2515  
2523 2516          if (vcpu->exception_pending) {
2524 2517                  info = vcpu->exc_vector & 0xff;
2525 2518                  info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2526 2519                  if (vcpu->exc_errcode_valid) {
2527 2520                          info |= VM_INTINFO_DEL_ERRCODE;
2528 2521                          info |= (uint64_t)vcpu->exc_errcode << 32;
2529 2522                  }
2530 2523          }
2531 2524          return (info);
2532 2525  }
2533 2526  
2534 2527  int
2535 2528  vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2536 2529  {
2537 2530          struct vcpu *vcpu;
2538 2531          uint64_t info1, info2;
2539 2532          int valid;
2540 2533  
2541 2534          KASSERT(vcpuid >= 0 &&
2542 2535              vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2543 2536  
2544 2537          vcpu = &vm->vcpu[vcpuid];
2545 2538  
2546 2539          info1 = vcpu->exitintinfo;
2547 2540          vcpu->exitintinfo = 0;
2548 2541  
2549 2542          info2 = 0;
2550 2543          if (vcpu->exception_pending) {
2551 2544                  info2 = vcpu_exception_intinfo(vcpu);
2552 2545                  vcpu->exception_pending = 0;
2553 2546                  VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2554 2547                      vcpu->exc_vector, info2);
2555 2548          }
2556 2549  
2557 2550          if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2558 2551                  valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2559 2552          } else if (info1 & VM_INTINFO_VALID) {
2560 2553                  *retinfo = info1;
2561 2554                  valid = 1;
2562 2555          } else if (info2 & VM_INTINFO_VALID) {
2563 2556                  *retinfo = info2;
2564 2557                  valid = 1;
2565 2558          } else {
2566 2559                  valid = 0;
2567 2560          }
2568 2561  
2569 2562          if (valid) {
2570 2563                  VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2571 2564                      "retinfo(%lx)", __func__, info1, info2, *retinfo);
2572 2565          }
2573 2566  
2574 2567          return (valid);
2575 2568  }
2576 2569  
2577 2570  int
2578 2571  vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2579 2572  {
2580 2573          struct vcpu *vcpu;
2581 2574  
2582 2575          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2583 2576                  return (EINVAL);
2584 2577  
2585 2578          vcpu = &vm->vcpu[vcpuid];
2586 2579          *info1 = vcpu->exitintinfo;
2587 2580          *info2 = vcpu_exception_intinfo(vcpu);
2588 2581          return (0);
2589 2582  }
2590 2583  
2591 2584  int
2592 2585  vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2593 2586      uint32_t errcode, int restart_instruction)
2594 2587  {
2595 2588          struct vcpu *vcpu;
2596 2589          uint64_t regval;
2597 2590          int error;
2598 2591  
2599 2592          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2600 2593                  return (EINVAL);
2601 2594  
2602 2595          if (vector < 0 || vector >= 32)
2603 2596                  return (EINVAL);
2604 2597  
2605 2598          /*
2606 2599           * NMIs (which bear an exception vector of 2) are to be injected via
2607 2600           * their own specialized path using vm_inject_nmi().
2608 2601           */
2609 2602          if (vector == 2) {
2610 2603                  return (EINVAL);
2611 2604          }
2612 2605  
2613 2606          /*
2614 2607           * A double fault exception should never be injected directly into
2615 2608           * the guest. It is a derived exception that results from specific
2616 2609           * combinations of nested faults.
2617 2610           */
2618 2611          if (vector == IDT_DF)
2619 2612                  return (EINVAL);
2620 2613  
2621 2614          vcpu = &vm->vcpu[vcpuid];
2622 2615  
2623 2616          if (vcpu->exception_pending) {
2624 2617                  VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2625 2618                      "pending exception %d", vector, vcpu->exc_vector);
2626 2619                  return (EBUSY);
2627 2620          }
2628 2621  
2629 2622          if (errcode_valid) {
2630 2623                  /*
2631 2624                   * Exceptions don't deliver an error code in real mode.
2632 2625                   */
2633 2626                  error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2634 2627                  KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2635 2628                  if (!(regval & CR0_PE))
2636 2629                          errcode_valid = 0;
2637 2630          }
2638 2631  
2639 2632          /*
2640 2633           * From section 26.6.1 "Interruptibility State" in Intel SDM:
2641 2634           *
2642 2635           * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2643 2636           * one instruction or incurs an exception.
2644 2637           */
2645 2638          error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2646 2639          KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2647 2640              __func__, error));
2648 2641  
2649 2642          if (restart_instruction)
2650 2643                  vm_restart_instruction(vm, vcpuid);
2651 2644  
2652 2645          vcpu->exception_pending = 1;
2653 2646          vcpu->exc_vector = vector;
2654 2647          vcpu->exc_errcode = errcode;
2655 2648          vcpu->exc_errcode_valid = errcode_valid;
2656 2649          VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2657 2650          return (0);
2658 2651  }
2659 2652  
2660 2653  void
2661 2654  vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2662 2655      int errcode)
2663 2656  {
2664 2657          int error;
2665 2658  
2666 2659          error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2667 2660              errcode, 1);
2668 2661          KASSERT(error == 0, ("vm_inject_exception error %d", error));
2669 2662  }
2670 2663  
2671 2664  void
2672 2665  vm_inject_ud(struct vm *vm, int vcpuid)
2673 2666  {
2674 2667          vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2675 2668  }
2676 2669  
2677 2670  void
2678 2671  vm_inject_gp(struct vm *vm, int vcpuid)
2679 2672  {
2680 2673          vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2681 2674  }
2682 2675  
2683 2676  void
2684 2677  vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2685 2678  {
2686 2679          vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2687 2680  }
2688 2681  
2689 2682  void
2690 2683  vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2691 2684  {
2692 2685          vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2693 2686  }
2694 2687  
2695 2688  void
2696 2689  vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2697 2690  {
2698 2691          int error;
2699 2692  
2700 2693          VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2701 2694              error_code, cr2);
2702 2695  
2703 2696          error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2704 2697          KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2705 2698  
2706 2699          vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2707 2700  }
2708 2701  
2709 2702  static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2710 2703  
2711 2704  int
2712 2705  vm_inject_nmi(struct vm *vm, int vcpuid)
2713 2706  {
2714 2707          struct vcpu *vcpu;
2715 2708  
2716 2709          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2717 2710                  return (EINVAL);
2718 2711  
2719 2712          vcpu = &vm->vcpu[vcpuid];
2720 2713  
2721 2714          vcpu->nmi_pending = 1;
2722 2715          vcpu_notify_event(vm, vcpuid);
2723 2716          return (0);
2724 2717  }
2725 2718  
2726 2719  int
2727 2720  vm_nmi_pending(struct vm *vm, int vcpuid)
2728 2721  {
2729 2722          struct vcpu *vcpu;
2730 2723  
2731 2724          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2732 2725                  panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2733 2726  
2734 2727          vcpu = &vm->vcpu[vcpuid];
2735 2728  
2736 2729          return (vcpu->nmi_pending);
2737 2730  }
2738 2731  
2739 2732  void
2740 2733  vm_nmi_clear(struct vm *vm, int vcpuid)
2741 2734  {
2742 2735          struct vcpu *vcpu;
2743 2736  
2744 2737          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2745 2738                  panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2746 2739  
2747 2740          vcpu = &vm->vcpu[vcpuid];
2748 2741  
2749 2742          if (vcpu->nmi_pending == 0)
2750 2743                  panic("vm_nmi_clear: inconsistent nmi_pending state");
2751 2744  
2752 2745          vcpu->nmi_pending = 0;
2753 2746          vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2754 2747  }
2755 2748  
2756 2749  static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2757 2750  
2758 2751  int
2759 2752  vm_inject_extint(struct vm *vm, int vcpuid)
2760 2753  {
2761 2754          struct vcpu *vcpu;
2762 2755  
2763 2756          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2764 2757                  return (EINVAL);
2765 2758  
2766 2759          vcpu = &vm->vcpu[vcpuid];
2767 2760  
2768 2761          vcpu->extint_pending = 1;
2769 2762          vcpu_notify_event(vm, vcpuid);
2770 2763          return (0);
2771 2764  }
2772 2765  
2773 2766  int
2774 2767  vm_extint_pending(struct vm *vm, int vcpuid)
2775 2768  {
2776 2769          struct vcpu *vcpu;
2777 2770  
2778 2771          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2779 2772                  panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2780 2773  
2781 2774          vcpu = &vm->vcpu[vcpuid];
2782 2775  
2783 2776          return (vcpu->extint_pending);
2784 2777  }
2785 2778  
2786 2779  void
2787 2780  vm_extint_clear(struct vm *vm, int vcpuid)
2788 2781  {
2789 2782          struct vcpu *vcpu;
2790 2783  
2791 2784          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2792 2785                  panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2793 2786  
2794 2787          vcpu = &vm->vcpu[vcpuid];
2795 2788  
2796 2789          if (vcpu->extint_pending == 0)
2797 2790                  panic("vm_extint_clear: inconsistent extint_pending state");
2798 2791  
2799 2792          vcpu->extint_pending = 0;
2800 2793          vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2801 2794  }
2802 2795  
2803 2796  int
2804 2797  vm_inject_init(struct vm *vm, int vcpuid)
2805 2798  {
2806 2799          struct vcpu *vcpu;
2807 2800  
2808 2801          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2809 2802                  return (EINVAL);
2810 2803  
2811 2804          vcpu = &vm->vcpu[vcpuid];
2812 2805          vcpu_lock(vcpu);
2813 2806          vcpu->run_state |= VRS_PEND_INIT;
2814 2807          /*
2815 2808           * As part of queuing the INIT request, clear any pending SIPI.  It
2816 2809           * would not otherwise survive across the reset of the vCPU when it
2817 2810           * undergoes the requested INIT.  We would not want it to linger when it
2818 2811           * could be mistaken as a subsequent (after the INIT) SIPI request.
2819 2812           */
2820 2813          vcpu->run_state &= ~VRS_PEND_SIPI;
2821 2814          vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2822 2815  
2823 2816          vcpu_unlock(vcpu);
2824 2817          return (0);
2825 2818  }
2826 2819  
2827 2820  int
2828 2821  vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2829 2822  {
2830 2823          struct vcpu *vcpu;
2831 2824  
2832 2825          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2833 2826                  return (EINVAL);
2834 2827  
2835 2828          vcpu = &vm->vcpu[vcpuid];
2836 2829          vcpu_lock(vcpu);
2837 2830          vcpu->run_state |= VRS_PEND_SIPI;
2838 2831          vcpu->sipi_vector = vector;
2839 2832          /* SIPI is only actionable if the CPU is waiting in INIT state */
2840 2833          if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2841 2834                  vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2842 2835          }
2843 2836          vcpu_unlock(vcpu);
2844 2837          return (0);
2845 2838  }
2846 2839  
2847 2840  bool
2848 2841  vcpu_run_state_pending(struct vm *vm, int vcpuid)
2849 2842  {
2850 2843          struct vcpu *vcpu;
2851 2844  
2852 2845          ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2853 2846          vcpu = &vm->vcpu[vcpuid];
2854 2847  
2855 2848          /* Of interest: vCPU not in running state or with pending INIT */
2856 2849          return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2857 2850  }
2858 2851  
2859 2852  int
2860 2853  vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2861 2854  {
2862 2855          struct seg_desc desc;
2863 2856          const enum vm_reg_name clear_regs[] = {
2864 2857                  VM_REG_GUEST_CR2,
2865 2858                  VM_REG_GUEST_CR3,
2866 2859                  VM_REG_GUEST_CR4,
2867 2860                  VM_REG_GUEST_RAX,
2868 2861                  VM_REG_GUEST_RBX,
2869 2862                  VM_REG_GUEST_RCX,
2870 2863                  VM_REG_GUEST_RSI,
2871 2864                  VM_REG_GUEST_RDI,
2872 2865                  VM_REG_GUEST_RBP,
2873 2866                  VM_REG_GUEST_RSP,
2874 2867                  VM_REG_GUEST_R8,
2875 2868                  VM_REG_GUEST_R9,
2876 2869                  VM_REG_GUEST_R10,
2877 2870                  VM_REG_GUEST_R11,
2878 2871                  VM_REG_GUEST_R12,
2879 2872                  VM_REG_GUEST_R13,
2880 2873                  VM_REG_GUEST_R14,
2881 2874                  VM_REG_GUEST_R15,
2882 2875                  VM_REG_GUEST_DR0,
2883 2876                  VM_REG_GUEST_DR1,
2884 2877                  VM_REG_GUEST_DR2,
2885 2878                  VM_REG_GUEST_DR3,
2886 2879                  VM_REG_GUEST_EFER,
2887 2880          };
2888 2881          const enum vm_reg_name data_segs[] = {
2889 2882                  VM_REG_GUEST_SS,
2890 2883                  VM_REG_GUEST_DS,
2891 2884                  VM_REG_GUEST_ES,
2892 2885                  VM_REG_GUEST_FS,
2893 2886                  VM_REG_GUEST_GS,
2894 2887          };
2895 2888          struct vcpu *vcpu = &vm->vcpu[vcpuid];
2896 2889  
2897 2890          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2898 2891                  return (EINVAL);
2899 2892  
2900 2893          for (uint_t i = 0; i < nitems(clear_regs); i++) {
2901 2894                  VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2902 2895          }
2903 2896  
2904 2897          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2905 2898          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2906 2899          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2907 2900  
2908 2901          /*
2909 2902           * The prescribed contents of %rdx differ slightly between the Intel and
2910 2903           * AMD architectural definitions.  The former expects the Extended Model
2911 2904           * in bits 16-19 where the latter expects all the Family, Model, and
2912 2905           * Stepping be there.  Common boot ROMs appear to disregard this
2913 2906           * anyways, so we stick with a compromise value similar to what is
2914 2907           * spelled out in the Intel SDM.
2915 2908           */
2916 2909          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2917 2910  
2918 2911          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2919 2912          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2920 2913  
2921 2914          /* CS: Present, R/W, Accessed */
2922 2915          desc.access = 0x0093;
2923 2916          desc.base = 0xffff0000;
2924 2917          desc.limit = 0xffff;
2925 2918          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2926 2919          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2927 2920  
2928 2921          /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2929 2922          desc.access = 0x0093;
2930 2923          desc.base = 0;
2931 2924          desc.limit = 0xffff;
2932 2925          for (uint_t i = 0; i < nitems(data_segs); i++) {
2933 2926                  VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2934 2927                  VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2935 2928          }
2936 2929  
2937 2930          /* GDTR, IDTR */
2938 2931          desc.base = 0;
2939 2932          desc.limit = 0xffff;
2940 2933          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2941 2934          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2942 2935  
2943 2936          /* LDTR: Present, LDT */
2944 2937          desc.access = 0x0082;
2945 2938          desc.base = 0;
2946 2939          desc.limit = 0xffff;
2947 2940          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2948 2941          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2949 2942  
2950 2943          /* TR: Present, 32-bit TSS */
2951 2944          desc.access = 0x008b;
2952 2945          desc.base = 0;
2953 2946          desc.limit = 0xffff;
2954 2947          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2955 2948          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2956 2949  
2957 2950          vlapic_reset(vm_lapic(vm, vcpuid));
2958 2951  
2959 2952          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2960 2953  
2961 2954          vcpu->exitintinfo = 0;
2962 2955          vcpu->exception_pending = 0;
2963 2956          vcpu->nmi_pending = 0;
2964 2957          vcpu->extint_pending = 0;
2965 2958  
2966 2959          /*
2967 2960           * A CPU reset caused by power-on or system reset clears more state than
2968 2961           * one which is trigged from an INIT IPI.
2969 2962           */
2970 2963          if (!init_only) {
2971 2964                  vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2972 2965                  fpu_save_area_reset(vcpu->guestfpu);
2973 2966  
2974 2967                  /* XXX: clear MSRs and other pieces */
2975 2968          }
2976 2969  
2977 2970          return (0);
2978 2971  }
2979 2972  
2980 2973  static int
2981 2974  vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2982 2975  {
2983 2976          struct seg_desc desc;
2984 2977  
2985 2978          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2986 2979                  return (EINVAL);
2987 2980  
2988 2981          /* CS: Present, R/W, Accessed */
2989 2982          desc.access = 0x0093;
2990 2983          desc.base = (uint64_t)vector << 12;
2991 2984          desc.limit = 0xffff;
2992 2985          VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2993 2986          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2994 2987              (uint64_t)vector << 8));
2995 2988  
2996 2989          VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2997 2990  
2998 2991          return (0);
2999 2992  }
3000 2993  
3001 2994  int
3002 2995  vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
3003 2996  {
3004 2997          if (vcpu < 0 || vcpu >= vm->maxcpus)
3005 2998                  return (EINVAL);
3006 2999  
3007 3000          if (type < 0 || type >= VM_CAP_MAX)
3008 3001                  return (EINVAL);
3009 3002  
3010 3003          return (VMGETCAP(vm->cookie, vcpu, type, retval));
3011 3004  }
3012 3005  
3013 3006  int
3014 3007  vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3015 3008  {
3016 3009          if (vcpu < 0 || vcpu >= vm->maxcpus)
3017 3010                  return (EINVAL);
3018 3011  
3019 3012          if (type < 0 || type >= VM_CAP_MAX)
3020 3013                  return (EINVAL);
3021 3014  
3022 3015          return (VMSETCAP(vm->cookie, vcpu, type, val));
3023 3016  }
3024 3017  
3025 3018  struct vlapic *
3026 3019  vm_lapic(struct vm *vm, int cpu)
3027 3020  {
3028 3021          return (vm->vcpu[cpu].vlapic);
3029 3022  }
3030 3023  
3031 3024  struct vioapic *
3032 3025  vm_ioapic(struct vm *vm)
3033 3026  {
3034 3027  
3035 3028          return (vm->vioapic);
3036 3029  }
3037 3030  
3038 3031  struct vhpet *
3039 3032  vm_hpet(struct vm *vm)
3040 3033  {
3041 3034  
3042 3035          return (vm->vhpet);
3043 3036  }
3044 3037  
3045 3038  void *
3046 3039  vm_iommu_domain(struct vm *vm)
3047 3040  {
3048 3041  
3049 3042          return (vm->iommu);
3050 3043  }
3051 3044  
3052 3045  int
3053 3046  vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3054 3047      bool from_idle)
3055 3048  {
3056 3049          int error;
3057 3050          struct vcpu *vcpu;
3058 3051  
3059 3052          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3060 3053                  panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3061 3054  
3062 3055          vcpu = &vm->vcpu[vcpuid];
3063 3056  
3064 3057          vcpu_lock(vcpu);
3065 3058          error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3066 3059          vcpu_unlock(vcpu);
3067 3060  
3068 3061          return (error);
3069 3062  }
3070 3063  
3071 3064  enum vcpu_state
3072 3065  vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3073 3066  {
3074 3067          struct vcpu *vcpu;
3075 3068          enum vcpu_state state;
3076 3069  
3077 3070          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3078 3071                  panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3079 3072  
3080 3073          vcpu = &vm->vcpu[vcpuid];
3081 3074  
3082 3075          vcpu_lock(vcpu);
3083 3076          state = vcpu->state;
3084 3077          if (hostcpu != NULL)
3085 3078                  *hostcpu = vcpu->hostcpu;
3086 3079          vcpu_unlock(vcpu);
3087 3080  
3088 3081          return (state);
3089 3082  }
3090 3083  
3091 3084  uint64_t
3092 3085  vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3093 3086  {
3094 3087          ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3095 3088  
3096 3089          uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3097 3090  
3098 3091          if (phys_adj) {
3099 3092                  /* Include any offset for the current physical CPU too */
3100 3093                  extern hrtime_t tsc_gethrtime_tick_delta(void);
3101 3094                  vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3102 3095          }
3103 3096  
3104 3097          return (vcpu_off);
3105 3098  }
3106 3099  
3107 3100  int
3108 3101  vm_activate_cpu(struct vm *vm, int vcpuid)
3109 3102  {
3110 3103  
3111 3104          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3112 3105                  return (EINVAL);
3113 3106  
3114 3107          if (CPU_ISSET(vcpuid, &vm->active_cpus))
3115 3108                  return (EBUSY);
3116 3109  
3117 3110          VCPU_CTR0(vm, vcpuid, "activated");
3118 3111          CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3119 3112          return (0);
3120 3113  }
3121 3114  
3122 3115  int
3123 3116  vm_suspend_cpu(struct vm *vm, int vcpuid)
3124 3117  {
3125 3118          int i;
3126 3119  
3127 3120          if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3128 3121                  return (EINVAL);
3129 3122  
3130 3123          if (vcpuid == -1) {
3131 3124                  vm->debug_cpus = vm->active_cpus;
3132 3125                  for (i = 0; i < vm->maxcpus; i++) {
3133 3126                          if (CPU_ISSET(i, &vm->active_cpus))
3134 3127                                  vcpu_notify_event(vm, i);
3135 3128                  }
3136 3129          } else {
3137 3130                  if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3138 3131                          return (EINVAL);
3139 3132  
3140 3133                  CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3141 3134                  vcpu_notify_event(vm, vcpuid);
3142 3135          }
3143 3136          return (0);
3144 3137  }
3145 3138  
3146 3139  int
3147 3140  vm_resume_cpu(struct vm *vm, int vcpuid)
3148 3141  {
3149 3142  
3150 3143          if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3151 3144                  return (EINVAL);
3152 3145  
3153 3146          if (vcpuid == -1) {
3154 3147                  CPU_ZERO(&vm->debug_cpus);
3155 3148          } else {
3156 3149                  if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3157 3150                          return (EINVAL);
3158 3151  
3159 3152                  CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3160 3153          }
3161 3154          return (0);
3162 3155  }
3163 3156  
3164 3157  static bool
3165 3158  vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3166 3159      uint64_t entry_rip)
3167 3160  {
3168 3161          struct vcpu *vcpu = &vm->vcpu[vcpuid];
3169 3162          struct vm_exit *vme = &vcpu->exitinfo;
3170 3163          bool bail = false;
3171 3164  
3172 3165          ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3173 3166  
3174 3167          if (vm->suspend) {
3175 3168                  if (on_entry) {
3176 3169                          VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3177 3170                              vm->suspend < VM_SUSPEND_LAST);
3178 3171  
3179 3172                          vme->exitcode = VM_EXITCODE_SUSPENDED;
3180 3173                          vme->u.suspended.how = vm->suspend;
3181 3174                  } else {
3182 3175                          /*
3183 3176                           * Handling VM suspend is complicated, so if that
3184 3177                           * condition is detected outside of VM-entry itself,
3185 3178                           * just emit a BOGUS exitcode so we take a lap to pick
3186 3179                           * up the event during an entry and are directed into
3187 3180                           * the vm_handle_suspend() logic.
3188 3181                           */
3189 3182                          vme->exitcode = VM_EXITCODE_BOGUS;
3190 3183                  }
3191 3184                  bail = true;
3192 3185          }
3193 3186          if (vcpu->reqidle) {
3194 3187                  vme->exitcode = VM_EXITCODE_REQIDLE;
3195 3188                  vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3196 3189  
3197 3190                  if (!on_entry) {
3198 3191                          /*
3199 3192                           * A reqidle request detected outside of VM-entry can be
3200 3193                           * handled directly by clearing the request (and taking
3201 3194                           * a lap to userspace).
3202 3195                           */
3203 3196                          vcpu_assert_locked(vcpu);
3204 3197                          vcpu->reqidle = 0;
3205 3198                  }
3206 3199                  bail = true;
3207 3200          }
3208 3201          if (vcpu_should_yield(vm, vcpuid)) {
3209 3202                  vme->exitcode = VM_EXITCODE_BOGUS;
3210 3203                  vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3211 3204                  bail = true;
3212 3205          }
3213 3206          if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3214 3207                  vme->exitcode = VM_EXITCODE_DEBUG;
3215 3208                  bail = true;
3216 3209          }
3217 3210  
3218 3211          if (bail) {
3219 3212                  if (on_entry) {
3220 3213                          /*
3221 3214                           * If bailing out during VM-entry, the current %rip must
3222 3215                           * be recorded in the exitinfo.
3223 3216                           */
3224 3217                          vme->rip = entry_rip;
3225 3218                  }
3226 3219                  vme->inst_length = 0;
3227 3220          }
3228 3221          return (bail);
3229 3222  }
3230 3223  
3231 3224  static bool
3232 3225  vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3233 3226  {
3234 3227          /*
3235 3228           * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3236 3229           * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3237 3230           * structure, and we would only modify the exitcode.
3238 3231           */
3239 3232          return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3240 3233  }
3241 3234  
3242 3235  bool
3243 3236  vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3244 3237  {
3245 3238          /*
3246 3239           * Bail-out checks done as part of VM entry require an updated %rip to
3247 3240           * populate the vm_exit struct if any of the conditions of interest are
3248 3241           * matched in the check.
3249 3242           */
3250 3243          return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3251 3244  }
3252 3245  
3253 3246  cpuset_t
3254 3247  vm_active_cpus(struct vm *vm)
3255 3248  {
3256 3249  
3257 3250          return (vm->active_cpus);
3258 3251  }
3259 3252  
3260 3253  cpuset_t
3261 3254  vm_debug_cpus(struct vm *vm)
3262 3255  {
3263 3256  
3264 3257          return (vm->debug_cpus);
3265 3258  }
3266 3259  
3267 3260  cpuset_t
3268 3261  vm_suspended_cpus(struct vm *vm)
3269 3262  {
3270 3263  
3271 3264          return (vm->suspended_cpus);
3272 3265  }
3273 3266  
3274 3267  void *
3275 3268  vcpu_stats(struct vm *vm, int vcpuid)
3276 3269  {
3277 3270  
3278 3271          return (vm->vcpu[vcpuid].stats);
3279 3272  }
3280 3273  
3281 3274  int
3282 3275  vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3283 3276  {
3284 3277          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3285 3278                  return (EINVAL);
3286 3279  
3287 3280          *state = vm->vcpu[vcpuid].x2apic_state;
3288 3281  
3289 3282          return (0);
3290 3283  }
3291 3284  
3292 3285  int
3293 3286  vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3294 3287  {
3295 3288          if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3296 3289                  return (EINVAL);
3297 3290  
3298 3291          if (state >= X2APIC_STATE_LAST)
3299 3292                  return (EINVAL);
3300 3293  
3301 3294          vm->vcpu[vcpuid].x2apic_state = state;
3302 3295  
3303 3296          vlapic_set_x2apic_state(vm, vcpuid, state);
3304 3297  
3305 3298          return (0);
3306 3299  }
3307 3300  
3308 3301  /*
3309 3302   * This function is called to ensure that a vcpu "sees" a pending event
3310 3303   * as soon as possible:
3311 3304   * - If the vcpu thread is sleeping then it is woken up.
3312 3305   * - If the vcpu is running on a different host_cpu then an IPI will be directed
3313 3306   *   to the host_cpu to cause the vcpu to trap into the hypervisor.
3314 3307   */
3315 3308  static void
3316 3309  vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3317 3310  {
3318 3311          int hostcpu;
3319 3312  
3320 3313          ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3321 3314  
3322 3315          hostcpu = vcpu->hostcpu;
3323 3316          if (vcpu->state == VCPU_RUNNING) {
3324 3317                  KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3325 3318                  if (hostcpu != curcpu) {
3326 3319                          if (ntype == VCPU_NOTIFY_APIC) {
3327 3320                                  vlapic_post_intr(vcpu->vlapic, hostcpu,
3328 3321                                      vmm_ipinum);
3329 3322                          } else {
3330 3323                                  ipi_cpu(hostcpu, vmm_ipinum);
3331 3324                          }
3332 3325                  } else {
3333 3326                          /*
3334 3327                           * If the 'vcpu' is running on 'curcpu' then it must
3335 3328                           * be sending a notification to itself (e.g. SELF_IPI).
3336 3329                           * The pending event will be picked up when the vcpu
3337 3330                           * transitions back to guest context.
3338 3331                           */
3339 3332                  }
3340 3333          } else {
3341 3334                  KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3342 3335                      "with hostcpu %d", vcpu->state, hostcpu));
3343 3336                  if (vcpu->state == VCPU_SLEEPING) {
3344 3337                          cv_signal(&vcpu->vcpu_cv);
3345 3338                  }
3346 3339          }
3347 3340  }
3348 3341  
3349 3342  void
3350 3343  vcpu_notify_event(struct vm *vm, int vcpuid)
3351 3344  {
3352 3345          struct vcpu *vcpu = &vm->vcpu[vcpuid];
3353 3346  
3354 3347          vcpu_lock(vcpu);
3355 3348          vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3356 3349          vcpu_unlock(vcpu);
3357 3350  }
3358 3351  
3359 3352  void
3360 3353  vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3361 3354  {
3362 3355          struct vcpu *vcpu = &vm->vcpu[vcpuid];
3363 3356  
3364 3357          if (ntype == VCPU_NOTIFY_NONE) {
3365 3358                  return;
3366 3359          }
3367 3360  
3368 3361          vcpu_lock(vcpu);
3369 3362          vcpu_notify_event_locked(vcpu, ntype);
3370 3363          vcpu_unlock(vcpu);
3371 3364  }
3372 3365  
3373 3366  void
3374 3367  vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3375 3368  {
3376 3369          struct vcpu *vcpu = &vm->vcpu[vcpuid];
3377 3370          hrtime_t now = gethrtime();
3378 3371  
3379 3372          ASSERT3U(ustate, !=, vcpu->ustate);
3380 3373          ASSERT3S(ustate, <, VU_MAX);
3381 3374          ASSERT3S(ustate, >=, VU_INIT);
3382 3375  
3383 3376          hrtime_t delta = now - vcpu->ustate_when;
3384 3377          vcpu->ustate_total[vcpu->ustate] += delta;
3385 3378  
3386 3379          membar_producer();
3387 3380  
3388 3381          vcpu->ustate_when = now;
3389 3382          vcpu->ustate = ustate;
3390 3383  }
3391 3384  
3392 3385  struct vmspace *
3393 3386  vm_get_vmspace(struct vm *vm)
3394 3387  {
3395 3388  
3396 3389          return (vm->vmspace);
3397 3390  }
3398 3391  
3399 3392  int
3400 3393  vm_apicid2vcpuid(struct vm *vm, int apicid)
3401 3394  {
3402 3395          /*
3403 3396           * XXX apic id is assumed to be numerically identical to vcpu id
3404 3397           */
3405 3398          return (apicid);
3406 3399  }
3407 3400  
3408 3401  struct vatpic *
3409 3402  vm_atpic(struct vm *vm)
3410 3403  {
3411 3404          return (vm->vatpic);
3412 3405  }
3413 3406  
3414 3407  struct vatpit *
3415 3408  vm_atpit(struct vm *vm)
3416 3409  {
3417 3410          return (vm->vatpit);
3418 3411  }
3419 3412  
3420 3413  struct vpmtmr *
3421 3414  vm_pmtmr(struct vm *vm)
3422 3415  {
3423 3416  
3424 3417          return (vm->vpmtmr);
3425 3418  }
3426 3419  
3427 3420  struct vrtc *
3428 3421  vm_rtc(struct vm *vm)
3429 3422  {
3430 3423  
3431 3424          return (vm->vrtc);
3432 3425  }
3433 3426  
3434 3427  enum vm_reg_name
3435 3428  vm_segment_name(int seg)
3436 3429  {
3437 3430          static enum vm_reg_name seg_names[] = {
3438 3431                  VM_REG_GUEST_ES,
3439 3432                  VM_REG_GUEST_CS,
3440 3433                  VM_REG_GUEST_SS,
3441 3434                  VM_REG_GUEST_DS,
3442 3435                  VM_REG_GUEST_FS,
3443 3436                  VM_REG_GUEST_GS
3444 3437          };
3445 3438  
3446 3439          KASSERT(seg >= 0 && seg < nitems(seg_names),
3447 3440              ("%s: invalid segment encoding %d", __func__, seg));
3448 3441          return (seg_names[seg]);
3449 3442  }
3450 3443  
3451 3444  void
3452 3445  vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3453 3446      int num_copyinfo)
3454 3447  {
3455 3448          int idx;
3456 3449  
3457 3450          for (idx = 0; idx < num_copyinfo; idx++) {
3458 3451                  if (copyinfo[idx].cookie != NULL)
3459 3452                          vm_gpa_release(copyinfo[idx].cookie);
3460 3453          }
3461 3454          bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3462 3455  }
3463 3456  
3464 3457  int
3465 3458  vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3466 3459      uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3467 3460      int num_copyinfo, int *fault)
3468 3461  {
3469 3462          int error, idx, nused;
3470 3463          size_t n, off, remaining;
3471 3464          void *hva, *cookie;
3472 3465          uint64_t gpa;
3473 3466  
3474 3467          bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3475 3468  
3476 3469          nused = 0;
3477 3470          remaining = len;
3478 3471          while (remaining > 0) {
3479 3472                  KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3480 3473                  error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3481 3474                  if (error || *fault)
3482 3475                          return (error);
3483 3476                  off = gpa & PAGE_MASK;
3484 3477                  n = min(remaining, PAGE_SIZE - off);
3485 3478                  copyinfo[nused].gpa = gpa;
3486 3479                  copyinfo[nused].len = n;
3487 3480                  remaining -= n;
3488 3481                  gla += n;
3489 3482                  nused++;
3490 3483          }
3491 3484  
3492 3485          for (idx = 0; idx < nused; idx++) {
3493 3486                  hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3494 3487                      copyinfo[idx].len, prot, &cookie);
3495 3488                  if (hva == NULL)
3496 3489                          break;
3497 3490                  copyinfo[idx].hva = hva;
3498 3491                  copyinfo[idx].cookie = cookie;
3499 3492          }
3500 3493  
3501 3494          if (idx != nused) {
3502 3495                  vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3503 3496                  return (EFAULT);
3504 3497          } else {
3505 3498                  *fault = 0;
3506 3499                  return (0);
3507 3500          }
3508 3501  }
3509 3502  
3510 3503  void
3511 3504  vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3512 3505      size_t len)
3513 3506  {
3514 3507          char *dst;
3515 3508          int idx;
3516 3509  
3517 3510          dst = kaddr;
3518 3511          idx = 0;
3519 3512          while (len > 0) {
3520 3513                  bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3521 3514                  len -= copyinfo[idx].len;
3522 3515                  dst += copyinfo[idx].len;
3523 3516                  idx++;
3524 3517          }
3525 3518  }
3526 3519  
3527 3520  void
3528 3521  vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3529 3522      struct vm_copyinfo *copyinfo, size_t len)
3530 3523  {
3531 3524          const char *src;
3532 3525          int idx;
3533 3526  
3534 3527          src = kaddr;
3535 3528          idx = 0;
3536 3529          while (len > 0) {
3537 3530                  bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3538 3531                  len -= copyinfo[idx].len;
3539 3532                  src += copyinfo[idx].len;
3540 3533                  idx++;
3541 3534          }
3542 3535  }
3543 3536  
3544 3537  /*
3545 3538   * Return the amount of in-use and wired memory for the VM. Since
3546 3539   * these are global stats, only return the values with for vCPU 0
3547 3540   */
3548 3541  VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3549 3542  VMM_STAT_DECLARE(VMM_MEM_WIRED);
3550 3543  
3551 3544  static void
3552 3545  vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3553 3546  {
3554 3547  
3555 3548          if (vcpu == 0) {
3556 3549                  vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3557 3550                      PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3558 3551          }
3559 3552  }
3560 3553  
3561 3554  static void
3562 3555  vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3563 3556  {
3564 3557  
3565 3558          if (vcpu == 0) {
3566 3559                  vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3567 3560                      PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3568 3561          }
3569 3562  }
3570 3563  
3571 3564  VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3572 3565  VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3573 3566  
3574 3567  int
3575 3568  vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3576 3569      uint8_t bytes, uint32_t *val)
3577 3570  {
3578 3571          return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3579 3572  }
3580 3573  
3581 3574  /*
3582 3575   * bhyve-internal interfaces to attach or detach IO port handlers.
3583 3576   * Must be called with VM write lock held for safety.
3584 3577   */
3585 3578  int
3586 3579  vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3587 3580      void **cookie)
3588 3581  {
3589 3582          int err;
3590 3583          err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3591 3584          if (err == 0) {
3592 3585                  *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3593 3586          }
3594 3587          return (err);
3595 3588  }
3596 3589  int
3597 3590  vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3598 3591      void **old_arg)
3599 3592  {
3600 3593          uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3601 3594          int err;
3602 3595  
3603 3596          err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3604 3597          if (err == 0) {
3605 3598                  *cookie = NULL;
3606 3599          }
3607 3600          return (err);
3608 3601  }
3609 3602  
3610 3603  /*
3611 3604   * External driver interfaces to attach or detach IO port handlers.
3612 3605   * Must be called with VM write lock held for safety.
3613 3606   */
3614 3607  int
3615 3608  vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3616 3609      void *arg, void **cookie)
3617 3610  {
3618 3611          int err;
3619 3612  
3620 3613          if (port == 0) {
3621 3614                  return (EINVAL);
3622 3615          }
3623 3616  
3624 3617          err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3625 3618          if (err == 0) {
3626 3619                  *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3627 3620          }
3628 3621          return (err);
3629 3622  }
3630 3623  void
3631 3624  vm_ioport_unhook(struct vm *vm, void **cookie)
3632 3625  {
3633 3626          uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3634 3627          ioport_handler_t old_func;
3635 3628          void *old_arg;
3636 3629          int err;
3637 3630  
3638 3631          err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3639 3632  
3640 3633          /* ioport-hook-using drivers are expected to be well-behaved */
3641 3634          VERIFY0(err);
3642 3635          VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3643 3636  
3644 3637          *cookie = NULL;
3645 3638  }
3646 3639  
3647 3640  int
3648 3641  vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3649 3642  {
3650 3643          struct vm *vm = ksp->ks_private;
3651 3644          vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3652 3645          const int vcpuid = vvk->vvk_vcpu.value.ui32;
3653 3646          struct vcpu *vcpu = &vm->vcpu[vcpuid];
3654 3647  
3655 3648          ASSERT3U(vcpuid, <, VM_MAXCPU);

↓ open down ↓

3017 lines elided

↑ open up ↑

3656 3649  
3657 3650          vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3658 3651          vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3659 3652          vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3660 3653          vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3661 3654          vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3662 3655          vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3663 3656  
3664 3657          return (0);
3665 3658  }
3666      -
3667      -int
3668      -vm_arc_resv(struct vm *vm, uint64_t len)
3669      -{
3670      -        /* Since we already have the compat macros included, we use those */
3671      -        size_t pages = (size_t)roundup2(len, PAGE_SIZE) >> PAGE_SHIFT;
3672      -        int err = 0;
3673      -
3674      -        err = arc_virt_machine_reserve(pages);
3675      -        if (err != 0)
3676      -                return (err);
3677      -
3678      -        vm->arc_resv += pages;
3679      -        return (0);
3680      -}

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX