checkme Wdiff usr/src/uts/i86pc/io/vmm/amd/svm.c

Print this page

13275 bhyve needs richer INIT/SIPI support
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/io/vmm/amd/svm.c
          +++ new/usr/src/uts/i86pc/io/vmm/amd/svm.c

   1    1  /*-
   2    2   * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3    3   *
   4    4   * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
   5    5   * All rights reserved.
   6    6   *
   7    7   * Redistribution and use in source and binary forms, with or without
   8    8   * modification, are permitted provided that the following conditions
   9    9   * are met:
  10   10   * 1. Redistributions of source code must retain the above copyright
  11   11   *    notice unmodified, this list of conditions, and the following
  12   12   *    disclaimer.
  13   13   * 2. Redistributions in binary form must reproduce the above copyright
  14   14   *    notice, this list of conditions and the following disclaimer in the
  15   15   *    documentation and/or other materials provided with the distribution.
  16   16   *
  17   17   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18   18   * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19   19   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20   20   * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21   21   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22   22   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23   23   * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24   24   * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25   25   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26   26   * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27   27   */
  28   28  
  29   29  /*
  30   30   * This file and its contents are supplied under the terms of the
  31   31   * Common Development and Distribution License ("CDDL"), version 1.0.
  32   32   * You may only use this file in accordance with the terms of version
  33   33   * 1.0 of the CDDL.
  34   34   *
  35   35   * A full copy of the text of the CDDL should have accompanied this
  36   36   * source.  A copy of the CDDL is also available via the Internet at
  37   37   * http://www.illumos.org/license/CDDL.
  38   38   *
  39   39   * Copyright 2018 Joyent, Inc.
  40   40   * Copyright 2020 Oxide Computer Company
  41   41   */
  42   42  
  43   43  #include <sys/cdefs.h>
  44   44  __FBSDID("$FreeBSD$");
  45   45  
  46   46  #include <sys/param.h>
  47   47  #include <sys/systm.h>
  48   48  #include <sys/smp.h>
  49   49  #include <sys/kernel.h>
  50   50  #include <sys/malloc.h>
  51   51  #include <sys/pcpu.h>
  52   52  #include <sys/proc.h>
  53   53  #include <sys/sysctl.h>
  54   54  
  55   55  #ifndef __FreeBSD__
  56   56  #include <sys/x86_archext.h>
  57   57  #include <sys/trap.h>
  58   58  #endif
  59   59  
  60   60  #include <vm/vm.h>
  61   61  #include <vm/pmap.h>
  62   62  
  63   63  #include <machine/cpufunc.h>
  64   64  #include <machine/psl.h>
  65   65  #include <machine/md_var.h>
  66   66  #include <machine/reg.h>
  67   67  #include <machine/specialreg.h>
  68   68  #include <machine/smp.h>
  69   69  #include <machine/vmm.h>
  70   70  #include <machine/vmm_dev.h>
  71   71  #include <sys/vmm_instruction_emul.h>
  72   72  
  73   73  #include "vmm_lapic.h"
  74   74  #include "vmm_stat.h"
  75   75  #include "vmm_ktr.h"
  76   76  #include "vmm_ioport.h"
  77   77  #include "vatpic.h"
  78   78  #include "vlapic.h"
  79   79  #include "vlapic_priv.h"
  80   80  
  81   81  #include "x86.h"
  82   82  #include "vmcb.h"
  83   83  #include "svm.h"
  84   84  #include "svm_softc.h"
  85   85  #include "svm_msr.h"
  86   86  #include "npt.h"
  87   87  
  88   88  SYSCTL_DECL(_hw_vmm);
  89   89  SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
  90   90      NULL);
  91   91  
  92   92  /*
  93   93   * SVM CPUID function 0x8000_000A, edx bit decoding.
  94   94   */
  95   95  #define AMD_CPUID_SVM_NP                BIT(0)  /* Nested paging or RVI */
  96   96  #define AMD_CPUID_SVM_LBR               BIT(1)  /* Last branch virtualization */
  97   97  #define AMD_CPUID_SVM_SVML              BIT(2)  /* SVM lock */
  98   98  #define AMD_CPUID_SVM_NRIP_SAVE         BIT(3)  /* Next RIP is saved */
  99   99  #define AMD_CPUID_SVM_TSC_RATE          BIT(4)  /* TSC rate control. */
 100  100  #define AMD_CPUID_SVM_VMCB_CLEAN        BIT(5)  /* VMCB state caching */
 101  101  #define AMD_CPUID_SVM_FLUSH_BY_ASID     BIT(6)  /* Flush by ASID */
 102  102  #define AMD_CPUID_SVM_DECODE_ASSIST     BIT(7)  /* Decode assist */
 103  103  #define AMD_CPUID_SVM_PAUSE_INC         BIT(10) /* Pause intercept filter. */
 104  104  #define AMD_CPUID_SVM_PAUSE_FTH         BIT(12) /* Pause filter threshold */
 105  105  #define AMD_CPUID_SVM_AVIC              BIT(13) /* AVIC present */
 106  106  
 107  107  #define VMCB_CACHE_DEFAULT      (VMCB_CACHE_ASID        |       \
 108  108                                  VMCB_CACHE_IOPM         |       \
 109  109                                  VMCB_CACHE_I            |       \
 110  110                                  VMCB_CACHE_TPR          |       \
 111  111                                  VMCB_CACHE_CR2          |       \
 112  112                                  VMCB_CACHE_CR           |       \
 113  113                                  VMCB_CACHE_DR           |       \
 114  114                                  VMCB_CACHE_DT           |       \
 115  115                                  VMCB_CACHE_SEG          |       \
 116  116                                  VMCB_CACHE_NP)
 117  117  
 118  118  static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
 119  119  SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
 120  120      0, NULL);
 121  121  
 122  122  static MALLOC_DEFINE(M_SVM, "svm", "svm");
 123  123  static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
 124  124  
 125  125  /* SVM features advertised by CPUID.8000000AH:EDX */
 126  126  static uint32_t svm_feature = ~0U;      /* AMD SVM features. */
 127  127  
 128  128  static int disable_npf_assist;
 129  129  
 130  130  static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
 131  131  static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
 132  132  static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
 133  133  
 134  134  static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
 135  135  
 136  136  static __inline int
 137  137  flush_by_asid(void)
 138  138  {
 139  139          return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
 140  140  }
 141  141  
 142  142  static __inline int
 143  143  decode_assist(void)
 144  144  {
 145  145          return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
 146  146  }
 147  147  
 148  148  #ifdef __FreeBSD__
 149  149  static void
 150  150  svm_disable(void *arg __unused)
 151  151  {
 152  152          uint64_t efer;
 153  153  
 154  154          efer = rdmsr(MSR_EFER);
 155  155          efer &= ~EFER_SVM;
 156  156          wrmsr(MSR_EFER, efer);
 157  157  }
 158  158  
 159  159  /*
 160  160   * Disable SVM on all CPUs.
 161  161   */
 162  162  static int
 163  163  svm_cleanup(void)
 164  164  {
 165  165  
 166  166          smp_rendezvous(NULL, svm_disable, NULL, NULL);
 167  167          return (0);
 168  168  }
 169  169  
 170  170  /*
 171  171   * Verify that all the features required by bhyve are available.
 172  172   */
 173  173  static int
 174  174  check_svm_features(void)
 175  175  {
 176  176          uint_t regs[4];
 177  177  
 178  178          /* CPUID Fn8000_000A is for SVM */
 179  179          do_cpuid(0x8000000A, regs);
 180  180          svm_feature &= regs[3];
 181  181  
 182  182          /*
 183  183           * The number of ASIDs can be configured to be less than what is
 184  184           * supported by the hardware but not more.
 185  185           */
 186  186          if (nasid == 0 || nasid > regs[1])
 187  187                  nasid = regs[1];
 188  188          KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %x", nasid));
 189  189  
 190  190          /* bhyve requires the Nested Paging feature */
 191  191          if (!(svm_feature & AMD_CPUID_SVM_NP)) {
 192  192                  printf("SVM: Nested Paging feature not available.\n");
 193  193                  return (ENXIO);
 194  194          }
 195  195  
 196  196          /* bhyve requires the NRIP Save feature */
 197  197          if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
 198  198                  printf("SVM: NRIP Save feature not available.\n");
 199  199                  return (ENXIO);
 200  200          }
 201  201  
 202  202          return (0);
 203  203  }
 204  204  
 205  205  static void
 206  206  svm_enable(void *arg __unused)
 207  207  {
 208  208          uint64_t efer;
 209  209  
 210  210          efer = rdmsr(MSR_EFER);
 211  211          efer |= EFER_SVM;
 212  212          wrmsr(MSR_EFER, efer);
 213  213  
 214  214          wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
 215  215  }
 216  216  
 217  217  /*
 218  218   * Return 1 if SVM is enabled on this processor and 0 otherwise.
 219  219   */
 220  220  static int
 221  221  svm_available(void)
 222  222  {
 223  223          uint64_t msr;
 224  224  
 225  225  #ifdef __FreeBSD__
 226  226          /* Section 15.4 Enabling SVM from APM2. */
 227  227          if ((amd_feature2 & AMDID2_SVM) == 0) {
 228  228                  printf("SVM: not available.\n");
 229  229                  return (0);
 230  230          }
 231  231  #else
 232  232          if (!is_x86_feature(x86_featureset, X86FSET_SVM)) {
 233  233                  cmn_err(CE_WARN, "processor does not support SVM operation\n");
 234  234                  return (0);
 235  235          }
 236  236  #endif
 237  237  
 238  238          msr = rdmsr(MSR_VM_CR);
 239  239          if ((msr & VM_CR_SVMDIS) != 0) {
 240  240  #ifdef __FreeBSD__
 241  241                  printf("SVM: disabled by BIOS.\n");
 242  242  #else
 243  243                  cmn_err(CE_WARN, "SVM disabled by BIOS.\n");
 244  244  #endif
 245  245                  return (0);
 246  246          }
 247  247  
 248  248          return (1);
 249  249  }
 250  250  
 251  251  static int
 252  252  svm_init(int ipinum)
 253  253  {
 254  254          int error, cpu;
 255  255  
 256  256          if (!svm_available())
 257  257                  return (ENXIO);
 258  258  
 259  259          error = check_svm_features();
 260  260          if (error)
 261  261                  return (error);
 262  262  
 263  263          vmcb_clean &= VMCB_CACHE_DEFAULT;
 264  264  
 265  265          for (cpu = 0; cpu < MAXCPU; cpu++) {
 266  266                  /*
 267  267                   * Initialize the host ASIDs to their "highest" valid values.
 268  268                   *
 269  269                   * The next ASID allocation will rollover both 'gen' and 'num'
 270  270                   * and start off the sequence at {1,1}.
 271  271                   */
 272  272                  asid[cpu].gen = ~0UL;
 273  273                  asid[cpu].num = nasid - 1;
 274  274          }
 275  275  
 276  276          svm_msr_init();
 277  277          svm_npt_init(ipinum);
 278  278  
 279  279          /* Enable SVM on all CPUs */
 280  280          smp_rendezvous(NULL, svm_enable, NULL, NULL);
 281  281  
 282  282          return (0);
 283  283  }
 284  284  
 285  285  static void
 286  286  svm_restore(void)
 287  287  {
 288  288  
 289  289          svm_enable(NULL);
 290  290  }
 291  291  #else /* __FreeBSD__ */
 292  292  static int
 293  293  svm_cleanup(void)
 294  294  {
 295  295          /* This is taken care of by the hma registration */
 296  296          return (0);
 297  297  }
 298  298  
 299  299  static int
 300  300  svm_init(int ipinum)
 301  301  {
 302  302          vmcb_clean &= VMCB_CACHE_DEFAULT;
 303  303  
 304  304          svm_msr_init();
 305  305          svm_npt_init(ipinum);
 306  306  
 307  307          return (0);
 308  308  }
 309  309  
 310  310  static void
 311  311  svm_restore(void)
 312  312  {
 313  313          /* No-op on illumos */
 314  314  }
 315  315  #endif /* __FreeBSD__ */
 316  316  
 317  317  /* Pentium compatible MSRs */
 318  318  #define MSR_PENTIUM_START       0
 319  319  #define MSR_PENTIUM_END         0x1FFF
 320  320  /* AMD 6th generation and Intel compatible MSRs */
 321  321  #define MSR_AMD6TH_START        0xC0000000UL
 322  322  #define MSR_AMD6TH_END          0xC0001FFFUL
 323  323  /* AMD 7th and 8th generation compatible MSRs */
 324  324  #define MSR_AMD7TH_START        0xC0010000UL
 325  325  #define MSR_AMD7TH_END          0xC0011FFFUL
 326  326  
 327  327  /*
 328  328   * Get the index and bit position for a MSR in permission bitmap.
 329  329   * Two bits are used for each MSR: lower bit for read and higher bit for write.
 330  330   */
 331  331  static int
 332  332  svm_msr_index(uint64_t msr, int *index, int *bit)
 333  333  {
 334  334          uint32_t base, off;
 335  335  
 336  336          *index = -1;
 337  337          *bit = (msr % 4) * 2;
 338  338          base = 0;
 339  339  
 340  340          if (msr <= MSR_PENTIUM_END) {
 341  341                  *index = msr / 4;
 342  342                  return (0);
 343  343          }
 344  344  
 345  345          base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
 346  346          if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
 347  347                  off = (msr - MSR_AMD6TH_START);
 348  348                  *index = (off + base) / 4;
 349  349                  return (0);
 350  350          }
 351  351  
 352  352          base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
 353  353          if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
 354  354                  off = (msr - MSR_AMD7TH_START);
 355  355                  *index = (off + base) / 4;
 356  356                  return (0);
 357  357          }
 358  358  
 359  359          return (EINVAL);
 360  360  }
 361  361  
 362  362  /*
 363  363   * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
 364  364   */
 365  365  static void
 366  366  svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
 367  367  {
 368  368          int index, bit, error;
 369  369  
 370  370          error = svm_msr_index(msr, &index, &bit);
 371  371          KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr));
 372  372          KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
 373  373              ("%s: invalid index %d for msr %lx", __func__, index, msr));
 374  374          KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
 375  375              "msr %lx", __func__, bit, msr));
 376  376  
 377  377          if (read)
 378  378                  perm_bitmap[index] &= ~(1UL << bit);
 379  379  
 380  380          if (write)
 381  381                  perm_bitmap[index] &= ~(2UL << bit);
 382  382  }
 383  383  
 384  384  static void
 385  385  svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
 386  386  {
 387  387  
 388  388          svm_msr_perm(perm_bitmap, msr, true, true);
 389  389  }
 390  390  
 391  391  static void
 392  392  svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
 393  393  {
 394  394  
 395  395          svm_msr_perm(perm_bitmap, msr, true, false);
 396  396  }
 397  397  
 398  398  static __inline int
 399  399  svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
 400  400  {
 401  401          struct vmcb_ctrl *ctrl;
 402  402  
 403  403          KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
 404  404  
 405  405          ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 406  406          return (ctrl->intercept[idx] & bitmask ? 1 : 0);
 407  407  }
 408  408  
 409  409  static __inline void
 410  410  svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
 411  411      int enabled)
 412  412  {
 413  413          struct vmcb_ctrl *ctrl;
 414  414          uint32_t oldval;
 415  415  
 416  416          KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
 417  417  
 418  418          ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 419  419          oldval = ctrl->intercept[idx];
 420  420  
 421  421          if (enabled)
 422  422                  ctrl->intercept[idx] |= bitmask;
 423  423          else
 424  424                  ctrl->intercept[idx] &= ~bitmask;
 425  425  
 426  426          if (ctrl->intercept[idx] != oldval) {
 427  427                  svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
 428  428                  VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
 429  429                      "from %x to %x", idx, oldval, ctrl->intercept[idx]);
 430  430          }
 431  431  }
 432  432  
 433  433  static __inline void
 434  434  svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 435  435  {
 436  436  
 437  437          svm_set_intercept(sc, vcpu, off, bitmask, 0);
 438  438  }
 439  439  
 440  440  static __inline void
 441  441  svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 442  442  {
 443  443  
 444  444          svm_set_intercept(sc, vcpu, off, bitmask, 1);
 445  445  }
 446  446  
 447  447  static void
 448  448  vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
 449  449      uint64_t msrpm_base_pa, uint64_t np_pml4)
 450  450  {
 451  451          struct vmcb_ctrl *ctrl;
 452  452          struct vmcb_state *state;
 453  453          uint32_t mask;
 454  454          int n;
 455  455  
 456  456          ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 457  457          state = svm_get_vmcb_state(sc, vcpu);
 458  458  
 459  459          ctrl->iopm_base_pa = iopm_base_pa;
 460  460          ctrl->msrpm_base_pa = msrpm_base_pa;
 461  461  
 462  462          /* Enable nested paging */
 463  463          ctrl->np_ctrl = NP_ENABLE;
 464  464          ctrl->n_cr3 = np_pml4;
 465  465  
 466  466          /*
 467  467           * Intercept accesses to the control registers that are not shadowed
 468  468           * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
 469  469           */
 470  470          for (n = 0; n < 16; n++) {
 471  471                  mask = (BIT(n) << 16) | BIT(n);
 472  472                  if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
 473  473                          svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 474  474                  else
 475  475                          svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 476  476          }
 477  477  
 478  478  
 479  479          /*
 480  480           * Intercept everything when tracing guest exceptions otherwise
 481  481           * just intercept machine check exception.
 482  482           */
 483  483          if (vcpu_trace_exceptions(sc->vm, vcpu)) {
 484  484                  for (n = 0; n < 32; n++) {
 485  485                          /*
 486  486                           * Skip unimplemented vectors in the exception bitmap.
 487  487                           */
 488  488                          if (n == 2 || n == 9) {
 489  489                                  continue;
 490  490                          }
 491  491                          svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
 492  492                  }
 493  493          } else {
 494  494                  svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
 495  495          }
 496  496  
 497  497          /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
 498  498          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
 499  499          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
 500  500          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
 501  501          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
 502  502          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
 503  503          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
 504  504          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
 505  505          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
 506  506          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 507  507              VMCB_INTCPT_FERR_FREEZE);
 508  508  
 509  509          svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
 510  510          svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
 511  511  
 512  512          /* Intercept privileged invalidation instructions. */
 513  513          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
 514  514          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
 515  515  
 516  516          /*
 517  517           * Intercept all virtualization-related instructions.
 518  518           *
 519  519           * From section "Canonicalization and Consistency Checks" in APMv2
 520  520           * the VMRUN intercept bit must be set to pass the consistency check.
 521  521           */
 522  522          svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
 523  523          svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL);
 524  524          svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
 525  525          svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
 526  526          svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
 527  527          svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
 528  528          svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
 529  529  
 530  530          /*
 531  531           * The ASID will be set to a non-zero value just before VMRUN.
 532  532           */
 533  533          ctrl->asid = 0;
 534  534  
 535  535          /*
 536  536           * Section 15.21.1, Interrupt Masking in EFLAGS
 537  537           * Section 15.21.2, Virtualizing APIC.TPR
 538  538           *
 539  539           * This must be set for %rflag and %cr8 isolation of guest and host.
 540  540           */
 541  541          ctrl->v_intr_ctrl |= V_INTR_MASKING;
 542  542  
 543  543          /* Enable Last Branch Record aka LBR for debugging */
 544  544          ctrl->misc_ctrl |= LBR_VIRT_ENABLE;
 545  545          state->dbgctl = BIT(0);
 546  546  
 547  547          /* EFER_SVM must always be set when the guest is executing */
 548  548          state->efer = EFER_SVM;
 549  549  
 550  550          /* Set up the PAT to power-on state */
 551  551          state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)     |
 552  552              PAT_VALUE(1, PAT_WRITE_THROUGH)     |
 553  553              PAT_VALUE(2, PAT_UNCACHED)          |
 554  554              PAT_VALUE(3, PAT_UNCACHEABLE)       |
 555  555              PAT_VALUE(4, PAT_WRITE_BACK)        |
 556  556              PAT_VALUE(5, PAT_WRITE_THROUGH)     |
 557  557              PAT_VALUE(6, PAT_UNCACHED)          |
 558  558              PAT_VALUE(7, PAT_UNCACHEABLE);
 559  559  
 560  560          /* Set up DR6/7 to power-on state */
 561  561          state->dr6 = DBREG_DR6_RESERVED1;
 562  562          state->dr7 = DBREG_DR7_RESERVED1;
 563  563  }
 564  564  
 565  565  /*
 566  566   * Initialize a virtual machine.
 567  567   */
 568  568  static void *
 569  569  svm_vminit(struct vm *vm, pmap_t pmap)
 570  570  {
 571  571          struct svm_softc *svm_sc;
 572  572          struct svm_vcpu *vcpu;
 573  573          vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
 574  574          int i;
 575  575          uint16_t maxcpus;
 576  576  
 577  577          svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO);
 578  578          if (((uintptr_t)svm_sc & PAGE_MASK) != 0)
 579  579                  panic("malloc of svm_softc not aligned on page boundary");
 580  580  
 581  581          svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM,
 582  582              M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
 583  583          if (svm_sc->msr_bitmap == NULL)
 584  584                  panic("contigmalloc of SVM MSR bitmap failed");
 585  585          svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM,
 586  586              M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
 587  587          if (svm_sc->iopm_bitmap == NULL)
 588  588                  panic("contigmalloc of SVM IO bitmap failed");
 589  589  
 590  590          svm_sc->vm = vm;
 591  591          svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
 592  592  
 593  593          /*
 594  594           * Intercept read and write accesses to all MSRs.
 595  595           */
 596  596          memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
 597  597  
 598  598          /*
 599  599           * Access to the following MSRs is redirected to the VMCB when the
 600  600           * guest is executing. Therefore it is safe to allow the guest to
 601  601           * read/write these MSRs directly without hypervisor involvement.
 602  602           */
 603  603          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
 604  604          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
 605  605          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
 606  606  
 607  607          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
 608  608          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
 609  609          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
 610  610          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
 611  611          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
 612  612          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
 613  613          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
 614  614          svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
 615  615  
 616  616          svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
 617  617  
 618  618          /*
 619  619           * Intercept writes to make sure that the EFER_SVM bit is not cleared.
 620  620           */
 621  621          svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
 622  622  
 623  623          /* Intercept access to all I/O ports. */
 624  624          memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
 625  625  
 626  626          iopm_pa = vtophys(svm_sc->iopm_bitmap);
 627  627          msrpm_pa = vtophys(svm_sc->msr_bitmap);
 628  628          pml4_pa = svm_sc->nptp;
 629  629          maxcpus = vm_get_maxcpus(svm_sc->vm);
 630  630          for (i = 0; i < maxcpus; i++) {
 631  631                  vcpu = svm_get_vcpu(svm_sc, i);
 632  632                  vcpu->nextrip = ~0;
 633  633                  vcpu->lastcpu = NOCPU;
 634  634                  vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
 635  635                  vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
 636  636                  svm_msr_guest_init(svm_sc, i);
 637  637          }
 638  638          return (svm_sc);
 639  639  }
 640  640  
 641  641  /*
 642  642   * Collateral for a generic SVM VM-exit.
 643  643   */
 644  644  static void
 645  645  vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
 646  646  {
 647  647  
 648  648          vme->exitcode = VM_EXITCODE_SVM;
 649  649          vme->u.svm.exitcode = code;
 650  650          vme->u.svm.exitinfo1 = info1;
 651  651          vme->u.svm.exitinfo2 = info2;
 652  652  }
 653  653  
 654  654  static int
 655  655  svm_cpl(struct vmcb_state *state)
 656  656  {
 657  657  
 658  658          /*
 659  659           * From APMv2:
 660  660           *   "Retrieve the CPL from the CPL field in the VMCB, not
 661  661           *    from any segment DPL"
 662  662           */
 663  663          return (state->cpl);
 664  664  }
 665  665  
 666  666  static enum vm_cpu_mode
 667  667  svm_vcpu_mode(struct vmcb *vmcb)
 668  668  {
 669  669          struct vmcb_state *state;
 670  670  
 671  671          state = &vmcb->state;
 672  672  
 673  673          if (state->efer & EFER_LMA) {
 674  674                  struct vmcb_segment *seg;
 675  675  
 676  676                  /*
 677  677                   * Section 4.8.1 for APM2, check if Code Segment has
 678  678                   * Long attribute set in descriptor.
 679  679                   */
 680  680                  seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
 681  681                  if (seg->attrib & VMCB_CS_ATTRIB_L)
 682  682                          return (CPU_MODE_64BIT);
 683  683                  else
 684  684                          return (CPU_MODE_COMPATIBILITY);
 685  685          } else  if (state->cr0 & CR0_PE) {
 686  686                  return (CPU_MODE_PROTECTED);
 687  687          } else {
 688  688                  return (CPU_MODE_REAL);
 689  689          }
 690  690  }
 691  691  
 692  692  static enum vm_paging_mode
 693  693  svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
 694  694  {
 695  695  
 696  696          if ((cr0 & CR0_PG) == 0)
 697  697                  return (PAGING_MODE_FLAT);
 698  698          if ((cr4 & CR4_PAE) == 0)
 699  699                  return (PAGING_MODE_32);
 700  700          if (efer & EFER_LME)
 701  701                  return (PAGING_MODE_64);
 702  702          else
 703  703                  return (PAGING_MODE_PAE);
 704  704  }
 705  705  
 706  706  /*
 707  707   * ins/outs utility routines
 708  708   */
 709  709  
 710  710  static void
 711  711  svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
 712  712  {
 713  713          struct vmcb_state *state;
 714  714  
 715  715          state = &vmcb->state;
 716  716          paging->cr3 = state->cr3;
 717  717          paging->cpl = svm_cpl(state);
 718  718          paging->cpu_mode = svm_vcpu_mode(vmcb);
 719  719          paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
 720  720              state->efer);
 721  721  }
 722  722  
 723  723  #define UNHANDLED 0
 724  724  
 725  725  /*
 726  726   * Handle guest I/O intercept.
 727  727   */
 728  728  static int
 729  729  svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 730  730  {
 731  731          struct vmcb_ctrl *ctrl;
 732  732          struct vmcb_state *state;
 733  733          struct vm_inout *inout;
 734  734          struct vie *vie;
 735  735          uint64_t info1;
 736  736          struct vm_guest_paging paging;
 737  737  
 738  738          state = svm_get_vmcb_state(svm_sc, vcpu);
 739  739          ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
 740  740          inout = &vmexit->u.inout;
 741  741          info1 = ctrl->exitinfo1;
 742  742  
 743  743          inout->bytes = (info1 >> 4) & 0x7;
 744  744          inout->flags = 0;
 745  745          inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0;
 746  746          inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0;
 747  747          inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0;
 748  748          inout->port = (uint16_t)(info1 >> 16);
 749  749          inout->eax = (uint32_t)(state->rax);
 750  750  
 751  751          if ((inout->flags & INOUT_STR) != 0) {
 752  752                  /*
 753  753                   * The effective segment number in EXITINFO1[12:10] is populated
 754  754                   * only if the processor has the DecodeAssist capability.
 755  755                   *
 756  756                   * This is not specified explicitly in APMv2 but can be verified
 757  757                   * empirically.
 758  758                   */
 759  759                  if (!decode_assist()) {
 760  760                          /*
 761  761                           * Without decoding assistance, force the task of
 762  762                           * emulating the ins/outs on userspace.
 763  763                           */
 764  764                          vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 765  765                          bzero(&vmexit->u.inst_emul,
 766  766                              sizeof (vmexit->u.inst_emul));
 767  767                          return (UNHANDLED);
 768  768                  }
 769  769  
 770  770                  /*
 771  771                   * Bits 7-9 encode the address size of ins/outs operations where
 772  772                   * the 1/2/4 values correspond to 16/32/64 bit sizes.
 773  773                   */
 774  774                  inout->addrsize = 2 * ((info1 >> 7) & 0x7);
 775  775                  VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
 776  776                      inout->addrsize == 8);
 777  777  
 778  778                  if (inout->flags & INOUT_IN) {
 779  779                          /*
 780  780                           * For INS instructions, %es (encoded as 0) is the
 781  781                           * implied segment for the operation.
 782  782                           */
 783  783                          inout->segment = 0;
 784  784                  } else {
 785  785                          /*
 786  786                           * Bits 10-12 encode the segment for OUTS.
 787  787                           * This value follows the standard x86 segment order.
 788  788                           */
 789  789                          inout->segment = (info1 >> 10) & 0x7;
 790  790                  }
 791  791          }
 792  792  
 793  793          vmexit->exitcode = VM_EXITCODE_INOUT;
 794  794          svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
 795  795          vie = vm_vie_ctx(svm_sc->vm, vcpu);
 796  796          vie_init_inout(vie, inout, vmexit->inst_length, &paging);
 797  797  
 798  798          /* The in/out emulation will handle advancing %rip */
 799  799          vmexit->inst_length = 0;
 800  800  
 801  801          return (UNHANDLED);
 802  802  }
 803  803  
 804  804  static int
 805  805  npf_fault_type(uint64_t exitinfo1)
 806  806  {
 807  807  
 808  808          if (exitinfo1 & VMCB_NPF_INFO1_W)
 809  809                  return (VM_PROT_WRITE);
 810  810          else if (exitinfo1 & VMCB_NPF_INFO1_ID)
 811  811                  return (VM_PROT_EXECUTE);
 812  812          else
 813  813                  return (VM_PROT_READ);
 814  814  }
 815  815  
 816  816  static bool
 817  817  svm_npf_emul_fault(uint64_t exitinfo1)
 818  818  {
 819  819          if (exitinfo1 & VMCB_NPF_INFO1_ID) {
 820  820                  return (false);
 821  821          }
 822  822  
 823  823          if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
 824  824                  return (false);
 825  825          }
 826  826  
 827  827          if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
 828  828                  return (false);
 829  829          }
 830  830  
 831  831          return (true);
 832  832  }
 833  833  
 834  834  static void
 835  835  svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
 836  836      uint64_t gpa)
 837  837  {
 838  838          struct vmcb_ctrl *ctrl;
 839  839          struct vmcb *vmcb;
 840  840          struct vie *vie;
 841  841          struct vm_guest_paging paging;
 842  842          struct vmcb_segment *seg;
 843  843          char *inst_bytes = NULL;
 844  844          uint8_t inst_len = 0;
 845  845  
 846  846          vmcb = svm_get_vmcb(svm_sc, vcpu);
 847  847          ctrl = &vmcb->ctrl;
 848  848  
 849  849          vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
 850  850          vmexit->u.mmio_emul.gpa = gpa;
 851  851          vmexit->u.mmio_emul.gla = VIE_INVALID_GLA;
 852  852          svm_paging_info(vmcb, &paging);
 853  853  
 854  854          switch (paging.cpu_mode) {
 855  855          case CPU_MODE_REAL:
 856  856                  seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
 857  857                  vmexit->u.mmio_emul.cs_base = seg->base;
 858  858                  vmexit->u.mmio_emul.cs_d = 0;
 859  859                  break;
 860  860          case CPU_MODE_PROTECTED:
 861  861          case CPU_MODE_COMPATIBILITY:
 862  862                  seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
 863  863                  vmexit->u.mmio_emul.cs_base = seg->base;
 864  864  
 865  865                  /*
 866  866                   * Section 4.8.1 of APM2, Default Operand Size or D bit.
 867  867                   */
 868  868                  vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
 869  869                      1 : 0;
 870  870                  break;
 871  871          default:
 872  872                  vmexit->u.mmio_emul.cs_base = 0;
 873  873                  vmexit->u.mmio_emul.cs_d = 0;
 874  874                  break;
 875  875          }
 876  876  
 877  877          /*
 878  878           * Copy the instruction bytes into 'vie' if available.
 879  879           */
 880  880          if (decode_assist() && !disable_npf_assist) {
 881  881                  inst_len = ctrl->inst_len;
 882  882                  inst_bytes = (char *)ctrl->inst_bytes;
 883  883          }
 884  884          vie = vm_vie_ctx(svm_sc->vm, vcpu);
 885  885          vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
 886  886  }
 887  887  
 888  888  static void
 889  889  svm_update_virqinfo(struct svm_softc *sc, int vcpu)
 890  890  {
 891  891          struct vm *vm;
 892  892          struct vlapic *vlapic;
 893  893          struct vmcb_ctrl *ctrl;
 894  894  
 895  895          vm = sc->vm;
 896  896          vlapic = vm_lapic(vm, vcpu);
 897  897          ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 898  898  
 899  899          /* Update %cr8 in the emulated vlapic */
 900  900          vlapic_set_cr8(vlapic, ctrl->v_tpr);
 901  901  
 902  902          /* Virtual interrupt injection is not used. */
 903  903          KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
 904  904              "v_intr_vector %d", __func__, ctrl->v_intr_vector));
 905  905  }
 906  906  
 907  907  static void
 908  908  svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
 909  909  {
 910  910          struct vmcb_ctrl *ctrl;
 911  911          uint64_t intinfo;
 912  912  
 913  913          ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
 914  914          intinfo = ctrl->exitintinfo;
 915  915          if (!VMCB_EXITINTINFO_VALID(intinfo))
 916  916                  return;
 917  917  
 918  918          /*
 919  919           * From APMv2, Section "Intercepts during IDT interrupt delivery"
 920  920           *
 921  921           * If a #VMEXIT happened during event delivery then record the event
 922  922           * that was being delivered.
 923  923           */
 924  924          VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
 925  925              intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
 926  926          vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
 927  927          vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
 928  928  }
 929  929  
 930  930  static __inline int
 931  931  vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
 932  932  {
 933  933  
 934  934          return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 935  935              VMCB_INTCPT_VINTR));
 936  936  }
 937  937  
 938  938  static void
 939  939  svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 940  940  {
 941  941          struct vmcb_ctrl *ctrl;
 942  942          struct vmcb_state *state;
 943  943  
 944  944          ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 945  945          state = svm_get_vmcb_state(sc, vcpu);
 946  946  
 947  947          if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
 948  948                  KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
 949  949                      ("%s: invalid v_ign_tpr", __func__));
 950  950                  KASSERT(vintr_intercept_enabled(sc, vcpu),
 951  951                      ("%s: vintr intercept should be enabled", __func__));
 952  952                  return;
 953  953          }
 954  954  
 955  955          /*
 956  956           * We use V_IRQ in conjunction with the VINTR intercept to trap into the
 957  957           * hypervisor as soon as a virtual interrupt can be delivered.
 958  958           *
 959  959           * Since injected events are not subject to intercept checks we need to
 960  960           * ensure that the V_IRQ is not actually going to be delivered on VM
 961  961           * entry.
 962  962           */
 963  963          VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
 964  964              (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
 965  965  
 966  966          VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
 967  967          ctrl->v_irq |= V_IRQ;
 968  968          ctrl->v_intr_prio |= V_IGN_TPR;
 969  969          ctrl->v_intr_vector = 0;
 970  970          svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 971  971          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 972  972  }
 973  973  
 974  974  static void
 975  975  svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 976  976  {
 977  977          struct vmcb_ctrl *ctrl;
 978  978  
 979  979          ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 980  980  
 981  981          if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) {
 982  982                  KASSERT(!vintr_intercept_enabled(sc, vcpu),
 983  983                      ("%s: vintr intercept should be disabled", __func__));
 984  984                  return;
 985  985          }
 986  986  
 987  987          VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
 988  988          ctrl->v_irq &= ~V_IRQ;
 989  989          ctrl->v_intr_vector = 0;
 990  990          svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 991  991          svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 992  992  }
 993  993  
 994  994  /*
 995  995   * Once an NMI is injected it blocks delivery of further NMIs until the handler
 996  996   * executes an IRET. The IRET intercept is enabled when an NMI is injected to
 997  997   * to track when the vcpu is done handling the NMI.
 998  998   */
 999  999  static int
1000 1000  svm_nmi_blocked(struct svm_softc *sc, int vcpu)
1001 1001  {
1002 1002          return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1003 1003              VMCB_INTCPT_IRET));
1004 1004  }
1005 1005  
1006 1006  static void
1007 1007  svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1008 1008  {
1009 1009          struct vmcb_ctrl *ctrl;
1010 1010  
1011 1011          KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1012 1012          VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
1013 1013          /*
1014 1014           * When the IRET intercept is cleared the vcpu will attempt to execute
1015 1015           * the "iret" when it runs next. However, it is possible to inject
1016 1016           * another NMI into the vcpu before the "iret" has actually executed.
1017 1017           *
1018 1018           * For e.g. if the "iret" encounters a #NPF when accessing the stack
1019 1019           * it will trap back into the hypervisor. If an NMI is pending for
1020 1020           * the vcpu it will be injected into the guest.
1021 1021           *
1022 1022           * XXX this needs to be fixed
1023 1023           */
1024 1024          svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1025 1025  
1026 1026          /*
1027 1027           * Set an interrupt shadow to prevent an NMI from being immediately
1028 1028           * injected on the next VMRUN.
1029 1029           */
1030 1030          ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1031 1031          ctrl->intr_shadow = 1;
1032 1032  }
1033 1033  
1034 1034  static void
1035 1035  svm_inject_event(struct svm_softc *sc, int vcpu, uint64_t intinfo)
1036 1036  {
1037 1037          struct vmcb_ctrl *ctrl;
1038 1038          uint8_t vector;
1039 1039          uint32_t evtype;
1040 1040  
1041 1041          ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
1042 1042  
1043 1043          ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1044 1044          vector = VMCB_EXITINTINFO_VECTOR(intinfo);
1045 1045          evtype = VMCB_EXITINTINFO_TYPE(intinfo);
1046 1046  
1047 1047          switch (evtype) {
1048 1048          case VMCB_EVENTINJ_TYPE_INTR:
1049 1049          case VMCB_EVENTINJ_TYPE_NMI:
1050 1050          case VMCB_EVENTINJ_TYPE_INTn:
1051 1051                  break;
1052 1052          case VMCB_EVENTINJ_TYPE_EXCEPTION:
1053 1053                  VERIFY(vector <= 31);
1054 1054                  /*
1055 1055                   * NMIs are expected to be injected with VMCB_EVENTINJ_TYPE_NMI,
1056 1056                   * rather than as an exception with the NMI vector.
1057 1057                   */
1058 1058                  VERIFY(vector != 2);
1059 1059                  break;
1060 1060          default:
1061 1061                  panic("unexpected event type %x", evtype);
1062 1062          }
1063 1063  
1064 1064          ctrl->eventinj = VMCB_EVENTINJ_VALID | evtype | vector;
1065 1065          if (VMCB_EXITINTINFO_EC_VALID(intinfo)) {
1066 1066                  ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
1067 1067                  ctrl->eventinj |= (uint64_t)VMCB_EXITINTINFO_EC(intinfo) << 32;
1068 1068          }
1069 1069  }
1070 1070  
1071 1071  static void
1072 1072  svm_inject_nmi(struct svm_softc *sc, int vcpu)
1073 1073  {
1074 1074          struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1075 1075  
1076 1076          ASSERT(!svm_nmi_blocked(sc, vcpu));
1077 1077  
1078 1078          ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
1079 1079          vm_nmi_clear(sc->vm, vcpu);
1080 1080  
1081 1081          /*
1082 1082           * Virtual NMI blocking is now in effect.
1083 1083           *
1084 1084           * Not only does this block a subsequent NMI injection from taking
1085 1085           * place, it also configures an intercept on the IRET so we can track
1086 1086           * when the next injection can take place.
1087 1087           */
1088 1088          svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1089 1089  }
1090 1090  
1091 1091  static void
1092 1092  svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
1093 1093  {
1094 1094          struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1095 1095  
1096 1096          ASSERT(vector >= 0 && vector <= 255);
1097 1097  
1098 1098          ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
1099 1099  }
1100 1100  
1101 1101  #define EFER_MBZ_BITS   0xFFFFFFFFFFFF0200UL
1102 1102  
1103 1103  static int
1104 1104  svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval)
1105 1105  {
1106 1106          struct vm_exit *vme;
1107 1107          struct vmcb_state *state;
1108 1108          uint64_t changed, lma, oldval;
1109 1109          int error;
1110 1110  
1111 1111          state = svm_get_vmcb_state(sc, vcpu);
1112 1112  
1113 1113          oldval = state->efer;
1114 1114          VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %lx/%lx", oldval, newval);
1115 1115  
1116 1116          newval &= ~0xFE;                /* clear the Read-As-Zero (RAZ) bits */
1117 1117          changed = oldval ^ newval;
1118 1118  
1119 1119          if (newval & EFER_MBZ_BITS)
1120 1120                  goto gpf;
1121 1121  
1122 1122          /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1123 1123          if (changed & EFER_LME) {
1124 1124                  if (state->cr0 & CR0_PG)
1125 1125                          goto gpf;
1126 1126          }
1127 1127  
1128 1128          /* EFER.LMA = EFER.LME & CR0.PG */
1129 1129          if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
1130 1130                  lma = EFER_LMA;
1131 1131          else
1132 1132                  lma = 0;
1133 1133  
1134 1134          if ((newval & EFER_LMA) != lma)
1135 1135                  goto gpf;
1136 1136  
1137 1137          if (newval & EFER_NXE) {
1138 1138                  if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
1139 1139                          goto gpf;
1140 1140          }
1141 1141  
1142 1142          /*
1143 1143           * XXX bhyve does not enforce segment limits in 64-bit mode. Until
1144 1144           * this is fixed flag guest attempt to set EFER_LMSLE as an error.
1145 1145           */
1146 1146          if (newval & EFER_LMSLE) {
1147 1147                  vme = vm_exitinfo(sc->vm, vcpu);
1148 1148                  vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
1149 1149                  return (-1);
1150 1150          }
1151 1151  
1152 1152          if (newval & EFER_FFXSR) {
1153 1153                  if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
1154 1154                          goto gpf;
1155 1155          }
1156 1156  
1157 1157          if (newval & EFER_TCE) {
1158 1158                  if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
1159 1159                          goto gpf;
1160 1160          }
1161 1161  
1162 1162          error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1163 1163          KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
1164 1164          return (0);
1165 1165  gpf:
1166 1166          vm_inject_gp(sc->vm, vcpu);
1167 1167          return (0);
1168 1168  }
1169 1169  
1170 1170  static int
1171 1171  emulate_wrmsr(struct svm_softc *sc, int vcpu, uint_t num, uint64_t val)
1172 1172  {
1173 1173          int error;
1174 1174  
1175 1175          if (lapic_msr(num))
1176 1176                  error = lapic_wrmsr(sc->vm, vcpu, num, val);
1177 1177          else if (num == MSR_EFER)
1178 1178                  error = svm_write_efer(sc, vcpu, val);
1179 1179          else
1180 1180                  error = svm_wrmsr(sc, vcpu, num, val);
1181 1181  
1182 1182          return (error);
1183 1183  }
1184 1184  
1185 1185  static int
1186 1186  emulate_rdmsr(struct svm_softc *sc, int vcpu, uint_t num)
1187 1187  {
1188 1188          struct vmcb_state *state;
1189 1189          struct svm_regctx *ctx;
1190 1190          uint64_t result;
1191 1191          int error;
1192 1192  
1193 1193          if (lapic_msr(num))
1194 1194                  error = lapic_rdmsr(sc->vm, vcpu, num, &result);
1195 1195          else
1196 1196                  error = svm_rdmsr(sc, vcpu, num, &result);
1197 1197  
1198 1198          if (error == 0) {
1199 1199                  state = svm_get_vmcb_state(sc, vcpu);
1200 1200                  ctx = svm_get_guest_regctx(sc, vcpu);
1201 1201                  state->rax = result & 0xffffffff;
1202 1202                  ctx->sctx_rdx = result >> 32;
1203 1203          }
1204 1204  
1205 1205          return (error);
1206 1206  }
1207 1207  
1208 1208  /*
1209 1209   * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1210 1210   * that are due to instruction intercepts as well as MSR and IOIO intercepts
1211 1211   * and exceptions caused by INT3, INTO and BOUND instructions.
1212 1212   *
1213 1213   * Return 1 if the nRIP is valid and 0 otherwise.
1214 1214   */
1215 1215  static int
1216 1216  nrip_valid(uint64_t exitcode)
1217 1217  {
1218 1218          switch (exitcode) {
1219 1219          case 0x00 ... 0x0F:     /* read of CR0 through CR15 */
1220 1220          case 0x10 ... 0x1F:     /* write of CR0 through CR15 */
1221 1221          case 0x20 ... 0x2F:     /* read of DR0 through DR15 */
1222 1222          case 0x30 ... 0x3F:     /* write of DR0 through DR15 */
1223 1223          case 0x43:              /* INT3 */
1224 1224          case 0x44:              /* INTO */
1225 1225          case 0x45:              /* BOUND */
1226 1226          case 0x65 ... 0x7C:     /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1227 1227          case 0x80 ... 0x8D:     /* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1228 1228                  return (1);
1229 1229          default:
1230 1230                  return (0);
1231 1231          }
1232 1232  }
1233 1233  
1234 1234  static int
1235 1235  svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1236 1236  {
1237 1237          struct vmcb *vmcb;
1238 1238          struct vmcb_state *state;
1239 1239          struct vmcb_ctrl *ctrl;
1240 1240          struct svm_regctx *ctx;
1241 1241          uint64_t code, info1, info2, val;
1242 1242          uint32_t eax, ecx, edx;
1243 1243  #ifdef __FreeBSD__
1244 1244          int error, errcode_valid, handled, idtvec, reflect;
1245 1245  #else
1246 1246          int error, errcode_valid = 0, handled, idtvec, reflect;
1247 1247  #endif
1248 1248  
1249 1249          ctx = svm_get_guest_regctx(svm_sc, vcpu);
1250 1250          vmcb = svm_get_vmcb(svm_sc, vcpu);
1251 1251          state = &vmcb->state;
1252 1252          ctrl = &vmcb->ctrl;
1253 1253  
1254 1254          handled = 0;
1255 1255          code = ctrl->exitcode;
1256 1256          info1 = ctrl->exitinfo1;
1257 1257          info2 = ctrl->exitinfo2;
1258 1258  
1259 1259          vmexit->exitcode = VM_EXITCODE_BOGUS;
1260 1260          vmexit->rip = state->rip;
1261 1261          vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1262 1262  
1263 1263          vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1264 1264  
1265 1265          /*
1266 1266           * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1267 1267           * in an inconsistent state and can trigger assertions that would
1268 1268           * never happen otherwise.
1269 1269           */
1270 1270          if (code == VMCB_EXIT_INVALID) {
1271 1271                  vm_exit_svm(vmexit, code, info1, info2);
1272 1272                  return (0);
1273 1273          }
1274 1274  
1275 1275          KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1276 1276              "injection valid bit is set %lx", __func__, ctrl->eventinj));
1277 1277  
1278 1278          KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1279 1279              ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)",
1280 1280              vmexit->inst_length, code, info1, info2));
1281 1281  
1282 1282          svm_update_virqinfo(svm_sc, vcpu);
1283 1283          svm_save_exitintinfo(svm_sc, vcpu);
1284 1284  
1285 1285          switch (code) {
1286 1286          case VMCB_EXIT_IRET:
1287 1287                  /*
1288 1288                   * Restart execution at "iret" but with the intercept cleared.
1289 1289                   */
1290 1290                  vmexit->inst_length = 0;
1291 1291                  svm_clear_nmi_blocking(svm_sc, vcpu);
1292 1292                  handled = 1;
1293 1293                  break;
1294 1294          case VMCB_EXIT_VINTR:   /* interrupt window exiting */
1295 1295                  vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1296 1296                  svm_disable_intr_window_exiting(svm_sc, vcpu);
1297 1297                  handled = 1;
1298 1298                  break;
1299 1299          case VMCB_EXIT_INTR:    /* external interrupt */
1300 1300                  vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1301 1301                  handled = 1;
1302 1302                  break;
1303 1303          case VMCB_EXIT_NMI:     /* external NMI */
1304 1304                  handled = 1;
1305 1305                  break;
1306 1306          case 0x40 ... 0x5F:
1307 1307                  vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1308 1308                  reflect = 1;
1309 1309                  idtvec = code - 0x40;
1310 1310                  switch (idtvec) {
1311 1311                  case IDT_MC:
1312 1312                          /*
1313 1313                           * Call the machine check handler by hand. Also don't
1314 1314                           * reflect the machine check back into the guest.
1315 1315                           */
1316 1316                          reflect = 0;
1317 1317                          VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
1318 1318  #ifdef __FreeBSD__
1319 1319                          __asm __volatile("int $18");
1320 1320  #else
1321 1321                          vmm_call_trap(T_MCE);
1322 1322  #endif
1323 1323                          break;
1324 1324                  case IDT_PF:
1325 1325                          error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1326 1326                              info2);
1327 1327                          KASSERT(error == 0, ("%s: error %d updating cr2",
1328 1328                              __func__, error));
1329 1329                          /* fallthru */
1330 1330                  case IDT_NP:
1331 1331                  case IDT_SS:
1332 1332                  case IDT_GP:
1333 1333                  case IDT_AC:
1334 1334                  case IDT_TS:
1335 1335                          errcode_valid = 1;
1336 1336                          break;
1337 1337  
1338 1338                  case IDT_DF:
1339 1339                          errcode_valid = 1;
1340 1340                          info1 = 0;
1341 1341                          break;
1342 1342  
1343 1343                  case IDT_BP:
1344 1344                  case IDT_OF:
1345 1345                  case IDT_BR:
1346 1346                          /*
1347 1347                           * The 'nrip' field is populated for INT3, INTO and
1348 1348                           * BOUND exceptions and this also implies that
1349 1349                           * 'inst_length' is non-zero.
1350 1350                           *
1351 1351                           * Reset 'inst_length' to zero so the guest %rip at
1352 1352                           * event injection is identical to what it was when
1353 1353                           * the exception originally happened.
1354 1354                           */
1355 1355                          VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
1356 1356                              "to zero before injecting exception %d",
1357 1357                              vmexit->inst_length, idtvec);
1358 1358                          vmexit->inst_length = 0;
1359 1359                          /* fallthru */
1360 1360                  default:
1361 1361                          errcode_valid = 0;
1362 1362                          info1 = 0;
1363 1363                          break;
1364 1364                  }
1365 1365                  KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
1366 1366                      "when reflecting exception %d into guest",
1367 1367                      vmexit->inst_length, idtvec));
1368 1368  
1369 1369                  if (reflect) {
1370 1370                          /* Reflect the exception back into the guest */
1371 1371                          VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
1372 1372                              "%d/%x into the guest", idtvec, (int)info1);
1373 1373                          error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1374 1374                              errcode_valid, info1, 0);
1375 1375                          KASSERT(error == 0, ("%s: vm_inject_exception error %d",
1376 1376                              __func__, error));
1377 1377                  }
1378 1378                  handled = 1;
1379 1379                  break;
1380 1380          case VMCB_EXIT_MSR:     /* MSR access. */
1381 1381                  eax = state->rax;
1382 1382                  ecx = ctx->sctx_rcx;
1383 1383                  edx = ctx->sctx_rdx;
1384 1384  
1385 1385                  if (info1) {
1386 1386                          vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1387 1387                          val = (uint64_t)edx << 32 | eax;
1388 1388                          VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %x val %lx",
1389 1389                              ecx, val);
1390 1390                          error = emulate_wrmsr(svm_sc, vcpu, ecx, val);
1391 1391                          if (error == 0) {
1392 1392                                  handled = 1;
1393 1393                          } else if (error > 0) {
1394 1394                                  vmexit->exitcode = VM_EXITCODE_WRMSR;
1395 1395                                  vmexit->u.msr.code = ecx;
1396 1396                                  vmexit->u.msr.wval = val;
1397 1397                          } else {
1398 1398                                  KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1399 1399                                      ("emulate_wrmsr retu with bogus exitcode"));
1400 1400                          }
1401 1401                  } else {
1402 1402                          VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %x", ecx);
1403 1403                          vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1404 1404                          error = emulate_rdmsr(svm_sc, vcpu, ecx);
1405 1405                          if (error == 0) {
1406 1406                                  handled = 1;
1407 1407                          } else if (error > 0) {
1408 1408                                  vmexit->exitcode = VM_EXITCODE_RDMSR;
1409 1409                                  vmexit->u.msr.code = ecx;
1410 1410                          } else {
1411 1411                                  KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1412 1412                                      ("emulate_rdmsr retu with bogus exitcode"));
1413 1413                          }
1414 1414                  }
1415 1415                  break;
1416 1416          case VMCB_EXIT_IO:
1417 1417                  handled = svm_handle_inout(svm_sc, vcpu, vmexit);
1418 1418                  vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1419 1419                  break;
1420 1420          case VMCB_EXIT_SHUTDOWN:
1421 1421                  vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT);
1422 1422                  handled = 1;
1423 1423                  break;
1424 1424          case VMCB_EXIT_INVD:
1425 1425          case VMCB_EXIT_INVLPGA:
1426 1426                  /* privileged invalidation instructions */
1427 1427                  vm_inject_ud(svm_sc->vm, vcpu);
1428 1428                  handled = 1;
1429 1429                  break;
1430 1430          case VMCB_EXIT_VMRUN:
1431 1431          case VMCB_EXIT_VMLOAD:
1432 1432          case VMCB_EXIT_VMSAVE:
1433 1433          case VMCB_EXIT_STGI:
1434 1434          case VMCB_EXIT_CLGI:
1435 1435          case VMCB_EXIT_SKINIT:
1436 1436                  /* privileged vmm instructions */
1437 1437                  vm_inject_ud(svm_sc->vm, vcpu);
1438 1438                  handled = 1;
1439 1439                  break;
1440 1440          case VMCB_EXIT_VMMCALL:
1441 1441                  /* No handlers make use of VMMCALL for now */
1442 1442                  vm_inject_ud(svm_sc->vm, vcpu);
1443 1443                  handled = 1;
1444 1444                  break;
1445 1445          case VMCB_EXIT_CPUID:
1446 1446                  vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1447 1447                  handled = x86_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
1448 1448                      &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
1449 1449                  break;
1450 1450          case VMCB_EXIT_HLT:
1451 1451                  vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1452 1452                  vmexit->exitcode = VM_EXITCODE_HLT;
1453 1453                  vmexit->u.hlt.rflags = state->rflags;
1454 1454                  break;
1455 1455          case VMCB_EXIT_PAUSE:
1456 1456                  vmexit->exitcode = VM_EXITCODE_PAUSE;
1457 1457                  vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1458 1458                  break;
1459 1459          case VMCB_EXIT_NPF:
1460 1460                  /* EXITINFO2 contains the faulting guest physical address */
1461 1461                  if (info1 & VMCB_NPF_INFO1_RSV) {
1462 1462                          VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
1463 1463                              "reserved bits set: info1(%lx) info2(%lx)",
1464 1464                              info1, info2);
1465 1465                  } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1466 1466                          vmexit->exitcode = VM_EXITCODE_PAGING;
1467 1467                          vmexit->u.paging.gpa = info2;
1468 1468                          vmexit->u.paging.fault_type = npf_fault_type(info1);
1469 1469                          vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1470 1470                          VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
1471 1471                              "on gpa %lx/%lx at rip %lx",
1472 1472                              info2, info1, state->rip);
1473 1473                  } else if (svm_npf_emul_fault(info1)) {
1474 1474                          svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2);
1475 1475                          vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
1476 1476                          VCPU_CTR3(svm_sc->vm, vcpu, "mmio_emul fault "
1477 1477                              "for gpa %lx/%lx at rip %lx",
1478 1478                              info2, info1, state->rip);
1479 1479                  }
1480 1480                  break;
1481 1481          case VMCB_EXIT_MONITOR:
1482 1482                  vmexit->exitcode = VM_EXITCODE_MONITOR;
1483 1483                  break;
1484 1484          case VMCB_EXIT_MWAIT:
1485 1485                  vmexit->exitcode = VM_EXITCODE_MWAIT;
1486 1486                  break;
1487 1487          default:
1488 1488                  vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1489 1489                  break;
1490 1490          }
1491 1491  
1492 1492          DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t,
1493 1493              code);
1494 1494  
1495 1495          if (handled) {
1496 1496                  vmexit->rip += vmexit->inst_length;
1497 1497                  vmexit->inst_length = 0;
1498 1498                  state->rip = vmexit->rip;
1499 1499          } else {
1500 1500                  if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1501 1501                          /*
1502 1502                           * If this VM exit was not claimed by anybody then
1503 1503                           * treat it as a generic SVM exit.
1504 1504                           */
1505 1505                          vm_exit_svm(vmexit, code, info1, info2);
1506 1506                  } else {
1507 1507                          /*
1508 1508                           * The exitcode and collateral have been populated.
1509 1509                           * The VM exit will be processed further in userland.
1510 1510                           */
1511 1511                  }
1512 1512          }
1513 1513          return (handled);
1514 1514  }
1515 1515  
1516 1516  /*
1517 1517   * Inject exceptions, NMIs, and ExtINTs.
1518 1518   *
1519 1519   * The logic behind these are complicated and may involve mutex contention, so
1520 1520   * the injection is performed without the protection of host CPU interrupts
1521 1521   * being disabled.  This means a racing notification could be "lost",
1522 1522   * necessitating a later call to svm_inject_recheck() to close that window
1523 1523   * of opportunity.
1524 1524   */
1525 1525  static enum event_inject_state
1526 1526  svm_inject_events(struct svm_softc *sc, int vcpu)
1527 1527  {
1528 1528          struct vmcb_ctrl *ctrl;
1529 1529          struct vmcb_state *state;
1530 1530          struct svm_vcpu *vcpustate;
1531 1531          uint64_t intinfo;
1532 1532          enum event_inject_state ev_state;
1533 1533  
1534 1534          state = svm_get_vmcb_state(sc, vcpu);
1535 1535          ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1536 1536          vcpustate = svm_get_vcpu(sc, vcpu);
1537 1537          ev_state = EIS_CAN_INJECT;
1538 1538  
1539 1539          /* Clear any interrupt shadow if guest %rip has changed */
1540 1540          if (vcpustate->nextrip != state->rip) {
1541 1541                  ctrl->intr_shadow = 0;
1542 1542          }
1543 1543  
1544 1544          /*
1545 1545           * An event is already pending for injection.  This can occur when the
1546 1546           * vCPU exits prior to VM entry (like for an AST).
1547 1547           */
1548 1548          if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1549 1549                  return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1550 1550          }
1551 1551  
1552 1552          /*
1553 1553           * Inject pending events or exceptions for this vcpu.
1554 1554           *
1555 1555           * An event might be pending because the previous #VMEXIT happened
1556 1556           * during event delivery (i.e. ctrl->exitintinfo).
1557 1557           *
1558 1558           * An event might also be pending because an exception was injected
1559 1559           * by the hypervisor (e.g. #PF during instruction emulation).
1560 1560           */
1561 1561          if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
1562 1562                  ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
1563 1563  
1564 1564                  svm_inject_event(sc, vcpu, intinfo);
1565 1565                  vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1566 1566                  ev_state = EIS_EV_INJECTED;
1567 1567          }
1568 1568  
1569 1569          /* NMI event has priority over interrupts. */
1570 1570          if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
1571 1571                  if (ev_state == EIS_CAN_INJECT) {
1572 1572                          /* Can't inject NMI if vcpu is in an intr_shadow. */
1573 1573                          if (ctrl->intr_shadow) {
1574 1574                                  return (EIS_GI_BLOCK);
1575 1575                          }
1576 1576  
1577 1577                          svm_inject_nmi(sc, vcpu);
1578 1578                          ev_state = EIS_EV_INJECTED;
1579 1579                  } else {
1580 1580                          return (ev_state | EIS_REQ_EXIT);
1581 1581                  }
1582 1582          }
1583 1583  
1584 1584          if (vm_extint_pending(sc->vm, vcpu)) {
1585 1585                  int vector;
1586 1586  
1587 1587                  if (ev_state != EIS_CAN_INJECT) {
1588 1588                          return (ev_state | EIS_REQ_EXIT);
1589 1589                  }
1590 1590  
1591 1591                  /*
1592 1592                   * If the guest has disabled interrupts or is in an interrupt
1593 1593                   * shadow then we cannot inject the pending interrupt.
1594 1594                   */
1595 1595                  if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1596 1596                          return (EIS_GI_BLOCK);
1597 1597                  }
1598 1598  
1599 1599                  /* Ask the legacy pic for a vector to inject */
1600 1600                  vatpic_pending_intr(sc->vm, &vector);
1601 1601                  KASSERT(vector >= 0 && vector <= 255,
1602 1602                      ("invalid vector %d from INTR", vector));
1603 1603  
1604 1604                  svm_inject_irq(sc, vcpu, vector);
1605 1605                  vm_extint_clear(sc->vm, vcpu);
1606 1606                  vatpic_intr_accepted(sc->vm, vector);
1607 1607                  ev_state = EIS_EV_INJECTED;
1608 1608          }
1609 1609  
1610 1610          return (ev_state);
1611 1611  }
1612 1612  
1613 1613  /*
1614 1614   * Synchronize vLAPIC state and inject any interrupts pending on it.
1615 1615   *
1616 1616   * This is done with host CPU interrupts disabled so notification IPIs will be
1617 1617   * queued on the host APIC and recognized when entering SVM guest context.
1618 1618   */
1619 1619  static enum event_inject_state
1620 1620  svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
1621 1621      enum event_inject_state ev_state)
1622 1622  {
1623 1623          struct vmcb_ctrl *ctrl;
1624 1624          struct vmcb_state *state;
1625 1625          int vector;
1626 1626          uint8_t v_tpr;
1627 1627  
1628 1628          state = svm_get_vmcb_state(sc, vcpu);
1629 1629          ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1630 1630  
1631 1631          /*
1632 1632           * The guest can modify the TPR by writing to %cr8. In guest mode the
1633 1633           * CPU reflects this write to V_TPR without hypervisor intervention.
1634 1634           *
1635 1635           * The guest can also modify the TPR by writing to it via the memory
1636 1636           * mapped APIC page. In this case, the write will be emulated by the
1637 1637           * hypervisor. For this reason V_TPR must be updated before every
1638 1638           * VMRUN.
1639 1639           */
1640 1640          v_tpr = vlapic_get_cr8(vlapic);
1641 1641          KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
1642 1642          if (ctrl->v_tpr != v_tpr) {
1643 1643                  ctrl->v_tpr = v_tpr;
1644 1644                  svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1645 1645          }
1646 1646  
1647 1647          /* If an event cannot otherwise be injected, we are done for now */
1648 1648          if (ev_state != EIS_CAN_INJECT) {
1649 1649                  return (ev_state);
1650 1650          }
1651 1651  
1652 1652          if (!vlapic_pending_intr(vlapic, &vector)) {
1653 1653                  return (EIS_CAN_INJECT);
1654 1654          }
1655 1655          KASSERT(vector >= 16 && vector <= 255,
1656 1656              ("invalid vector %d from local APIC", vector));
1657 1657  
1658 1658          /*
1659 1659           * If the guest has disabled interrupts or is in an interrupt shadow
1660 1660           * then we cannot inject the pending interrupt.
1661 1661           */
1662 1662          if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1663 1663                  return (EIS_GI_BLOCK);
1664 1664          }
1665 1665  
1666 1666          svm_inject_irq(sc, vcpu, vector);
1667 1667          vlapic_intr_accepted(vlapic, vector);
1668 1668          return (EIS_EV_INJECTED);
1669 1669  }
1670 1670  
1671 1671  /*
1672 1672   * Re-check for events to be injected.
1673 1673   *
1674 1674   * Once host CPU interrupts are disabled, check for the presence of any events
1675 1675   * which require injection processing.  If an exit is required upon injection,
1676 1676   * or once the guest becomes interruptable, that will be configured too.
1677 1677   */
1678 1678  static bool
1679 1679  svm_inject_recheck(struct svm_softc *sc, int vcpu,
1680 1680      enum event_inject_state ev_state)
1681 1681  {
1682 1682          struct vmcb_ctrl *ctrl;
1683 1683  
1684 1684          ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1685 1685  
1686 1686          if (ev_state == EIS_CAN_INJECT) {
1687 1687                  /*
1688 1688                   * An active interrupt shadow would preclude us from injecting
1689 1689                   * any events picked up during a re-check.
1690 1690                   */
1691 1691                  if (ctrl->intr_shadow != 0) {
1692 1692                          return (false);
1693 1693                  }
1694 1694  
1695 1695                  if (vm_nmi_pending(sc->vm, vcpu) &&
1696 1696                      !svm_nmi_blocked(sc, vcpu)) {
1697 1697                          /* queued NMI not blocked by NMI-window-exiting */
1698 1698                          return (true);
1699 1699                  }
1700 1700                  if (vm_extint_pending(sc->vm, vcpu)) {
1701 1701                          /* queued ExtINT not blocked by existing injection */
1702 1702                          return (true);
1703 1703                  }
1704 1704          } else {
1705 1705                  if ((ev_state & EIS_REQ_EXIT) != 0) {
1706 1706                          /*
1707 1707                           * Use a self-IPI to force an immediate exit after
1708 1708                           * event injection has occurred.
1709 1709                           */
1710 1710                          poke_cpu(CPU->cpu_id);
1711 1711                  } else {
1712 1712                          /*
1713 1713                           * If any event is being injected, an exit immediately
1714 1714                           * upon becoming interruptable again will allow pending
1715 1715                           * or newly queued events to be injected in a timely
1716 1716                           * manner.
1717 1717                           */
1718 1718                          svm_enable_intr_window_exiting(sc, vcpu);
1719 1719                  }
1720 1720          }
1721 1721          return (false);
1722 1722  }
1723 1723  
1724 1724  
1725 1725  #ifdef __FreeBSD__
1726 1726  static void
1727 1727  check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, uint_t thiscpu)
1728 1728  {
1729 1729          struct svm_vcpu *vcpustate;
1730 1730          struct vmcb_ctrl *ctrl;
1731 1731          long eptgen;
1732 1732          bool alloc_asid;
1733 1733  
1734 1734          KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
1735 1735              "active on cpu %u", __func__, thiscpu));
1736 1736  
1737 1737          vcpustate = svm_get_vcpu(sc, vcpuid);
1738 1738          ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1739 1739  
1740 1740          /*
1741 1741           * The TLB entries associated with the vcpu's ASID are not valid
1742 1742           * if either of the following conditions is true:
1743 1743           *
1744 1744           * 1. The vcpu's ASID generation is different than the host cpu's
1745 1745           *    ASID generation. This happens when the vcpu migrates to a new
1746 1746           *    host cpu. It can also happen when the number of vcpus executing
1747 1747           *    on a host cpu is greater than the number of ASIDs available.
1748 1748           *
1749 1749           * 2. The pmap generation number is different than the value cached in
1750 1750           *    the 'vcpustate'. This happens when the host invalidates pages
1751 1751           *    belonging to the guest.
1752 1752           *
1753 1753           *      asidgen         eptgen          Action
1754 1754           *      mismatch        mismatch
1755 1755           *      0               0               (a)
1756 1756           *      0               1               (b1) or (b2)
1757 1757           *      1               0               (c)
1758 1758           *      1               1               (d)
1759 1759           *
1760 1760           * (a)  There is no mismatch in eptgen or ASID generation and therefore
1761 1761           *      no further action is needed.
1762 1762           *
1763 1763           * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
1764 1764           *      retained and the TLB entries associated with this ASID
1765 1765           *      are flushed by VMRUN.
1766 1766           *
1767 1767           * (b2) If the cpu does not support FlushByAsid then a new ASID is
1768 1768           *      allocated.
1769 1769           *
1770 1770           * (c)  A new ASID is allocated.
1771 1771           *
1772 1772           * (d)  A new ASID is allocated.
1773 1773           */
1774 1774  
1775 1775          alloc_asid = false;
1776 1776          eptgen = pmap->pm_eptgen;
1777 1777          ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
1778 1778  
1779 1779          if (vcpustate->asid.gen != asid[thiscpu].gen) {
1780 1780                  alloc_asid = true;      /* (c) and (d) */
1781 1781          } else if (vcpustate->eptgen != eptgen) {
1782 1782                  if (flush_by_asid())
1783 1783                          ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;  /* (b1) */
1784 1784                  else
1785 1785                          alloc_asid = true;                      /* (b2) */
1786 1786          } else {
1787 1787                  /*
1788 1788                   * This is the common case (a).
1789 1789                   */
1790 1790                  KASSERT(!alloc_asid, ("ASID allocation not necessary"));
1791 1791                  KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
1792 1792                      ("Invalid VMCB tlb_ctrl: %x", ctrl->tlb_ctrl));
1793 1793          }
1794 1794  
1795 1795          if (alloc_asid) {
1796 1796                  if (++asid[thiscpu].num >= nasid) {
1797 1797                          asid[thiscpu].num = 1;
1798 1798                          if (++asid[thiscpu].gen == 0)
1799 1799                                  asid[thiscpu].gen = 1;
1800 1800                          /*
1801 1801                           * If this cpu does not support "flush-by-asid"
1802 1802                           * then flush the entire TLB on a generation
1803 1803                           * bump. Subsequent ASID allocation in this
1804 1804                           * generation can be done without a TLB flush.
1805 1805                           */
1806 1806                          if (!flush_by_asid())
1807 1807                                  ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
1808 1808                  }
1809 1809                  vcpustate->asid.gen = asid[thiscpu].gen;
1810 1810                  vcpustate->asid.num = asid[thiscpu].num;
1811 1811  
1812 1812                  ctrl->asid = vcpustate->asid.num;
1813 1813                  svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1814 1814                  /*
1815 1815                   * If this cpu supports "flush-by-asid" then the TLB
1816 1816                   * was not flushed after the generation bump. The TLB
1817 1817                   * is flushed selectively after every new ASID allocation.
1818 1818                   */
1819 1819                  if (flush_by_asid())
1820 1820                          ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
1821 1821          }
1822 1822          vcpustate->eptgen = eptgen;
1823 1823  
1824 1824          KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
1825 1825          KASSERT(ctrl->asid == vcpustate->asid.num,
1826 1826              ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
1827 1827  }
1828 1828  #else /* __FreeBSD__ */
1829 1829  static void
1830 1830  check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, uint_t thiscpu)
1831 1831  {
1832 1832          struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1833 1833          struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1834 1834          long eptgen;
1835 1835          uint8_t flush;
1836 1836  
1837 1837          eptgen = pmap->pm_eptgen;
1838 1838          flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
1839 1839              vcpustate->eptgen == eptgen);
1840 1840  
1841 1841          if (flush != VMCB_TLB_FLUSH_NOTHING) {
1842 1842                  ctrl->asid = vcpustate->hma_asid.hsa_asid;
1843 1843                  svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1844 1844          }
1845 1845          ctrl->tlb_ctrl = flush;
1846 1846          vcpustate->eptgen = eptgen;
1847 1847  }
1848 1848  #endif /* __FreeBSD__ */
1849 1849  
1850 1850  static __inline void
1851 1851  disable_gintr(void)
1852 1852  {
1853 1853          __asm __volatile("clgi");
1854 1854  }
1855 1855  
1856 1856  static __inline void
1857 1857  enable_gintr(void)
1858 1858  {
1859 1859          __asm __volatile("stgi");
1860 1860  }
1861 1861  
1862 1862  static __inline void
1863 1863  svm_dr_enter_guest(struct svm_regctx *gctx)
1864 1864  {
1865 1865  
1866 1866          /* Save host control debug registers. */
1867 1867          gctx->host_dr7 = rdr7();
1868 1868          gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
1869 1869  
1870 1870          /*
1871 1871           * Disable debugging in DR7 and DEBUGCTL to avoid triggering
1872 1872           * exceptions in the host based on the guest DRx values.  The
1873 1873           * guest DR6, DR7, and DEBUGCTL are saved/restored in the
1874 1874           * VMCB.
1875 1875           */
1876 1876          load_dr7(0);
1877 1877          wrmsr(MSR_DEBUGCTLMSR, 0);
1878 1878  
1879 1879          /* Save host debug registers. */
1880 1880          gctx->host_dr0 = rdr0();
1881 1881          gctx->host_dr1 = rdr1();
1882 1882          gctx->host_dr2 = rdr2();
1883 1883          gctx->host_dr3 = rdr3();
1884 1884          gctx->host_dr6 = rdr6();
1885 1885  
1886 1886          /* Restore guest debug registers. */
1887 1887          load_dr0(gctx->sctx_dr0);
1888 1888          load_dr1(gctx->sctx_dr1);
1889 1889          load_dr2(gctx->sctx_dr2);
1890 1890          load_dr3(gctx->sctx_dr3);
1891 1891  }
1892 1892  
1893 1893  static __inline void
1894 1894  svm_dr_leave_guest(struct svm_regctx *gctx)
1895 1895  {
1896 1896  
1897 1897          /* Save guest debug registers. */
1898 1898          gctx->sctx_dr0 = rdr0();
1899 1899          gctx->sctx_dr1 = rdr1();
1900 1900          gctx->sctx_dr2 = rdr2();
1901 1901          gctx->sctx_dr3 = rdr3();
1902 1902  
1903 1903          /*
1904 1904           * Restore host debug registers.  Restore DR7 and DEBUGCTL
1905 1905           * last.
1906 1906           */
1907 1907          load_dr0(gctx->host_dr0);
1908 1908          load_dr1(gctx->host_dr1);
1909 1909          load_dr2(gctx->host_dr2);

↓ open down ↓

1909 lines elided

↑ open up ↑

1910 1910          load_dr3(gctx->host_dr3);
1911 1911          load_dr6(gctx->host_dr6);
1912 1912          wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
1913 1913          load_dr7(gctx->host_dr7);
1914 1914  }
1915 1915  
1916 1916  /*
1917 1917   * Start vcpu with specified RIP.
1918 1918   */
1919 1919  static int
1920      -svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
1921      -    struct vm_eventinfo *evinfo)
     1920 +svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
1922 1921  {
1923 1922          struct svm_regctx *gctx;
1924 1923          struct svm_softc *svm_sc;
1925 1924          struct svm_vcpu *vcpustate;
1926 1925          struct vmcb_state *state;
1927 1926          struct vmcb_ctrl *ctrl;
1928 1927          struct vm_exit *vmexit;
1929 1928          struct vlapic *vlapic;
1930 1929          struct vm *vm;
1931 1930          uint64_t vmcb_pa;

1932 1931          int handled;
1933 1932          uint16_t ldt_sel;
1934 1933  
1935 1934          svm_sc = arg;
1936 1935          vm = svm_sc->vm;
1937 1936  
1938 1937          vcpustate = svm_get_vcpu(svm_sc, vcpu);
1939 1938          state = svm_get_vmcb_state(svm_sc, vcpu);
1940 1939          ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1941 1940          vmexit = vm_exitinfo(vm, vcpu);
1942 1941          vlapic = vm_lapic(vm, vcpu);
1943 1942  
1944 1943          gctx = svm_get_guest_regctx(svm_sc, vcpu);
1945 1944          vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1946 1945  
1947 1946          if (vcpustate->lastcpu != curcpu) {
1948 1947                  /*
1949 1948                   * Force new ASID allocation by invalidating the generation.
1950 1949                   */
1951 1950  #ifdef __FreeBSD__
1952 1951                  vcpustate->asid.gen = 0;
1953 1952  #else
1954 1953                  vcpustate->hma_asid.hsa_gen = 0;
1955 1954  #endif
1956 1955  
1957 1956                  /*
1958 1957                   * Invalidate the VMCB state cache by marking all fields dirty.
1959 1958                   */
1960 1959                  svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1961 1960  
1962 1961                  /*
1963 1962                   * XXX
1964 1963                   * Setting 'vcpustate->lastcpu' here is bit premature because
1965 1964                   * we may return from this function without actually executing
1966 1965                   * the VMRUN  instruction. This could happen if an AST or yield
1967 1966                   * condition is pending on the first time through the loop.
1968 1967                   *
1969 1968                   * This works for now but any new side-effects of vcpu
1970 1969                   * migration should take this case into account.
1971 1970                   */
1972 1971                  vcpustate->lastcpu = curcpu;
1973 1972                  vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1974 1973          }
1975 1974  
1976 1975          svm_msr_guest_enter(svm_sc, vcpu);
1977 1976  
1978 1977  #ifndef __FreeBSD__
1979 1978          VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
1980 1979          vcpustate->loaded = B_TRUE;
1981 1980  #endif
1982 1981  
1983 1982          /* Update Guest RIP */
1984 1983          state->rip = rip;
1985 1984  
1986 1985          do {
1987 1986                  enum event_inject_state inject_state;
1988 1987  
1989 1988                  /*
1990 1989                   * Initial event injection is complex and may involve mutex
1991 1990                   * contention, so it must be performed with global interrupts
1992 1991                   * still enabled.
1993 1992                   */
1994 1993                  inject_state = svm_inject_events(svm_sc, vcpu);
1995 1994                  handled = 0;
1996 1995  
1997 1996                  /*
1998 1997                   * Disable global interrupts to guarantee atomicity during
1999 1998                   * loading of guest state. This includes not only the state
2000 1999                   * loaded by the "vmrun" instruction but also software state
2001 2000                   * maintained by the hypervisor: suspended and rendezvous
2002 2001                   * state, NPT generation number, vlapic interrupts etc.

↓ open down ↓

71 lines elided

↑ open up ↑

2003 2002                   */
2004 2003                  disable_gintr();
2005 2004  
2006 2005                  /*
2007 2006                   * Synchronizing and injecting vlapic state is lock-free and is
2008 2007                   * safe (and prudent) to perform with interrupts disabled.
2009 2008                   */
2010 2009                  inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
2011 2010                      inject_state);
2012 2011  
2013      -                if (vcpu_suspended(evinfo)) {
     2012 +                /*
     2013 +                 * Check for vCPU bail-out conditions.  This must be done after
     2014 +                 * svm_inject_events() to detect a triple-fault condition.
     2015 +                 */
     2016 +                if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
2014 2017                          enable_gintr();
2015      -                        vm_exit_suspended(vm, vcpu, state->rip);
2016 2018                          break;
2017 2019                  }
2018 2020  
2019      -                if (vcpu_runblocked(evinfo)) {
     2021 +                if (vcpu_run_state_pending(vm, vcpu)) {
2020 2022                          enable_gintr();
2021      -                        vm_exit_runblock(vm, vcpu, state->rip);
     2023 +                        vm_exit_run_state(vm, vcpu, state->rip);
2022 2024                          break;
2023 2025                  }
2024 2026  
2025      -                if (vcpu_reqidle(evinfo)) {
2026      -                        enable_gintr();
2027      -                        vm_exit_reqidle(vm, vcpu, state->rip);
2028      -                        break;
2029      -                }
2030      -
2031      -                /* We are asked to give the cpu by scheduler. */
2032      -                if (vcpu_should_yield(vm, vcpu)) {
2033      -                        enable_gintr();
2034      -                        vm_exit_astpending(vm, vcpu, state->rip);
2035      -                        break;
2036      -                }
2037      -
2038      -                if (vcpu_debugged(vm, vcpu)) {
2039      -                        enable_gintr();
2040      -                        vm_exit_debug(vm, vcpu, state->rip);
2041      -                        break;
2042      -                }
2043      -
2044 2027                  /*
2045 2028                   * If subsequent activity queued events which require injection
2046 2029                   * handling, take another lap to handle them.
2047 2030                   */
2048 2031                  if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
2049 2032                          enable_gintr();
2050 2033                          handled = 1;
2051 2034                          continue;
2052 2035                  }
2053 2036

2054 2037                  /*
2055 2038                   * #VMEXIT resumes the host with the guest LDTR, so
2056 2039                   * save the current LDT selector so it can be restored
2057 2040                   * after an exit.  The userspace hypervisor probably
2058 2041                   * doesn't use a LDT, but save and restore it to be
2059 2042                   * safe.
2060 2043                   */
2061 2044                  ldt_sel = sldt();
2062 2045  
2063 2046                  /* Activate the nested pmap on 'curcpu' */
2064 2047                  CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
2065 2048  
2066 2049                  /*
2067 2050                   * Check the pmap generation and the ASID generation to
2068 2051                   * ensure that the vcpu does not use stale TLB mappings.
2069 2052                   */
2070 2053                  check_asid(svm_sc, vcpu, pmap, curcpu);
2071 2054  
2072 2055                  ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
2073 2056                  vcpustate->dirty = 0;
2074 2057                  VCPU_CTR1(vm, vcpu, "vmcb clean %x", ctrl->vmcb_clean);
2075 2058  
2076 2059                  /* Launch Virtual Machine. */
2077 2060                  VCPU_CTR1(vm, vcpu, "Resume execution at %lx", state->rip);
2078 2061                  svm_dr_enter_guest(gctx);
2079 2062                  svm_launch(vmcb_pa, gctx, get_pcpu());
2080 2063                  svm_dr_leave_guest(gctx);
2081 2064  
2082 2065                  CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
2083 2066  
2084 2067                  /* Restore host LDTR. */
2085 2068                  lldt(ldt_sel);
2086 2069  
2087 2070                  /* #VMEXIT disables interrupts so re-enable them here. */
2088 2071                  enable_gintr();
2089 2072  
2090 2073                  /* Update 'nextrip' */
2091 2074                  vcpustate->nextrip = state->rip;
2092 2075  
2093 2076                  /* Handle #VMEXIT and if required return to user space. */
2094 2077                  handled = svm_vmexit(svm_sc, vcpu, vmexit);
2095 2078          } while (handled);
2096 2079  
2097 2080          svm_msr_guest_exit(svm_sc, vcpu);
2098 2081  
2099 2082  #ifndef __FreeBSD__
2100 2083          VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
2101 2084          vcpustate->loaded = B_FALSE;
2102 2085  #endif
2103 2086  
2104 2087          return (0);
2105 2088  }
2106 2089  
2107 2090  static void
2108 2091  svm_vmcleanup(void *arg)
2109 2092  {
2110 2093          struct svm_softc *sc = arg;
2111 2094  
2112 2095          contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM);
2113 2096          contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM);
2114 2097          free(sc, M_SVM);
2115 2098  }
2116 2099  
2117 2100  static uint64_t *
2118 2101  swctx_regptr(struct svm_regctx *regctx, int reg)
2119 2102  {
2120 2103          switch (reg) {
2121 2104          case VM_REG_GUEST_RBX:
2122 2105                  return (&regctx->sctx_rbx);
2123 2106          case VM_REG_GUEST_RCX:
2124 2107                  return (&regctx->sctx_rcx);
2125 2108          case VM_REG_GUEST_RDX:
2126 2109                  return (&regctx->sctx_rdx);
2127 2110          case VM_REG_GUEST_RDI:
2128 2111                  return (&regctx->sctx_rdi);
2129 2112          case VM_REG_GUEST_RSI:
2130 2113                  return (&regctx->sctx_rsi);
2131 2114          case VM_REG_GUEST_RBP:
2132 2115                  return (&regctx->sctx_rbp);
2133 2116          case VM_REG_GUEST_R8:
2134 2117                  return (&regctx->sctx_r8);
2135 2118          case VM_REG_GUEST_R9:
2136 2119                  return (&regctx->sctx_r9);
2137 2120          case VM_REG_GUEST_R10:
2138 2121                  return (&regctx->sctx_r10);
2139 2122          case VM_REG_GUEST_R11:
2140 2123                  return (&regctx->sctx_r11);
2141 2124          case VM_REG_GUEST_R12:
2142 2125                  return (&regctx->sctx_r12);
2143 2126          case VM_REG_GUEST_R13:
2144 2127                  return (&regctx->sctx_r13);
2145 2128          case VM_REG_GUEST_R14:
2146 2129                  return (&regctx->sctx_r14);
2147 2130          case VM_REG_GUEST_R15:
2148 2131                  return (&regctx->sctx_r15);
2149 2132          case VM_REG_GUEST_DR0:
2150 2133                  return (&regctx->sctx_dr0);
2151 2134          case VM_REG_GUEST_DR1:
2152 2135                  return (&regctx->sctx_dr1);
2153 2136          case VM_REG_GUEST_DR2:
2154 2137                  return (&regctx->sctx_dr2);
2155 2138          case VM_REG_GUEST_DR3:
2156 2139                  return (&regctx->sctx_dr3);
2157 2140          default:
2158 2141                  return (NULL);
2159 2142          }
2160 2143  }
2161 2144  
2162 2145  static int
2163 2146  svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2164 2147  {
2165 2148          struct svm_softc *sc;
2166 2149          struct vmcb *vmcb;
2167 2150          uint64_t *regp;
2168 2151          uint64_t *fieldp;
2169 2152          struct vmcb_segment *seg;
2170 2153  
2171 2154          sc = arg;
2172 2155          vmcb = svm_get_vmcb(sc, vcpu);
2173 2156  
2174 2157          regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2175 2158          if (regp != NULL) {
2176 2159                  *val = *regp;
2177 2160                  return (0);
2178 2161          }
2179 2162  
2180 2163          switch (ident) {
2181 2164          case VM_REG_GUEST_INTR_SHADOW:
2182 2165                  *val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0;
2183 2166                  break;
2184 2167  
2185 2168          case VM_REG_GUEST_CR0:
2186 2169          case VM_REG_GUEST_CR2:
2187 2170          case VM_REG_GUEST_CR3:
2188 2171          case VM_REG_GUEST_CR4:
2189 2172          case VM_REG_GUEST_DR6:
2190 2173          case VM_REG_GUEST_DR7:
2191 2174          case VM_REG_GUEST_EFER:
2192 2175          case VM_REG_GUEST_RAX:
2193 2176          case VM_REG_GUEST_RFLAGS:
2194 2177          case VM_REG_GUEST_RIP:
2195 2178          case VM_REG_GUEST_RSP:
2196 2179                  fieldp = vmcb_regptr(vmcb, ident, NULL);
2197 2180                  *val = *fieldp;
2198 2181                  break;
2199 2182  
2200 2183          case VM_REG_GUEST_CS:
2201 2184          case VM_REG_GUEST_DS:
2202 2185          case VM_REG_GUEST_ES:
2203 2186          case VM_REG_GUEST_FS:
2204 2187          case VM_REG_GUEST_GS:
2205 2188          case VM_REG_GUEST_SS:
2206 2189          case VM_REG_GUEST_LDTR:
2207 2190          case VM_REG_GUEST_TR:
2208 2191                  seg = vmcb_segptr(vmcb, ident);
2209 2192                  *val = seg->selector;
2210 2193                  break;
2211 2194  
2212 2195          case VM_REG_GUEST_GDTR:
2213 2196          case VM_REG_GUEST_IDTR:
2214 2197                  /* GDTR and IDTR don't have segment selectors */
2215 2198                  return (EINVAL);
2216 2199  
2217 2200          default:
2218 2201                  return (EINVAL);
2219 2202          }
2220 2203  
2221 2204          return (0);
2222 2205  }
2223 2206  
2224 2207  static int
2225 2208  svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2226 2209  {
2227 2210          struct svm_softc *sc;
2228 2211          struct vmcb *vmcb;
2229 2212          uint64_t *regp;
2230 2213          uint64_t *fieldp;
2231 2214          uint32_t dirty;
2232 2215          struct vmcb_segment *seg;
2233 2216  
2234 2217          sc = arg;
2235 2218          vmcb = svm_get_vmcb(sc, vcpu);
2236 2219  
2237 2220          regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2238 2221          if (regp != NULL) {
2239 2222                  *regp = val;
2240 2223                  return (0);
2241 2224          }
2242 2225  
2243 2226          dirty = VMCB_CACHE_NONE;
2244 2227          switch (ident) {
2245 2228          case VM_REG_GUEST_INTR_SHADOW:
2246 2229                  vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0;
2247 2230                  break;
2248 2231  
2249 2232          case VM_REG_GUEST_EFER:
2250 2233                  fieldp = vmcb_regptr(vmcb, ident, &dirty);
2251 2234                  /* EFER_SVM must always be set when the guest is executing */
2252 2235                  *fieldp = val | EFER_SVM;
2253 2236                  dirty |= VMCB_CACHE_CR;
2254 2237                  break;
2255 2238  
2256 2239          case VM_REG_GUEST_CR0:
2257 2240          case VM_REG_GUEST_CR2:
2258 2241          case VM_REG_GUEST_CR3:
2259 2242          case VM_REG_GUEST_CR4:
2260 2243          case VM_REG_GUEST_DR6:
2261 2244          case VM_REG_GUEST_DR7:
2262 2245          case VM_REG_GUEST_RAX:
2263 2246          case VM_REG_GUEST_RFLAGS:
2264 2247          case VM_REG_GUEST_RIP:
2265 2248          case VM_REG_GUEST_RSP:
2266 2249                  fieldp = vmcb_regptr(vmcb, ident, &dirty);
2267 2250                  *fieldp = val;
2268 2251                  break;
2269 2252  
2270 2253          case VM_REG_GUEST_CS:
2271 2254          case VM_REG_GUEST_DS:
2272 2255          case VM_REG_GUEST_ES:
2273 2256          case VM_REG_GUEST_SS:
2274 2257          case VM_REG_GUEST_FS:
2275 2258          case VM_REG_GUEST_GS:
2276 2259          case VM_REG_GUEST_LDTR:
2277 2260          case VM_REG_GUEST_TR:
2278 2261                  dirty |= VMCB_CACHE_SEG;
2279 2262                  seg = vmcb_segptr(vmcb, ident);
2280 2263                  seg->selector = (uint16_t)val;
2281 2264                  break;
2282 2265  
2283 2266          case VM_REG_GUEST_GDTR:
2284 2267          case VM_REG_GUEST_IDTR:
2285 2268                  /* GDTR and IDTR don't have segment selectors */
2286 2269                  return (EINVAL);
2287 2270  
2288 2271          default:
2289 2272                  return (EINVAL);
2290 2273          }
2291 2274  
2292 2275          if (dirty != VMCB_CACHE_NONE) {
2293 2276                  svm_set_dirty(sc, vcpu, dirty);
2294 2277          }
2295 2278

↓ open down ↓

242 lines elided

↑ open up ↑

2296 2279          /*
2297 2280           * XXX deal with CR3 and invalidate TLB entries tagged with the
2298 2281           * vcpu's ASID. This needs to be treated differently depending on
2299 2282           * whether 'running' is true/false.
2300 2283           */
2301 2284  
2302 2285          return (0);
2303 2286  }
2304 2287  
2305 2288  static int
2306      -svm_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
     2289 +svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
2307 2290  {
2308 2291          struct vmcb *vmcb;
2309 2292          struct svm_softc *sc;
2310 2293          struct vmcb_segment *seg;
2311 2294  
2312 2295          sc = arg;
2313 2296          vmcb = svm_get_vmcb(sc, vcpu);
2314 2297  
2315 2298          switch (reg) {
2316 2299          case VM_REG_GUEST_CS:

2317 2300          case VM_REG_GUEST_DS:
2318 2301          case VM_REG_GUEST_ES:
2319 2302          case VM_REG_GUEST_SS:
2320 2303          case VM_REG_GUEST_FS:
2321 2304          case VM_REG_GUEST_GS:
2322 2305          case VM_REG_GUEST_LDTR:
2323 2306          case VM_REG_GUEST_TR:
2324 2307                  svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
2325 2308                  seg = vmcb_segptr(vmcb, reg);
2326 2309                  /*
2327 2310                   * Map seg_desc access to VMCB attribute format.
2328 2311                   *
2329 2312                   * SVM uses the 'P' bit in the segment attributes to indicate a
2330 2313                   * NULL segment so clear it if the segment is marked unusable.
2331 2314                   */
2332 2315                  seg->attrib = VMCB_ACCESS2ATTR(desc->access);
2333 2316                  if (SEG_DESC_UNUSABLE(desc->access)) {
2334 2317                          seg->attrib &= ~0x80;
2335 2318                  }
2336 2319                  break;
2337 2320  
2338 2321          case VM_REG_GUEST_GDTR:
2339 2322          case VM_REG_GUEST_IDTR:
2340 2323                  svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
2341 2324                  seg = vmcb_segptr(vmcb, reg);
2342 2325                  break;
2343 2326  
2344 2327          default:
2345 2328                  return (EINVAL);
2346 2329          }
2347 2330  
2348 2331          ASSERT(seg != NULL);
2349 2332          seg->base = desc->base;
2350 2333          seg->limit = desc->limit;
2351 2334  
2352 2335          return (0);
2353 2336  }
2354 2337  
2355 2338  static int
2356 2339  svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2357 2340  {
2358 2341          struct vmcb *vmcb;
2359 2342          struct svm_softc *sc;
2360 2343          struct vmcb_segment *seg;
2361 2344  
2362 2345          sc = arg;
2363 2346          vmcb = svm_get_vmcb(sc, vcpu);
2364 2347  
2365 2348          switch (reg) {
2366 2349          case VM_REG_GUEST_DS:
2367 2350          case VM_REG_GUEST_ES:
2368 2351          case VM_REG_GUEST_FS:
2369 2352          case VM_REG_GUEST_GS:
2370 2353          case VM_REG_GUEST_SS:
2371 2354          case VM_REG_GUEST_LDTR:
2372 2355                  seg = vmcb_segptr(vmcb, reg);
2373 2356                  desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2374 2357                  /*
2375 2358                   * VT-x uses bit 16 to indicate a segment that has been loaded
2376 2359                   * with a NULL selector (aka unusable). The 'desc->access'
2377 2360                   * field is interpreted in the VT-x format by the
2378 2361                   * processor-independent code.
2379 2362                   *
2380 2363                   * SVM uses the 'P' bit to convey the same information so
2381 2364                   * convert it into the VT-x format. For more details refer to
2382 2365                   * section "Segment State in the VMCB" in APMv2.
2383 2366                   */
2384 2367                  if ((desc->access & 0x80) == 0) {
2385 2368                          /* Unusable segment */
2386 2369                          desc->access |= 0x10000;
2387 2370                  }
2388 2371                  break;
2389 2372  
2390 2373          case VM_REG_GUEST_CS:
2391 2374          case VM_REG_GUEST_TR:
2392 2375                  seg = vmcb_segptr(vmcb, reg);
2393 2376                  desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2394 2377                  break;
2395 2378  
2396 2379          case VM_REG_GUEST_GDTR:
2397 2380          case VM_REG_GUEST_IDTR:
2398 2381                  seg = vmcb_segptr(vmcb, reg);
2399 2382                  /*
2400 2383                   * Since there are no access bits associated with the GDTR or
2401 2384                   * the IDTR, zero out the field to ensure it does not contain
2402 2385                   * garbage which might confuse the consumer.
2403 2386                   */
2404 2387                  desc->access = 0;
2405 2388                  break;
2406 2389  
2407 2390          default:
2408 2391                  return (EINVAL);
2409 2392          }
2410 2393  
2411 2394          ASSERT(seg != NULL);
2412 2395          desc->base = seg->base;
2413 2396          desc->limit = seg->limit;
2414 2397          return (0);
2415 2398  }
2416 2399  
2417 2400  static int
2418 2401  svm_setcap(void *arg, int vcpu, int type, int val)
2419 2402  {
2420 2403          struct svm_softc *sc;
2421 2404          int error;
2422 2405  
2423 2406          sc = arg;
2424 2407          error = 0;
2425 2408          switch (type) {
2426 2409          case VM_CAP_HALT_EXIT:
2427 2410                  svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2428 2411                      VMCB_INTCPT_HLT, val);
2429 2412                  break;
2430 2413          case VM_CAP_PAUSE_EXIT:
2431 2414                  svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2432 2415                      VMCB_INTCPT_PAUSE, val);
2433 2416                  break;
2434 2417          default:
2435 2418                  error = ENOENT;
2436 2419                  break;
2437 2420          }
2438 2421          return (error);
2439 2422  }
2440 2423  
2441 2424  static int
2442 2425  svm_getcap(void *arg, int vcpu, int type, int *retval)
2443 2426  {
2444 2427          struct svm_softc *sc;
2445 2428          int error;
2446 2429  
2447 2430          sc = arg;
2448 2431          error = 0;
2449 2432  
2450 2433          switch (type) {
2451 2434          case VM_CAP_HALT_EXIT:
2452 2435                  *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2453 2436                      VMCB_INTCPT_HLT);
2454 2437                  break;
2455 2438          case VM_CAP_PAUSE_EXIT:
2456 2439                  *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2457 2440                      VMCB_INTCPT_PAUSE);
2458 2441                  break;
2459 2442          default:
2460 2443                  error = ENOENT;
2461 2444                  break;
2462 2445          }
2463 2446          return (error);
2464 2447  }
2465 2448  
2466 2449  static struct vlapic *
2467 2450  svm_vlapic_init(void *arg, int vcpuid)
2468 2451  {
2469 2452          struct svm_softc *svm_sc;
2470 2453          struct vlapic *vlapic;
2471 2454  
2472 2455          svm_sc = arg;
2473 2456          vlapic = malloc(sizeof (struct vlapic), M_SVM_VLAPIC,
2474 2457              M_WAITOK | M_ZERO);
2475 2458          vlapic->vm = svm_sc->vm;
2476 2459          vlapic->vcpuid = vcpuid;
2477 2460          vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2478 2461  
2479 2462          vlapic_init(vlapic);
2480 2463  
2481 2464          return (vlapic);
2482 2465  }
2483 2466  
2484 2467  static void
2485 2468  svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2486 2469  {
2487 2470          vlapic_cleanup(vlapic);
2488 2471          free(vlapic, M_SVM_VLAPIC);
2489 2472  }
2490 2473  
2491 2474  #ifndef __FreeBSD__
2492 2475  static void
2493 2476  svm_savectx(void *arg, int vcpu)
2494 2477  {
2495 2478          struct svm_softc *sc = arg;
2496 2479  
2497 2480          if (sc->vcpu[vcpu].loaded) {
2498 2481                  svm_msr_guest_exit(sc, vcpu);
2499 2482          }
2500 2483  }
2501 2484  
2502 2485  static void
2503 2486  svm_restorectx(void *arg, int vcpu)
2504 2487  {
2505 2488          struct svm_softc *sc = arg;
2506 2489  
2507 2490          if (sc->vcpu[vcpu].loaded) {
2508 2491                  svm_msr_guest_enter(sc, vcpu);
2509 2492          }
2510 2493  }
2511 2494  #endif /* __FreeBSD__ */
2512 2495  
2513 2496  struct vmm_ops vmm_ops_amd = {
2514 2497          .init           = svm_init,
2515 2498          .cleanup        = svm_cleanup,
2516 2499          .resume         = svm_restore,
2517 2500          .vminit         = svm_vminit,
2518 2501          .vmrun          = svm_vmrun,
2519 2502          .vmcleanup      = svm_vmcleanup,
2520 2503          .vmgetreg       = svm_getreg,
2521 2504          .vmsetreg       = svm_setreg,
2522 2505          .vmgetdesc      = svm_getdesc,
2523 2506          .vmsetdesc      = svm_setdesc,
2524 2507          .vmgetcap       = svm_getcap,
2525 2508          .vmsetcap       = svm_setcap,
2526 2509          .vmspace_alloc  = svm_npt_alloc,
2527 2510          .vmspace_free   = svm_npt_free,
2528 2511          .vlapic_init    = svm_vlapic_init,
2529 2512          .vlapic_cleanup = svm_vlapic_cleanup,
2530 2513  #ifndef __FreeBSD__
2531 2514          .vmsavectx      = svm_savectx,
2532 2515          .vmrestorectx   = svm_restorectx,
2533 2516  #endif
2534 2517  };

↓ open down ↓

218 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX