1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice unmodified, this list of conditions, and the following
  12  *    disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28 
  29 /*
  30  * This file and its contents are supplied under the terms of the
  31  * Common Development and Distribution License ("CDDL"), version 1.0.
  32  * You may only use this file in accordance with the terms of version
  33  * 1.0 of the CDDL.
  34  *
  35  * A full copy of the text of the CDDL should have accompanied this
  36  * source.  A copy of the CDDL is also available via the Internet at
  37  * http://www.illumos.org/license/CDDL.
  38  *
  39  * Copyright 2018 Joyent, Inc.
  40  * Copyright 2020 Oxide Computer Company
  41  */
  42 
  43 #include <sys/cdefs.h>
  44 __FBSDID("$FreeBSD$");
  45 
  46 #include <sys/param.h>
  47 #include <sys/systm.h>
  48 #include <sys/smp.h>
  49 #include <sys/kernel.h>
  50 #include <sys/malloc.h>
  51 #include <sys/pcpu.h>
  52 #include <sys/proc.h>
  53 #include <sys/sysctl.h>
  54 
  55 #ifndef __FreeBSD__
  56 #include <sys/x86_archext.h>
  57 #include <sys/trap.h>
  58 #endif
  59 
  60 #include <vm/vm.h>
  61 #include <vm/pmap.h>
  62 
  63 #include <machine/cpufunc.h>
  64 #include <machine/psl.h>
  65 #include <machine/md_var.h>
  66 #include <machine/reg.h>
  67 #include <machine/specialreg.h>
  68 #include <machine/smp.h>
  69 #include <machine/vmm.h>
  70 #include <machine/vmm_dev.h>
  71 #include <sys/vmm_instruction_emul.h>
  72 
  73 #include "vmm_lapic.h"
  74 #include "vmm_stat.h"
  75 #include "vmm_ktr.h"
  76 #include "vmm_ioport.h"
  77 #include "vatpic.h"
  78 #include "vlapic.h"
  79 #include "vlapic_priv.h"
  80 
  81 #include "x86.h"
  82 #include "vmcb.h"
  83 #include "svm.h"
  84 #include "svm_softc.h"
  85 #include "svm_msr.h"
  86 #include "npt.h"
  87 
  88 SYSCTL_DECL(_hw_vmm);
  89 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
  90     NULL);
  91 
  92 /*
  93  * SVM CPUID function 0x8000_000A, edx bit decoding.
  94  */
  95 #define AMD_CPUID_SVM_NP                BIT(0)  /* Nested paging or RVI */
  96 #define AMD_CPUID_SVM_LBR               BIT(1)  /* Last branch virtualization */
  97 #define AMD_CPUID_SVM_SVML              BIT(2)  /* SVM lock */
  98 #define AMD_CPUID_SVM_NRIP_SAVE         BIT(3)  /* Next RIP is saved */
  99 #define AMD_CPUID_SVM_TSC_RATE          BIT(4)  /* TSC rate control. */
 100 #define AMD_CPUID_SVM_VMCB_CLEAN        BIT(5)  /* VMCB state caching */
 101 #define AMD_CPUID_SVM_FLUSH_BY_ASID     BIT(6)  /* Flush by ASID */
 102 #define AMD_CPUID_SVM_DECODE_ASSIST     BIT(7)  /* Decode assist */
 103 #define AMD_CPUID_SVM_PAUSE_INC         BIT(10) /* Pause intercept filter. */
 104 #define AMD_CPUID_SVM_PAUSE_FTH         BIT(12) /* Pause filter threshold */
 105 #define AMD_CPUID_SVM_AVIC              BIT(13) /* AVIC present */
 106 
 107 #define VMCB_CACHE_DEFAULT      (VMCB_CACHE_ASID        |       \
 108                                 VMCB_CACHE_IOPM         |       \
 109                                 VMCB_CACHE_I            |       \
 110                                 VMCB_CACHE_TPR          |       \
 111                                 VMCB_CACHE_CR2          |       \
 112                                 VMCB_CACHE_CR           |       \
 113                                 VMCB_CACHE_DR           |       \
 114                                 VMCB_CACHE_DT           |       \
 115                                 VMCB_CACHE_SEG          |       \
 116                                 VMCB_CACHE_NP)
 117 
 118 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
 119 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
 120     0, NULL);
 121 
 122 static MALLOC_DEFINE(M_SVM, "svm", "svm");
 123 static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
 124 
 125 /* SVM features advertised by CPUID.8000000AH:EDX */
 126 static uint32_t svm_feature = ~0U;      /* AMD SVM features. */
 127 
 128 static int disable_npf_assist;
 129 
 130 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
 131 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
 132 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
 133 
 134 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
 135 
 136 static __inline int
 137 flush_by_asid(void)
 138 {
 139         return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
 140 }
 141 
 142 static __inline int
 143 decode_assist(void)
 144 {
 145         return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
 146 }
 147 
 148 #ifdef __FreeBSD__
 149 static void
 150 svm_disable(void *arg __unused)
 151 {
 152         uint64_t efer;
 153 
 154         efer = rdmsr(MSR_EFER);
 155         efer &= ~EFER_SVM;
 156         wrmsr(MSR_EFER, efer);
 157 }
 158 
 159 /*
 160  * Disable SVM on all CPUs.
 161  */
 162 static int
 163 svm_cleanup(void)
 164 {
 165 
 166         smp_rendezvous(NULL, svm_disable, NULL, NULL);
 167         return (0);
 168 }
 169 
 170 /*
 171  * Verify that all the features required by bhyve are available.
 172  */
 173 static int
 174 check_svm_features(void)
 175 {
 176         uint_t regs[4];
 177 
 178         /* CPUID Fn8000_000A is for SVM */
 179         do_cpuid(0x8000000A, regs);
 180         svm_feature &= regs[3];
 181 
 182         /*
 183          * The number of ASIDs can be configured to be less than what is
 184          * supported by the hardware but not more.
 185          */
 186         if (nasid == 0 || nasid > regs[1])
 187                 nasid = regs[1];
 188         KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %x", nasid));
 189 
 190         /* bhyve requires the Nested Paging feature */
 191         if (!(svm_feature & AMD_CPUID_SVM_NP)) {
 192                 printf("SVM: Nested Paging feature not available.\n");
 193                 return (ENXIO);
 194         }
 195 
 196         /* bhyve requires the NRIP Save feature */
 197         if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
 198                 printf("SVM: NRIP Save feature not available.\n");
 199                 return (ENXIO);
 200         }
 201 
 202         return (0);
 203 }
 204 
 205 static void
 206 svm_enable(void *arg __unused)
 207 {
 208         uint64_t efer;
 209 
 210         efer = rdmsr(MSR_EFER);
 211         efer |= EFER_SVM;
 212         wrmsr(MSR_EFER, efer);
 213 
 214         wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
 215 }
 216 
 217 /*
 218  * Return 1 if SVM is enabled on this processor and 0 otherwise.
 219  */
 220 static int
 221 svm_available(void)
 222 {
 223         uint64_t msr;
 224 
 225 #ifdef __FreeBSD__
 226         /* Section 15.4 Enabling SVM from APM2. */
 227         if ((amd_feature2 & AMDID2_SVM) == 0) {
 228                 printf("SVM: not available.\n");
 229                 return (0);
 230         }
 231 #else
 232         if (!is_x86_feature(x86_featureset, X86FSET_SVM)) {
 233                 cmn_err(CE_WARN, "processor does not support SVM operation\n");
 234                 return (0);
 235         }
 236 #endif
 237 
 238         msr = rdmsr(MSR_VM_CR);
 239         if ((msr & VM_CR_SVMDIS) != 0) {
 240 #ifdef __FreeBSD__
 241                 printf("SVM: disabled by BIOS.\n");
 242 #else
 243                 cmn_err(CE_WARN, "SVM disabled by BIOS.\n");
 244 #endif
 245                 return (0);
 246         }
 247 
 248         return (1);
 249 }
 250 
 251 static int
 252 svm_init(int ipinum)
 253 {
 254         int error, cpu;
 255 
 256         if (!svm_available())
 257                 return (ENXIO);
 258 
 259         error = check_svm_features();
 260         if (error)
 261                 return (error);
 262 
 263         vmcb_clean &= VMCB_CACHE_DEFAULT;
 264 
 265         for (cpu = 0; cpu < MAXCPU; cpu++) {
 266                 /*
 267                  * Initialize the host ASIDs to their "highest" valid values.
 268                  *
 269                  * The next ASID allocation will rollover both 'gen' and 'num'
 270                  * and start off the sequence at {1,1}.
 271                  */
 272                 asid[cpu].gen = ~0UL;
 273                 asid[cpu].num = nasid - 1;
 274         }
 275 
 276         svm_msr_init();
 277         svm_npt_init(ipinum);
 278 
 279         /* Enable SVM on all CPUs */
 280         smp_rendezvous(NULL, svm_enable, NULL, NULL);
 281 
 282         return (0);
 283 }
 284 
 285 static void
 286 svm_restore(void)
 287 {
 288 
 289         svm_enable(NULL);
 290 }
 291 #else /* __FreeBSD__ */
 292 static int
 293 svm_cleanup(void)
 294 {
 295         /* This is taken care of by the hma registration */
 296         return (0);
 297 }
 298 
 299 static int
 300 svm_init(int ipinum)
 301 {
 302         vmcb_clean &= VMCB_CACHE_DEFAULT;
 303 
 304         svm_msr_init();
 305         svm_npt_init(ipinum);
 306 
 307         return (0);
 308 }
 309 
 310 static void
 311 svm_restore(void)
 312 {
 313         /* No-op on illumos */
 314 }
 315 #endif /* __FreeBSD__ */
 316 
 317 /* Pentium compatible MSRs */
 318 #define MSR_PENTIUM_START       0
 319 #define MSR_PENTIUM_END         0x1FFF
 320 /* AMD 6th generation and Intel compatible MSRs */
 321 #define MSR_AMD6TH_START        0xC0000000UL
 322 #define MSR_AMD6TH_END          0xC0001FFFUL
 323 /* AMD 7th and 8th generation compatible MSRs */
 324 #define MSR_AMD7TH_START        0xC0010000UL
 325 #define MSR_AMD7TH_END          0xC0011FFFUL
 326 
 327 /*
 328  * Get the index and bit position for a MSR in permission bitmap.
 329  * Two bits are used for each MSR: lower bit for read and higher bit for write.
 330  */
 331 static int
 332 svm_msr_index(uint64_t msr, int *index, int *bit)
 333 {
 334         uint32_t base, off;
 335 
 336         *index = -1;
 337         *bit = (msr % 4) * 2;
 338         base = 0;
 339 
 340         if (msr <= MSR_PENTIUM_END) {
 341                 *index = msr / 4;
 342                 return (0);
 343         }
 344 
 345         base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
 346         if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
 347                 off = (msr - MSR_AMD6TH_START);
 348                 *index = (off + base) / 4;
 349                 return (0);
 350         }
 351 
 352         base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
 353         if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
 354                 off = (msr - MSR_AMD7TH_START);
 355                 *index = (off + base) / 4;
 356                 return (0);
 357         }
 358 
 359         return (EINVAL);
 360 }
 361 
 362 /*
 363  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
 364  */
 365 static void
 366 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
 367 {
 368         int index, bit, error;
 369 
 370         error = svm_msr_index(msr, &index, &bit);
 371         KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr));
 372         KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
 373             ("%s: invalid index %d for msr %lx", __func__, index, msr));
 374         KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
 375             "msr %lx", __func__, bit, msr));
 376 
 377         if (read)
 378                 perm_bitmap[index] &= ~(1UL << bit);
 379 
 380         if (write)
 381                 perm_bitmap[index] &= ~(2UL << bit);
 382 }
 383 
 384 static void
 385 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
 386 {
 387 
 388         svm_msr_perm(perm_bitmap, msr, true, true);
 389 }
 390 
 391 static void
 392 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
 393 {
 394 
 395         svm_msr_perm(perm_bitmap, msr, true, false);
 396 }
 397 
 398 static __inline int
 399 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
 400 {
 401         struct vmcb_ctrl *ctrl;
 402 
 403         KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
 404 
 405         ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 406         return (ctrl->intercept[idx] & bitmask ? 1 : 0);
 407 }
 408 
 409 static __inline void
 410 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
 411     int enabled)
 412 {
 413         struct vmcb_ctrl *ctrl;
 414         uint32_t oldval;
 415 
 416         KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
 417 
 418         ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 419         oldval = ctrl->intercept[idx];
 420 
 421         if (enabled)
 422                 ctrl->intercept[idx] |= bitmask;
 423         else
 424                 ctrl->intercept[idx] &= ~bitmask;
 425 
 426         if (ctrl->intercept[idx] != oldval) {
 427                 svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
 428                 VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
 429                     "from %x to %x", idx, oldval, ctrl->intercept[idx]);
 430         }
 431 }
 432 
 433 static __inline void
 434 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 435 {
 436 
 437         svm_set_intercept(sc, vcpu, off, bitmask, 0);
 438 }
 439 
 440 static __inline void
 441 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 442 {
 443 
 444         svm_set_intercept(sc, vcpu, off, bitmask, 1);
 445 }
 446 
 447 static void
 448 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
 449     uint64_t msrpm_base_pa, uint64_t np_pml4)
 450 {
 451         struct vmcb_ctrl *ctrl;
 452         struct vmcb_state *state;
 453         uint32_t mask;
 454         int n;
 455 
 456         ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 457         state = svm_get_vmcb_state(sc, vcpu);
 458 
 459         ctrl->iopm_base_pa = iopm_base_pa;
 460         ctrl->msrpm_base_pa = msrpm_base_pa;
 461 
 462         /* Enable nested paging */
 463         ctrl->np_ctrl = NP_ENABLE;
 464         ctrl->n_cr3 = np_pml4;
 465 
 466         /*
 467          * Intercept accesses to the control registers that are not shadowed
 468          * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
 469          */
 470         for (n = 0; n < 16; n++) {
 471                 mask = (BIT(n) << 16) | BIT(n);
 472                 if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
 473                         svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 474                 else
 475                         svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 476         }
 477 
 478 
 479         /*
 480          * Intercept everything when tracing guest exceptions otherwise
 481          * just intercept machine check exception.
 482          */
 483         if (vcpu_trace_exceptions(sc->vm, vcpu)) {
 484                 for (n = 0; n < 32; n++) {
 485                         /*
 486                          * Skip unimplemented vectors in the exception bitmap.
 487                          */
 488                         if (n == 2 || n == 9) {
 489                                 continue;
 490                         }
 491                         svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
 492                 }
 493         } else {
 494                 svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
 495         }
 496 
 497         /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
 498         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
 499         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
 500         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
 501         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
 502         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
 503         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
 504         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
 505         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
 506         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 507             VMCB_INTCPT_FERR_FREEZE);
 508 
 509         svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
 510         svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
 511 
 512         /* Intercept privileged invalidation instructions. */
 513         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
 514         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
 515 
 516         /*
 517          * Intercept all virtualization-related instructions.
 518          *
 519          * From section "Canonicalization and Consistency Checks" in APMv2
 520          * the VMRUN intercept bit must be set to pass the consistency check.
 521          */
 522         svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
 523         svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL);
 524         svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
 525         svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
 526         svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
 527         svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
 528         svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
 529 
 530         /*
 531          * The ASID will be set to a non-zero value just before VMRUN.
 532          */
 533         ctrl->asid = 0;
 534 
 535         /*
 536          * Section 15.21.1, Interrupt Masking in EFLAGS
 537          * Section 15.21.2, Virtualizing APIC.TPR
 538          *
 539          * This must be set for %rflag and %cr8 isolation of guest and host.
 540          */
 541         ctrl->v_intr_ctrl |= V_INTR_MASKING;
 542 
 543         /* Enable Last Branch Record aka LBR for debugging */
 544         ctrl->misc_ctrl |= LBR_VIRT_ENABLE;
 545         state->dbgctl = BIT(0);
 546 
 547         /* EFER_SVM must always be set when the guest is executing */
 548         state->efer = EFER_SVM;
 549 
 550         /* Set up the PAT to power-on state */
 551         state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)  |
 552             PAT_VALUE(1, PAT_WRITE_THROUGH)     |
 553             PAT_VALUE(2, PAT_UNCACHED)          |
 554             PAT_VALUE(3, PAT_UNCACHEABLE)       |
 555             PAT_VALUE(4, PAT_WRITE_BACK)        |
 556             PAT_VALUE(5, PAT_WRITE_THROUGH)     |
 557             PAT_VALUE(6, PAT_UNCACHED)          |
 558             PAT_VALUE(7, PAT_UNCACHEABLE);
 559 
 560         /* Set up DR6/7 to power-on state */
 561         state->dr6 = DBREG_DR6_RESERVED1;
 562         state->dr7 = DBREG_DR7_RESERVED1;
 563 }
 564 
 565 /*
 566  * Initialize a virtual machine.
 567  */
 568 static void *
 569 svm_vminit(struct vm *vm, pmap_t pmap)
 570 {
 571         struct svm_softc *svm_sc;
 572         struct svm_vcpu *vcpu;
 573         vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
 574         int i;
 575         uint16_t maxcpus;
 576 
 577         svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO);
 578         if (((uintptr_t)svm_sc & PAGE_MASK) != 0)
 579                 panic("malloc of svm_softc not aligned on page boundary");
 580 
 581         svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM,
 582             M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
 583         if (svm_sc->msr_bitmap == NULL)
 584                 panic("contigmalloc of SVM MSR bitmap failed");
 585         svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM,
 586             M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
 587         if (svm_sc->iopm_bitmap == NULL)
 588                 panic("contigmalloc of SVM IO bitmap failed");
 589 
 590         svm_sc->vm = vm;
 591         svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
 592 
 593         /*
 594          * Intercept read and write accesses to all MSRs.
 595          */
 596         memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
 597 
 598         /*
 599          * Access to the following MSRs is redirected to the VMCB when the
 600          * guest is executing. Therefore it is safe to allow the guest to
 601          * read/write these MSRs directly without hypervisor involvement.
 602          */
 603         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
 604         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
 605         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
 606 
 607         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
 608         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
 609         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
 610         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
 611         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
 612         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
 613         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
 614         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
 615 
 616         svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
 617 
 618         /*
 619          * Intercept writes to make sure that the EFER_SVM bit is not cleared.
 620          */
 621         svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
 622 
 623         /* Intercept access to all I/O ports. */
 624         memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
 625 
 626         iopm_pa = vtophys(svm_sc->iopm_bitmap);
 627         msrpm_pa = vtophys(svm_sc->msr_bitmap);
 628         pml4_pa = svm_sc->nptp;
 629         maxcpus = vm_get_maxcpus(svm_sc->vm);
 630         for (i = 0; i < maxcpus; i++) {
 631                 vcpu = svm_get_vcpu(svm_sc, i);
 632                 vcpu->nextrip = ~0;
 633                 vcpu->lastcpu = NOCPU;
 634                 vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
 635                 vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
 636                 svm_msr_guest_init(svm_sc, i);
 637         }
 638         return (svm_sc);
 639 }
 640 
 641 /*
 642  * Collateral for a generic SVM VM-exit.
 643  */
 644 static void
 645 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
 646 {
 647 
 648         vme->exitcode = VM_EXITCODE_SVM;
 649         vme->u.svm.exitcode = code;
 650         vme->u.svm.exitinfo1 = info1;
 651         vme->u.svm.exitinfo2 = info2;
 652 }
 653 
 654 static int
 655 svm_cpl(struct vmcb_state *state)
 656 {
 657 
 658         /*
 659          * From APMv2:
 660          *   "Retrieve the CPL from the CPL field in the VMCB, not
 661          *    from any segment DPL"
 662          */
 663         return (state->cpl);
 664 }
 665 
 666 static enum vm_cpu_mode
 667 svm_vcpu_mode(struct vmcb *vmcb)
 668 {
 669         struct vmcb_state *state;
 670 
 671         state = &vmcb->state;
 672 
 673         if (state->efer & EFER_LMA) {
 674                 struct vmcb_segment *seg;
 675 
 676                 /*
 677                  * Section 4.8.1 for APM2, check if Code Segment has
 678                  * Long attribute set in descriptor.
 679                  */
 680                 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
 681                 if (seg->attrib & VMCB_CS_ATTRIB_L)
 682                         return (CPU_MODE_64BIT);
 683                 else
 684                         return (CPU_MODE_COMPATIBILITY);
 685         } else  if (state->cr0 & CR0_PE) {
 686                 return (CPU_MODE_PROTECTED);
 687         } else {
 688                 return (CPU_MODE_REAL);
 689         }
 690 }
 691 
 692 static enum vm_paging_mode
 693 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
 694 {
 695 
 696         if ((cr0 & CR0_PG) == 0)
 697                 return (PAGING_MODE_FLAT);
 698         if ((cr4 & CR4_PAE) == 0)
 699                 return (PAGING_MODE_32);
 700         if (efer & EFER_LME)
 701                 return (PAGING_MODE_64);
 702         else
 703                 return (PAGING_MODE_PAE);
 704 }
 705 
 706 /*
 707  * ins/outs utility routines
 708  */
 709 
 710 static void
 711 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
 712 {
 713         struct vmcb_state *state;
 714 
 715         state = &vmcb->state;
 716         paging->cr3 = state->cr3;
 717         paging->cpl = svm_cpl(state);
 718         paging->cpu_mode = svm_vcpu_mode(vmcb);
 719         paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
 720             state->efer);
 721 }
 722 
 723 #define UNHANDLED 0
 724 
 725 /*
 726  * Handle guest I/O intercept.
 727  */
 728 static int
 729 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 730 {
 731         struct vmcb_ctrl *ctrl;
 732         struct vmcb_state *state;
 733         struct vm_inout *inout;
 734         struct vie *vie;
 735         uint64_t info1;
 736         struct vm_guest_paging paging;
 737 
 738         state = svm_get_vmcb_state(svm_sc, vcpu);
 739         ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
 740         inout = &vmexit->u.inout;
 741         info1 = ctrl->exitinfo1;
 742 
 743         inout->bytes = (info1 >> 4) & 0x7;
 744         inout->flags = 0;
 745         inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0;
 746         inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0;
 747         inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0;
 748         inout->port = (uint16_t)(info1 >> 16);
 749         inout->eax = (uint32_t)(state->rax);
 750 
 751         if ((inout->flags & INOUT_STR) != 0) {
 752                 /*
 753                  * The effective segment number in EXITINFO1[12:10] is populated
 754                  * only if the processor has the DecodeAssist capability.
 755                  *
 756                  * This is not specified explicitly in APMv2 but can be verified
 757                  * empirically.
 758                  */
 759                 if (!decode_assist()) {
 760                         /*
 761                          * Without decoding assistance, force the task of
 762                          * emulating the ins/outs on userspace.
 763                          */
 764                         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 765                         bzero(&vmexit->u.inst_emul,
 766                             sizeof (vmexit->u.inst_emul));
 767                         return (UNHANDLED);
 768                 }
 769 
 770                 /*
 771                  * Bits 7-9 encode the address size of ins/outs operations where
 772                  * the 1/2/4 values correspond to 16/32/64 bit sizes.
 773                  */
 774                 inout->addrsize = 2 * ((info1 >> 7) & 0x7);
 775                 VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
 776                     inout->addrsize == 8);
 777 
 778                 if (inout->flags & INOUT_IN) {
 779                         /*
 780                          * For INS instructions, %es (encoded as 0) is the
 781                          * implied segment for the operation.
 782                          */
 783                         inout->segment = 0;
 784                 } else {
 785                         /*
 786                          * Bits 10-12 encode the segment for OUTS.
 787                          * This value follows the standard x86 segment order.
 788                          */
 789                         inout->segment = (info1 >> 10) & 0x7;
 790                 }
 791         }
 792 
 793         vmexit->exitcode = VM_EXITCODE_INOUT;
 794         svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
 795         vie = vm_vie_ctx(svm_sc->vm, vcpu);
 796         vie_init_inout(vie, inout, vmexit->inst_length, &paging);
 797 
 798         /* The in/out emulation will handle advancing %rip */
 799         vmexit->inst_length = 0;
 800 
 801         return (UNHANDLED);
 802 }
 803 
 804 static int
 805 npf_fault_type(uint64_t exitinfo1)
 806 {
 807 
 808         if (exitinfo1 & VMCB_NPF_INFO1_W)
 809                 return (VM_PROT_WRITE);
 810         else if (exitinfo1 & VMCB_NPF_INFO1_ID)
 811                 return (VM_PROT_EXECUTE);
 812         else
 813                 return (VM_PROT_READ);
 814 }
 815 
 816 static bool
 817 svm_npf_emul_fault(uint64_t exitinfo1)
 818 {
 819         if (exitinfo1 & VMCB_NPF_INFO1_ID) {
 820                 return (false);
 821         }
 822 
 823         if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
 824                 return (false);
 825         }
 826 
 827         if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
 828                 return (false);
 829         }
 830 
 831         return (true);
 832 }
 833 
 834 static void
 835 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
 836     uint64_t gpa)
 837 {
 838         struct vmcb_ctrl *ctrl;
 839         struct vmcb *vmcb;
 840         struct vie *vie;
 841         struct vm_guest_paging paging;
 842         struct vmcb_segment *seg;
 843         char *inst_bytes = NULL;
 844         uint8_t inst_len = 0;
 845 
 846         vmcb = svm_get_vmcb(svm_sc, vcpu);
 847         ctrl = &vmcb->ctrl;
 848 
 849         vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
 850         vmexit->u.mmio_emul.gpa = gpa;
 851         vmexit->u.mmio_emul.gla = VIE_INVALID_GLA;
 852         svm_paging_info(vmcb, &paging);
 853 
 854         switch (paging.cpu_mode) {
 855         case CPU_MODE_REAL:
 856                 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
 857                 vmexit->u.mmio_emul.cs_base = seg->base;
 858                 vmexit->u.mmio_emul.cs_d = 0;
 859                 break;
 860         case CPU_MODE_PROTECTED:
 861         case CPU_MODE_COMPATIBILITY:
 862                 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
 863                 vmexit->u.mmio_emul.cs_base = seg->base;
 864 
 865                 /*
 866                  * Section 4.8.1 of APM2, Default Operand Size or D bit.
 867                  */
 868                 vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
 869                     1 : 0;
 870                 break;
 871         default:
 872                 vmexit->u.mmio_emul.cs_base = 0;
 873                 vmexit->u.mmio_emul.cs_d = 0;
 874                 break;
 875         }
 876 
 877         /*
 878          * Copy the instruction bytes into 'vie' if available.
 879          */
 880         if (decode_assist() && !disable_npf_assist) {
 881                 inst_len = ctrl->inst_len;
 882                 inst_bytes = (char *)ctrl->inst_bytes;
 883         }
 884         vie = vm_vie_ctx(svm_sc->vm, vcpu);
 885         vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
 886 }
 887 
 888 static void
 889 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
 890 {
 891         struct vm *vm;
 892         struct vlapic *vlapic;
 893         struct vmcb_ctrl *ctrl;
 894 
 895         vm = sc->vm;
 896         vlapic = vm_lapic(vm, vcpu);
 897         ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 898 
 899         /* Update %cr8 in the emulated vlapic */
 900         vlapic_set_cr8(vlapic, ctrl->v_tpr);
 901 
 902         /* Virtual interrupt injection is not used. */
 903         KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
 904             "v_intr_vector %d", __func__, ctrl->v_intr_vector));
 905 }
 906 
 907 static void
 908 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
 909 {
 910         struct vmcb_ctrl *ctrl;
 911         uint64_t intinfo;
 912 
 913         ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
 914         intinfo = ctrl->exitintinfo;
 915         if (!VMCB_EXITINTINFO_VALID(intinfo))
 916                 return;
 917 
 918         /*
 919          * From APMv2, Section "Intercepts during IDT interrupt delivery"
 920          *
 921          * If a #VMEXIT happened during event delivery then record the event
 922          * that was being delivered.
 923          */
 924         VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
 925             intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
 926         vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
 927         vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
 928 }
 929 
 930 static __inline int
 931 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
 932 {
 933 
 934         return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 935             VMCB_INTCPT_VINTR));
 936 }
 937 
 938 static void
 939 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 940 {
 941         struct vmcb_ctrl *ctrl;
 942         struct vmcb_state *state;
 943 
 944         ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 945         state = svm_get_vmcb_state(sc, vcpu);
 946 
 947         if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
 948                 KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
 949                     ("%s: invalid v_ign_tpr", __func__));
 950                 KASSERT(vintr_intercept_enabled(sc, vcpu),
 951                     ("%s: vintr intercept should be enabled", __func__));
 952                 return;
 953         }
 954 
 955         /*
 956          * We use V_IRQ in conjunction with the VINTR intercept to trap into the
 957          * hypervisor as soon as a virtual interrupt can be delivered.
 958          *
 959          * Since injected events are not subject to intercept checks we need to
 960          * ensure that the V_IRQ is not actually going to be delivered on VM
 961          * entry.
 962          */
 963         VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
 964             (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
 965 
 966         VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
 967         ctrl->v_irq |= V_IRQ;
 968         ctrl->v_intr_prio |= V_IGN_TPR;
 969         ctrl->v_intr_vector = 0;
 970         svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 971         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 972 }
 973 
 974 static void
 975 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 976 {
 977         struct vmcb_ctrl *ctrl;
 978 
 979         ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 980 
 981         if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) {
 982                 KASSERT(!vintr_intercept_enabled(sc, vcpu),
 983                     ("%s: vintr intercept should be disabled", __func__));
 984                 return;
 985         }
 986 
 987         VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
 988         ctrl->v_irq &= ~V_IRQ;
 989         ctrl->v_intr_vector = 0;
 990         svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 991         svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 992 }
 993 
 994 /*
 995  * Once an NMI is injected it blocks delivery of further NMIs until the handler
 996  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
 997  * to track when the vcpu is done handling the NMI.
 998  */
 999 static int
1000 svm_nmi_blocked(struct svm_softc *sc, int vcpu)
1001 {
1002         return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1003             VMCB_INTCPT_IRET));
1004 }
1005 
1006 static void
1007 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1008 {
1009         struct vmcb_ctrl *ctrl;
1010 
1011         KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1012         VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
1013         /*
1014          * When the IRET intercept is cleared the vcpu will attempt to execute
1015          * the "iret" when it runs next. However, it is possible to inject
1016          * another NMI into the vcpu before the "iret" has actually executed.
1017          *
1018          * For e.g. if the "iret" encounters a #NPF when accessing the stack
1019          * it will trap back into the hypervisor. If an NMI is pending for
1020          * the vcpu it will be injected into the guest.
1021          *
1022          * XXX this needs to be fixed
1023          */
1024         svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1025 
1026         /*
1027          * Set an interrupt shadow to prevent an NMI from being immediately
1028          * injected on the next VMRUN.
1029          */
1030         ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1031         ctrl->intr_shadow = 1;
1032 }
1033 
1034 static void
1035 svm_inject_event(struct svm_softc *sc, int vcpu, uint64_t intinfo)
1036 {
1037         struct vmcb_ctrl *ctrl;
1038         uint8_t vector;
1039         uint32_t evtype;
1040 
1041         ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
1042 
1043         ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1044         vector = VMCB_EXITINTINFO_VECTOR(intinfo);
1045         evtype = VMCB_EXITINTINFO_TYPE(intinfo);
1046 
1047         switch (evtype) {
1048         case VMCB_EVENTINJ_TYPE_INTR:
1049         case VMCB_EVENTINJ_TYPE_NMI:
1050         case VMCB_EVENTINJ_TYPE_INTn:
1051                 break;
1052         case VMCB_EVENTINJ_TYPE_EXCEPTION:
1053                 VERIFY(vector <= 31);
1054                 /*
1055                  * NMIs are expected to be injected with VMCB_EVENTINJ_TYPE_NMI,
1056                  * rather than as an exception with the NMI vector.
1057                  */
1058                 VERIFY(vector != 2);
1059                 break;
1060         default:
1061                 panic("unexpected event type %x", evtype);
1062         }
1063 
1064         ctrl->eventinj = VMCB_EVENTINJ_VALID | evtype | vector;
1065         if (VMCB_EXITINTINFO_EC_VALID(intinfo)) {
1066                 ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
1067                 ctrl->eventinj |= (uint64_t)VMCB_EXITINTINFO_EC(intinfo) << 32;
1068         }
1069 }
1070 
1071 static void
1072 svm_inject_nmi(struct svm_softc *sc, int vcpu)
1073 {
1074         struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1075 
1076         ASSERT(!svm_nmi_blocked(sc, vcpu));
1077 
1078         ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
1079         vm_nmi_clear(sc->vm, vcpu);
1080 
1081         /*
1082          * Virtual NMI blocking is now in effect.
1083          *
1084          * Not only does this block a subsequent NMI injection from taking
1085          * place, it also configures an intercept on the IRET so we can track
1086          * when the next injection can take place.
1087          */
1088         svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1089 }
1090 
1091 static void
1092 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
1093 {
1094         struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1095 
1096         ASSERT(vector >= 0 && vector <= 255);
1097 
1098         ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
1099 }
1100 
1101 #define EFER_MBZ_BITS   0xFFFFFFFFFFFF0200UL
1102 
1103 static int
1104 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval)
1105 {
1106         struct vm_exit *vme;
1107         struct vmcb_state *state;
1108         uint64_t changed, lma, oldval;
1109         int error;
1110 
1111         state = svm_get_vmcb_state(sc, vcpu);
1112 
1113         oldval = state->efer;
1114         VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %lx/%lx", oldval, newval);
1115 
1116         newval &= ~0xFE;            /* clear the Read-As-Zero (RAZ) bits */
1117         changed = oldval ^ newval;
1118 
1119         if (newval & EFER_MBZ_BITS)
1120                 goto gpf;
1121 
1122         /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1123         if (changed & EFER_LME) {
1124                 if (state->cr0 & CR0_PG)
1125                         goto gpf;
1126         }
1127 
1128         /* EFER.LMA = EFER.LME & CR0.PG */
1129         if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
1130                 lma = EFER_LMA;
1131         else
1132                 lma = 0;
1133 
1134         if ((newval & EFER_LMA) != lma)
1135                 goto gpf;
1136 
1137         if (newval & EFER_NXE) {
1138                 if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
1139                         goto gpf;
1140         }
1141 
1142         /*
1143          * XXX bhyve does not enforce segment limits in 64-bit mode. Until
1144          * this is fixed flag guest attempt to set EFER_LMSLE as an error.
1145          */
1146         if (newval & EFER_LMSLE) {
1147                 vme = vm_exitinfo(sc->vm, vcpu);
1148                 vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
1149                 return (-1);
1150         }
1151 
1152         if (newval & EFER_FFXSR) {
1153                 if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
1154                         goto gpf;
1155         }
1156 
1157         if (newval & EFER_TCE) {
1158                 if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
1159                         goto gpf;
1160         }
1161 
1162         error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1163         KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
1164         return (0);
1165 gpf:
1166         vm_inject_gp(sc->vm, vcpu);
1167         return (0);
1168 }
1169 
1170 static int
1171 emulate_wrmsr(struct svm_softc *sc, int vcpu, uint_t num, uint64_t val)
1172 {
1173         int error;
1174 
1175         if (lapic_msr(num))
1176                 error = lapic_wrmsr(sc->vm, vcpu, num, val);
1177         else if (num == MSR_EFER)
1178                 error = svm_write_efer(sc, vcpu, val);
1179         else
1180                 error = svm_wrmsr(sc, vcpu, num, val);
1181 
1182         return (error);
1183 }
1184 
1185 static int
1186 emulate_rdmsr(struct svm_softc *sc, int vcpu, uint_t num)
1187 {
1188         struct vmcb_state *state;
1189         struct svm_regctx *ctx;
1190         uint64_t result;
1191         int error;
1192 
1193         if (lapic_msr(num))
1194                 error = lapic_rdmsr(sc->vm, vcpu, num, &result);
1195         else
1196                 error = svm_rdmsr(sc, vcpu, num, &result);
1197 
1198         if (error == 0) {
1199                 state = svm_get_vmcb_state(sc, vcpu);
1200                 ctx = svm_get_guest_regctx(sc, vcpu);
1201                 state->rax = result & 0xffffffff;
1202                 ctx->sctx_rdx = result >> 32;
1203         }
1204 
1205         return (error);
1206 }
1207 
1208 /*
1209  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1210  * that are due to instruction intercepts as well as MSR and IOIO intercepts
1211  * and exceptions caused by INT3, INTO and BOUND instructions.
1212  *
1213  * Return 1 if the nRIP is valid and 0 otherwise.
1214  */
1215 static int
1216 nrip_valid(uint64_t exitcode)
1217 {
1218         switch (exitcode) {
1219         case 0x00 ... 0x0F:     /* read of CR0 through CR15 */
1220         case 0x10 ... 0x1F:     /* write of CR0 through CR15 */
1221         case 0x20 ... 0x2F:     /* read of DR0 through DR15 */
1222         case 0x30 ... 0x3F:     /* write of DR0 through DR15 */
1223         case 0x43:              /* INT3 */
1224         case 0x44:              /* INTO */
1225         case 0x45:              /* BOUND */
1226         case 0x65 ... 0x7C:     /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1227         case 0x80 ... 0x8D:     /* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1228                 return (1);
1229         default:
1230                 return (0);
1231         }
1232 }
1233 
1234 static int
1235 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1236 {
1237         struct vmcb *vmcb;
1238         struct vmcb_state *state;
1239         struct vmcb_ctrl *ctrl;
1240         struct svm_regctx *ctx;
1241         uint64_t code, info1, info2, val;
1242         uint32_t eax, ecx, edx;
1243 #ifdef __FreeBSD__
1244         int error, errcode_valid, handled, idtvec, reflect;
1245 #else
1246         int error, errcode_valid = 0, handled, idtvec, reflect;
1247 #endif
1248 
1249         ctx = svm_get_guest_regctx(svm_sc, vcpu);
1250         vmcb = svm_get_vmcb(svm_sc, vcpu);
1251         state = &vmcb->state;
1252         ctrl = &vmcb->ctrl;
1253 
1254         handled = 0;
1255         code = ctrl->exitcode;
1256         info1 = ctrl->exitinfo1;
1257         info2 = ctrl->exitinfo2;
1258 
1259         vmexit->exitcode = VM_EXITCODE_BOGUS;
1260         vmexit->rip = state->rip;
1261         vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1262 
1263         vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1264 
1265         /*
1266          * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1267          * in an inconsistent state and can trigger assertions that would
1268          * never happen otherwise.
1269          */
1270         if (code == VMCB_EXIT_INVALID) {
1271                 vm_exit_svm(vmexit, code, info1, info2);
1272                 return (0);
1273         }
1274 
1275         KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1276             "injection valid bit is set %lx", __func__, ctrl->eventinj));
1277 
1278         KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1279             ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)",
1280             vmexit->inst_length, code, info1, info2));
1281 
1282         svm_update_virqinfo(svm_sc, vcpu);
1283         svm_save_exitintinfo(svm_sc, vcpu);
1284 
1285         switch (code) {
1286         case VMCB_EXIT_IRET:
1287                 /*
1288                  * Restart execution at "iret" but with the intercept cleared.
1289                  */
1290                 vmexit->inst_length = 0;
1291                 svm_clear_nmi_blocking(svm_sc, vcpu);
1292                 handled = 1;
1293                 break;
1294         case VMCB_EXIT_VINTR:   /* interrupt window exiting */
1295                 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1296                 svm_disable_intr_window_exiting(svm_sc, vcpu);
1297                 handled = 1;
1298                 break;
1299         case VMCB_EXIT_INTR:    /* external interrupt */
1300                 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1301                 handled = 1;
1302                 break;
1303         case VMCB_EXIT_NMI:     /* external NMI */
1304                 handled = 1;
1305                 break;
1306         case 0x40 ... 0x5F:
1307                 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1308                 reflect = 1;
1309                 idtvec = code - 0x40;
1310                 switch (idtvec) {
1311                 case IDT_MC:
1312                         /*
1313                          * Call the machine check handler by hand. Also don't
1314                          * reflect the machine check back into the guest.
1315                          */
1316                         reflect = 0;
1317                         VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
1318 #ifdef __FreeBSD__
1319                         __asm __volatile("int $18");
1320 #else
1321                         vmm_call_trap(T_MCE);
1322 #endif
1323                         break;
1324                 case IDT_PF:
1325                         error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1326                             info2);
1327                         KASSERT(error == 0, ("%s: error %d updating cr2",
1328                             __func__, error));
1329                         /* fallthru */
1330                 case IDT_NP:
1331                 case IDT_SS:
1332                 case IDT_GP:
1333                 case IDT_AC:
1334                 case IDT_TS:
1335                         errcode_valid = 1;
1336                         break;
1337 
1338                 case IDT_DF:
1339                         errcode_valid = 1;
1340                         info1 = 0;
1341                         break;
1342 
1343                 case IDT_BP:
1344                 case IDT_OF:
1345                 case IDT_BR:
1346                         /*
1347                          * The 'nrip' field is populated for INT3, INTO and
1348                          * BOUND exceptions and this also implies that
1349                          * 'inst_length' is non-zero.
1350                          *
1351                          * Reset 'inst_length' to zero so the guest %rip at
1352                          * event injection is identical to what it was when
1353                          * the exception originally happened.
1354                          */
1355                         VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
1356                             "to zero before injecting exception %d",
1357                             vmexit->inst_length, idtvec);
1358                         vmexit->inst_length = 0;
1359                         /* fallthru */
1360                 default:
1361                         errcode_valid = 0;
1362                         info1 = 0;
1363                         break;
1364                 }
1365                 KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
1366                     "when reflecting exception %d into guest",
1367                     vmexit->inst_length, idtvec));
1368 
1369                 if (reflect) {
1370                         /* Reflect the exception back into the guest */
1371                         VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
1372                             "%d/%x into the guest", idtvec, (int)info1);
1373                         error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1374                             errcode_valid, info1, 0);
1375                         KASSERT(error == 0, ("%s: vm_inject_exception error %d",
1376                             __func__, error));
1377                 }
1378                 handled = 1;
1379                 break;
1380         case VMCB_EXIT_MSR:     /* MSR access. */
1381                 eax = state->rax;
1382                 ecx = ctx->sctx_rcx;
1383                 edx = ctx->sctx_rdx;
1384 
1385                 if (info1) {
1386                         vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1387                         val = (uint64_t)edx << 32 | eax;
1388                         VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %x val %lx",
1389                             ecx, val);
1390                         error = emulate_wrmsr(svm_sc, vcpu, ecx, val);
1391                         if (error == 0) {
1392                                 handled = 1;
1393                         } else if (error > 0) {
1394                                 vmexit->exitcode = VM_EXITCODE_WRMSR;
1395                                 vmexit->u.msr.code = ecx;
1396                                 vmexit->u.msr.wval = val;
1397                         } else {
1398                                 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1399                                     ("emulate_wrmsr retu with bogus exitcode"));
1400                         }
1401                 } else {
1402                         VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %x", ecx);
1403                         vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1404                         error = emulate_rdmsr(svm_sc, vcpu, ecx);
1405                         if (error == 0) {
1406                                 handled = 1;
1407                         } else if (error > 0) {
1408                                 vmexit->exitcode = VM_EXITCODE_RDMSR;
1409                                 vmexit->u.msr.code = ecx;
1410                         } else {
1411                                 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1412                                     ("emulate_rdmsr retu with bogus exitcode"));
1413                         }
1414                 }
1415                 break;
1416         case VMCB_EXIT_IO:
1417                 handled = svm_handle_inout(svm_sc, vcpu, vmexit);
1418                 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1419                 break;
1420         case VMCB_EXIT_SHUTDOWN:
1421                 vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT);
1422                 handled = 1;
1423                 break;
1424         case VMCB_EXIT_INVD:
1425         case VMCB_EXIT_INVLPGA:
1426                 /* privileged invalidation instructions */
1427                 vm_inject_ud(svm_sc->vm, vcpu);
1428                 handled = 1;
1429                 break;
1430         case VMCB_EXIT_VMRUN:
1431         case VMCB_EXIT_VMLOAD:
1432         case VMCB_EXIT_VMSAVE:
1433         case VMCB_EXIT_STGI:
1434         case VMCB_EXIT_CLGI:
1435         case VMCB_EXIT_SKINIT:
1436                 /* privileged vmm instructions */
1437                 vm_inject_ud(svm_sc->vm, vcpu);
1438                 handled = 1;
1439                 break;
1440         case VMCB_EXIT_VMMCALL:
1441                 /* No handlers make use of VMMCALL for now */
1442                 vm_inject_ud(svm_sc->vm, vcpu);
1443                 handled = 1;
1444                 break;
1445         case VMCB_EXIT_CPUID:
1446                 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1447                 handled = x86_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
1448                     &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
1449                 break;
1450         case VMCB_EXIT_HLT:
1451                 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1452                 vmexit->exitcode = VM_EXITCODE_HLT;
1453                 vmexit->u.hlt.rflags = state->rflags;
1454                 break;
1455         case VMCB_EXIT_PAUSE:
1456                 vmexit->exitcode = VM_EXITCODE_PAUSE;
1457                 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1458                 break;
1459         case VMCB_EXIT_NPF:
1460                 /* EXITINFO2 contains the faulting guest physical address */
1461                 if (info1 & VMCB_NPF_INFO1_RSV) {
1462                         VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
1463                             "reserved bits set: info1(%lx) info2(%lx)",
1464                             info1, info2);
1465                 } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1466                         vmexit->exitcode = VM_EXITCODE_PAGING;
1467                         vmexit->u.paging.gpa = info2;
1468                         vmexit->u.paging.fault_type = npf_fault_type(info1);
1469                         vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1470                         VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
1471                             "on gpa %lx/%lx at rip %lx",
1472                             info2, info1, state->rip);
1473                 } else if (svm_npf_emul_fault(info1)) {
1474                         svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2);
1475                         vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
1476                         VCPU_CTR3(svm_sc->vm, vcpu, "mmio_emul fault "
1477                             "for gpa %lx/%lx at rip %lx",
1478                             info2, info1, state->rip);
1479                 }
1480                 break;
1481         case VMCB_EXIT_MONITOR:
1482                 vmexit->exitcode = VM_EXITCODE_MONITOR;
1483                 break;
1484         case VMCB_EXIT_MWAIT:
1485                 vmexit->exitcode = VM_EXITCODE_MWAIT;
1486                 break;
1487         default:
1488                 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1489                 break;
1490         }
1491 
1492         DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t,
1493             code);
1494 
1495         if (handled) {
1496                 vmexit->rip += vmexit->inst_length;
1497                 vmexit->inst_length = 0;
1498                 state->rip = vmexit->rip;
1499         } else {
1500                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1501                         /*
1502                          * If this VM exit was not claimed by anybody then
1503                          * treat it as a generic SVM exit.
1504                          */
1505                         vm_exit_svm(vmexit, code, info1, info2);
1506                 } else {
1507                         /*
1508                          * The exitcode and collateral have been populated.
1509                          * The VM exit will be processed further in userland.
1510                          */
1511                 }
1512         }
1513         return (handled);
1514 }
1515 
1516 /*
1517  * Inject exceptions, NMIs, and ExtINTs.
1518  *
1519  * The logic behind these are complicated and may involve mutex contention, so
1520  * the injection is performed without the protection of host CPU interrupts
1521  * being disabled.  This means a racing notification could be "lost",
1522  * necessitating a later call to svm_inject_recheck() to close that window
1523  * of opportunity.
1524  */
1525 static enum event_inject_state
1526 svm_inject_events(struct svm_softc *sc, int vcpu)
1527 {
1528         struct vmcb_ctrl *ctrl;
1529         struct vmcb_state *state;
1530         struct svm_vcpu *vcpustate;
1531         uint64_t intinfo;
1532         enum event_inject_state ev_state;
1533 
1534         state = svm_get_vmcb_state(sc, vcpu);
1535         ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1536         vcpustate = svm_get_vcpu(sc, vcpu);
1537         ev_state = EIS_CAN_INJECT;
1538 
1539         /* Clear any interrupt shadow if guest %rip has changed */
1540         if (vcpustate->nextrip != state->rip) {
1541                 ctrl->intr_shadow = 0;
1542         }
1543 
1544         /*
1545          * An event is already pending for injection.  This can occur when the
1546          * vCPU exits prior to VM entry (like for an AST).
1547          */
1548         if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1549                 return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1550         }
1551 
1552         /*
1553          * Inject pending events or exceptions for this vcpu.
1554          *
1555          * An event might be pending because the previous #VMEXIT happened
1556          * during event delivery (i.e. ctrl->exitintinfo).
1557          *
1558          * An event might also be pending because an exception was injected
1559          * by the hypervisor (e.g. #PF during instruction emulation).
1560          */
1561         if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
1562                 ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
1563 
1564                 svm_inject_event(sc, vcpu, intinfo);
1565                 vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1566                 ev_state = EIS_EV_INJECTED;
1567         }
1568 
1569         /* NMI event has priority over interrupts. */
1570         if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
1571                 if (ev_state == EIS_CAN_INJECT) {
1572                         /* Can't inject NMI if vcpu is in an intr_shadow. */
1573                         if (ctrl->intr_shadow) {
1574                                 return (EIS_GI_BLOCK);
1575                         }
1576 
1577                         svm_inject_nmi(sc, vcpu);
1578                         ev_state = EIS_EV_INJECTED;
1579                 } else {
1580                         return (ev_state | EIS_REQ_EXIT);
1581                 }
1582         }
1583 
1584         if (vm_extint_pending(sc->vm, vcpu)) {
1585                 int vector;
1586 
1587                 if (ev_state != EIS_CAN_INJECT) {
1588                         return (ev_state | EIS_REQ_EXIT);
1589                 }
1590 
1591                 /*
1592                  * If the guest has disabled interrupts or is in an interrupt
1593                  * shadow then we cannot inject the pending interrupt.
1594                  */
1595                 if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1596                         return (EIS_GI_BLOCK);
1597                 }
1598 
1599                 /* Ask the legacy pic for a vector to inject */
1600                 vatpic_pending_intr(sc->vm, &vector);
1601                 KASSERT(vector >= 0 && vector <= 255,
1602                     ("invalid vector %d from INTR", vector));
1603 
1604                 svm_inject_irq(sc, vcpu, vector);
1605                 vm_extint_clear(sc->vm, vcpu);
1606                 vatpic_intr_accepted(sc->vm, vector);
1607                 ev_state = EIS_EV_INJECTED;
1608         }
1609 
1610         return (ev_state);
1611 }
1612 
1613 /*
1614  * Synchronize vLAPIC state and inject any interrupts pending on it.
1615  *
1616  * This is done with host CPU interrupts disabled so notification IPIs will be
1617  * queued on the host APIC and recognized when entering SVM guest context.
1618  */
1619 static enum event_inject_state
1620 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
1621     enum event_inject_state ev_state)
1622 {
1623         struct vmcb_ctrl *ctrl;
1624         struct vmcb_state *state;
1625         int vector;
1626         uint8_t v_tpr;
1627 
1628         state = svm_get_vmcb_state(sc, vcpu);
1629         ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1630 
1631         /*
1632          * The guest can modify the TPR by writing to %cr8. In guest mode the
1633          * CPU reflects this write to V_TPR without hypervisor intervention.
1634          *
1635          * The guest can also modify the TPR by writing to it via the memory
1636          * mapped APIC page. In this case, the write will be emulated by the
1637          * hypervisor. For this reason V_TPR must be updated before every
1638          * VMRUN.
1639          */
1640         v_tpr = vlapic_get_cr8(vlapic);
1641         KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
1642         if (ctrl->v_tpr != v_tpr) {
1643                 ctrl->v_tpr = v_tpr;
1644                 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1645         }
1646 
1647         /* If an event cannot otherwise be injected, we are done for now */
1648         if (ev_state != EIS_CAN_INJECT) {
1649                 return (ev_state);
1650         }
1651 
1652         if (!vlapic_pending_intr(vlapic, &vector)) {
1653                 return (EIS_CAN_INJECT);
1654         }
1655         KASSERT(vector >= 16 && vector <= 255,
1656             ("invalid vector %d from local APIC", vector));
1657 
1658         /*
1659          * If the guest has disabled interrupts or is in an interrupt shadow
1660          * then we cannot inject the pending interrupt.
1661          */
1662         if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1663                 return (EIS_GI_BLOCK);
1664         }
1665 
1666         svm_inject_irq(sc, vcpu, vector);
1667         vlapic_intr_accepted(vlapic, vector);
1668         return (EIS_EV_INJECTED);
1669 }
1670 
1671 /*
1672  * Re-check for events to be injected.
1673  *
1674  * Once host CPU interrupts are disabled, check for the presence of any events
1675  * which require injection processing.  If an exit is required upon injection,
1676  * or once the guest becomes interruptable, that will be configured too.
1677  */
1678 static bool
1679 svm_inject_recheck(struct svm_softc *sc, int vcpu,
1680     enum event_inject_state ev_state)
1681 {
1682         struct vmcb_ctrl *ctrl;
1683 
1684         ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1685 
1686         if (ev_state == EIS_CAN_INJECT) {
1687                 /*
1688                  * An active interrupt shadow would preclude us from injecting
1689                  * any events picked up during a re-check.
1690                  */
1691                 if (ctrl->intr_shadow != 0) {
1692                         return (false);
1693                 }
1694 
1695                 if (vm_nmi_pending(sc->vm, vcpu) &&
1696                     !svm_nmi_blocked(sc, vcpu)) {
1697                         /* queued NMI not blocked by NMI-window-exiting */
1698                         return (true);
1699                 }
1700                 if (vm_extint_pending(sc->vm, vcpu)) {
1701                         /* queued ExtINT not blocked by existing injection */
1702                         return (true);
1703                 }
1704         } else {
1705                 if ((ev_state & EIS_REQ_EXIT) != 0) {
1706                         /*
1707                          * Use a self-IPI to force an immediate exit after
1708                          * event injection has occurred.
1709                          */
1710                         poke_cpu(CPU->cpu_id);
1711                 } else {
1712                         /*
1713                          * If any event is being injected, an exit immediately
1714                          * upon becoming interruptable again will allow pending
1715                          * or newly queued events to be injected in a timely
1716                          * manner.
1717                          */
1718                         svm_enable_intr_window_exiting(sc, vcpu);
1719                 }
1720         }
1721         return (false);
1722 }
1723 
1724 
1725 #ifdef __FreeBSD__
1726 static void
1727 check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, uint_t thiscpu)
1728 {
1729         struct svm_vcpu *vcpustate;
1730         struct vmcb_ctrl *ctrl;
1731         long eptgen;
1732         bool alloc_asid;
1733 
1734         KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
1735             "active on cpu %u", __func__, thiscpu));
1736 
1737         vcpustate = svm_get_vcpu(sc, vcpuid);
1738         ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1739 
1740         /*
1741          * The TLB entries associated with the vcpu's ASID are not valid
1742          * if either of the following conditions is true:
1743          *
1744          * 1. The vcpu's ASID generation is different than the host cpu's
1745          *    ASID generation. This happens when the vcpu migrates to a new
1746          *    host cpu. It can also happen when the number of vcpus executing
1747          *    on a host cpu is greater than the number of ASIDs available.
1748          *
1749          * 2. The pmap generation number is different than the value cached in
1750          *    the 'vcpustate'. This happens when the host invalidates pages
1751          *    belonging to the guest.
1752          *
1753          *      asidgen         eptgen          Action
1754          *      mismatch        mismatch
1755          *      0               0               (a)
1756          *      0               1               (b1) or (b2)
1757          *      1               0               (c)
1758          *      1               1               (d)
1759          *
1760          * (a)  There is no mismatch in eptgen or ASID generation and therefore
1761          *      no further action is needed.
1762          *
1763          * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
1764          *      retained and the TLB entries associated with this ASID
1765          *      are flushed by VMRUN.
1766          *
1767          * (b2) If the cpu does not support FlushByAsid then a new ASID is
1768          *      allocated.
1769          *
1770          * (c)  A new ASID is allocated.
1771          *
1772          * (d)  A new ASID is allocated.
1773          */
1774 
1775         alloc_asid = false;
1776         eptgen = pmap->pm_eptgen;
1777         ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
1778 
1779         if (vcpustate->asid.gen != asid[thiscpu].gen) {
1780                 alloc_asid = true;      /* (c) and (d) */
1781         } else if (vcpustate->eptgen != eptgen) {
1782                 if (flush_by_asid())
1783                         ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;       /* (b1) */
1784                 else
1785                         alloc_asid = true;                      /* (b2) */
1786         } else {
1787                 /*
1788                  * This is the common case (a).
1789                  */
1790                 KASSERT(!alloc_asid, ("ASID allocation not necessary"));
1791                 KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
1792                     ("Invalid VMCB tlb_ctrl: %x", ctrl->tlb_ctrl));
1793         }
1794 
1795         if (alloc_asid) {
1796                 if (++asid[thiscpu].num >= nasid) {
1797                         asid[thiscpu].num = 1;
1798                         if (++asid[thiscpu].gen == 0)
1799                                 asid[thiscpu].gen = 1;
1800                         /*
1801                          * If this cpu does not support "flush-by-asid"
1802                          * then flush the entire TLB on a generation
1803                          * bump. Subsequent ASID allocation in this
1804                          * generation can be done without a TLB flush.
1805                          */
1806                         if (!flush_by_asid())
1807                                 ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
1808                 }
1809                 vcpustate->asid.gen = asid[thiscpu].gen;
1810                 vcpustate->asid.num = asid[thiscpu].num;
1811 
1812                 ctrl->asid = vcpustate->asid.num;
1813                 svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1814                 /*
1815                  * If this cpu supports "flush-by-asid" then the TLB
1816                  * was not flushed after the generation bump. The TLB
1817                  * is flushed selectively after every new ASID allocation.
1818                  */
1819                 if (flush_by_asid())
1820                         ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
1821         }
1822         vcpustate->eptgen = eptgen;
1823 
1824         KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
1825         KASSERT(ctrl->asid == vcpustate->asid.num,
1826             ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
1827 }
1828 #else /* __FreeBSD__ */
1829 static void
1830 check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, uint_t thiscpu)
1831 {
1832         struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1833         struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1834         long eptgen;
1835         uint8_t flush;
1836 
1837         eptgen = pmap->pm_eptgen;
1838         flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
1839             vcpustate->eptgen == eptgen);
1840 
1841         if (flush != VMCB_TLB_FLUSH_NOTHING) {
1842                 ctrl->asid = vcpustate->hma_asid.hsa_asid;
1843                 svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1844         }
1845         ctrl->tlb_ctrl = flush;
1846         vcpustate->eptgen = eptgen;
1847 }
1848 #endif /* __FreeBSD__ */
1849 
1850 static __inline void
1851 disable_gintr(void)
1852 {
1853         __asm __volatile("clgi");
1854 }
1855 
1856 static __inline void
1857 enable_gintr(void)
1858 {
1859         __asm __volatile("stgi");
1860 }
1861 
1862 static __inline void
1863 svm_dr_enter_guest(struct svm_regctx *gctx)
1864 {
1865 
1866         /* Save host control debug registers. */
1867         gctx->host_dr7 = rdr7();
1868         gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
1869 
1870         /*
1871          * Disable debugging in DR7 and DEBUGCTL to avoid triggering
1872          * exceptions in the host based on the guest DRx values.  The
1873          * guest DR6, DR7, and DEBUGCTL are saved/restored in the
1874          * VMCB.
1875          */
1876         load_dr7(0);
1877         wrmsr(MSR_DEBUGCTLMSR, 0);
1878 
1879         /* Save host debug registers. */
1880         gctx->host_dr0 = rdr0();
1881         gctx->host_dr1 = rdr1();
1882         gctx->host_dr2 = rdr2();
1883         gctx->host_dr3 = rdr3();
1884         gctx->host_dr6 = rdr6();
1885 
1886         /* Restore guest debug registers. */
1887         load_dr0(gctx->sctx_dr0);
1888         load_dr1(gctx->sctx_dr1);
1889         load_dr2(gctx->sctx_dr2);
1890         load_dr3(gctx->sctx_dr3);
1891 }
1892 
1893 static __inline void
1894 svm_dr_leave_guest(struct svm_regctx *gctx)
1895 {
1896 
1897         /* Save guest debug registers. */
1898         gctx->sctx_dr0 = rdr0();
1899         gctx->sctx_dr1 = rdr1();
1900         gctx->sctx_dr2 = rdr2();
1901         gctx->sctx_dr3 = rdr3();
1902 
1903         /*
1904          * Restore host debug registers.  Restore DR7 and DEBUGCTL
1905          * last.
1906          */
1907         load_dr0(gctx->host_dr0);
1908         load_dr1(gctx->host_dr1);
1909         load_dr2(gctx->host_dr2);
1910         load_dr3(gctx->host_dr3);
1911         load_dr6(gctx->host_dr6);
1912         wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
1913         load_dr7(gctx->host_dr7);
1914 }
1915 
1916 /*
1917  * Start vcpu with specified RIP.
1918  */
1919 static int
1920 svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
1921 {
1922         struct svm_regctx *gctx;
1923         struct svm_softc *svm_sc;
1924         struct svm_vcpu *vcpustate;
1925         struct vmcb_state *state;
1926         struct vmcb_ctrl *ctrl;
1927         struct vm_exit *vmexit;
1928         struct vlapic *vlapic;
1929         struct vm *vm;
1930         uint64_t vmcb_pa;
1931         int handled;
1932         uint16_t ldt_sel;
1933 
1934         svm_sc = arg;
1935         vm = svm_sc->vm;
1936 
1937         vcpustate = svm_get_vcpu(svm_sc, vcpu);
1938         state = svm_get_vmcb_state(svm_sc, vcpu);
1939         ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1940         vmexit = vm_exitinfo(vm, vcpu);
1941         vlapic = vm_lapic(vm, vcpu);
1942 
1943         gctx = svm_get_guest_regctx(svm_sc, vcpu);
1944         vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1945 
1946         if (vcpustate->lastcpu != curcpu) {
1947                 /*
1948                  * Force new ASID allocation by invalidating the generation.
1949                  */
1950 #ifdef __FreeBSD__
1951                 vcpustate->asid.gen = 0;
1952 #else
1953                 vcpustate->hma_asid.hsa_gen = 0;
1954 #endif
1955 
1956                 /*
1957                  * Invalidate the VMCB state cache by marking all fields dirty.
1958                  */
1959                 svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1960 
1961                 /*
1962                  * XXX
1963                  * Setting 'vcpustate->lastcpu' here is bit premature because
1964                  * we may return from this function without actually executing
1965                  * the VMRUN  instruction. This could happen if an AST or yield
1966                  * condition is pending on the first time through the loop.
1967                  *
1968                  * This works for now but any new side-effects of vcpu
1969                  * migration should take this case into account.
1970                  */
1971                 vcpustate->lastcpu = curcpu;
1972                 vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1973         }
1974 
1975         svm_msr_guest_enter(svm_sc, vcpu);
1976 
1977 #ifndef __FreeBSD__
1978         VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
1979         vcpustate->loaded = B_TRUE;
1980 #endif
1981 
1982         /* Update Guest RIP */
1983         state->rip = rip;
1984 
1985         do {
1986                 enum event_inject_state inject_state;
1987 
1988                 /*
1989                  * Initial event injection is complex and may involve mutex
1990                  * contention, so it must be performed with global interrupts
1991                  * still enabled.
1992                  */
1993                 inject_state = svm_inject_events(svm_sc, vcpu);
1994                 handled = 0;
1995 
1996                 /*
1997                  * Disable global interrupts to guarantee atomicity during
1998                  * loading of guest state. This includes not only the state
1999                  * loaded by the "vmrun" instruction but also software state
2000                  * maintained by the hypervisor: suspended and rendezvous
2001                  * state, NPT generation number, vlapic interrupts etc.
2002                  */
2003                 disable_gintr();
2004 
2005                 /*
2006                  * Synchronizing and injecting vlapic state is lock-free and is
2007                  * safe (and prudent) to perform with interrupts disabled.
2008                  */
2009                 inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
2010                     inject_state);
2011 
2012                 /*
2013                  * Check for vCPU bail-out conditions.  This must be done after
2014                  * svm_inject_events() to detect a triple-fault condition.
2015                  */
2016                 if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
2017                         enable_gintr();
2018                         break;
2019                 }
2020 
2021                 if (vcpu_run_state_pending(vm, vcpu)) {
2022                         enable_gintr();
2023                         vm_exit_run_state(vm, vcpu, state->rip);
2024                         break;
2025                 }
2026 
2027                 /*
2028                  * If subsequent activity queued events which require injection
2029                  * handling, take another lap to handle them.
2030                  */
2031                 if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
2032                         enable_gintr();
2033                         handled = 1;
2034                         continue;
2035                 }
2036 
2037                 /*
2038                  * #VMEXIT resumes the host with the guest LDTR, so
2039                  * save the current LDT selector so it can be restored
2040                  * after an exit.  The userspace hypervisor probably
2041                  * doesn't use a LDT, but save and restore it to be
2042                  * safe.
2043                  */
2044                 ldt_sel = sldt();
2045 
2046                 /* Activate the nested pmap on 'curcpu' */
2047                 CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
2048 
2049                 /*
2050                  * Check the pmap generation and the ASID generation to
2051                  * ensure that the vcpu does not use stale TLB mappings.
2052                  */
2053                 check_asid(svm_sc, vcpu, pmap, curcpu);
2054 
2055                 ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
2056                 vcpustate->dirty = 0;
2057                 VCPU_CTR1(vm, vcpu, "vmcb clean %x", ctrl->vmcb_clean);
2058 
2059                 /* Launch Virtual Machine. */
2060                 VCPU_CTR1(vm, vcpu, "Resume execution at %lx", state->rip);
2061                 svm_dr_enter_guest(gctx);
2062                 svm_launch(vmcb_pa, gctx, get_pcpu());
2063                 svm_dr_leave_guest(gctx);
2064 
2065                 CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
2066 
2067                 /* Restore host LDTR. */
2068                 lldt(ldt_sel);
2069 
2070                 /* #VMEXIT disables interrupts so re-enable them here. */
2071                 enable_gintr();
2072 
2073                 /* Update 'nextrip' */
2074                 vcpustate->nextrip = state->rip;
2075 
2076                 /* Handle #VMEXIT and if required return to user space. */
2077                 handled = svm_vmexit(svm_sc, vcpu, vmexit);
2078         } while (handled);
2079 
2080         svm_msr_guest_exit(svm_sc, vcpu);
2081 
2082 #ifndef __FreeBSD__
2083         VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
2084         vcpustate->loaded = B_FALSE;
2085 #endif
2086 
2087         return (0);
2088 }
2089 
2090 static void
2091 svm_vmcleanup(void *arg)
2092 {
2093         struct svm_softc *sc = arg;
2094 
2095         contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM);
2096         contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM);
2097         free(sc, M_SVM);
2098 }
2099 
2100 static uint64_t *
2101 swctx_regptr(struct svm_regctx *regctx, int reg)
2102 {
2103         switch (reg) {
2104         case VM_REG_GUEST_RBX:
2105                 return (&regctx->sctx_rbx);
2106         case VM_REG_GUEST_RCX:
2107                 return (&regctx->sctx_rcx);
2108         case VM_REG_GUEST_RDX:
2109                 return (&regctx->sctx_rdx);
2110         case VM_REG_GUEST_RDI:
2111                 return (&regctx->sctx_rdi);
2112         case VM_REG_GUEST_RSI:
2113                 return (&regctx->sctx_rsi);
2114         case VM_REG_GUEST_RBP:
2115                 return (&regctx->sctx_rbp);
2116         case VM_REG_GUEST_R8:
2117                 return (&regctx->sctx_r8);
2118         case VM_REG_GUEST_R9:
2119                 return (&regctx->sctx_r9);
2120         case VM_REG_GUEST_R10:
2121                 return (&regctx->sctx_r10);
2122         case VM_REG_GUEST_R11:
2123                 return (&regctx->sctx_r11);
2124         case VM_REG_GUEST_R12:
2125                 return (&regctx->sctx_r12);
2126         case VM_REG_GUEST_R13:
2127                 return (&regctx->sctx_r13);
2128         case VM_REG_GUEST_R14:
2129                 return (&regctx->sctx_r14);
2130         case VM_REG_GUEST_R15:
2131                 return (&regctx->sctx_r15);
2132         case VM_REG_GUEST_DR0:
2133                 return (&regctx->sctx_dr0);
2134         case VM_REG_GUEST_DR1:
2135                 return (&regctx->sctx_dr1);
2136         case VM_REG_GUEST_DR2:
2137                 return (&regctx->sctx_dr2);
2138         case VM_REG_GUEST_DR3:
2139                 return (&regctx->sctx_dr3);
2140         default:
2141                 return (NULL);
2142         }
2143 }
2144 
2145 static int
2146 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2147 {
2148         struct svm_softc *sc;
2149         struct vmcb *vmcb;
2150         uint64_t *regp;
2151         uint64_t *fieldp;
2152         struct vmcb_segment *seg;
2153 
2154         sc = arg;
2155         vmcb = svm_get_vmcb(sc, vcpu);
2156 
2157         regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2158         if (regp != NULL) {
2159                 *val = *regp;
2160                 return (0);
2161         }
2162 
2163         switch (ident) {
2164         case VM_REG_GUEST_INTR_SHADOW:
2165                 *val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0;
2166                 break;
2167 
2168         case VM_REG_GUEST_CR0:
2169         case VM_REG_GUEST_CR2:
2170         case VM_REG_GUEST_CR3:
2171         case VM_REG_GUEST_CR4:
2172         case VM_REG_GUEST_DR6:
2173         case VM_REG_GUEST_DR7:
2174         case VM_REG_GUEST_EFER:
2175         case VM_REG_GUEST_RAX:
2176         case VM_REG_GUEST_RFLAGS:
2177         case VM_REG_GUEST_RIP:
2178         case VM_REG_GUEST_RSP:
2179                 fieldp = vmcb_regptr(vmcb, ident, NULL);
2180                 *val = *fieldp;
2181                 break;
2182 
2183         case VM_REG_GUEST_CS:
2184         case VM_REG_GUEST_DS:
2185         case VM_REG_GUEST_ES:
2186         case VM_REG_GUEST_FS:
2187         case VM_REG_GUEST_GS:
2188         case VM_REG_GUEST_SS:
2189         case VM_REG_GUEST_LDTR:
2190         case VM_REG_GUEST_TR:
2191                 seg = vmcb_segptr(vmcb, ident);
2192                 *val = seg->selector;
2193                 break;
2194 
2195         case VM_REG_GUEST_GDTR:
2196         case VM_REG_GUEST_IDTR:
2197                 /* GDTR and IDTR don't have segment selectors */
2198                 return (EINVAL);
2199 
2200         default:
2201                 return (EINVAL);
2202         }
2203 
2204         return (0);
2205 }
2206 
2207 static int
2208 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2209 {
2210         struct svm_softc *sc;
2211         struct vmcb *vmcb;
2212         uint64_t *regp;
2213         uint64_t *fieldp;
2214         uint32_t dirty;
2215         struct vmcb_segment *seg;
2216 
2217         sc = arg;
2218         vmcb = svm_get_vmcb(sc, vcpu);
2219 
2220         regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2221         if (regp != NULL) {
2222                 *regp = val;
2223                 return (0);
2224         }
2225 
2226         dirty = VMCB_CACHE_NONE;
2227         switch (ident) {
2228         case VM_REG_GUEST_INTR_SHADOW:
2229                 vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0;
2230                 break;
2231 
2232         case VM_REG_GUEST_EFER:
2233                 fieldp = vmcb_regptr(vmcb, ident, &dirty);
2234                 /* EFER_SVM must always be set when the guest is executing */
2235                 *fieldp = val | EFER_SVM;
2236                 dirty |= VMCB_CACHE_CR;
2237                 break;
2238 
2239         case VM_REG_GUEST_CR0:
2240         case VM_REG_GUEST_CR2:
2241         case VM_REG_GUEST_CR3:
2242         case VM_REG_GUEST_CR4:
2243         case VM_REG_GUEST_DR6:
2244         case VM_REG_GUEST_DR7:
2245         case VM_REG_GUEST_RAX:
2246         case VM_REG_GUEST_RFLAGS:
2247         case VM_REG_GUEST_RIP:
2248         case VM_REG_GUEST_RSP:
2249                 fieldp = vmcb_regptr(vmcb, ident, &dirty);
2250                 *fieldp = val;
2251                 break;
2252 
2253         case VM_REG_GUEST_CS:
2254         case VM_REG_GUEST_DS:
2255         case VM_REG_GUEST_ES:
2256         case VM_REG_GUEST_SS:
2257         case VM_REG_GUEST_FS:
2258         case VM_REG_GUEST_GS:
2259         case VM_REG_GUEST_LDTR:
2260         case VM_REG_GUEST_TR:
2261                 dirty |= VMCB_CACHE_SEG;
2262                 seg = vmcb_segptr(vmcb, ident);
2263                 seg->selector = (uint16_t)val;
2264                 break;
2265 
2266         case VM_REG_GUEST_GDTR:
2267         case VM_REG_GUEST_IDTR:
2268                 /* GDTR and IDTR don't have segment selectors */
2269                 return (EINVAL);
2270 
2271         default:
2272                 return (EINVAL);
2273         }
2274 
2275         if (dirty != VMCB_CACHE_NONE) {
2276                 svm_set_dirty(sc, vcpu, dirty);
2277         }
2278 
2279         /*
2280          * XXX deal with CR3 and invalidate TLB entries tagged with the
2281          * vcpu's ASID. This needs to be treated differently depending on
2282          * whether 'running' is true/false.
2283          */
2284 
2285         return (0);
2286 }
2287 
2288 static int
2289 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
2290 {
2291         struct vmcb *vmcb;
2292         struct svm_softc *sc;
2293         struct vmcb_segment *seg;
2294 
2295         sc = arg;
2296         vmcb = svm_get_vmcb(sc, vcpu);
2297 
2298         switch (reg) {
2299         case VM_REG_GUEST_CS:
2300         case VM_REG_GUEST_DS:
2301         case VM_REG_GUEST_ES:
2302         case VM_REG_GUEST_SS:
2303         case VM_REG_GUEST_FS:
2304         case VM_REG_GUEST_GS:
2305         case VM_REG_GUEST_LDTR:
2306         case VM_REG_GUEST_TR:
2307                 svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
2308                 seg = vmcb_segptr(vmcb, reg);
2309                 /*
2310                  * Map seg_desc access to VMCB attribute format.
2311                  *
2312                  * SVM uses the 'P' bit in the segment attributes to indicate a
2313                  * NULL segment so clear it if the segment is marked unusable.
2314                  */
2315                 seg->attrib = VMCB_ACCESS2ATTR(desc->access);
2316                 if (SEG_DESC_UNUSABLE(desc->access)) {
2317                         seg->attrib &= ~0x80;
2318                 }
2319                 break;
2320 
2321         case VM_REG_GUEST_GDTR:
2322         case VM_REG_GUEST_IDTR:
2323                 svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
2324                 seg = vmcb_segptr(vmcb, reg);
2325                 break;
2326 
2327         default:
2328                 return (EINVAL);
2329         }
2330 
2331         ASSERT(seg != NULL);
2332         seg->base = desc->base;
2333         seg->limit = desc->limit;
2334 
2335         return (0);
2336 }
2337 
2338 static int
2339 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2340 {
2341         struct vmcb *vmcb;
2342         struct svm_softc *sc;
2343         struct vmcb_segment *seg;
2344 
2345         sc = arg;
2346         vmcb = svm_get_vmcb(sc, vcpu);
2347 
2348         switch (reg) {
2349         case VM_REG_GUEST_DS:
2350         case VM_REG_GUEST_ES:
2351         case VM_REG_GUEST_FS:
2352         case VM_REG_GUEST_GS:
2353         case VM_REG_GUEST_SS:
2354         case VM_REG_GUEST_LDTR:
2355                 seg = vmcb_segptr(vmcb, reg);
2356                 desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2357                 /*
2358                  * VT-x uses bit 16 to indicate a segment that has been loaded
2359                  * with a NULL selector (aka unusable). The 'desc->access'
2360                  * field is interpreted in the VT-x format by the
2361                  * processor-independent code.
2362                  *
2363                  * SVM uses the 'P' bit to convey the same information so
2364                  * convert it into the VT-x format. For more details refer to
2365                  * section "Segment State in the VMCB" in APMv2.
2366                  */
2367                 if ((desc->access & 0x80) == 0) {
2368                         /* Unusable segment */
2369                         desc->access |= 0x10000;
2370                 }
2371                 break;
2372 
2373         case VM_REG_GUEST_CS:
2374         case VM_REG_GUEST_TR:
2375                 seg = vmcb_segptr(vmcb, reg);
2376                 desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2377                 break;
2378 
2379         case VM_REG_GUEST_GDTR:
2380         case VM_REG_GUEST_IDTR:
2381                 seg = vmcb_segptr(vmcb, reg);
2382                 /*
2383                  * Since there are no access bits associated with the GDTR or
2384                  * the IDTR, zero out the field to ensure it does not contain
2385                  * garbage which might confuse the consumer.
2386                  */
2387                 desc->access = 0;
2388                 break;
2389 
2390         default:
2391                 return (EINVAL);
2392         }
2393 
2394         ASSERT(seg != NULL);
2395         desc->base = seg->base;
2396         desc->limit = seg->limit;
2397         return (0);
2398 }
2399 
2400 static int
2401 svm_setcap(void *arg, int vcpu, int type, int val)
2402 {
2403         struct svm_softc *sc;
2404         int error;
2405 
2406         sc = arg;
2407         error = 0;
2408         switch (type) {
2409         case VM_CAP_HALT_EXIT:
2410                 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2411                     VMCB_INTCPT_HLT, val);
2412                 break;
2413         case VM_CAP_PAUSE_EXIT:
2414                 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2415                     VMCB_INTCPT_PAUSE, val);
2416                 break;
2417         default:
2418                 error = ENOENT;
2419                 break;
2420         }
2421         return (error);
2422 }
2423 
2424 static int
2425 svm_getcap(void *arg, int vcpu, int type, int *retval)
2426 {
2427         struct svm_softc *sc;
2428         int error;
2429 
2430         sc = arg;
2431         error = 0;
2432 
2433         switch (type) {
2434         case VM_CAP_HALT_EXIT:
2435                 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2436                     VMCB_INTCPT_HLT);
2437                 break;
2438         case VM_CAP_PAUSE_EXIT:
2439                 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2440                     VMCB_INTCPT_PAUSE);
2441                 break;
2442         default:
2443                 error = ENOENT;
2444                 break;
2445         }
2446         return (error);
2447 }
2448 
2449 static struct vlapic *
2450 svm_vlapic_init(void *arg, int vcpuid)
2451 {
2452         struct svm_softc *svm_sc;
2453         struct vlapic *vlapic;
2454 
2455         svm_sc = arg;
2456         vlapic = malloc(sizeof (struct vlapic), M_SVM_VLAPIC,
2457             M_WAITOK | M_ZERO);
2458         vlapic->vm = svm_sc->vm;
2459         vlapic->vcpuid = vcpuid;
2460         vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2461 
2462         vlapic_init(vlapic);
2463 
2464         return (vlapic);
2465 }
2466 
2467 static void
2468 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2469 {
2470         vlapic_cleanup(vlapic);
2471         free(vlapic, M_SVM_VLAPIC);
2472 }
2473 
2474 #ifndef __FreeBSD__
2475 static void
2476 svm_savectx(void *arg, int vcpu)
2477 {
2478         struct svm_softc *sc = arg;
2479 
2480         if (sc->vcpu[vcpu].loaded) {
2481                 svm_msr_guest_exit(sc, vcpu);
2482         }
2483 }
2484 
2485 static void
2486 svm_restorectx(void *arg, int vcpu)
2487 {
2488         struct svm_softc *sc = arg;
2489 
2490         if (sc->vcpu[vcpu].loaded) {
2491                 svm_msr_guest_enter(sc, vcpu);
2492         }
2493 }
2494 #endif /* __FreeBSD__ */
2495 
2496 struct vmm_ops vmm_ops_amd = {
2497         .init           = svm_init,
2498         .cleanup        = svm_cleanup,
2499         .resume         = svm_restore,
2500         .vminit         = svm_vminit,
2501         .vmrun          = svm_vmrun,
2502         .vmcleanup      = svm_vmcleanup,
2503         .vmgetreg       = svm_getreg,
2504         .vmsetreg       = svm_setreg,
2505         .vmgetdesc      = svm_getdesc,
2506         .vmsetdesc      = svm_setdesc,
2507         .vmgetcap       = svm_getcap,
2508         .vmsetcap       = svm_setcap,
2509         .vmspace_alloc  = svm_npt_alloc,
2510         .vmspace_free   = svm_npt_free,
2511         .vlapic_init    = svm_vlapic_init,
2512         .vlapic_cleanup = svm_vlapic_cleanup,
2513 #ifndef __FreeBSD__
2514         .vmsavectx      = svm_savectx,
2515         .vmrestorectx   = svm_restorectx,
2516 #endif
2517 };