8005-backout New usr/src/uts/i86pc/io/vmm/vmm.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2011 NetApp, Inc.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  *
  28  * $FreeBSD$
  29  */
  30 /*
  31  * This file and its contents are supplied under the terms of the
  32  * Common Development and Distribution License ("CDDL"), version 1.0.
  33  * You may only use this file in accordance with the terms of version
  34  * 1.0 of the CDDL.
  35  *
  36  * A full copy of the text of the CDDL should have accompanied this
  37  * source.  A copy of the CDDL is also available via the Internet at
  38  * http://www.illumos.org/license/CDDL.
  39  *
  40  * Copyright 2015 Pluribus Networks Inc.
  41  * Copyright 2018 Joyent, Inc.
  42  * Copyright 2021 Oxide Computer Company
  43  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  44  */
  45 
  46 #include <sys/cdefs.h>
  47 __FBSDID("$FreeBSD$");
  48 
  49 #include <sys/param.h>
  50 #include <sys/systm.h>
  51 #include <sys/kernel.h>
  52 #include <sys/module.h>
  53 #include <sys/sysctl.h>
  54 #include <sys/malloc.h>
  55 #include <sys/pcpu.h>
  56 #include <sys/lock.h>
  57 #include <sys/mutex.h>
  58 #include <sys/proc.h>
  59 #include <sys/rwlock.h>
  60 #include <sys/sched.h>
  61 #include <sys/smp.h>
  62 #include <sys/systm.h>
  63 
  64 #include <machine/pcb.h>
  65 #include <machine/smp.h>
  66 #include <machine/md_var.h>
  67 #include <x86/psl.h>
  68 #include <x86/apicreg.h>
  69 
  70 #include <machine/specialreg.h>
  71 #include <machine/vmm.h>
  72 #include <machine/vmm_dev.h>
  73 #include <machine/vmparam.h>
  74 #include <sys/vmm_instruction_emul.h>
  75 #include <sys/vmm_vm.h>
  76 
  77 #include "vmm_ioport.h"
  78 #include "vmm_ktr.h"
  79 #include "vmm_host.h"
  80 #include "vmm_mem.h"
  81 #include "vmm_util.h"
  82 #include "vatpic.h"
  83 #include "vatpit.h"
  84 #include "vhpet.h"
  85 #include "vioapic.h"
  86 #include "vlapic.h"
  87 #include "vpmtmr.h"
  88 #include "vrtc.h"
  89 #include "vmm_stat.h"
  90 #include "vmm_lapic.h"
  91 
  92 #include "io/ppt.h"
  93 #include "io/iommu.h"
  94 
  95 struct vlapic;
  96 
  97 /*
  98  * Initialization:
  99  * (a) allocated when vcpu is created
 100  * (i) initialized when vcpu is created and when it is reinitialized
 101  * (o) initialized the first time the vcpu is created
 102  * (x) initialized before use
 103  */
 104 struct vcpu {
 105         /* (o) protects state, run_state, hostcpu, sipi_vector */
 106         struct mtx      mtx;
 107 
 108         enum vcpu_state state;          /* (o) vcpu state */
 109         enum vcpu_run_state run_state;  /* (i) vcpu init/sipi/run state */
 110         kcondvar_t      vcpu_cv;        /* (o) cpu waiter cv */
 111         kcondvar_t      state_cv;       /* (o) IDLE-transition cv */
 112         int             hostcpu;        /* (o) vcpu's current host cpu */
 113         int             lastloccpu;     /* (o) last host cpu localized to */
 114         int             reqidle;        /* (i) request vcpu to idle */
 115         struct vlapic   *vlapic;        /* (i) APIC device model */
 116         enum x2apic_state x2apic_state; /* (i) APIC mode */
 117         uint64_t        exitintinfo;    /* (i) events pending at VM exit */
 118         int             nmi_pending;    /* (i) NMI pending */
 119         int             extint_pending; /* (i) INTR pending */
 120         int     exception_pending;      /* (i) exception pending */
 121         int     exc_vector;             /* (x) exception collateral */
 122         int     exc_errcode_valid;
 123         uint32_t exc_errcode;
 124         uint8_t         sipi_vector;    /* (i) SIPI vector */
 125         struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
 126         uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
 127         void            *stats;         /* (a,i) statistics */
 128         struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
 129         uint64_t        nextrip;        /* (x) next instruction to execute */
 130         struct vie      *vie_ctx;       /* (x) instruction emulation context */
 131         uint64_t        tsc_offset;     /* (x) offset from host TSC */
 132 
 133         enum vcpu_ustate ustate;        /* (i) microstate for the vcpu */
 134         hrtime_t        ustate_when;    /* (i) time of last ustate change */
 135         uint64_t ustate_total[VU_MAX];  /* (o) total time spent in ustates */
 136 };
 137 
 138 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 139 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 140 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
 141 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
 142 #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
 143 
 144 struct mem_seg {
 145         size_t  len;
 146         bool    sysmem;
 147         struct vm_object *object;
 148 };
 149 #define VM_MAX_MEMSEGS  4
 150 
 151 struct mem_map {
 152         vm_paddr_t      gpa;
 153         size_t          len;
 154         vm_ooffset_t    segoff;
 155         int             segid;
 156         int             prot;
 157         int             flags;
 158 };
 159 #define VM_MAX_MEMMAPS  8
 160 
 161 /*
 162  * Initialization:
 163  * (o) initialized the first time the VM is created
 164  * (i) initialized when VM is created and when it is reinitialized
 165  * (x) initialized before use
 166  */
 167 struct vm {
 168         void            *cookie;                /* (i) cpu-specific data */
 169         void            *iommu;                 /* (x) iommu-specific data */
 170         struct vhpet    *vhpet;                 /* (i) virtual HPET */
 171         struct vioapic  *vioapic;               /* (i) virtual ioapic */
 172         struct vatpic   *vatpic;                /* (i) virtual atpic */
 173         struct vatpit   *vatpit;                /* (i) virtual atpit */
 174         struct vpmtmr   *vpmtmr;                /* (i) virtual ACPI PM timer */
 175         struct vrtc     *vrtc;                  /* (o) virtual RTC */
 176         volatile cpuset_t active_cpus;          /* (i) active vcpus */
 177         volatile cpuset_t debug_cpus;           /* (i) vcpus stopped for dbg */
 178         int             suspend;                /* (i) stop VM execution */
 179         volatile cpuset_t suspended_cpus;       /* (i) suspended vcpus */
 180         volatile cpuset_t halted_cpus;          /* (x) cpus in a hard halt */
 181         struct mem_map  mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 182         struct mem_seg  mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 183         struct vmspace  *vmspace;               /* (o) guest's address space */
 184         char            name[VM_MAX_NAMELEN];   /* (o) virtual machine name */
 185         struct vcpu     vcpu[VM_MAXCPU];        /* (i) guest vcpus */
 186         /* The following describe the vm cpu topology */
 187         uint16_t        sockets;                /* (o) num of sockets */
 188         uint16_t        cores;                  /* (o) num of cores/socket */
 189         uint16_t        threads;                /* (o) num of threads/core */
 190         uint16_t        maxcpus;                /* (o) max pluggable cpus */
 191         uint64_t        boot_tsc_offset;        /* (i) TSC offset at VM boot */
 192 
 193         struct ioport_config ioports;           /* (o) ioport handling */
 194 };
 195 
 196 static int vmm_initialized;
 197 
 198 
 199 static void
 200 nullop_panic(void)
 201 {
 202         panic("null vmm operation call");
 203 }
 204 
 205 /* Do not allow use of an un-set `ops` to do anything but panic */
 206 static struct vmm_ops vmm_ops_null = {
 207         .init           = (vmm_init_func_t)nullop_panic,
 208         .cleanup        = (vmm_cleanup_func_t)nullop_panic,
 209         .resume         = (vmm_resume_func_t)nullop_panic,
 210         .vminit         = (vmi_init_func_t)nullop_panic,
 211         .vmrun          = (vmi_run_func_t)nullop_panic,
 212         .vmcleanup      = (vmi_cleanup_func_t)nullop_panic,
 213         .vmgetreg       = (vmi_get_register_t)nullop_panic,
 214         .vmsetreg       = (vmi_set_register_t)nullop_panic,
 215         .vmgetdesc      = (vmi_get_desc_t)nullop_panic,
 216         .vmsetdesc      = (vmi_set_desc_t)nullop_panic,
 217         .vmgetcap       = (vmi_get_cap_t)nullop_panic,
 218         .vmsetcap       = (vmi_set_cap_t)nullop_panic,
 219         .vmspace_alloc  = (vmi_vmspace_alloc)nullop_panic,
 220         .vmspace_free   = (vmi_vmspace_free)nullop_panic,
 221         .vlapic_init    = (vmi_vlapic_init)nullop_panic,
 222         .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
 223         .vmsavectx      = (vmi_savectx)nullop_panic,
 224         .vmrestorectx   = (vmi_restorectx)nullop_panic,
 225 };
 226 
 227 static struct vmm_ops *ops = &vmm_ops_null;
 228 
 229 #define VMM_INIT(num)                   ((*ops->init)(num))
 230 #define VMM_CLEANUP()                   ((*ops->cleanup)())
 231 #define VMM_RESUME()                    ((*ops->resume)())
 232 
 233 #define VMINIT(vm, pmap)                ((*ops->vminit)(vm, pmap))
 234 #define VMRUN(vmi, vcpu, rip, pmap) \
 235         ((*ops->vmrun)(vmi, vcpu, rip, pmap))
 236 #define VMCLEANUP(vmi)                  ((*ops->vmcleanup)(vmi))
 237 #define VMSPACE_ALLOC(min, max)         ((*ops->vmspace_alloc)(min, max))
 238 #define VMSPACE_FREE(vmspace)           ((*ops->vmspace_free)(vmspace))
 239 
 240 #define VMGETREG(vmi, vcpu, num, rv)    ((*ops->vmgetreg)(vmi, vcpu, num, rv))
 241 #define VMSETREG(vmi, vcpu, num, val)   ((*ops->vmsetreg)(vmi, vcpu, num, val))
 242 #define VMGETDESC(vmi, vcpu, num, dsc)  ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
 243 #define VMSETDESC(vmi, vcpu, num, dsc)  ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
 244 #define VMGETCAP(vmi, vcpu, num, rv)    ((*ops->vmgetcap)(vmi, vcpu, num, rv))
 245 #define VMSETCAP(vmi, vcpu, num, val)   ((*ops->vmsetcap)(vmi, vcpu, num, val))
 246 #define VLAPIC_INIT(vmi, vcpu)          ((*ops->vlapic_init)(vmi, vcpu))
 247 #define VLAPIC_CLEANUP(vmi, vlapic)     ((*ops->vlapic_cleanup)(vmi, vlapic))
 248 
 249 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 250 #define fpu_stop_emulating()    clts()
 251 
 252 SDT_PROVIDER_DEFINE(vmm);
 253 
 254 static MALLOC_DEFINE(M_VM, "vm", "vm");
 255 
 256 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 257     NULL);
 258 
 259 /*
 260  * Halt the guest if all vcpus are executing a HLT instruction with
 261  * interrupts disabled.
 262  */
 263 static int halt_detection_enabled = 1;
 264 
 265 /* IPI vector used for vcpu notifications */
 266 static int vmm_ipinum;
 267 
 268 /* Trap into hypervisor on all guest exceptions and reflect them back */
 269 static int trace_guest_exceptions;
 270 
 271 static void vm_free_memmap(struct vm *vm, int ident);
 272 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 273 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
 274 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
 275 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
 276 
 277 /* Flags for vtc_status */
 278 #define VTCS_FPU_RESTORED       1 /* guest FPU restored, host FPU saved */
 279 #define VTCS_FPU_CTX_CRITICAL   2 /* in ctx where FPU restore cannot be lazy */
 280 
 281 typedef struct vm_thread_ctx {
 282         struct vm       *vtc_vm;
 283         int             vtc_vcpuid;
 284         uint_t          vtc_status;
 285         enum vcpu_ustate vtc_ustate;
 286 } vm_thread_ctx_t;
 287 
 288 #ifdef KTR
 289 static const char *
 290 vcpu_state2str(enum vcpu_state state)
 291 {
 292 
 293         switch (state) {
 294         case VCPU_IDLE:
 295                 return ("idle");
 296         case VCPU_FROZEN:
 297                 return ("frozen");
 298         case VCPU_RUNNING:
 299                 return ("running");
 300         case VCPU_SLEEPING:
 301                 return ("sleeping");
 302         default:
 303                 return ("unknown");
 304         }
 305 }
 306 #endif
 307 
 308 static void
 309 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 310 {
 311         struct vcpu *vcpu = &vm->vcpu[i];
 312 
 313         VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 314         if (destroy) {
 315                 vmm_stat_free(vcpu->stats);
 316                 fpu_save_area_free(vcpu->guestfpu);
 317                 vie_free(vcpu->vie_ctx);
 318                 vcpu->vie_ctx = NULL;
 319         }
 320 }
 321 
 322 static void
 323 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 324 {
 325         struct vcpu *vcpu;
 326 
 327         KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 328             ("vcpu_init: invalid vcpu %d", vcpu_id));
 329 
 330         vcpu = &vm->vcpu[vcpu_id];
 331 
 332         if (create) {
 333                 vcpu_lock_init(vcpu);
 334                 vcpu->state = VCPU_IDLE;
 335                 vcpu->hostcpu = NOCPU;
 336                 vcpu->lastloccpu = NOCPU;
 337                 vcpu->guestfpu = fpu_save_area_alloc();
 338                 vcpu->stats = vmm_stat_alloc();
 339                 vcpu->vie_ctx = vie_alloc();
 340 
 341                 vcpu->ustate = VU_INIT;
 342                 vcpu->ustate_when = gethrtime();
 343         } else {
 344                 vie_reset(vcpu->vie_ctx);
 345                 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
 346                 if (vcpu->ustate != VU_INIT) {
 347                         vcpu_ustate_change(vm, vcpu_id, VU_INIT);
 348                 }
 349         }
 350 
 351         vcpu->run_state = VRS_HALT;
 352         vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 353         vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 354         vcpu->reqidle = 0;
 355         vcpu->exitintinfo = 0;
 356         vcpu->nmi_pending = 0;
 357         vcpu->extint_pending = 0;
 358         vcpu->exception_pending = 0;
 359         vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 360         fpu_save_area_reset(vcpu->guestfpu);
 361         vmm_stat_init(vcpu->stats);
 362         vcpu->tsc_offset = 0;
 363 }
 364 
 365 int
 366 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 367 {
 368 
 369         return (trace_guest_exceptions);
 370 }
 371 
 372 struct vm_exit *
 373 vm_exitinfo(struct vm *vm, int cpuid)
 374 {
 375         struct vcpu *vcpu;
 376 
 377         if (cpuid < 0 || cpuid >= vm->maxcpus)
 378                 panic("vm_exitinfo: invalid cpuid %d", cpuid);
 379 
 380         vcpu = &vm->vcpu[cpuid];
 381 
 382         return (&vcpu->exitinfo);
 383 }
 384 
 385 struct vie *
 386 vm_vie_ctx(struct vm *vm, int cpuid)
 387 {
 388         if (cpuid < 0 || cpuid >= vm->maxcpus)
 389                 panic("vm_vie_ctx: invalid cpuid %d", cpuid);
 390 
 391         return (vm->vcpu[cpuid].vie_ctx);
 392 }
 393 
 394 static int
 395 vmm_init(void)
 396 {
 397         int error;
 398 
 399         vmm_host_state_init();
 400 
 401         /* We use cpu_poke() for IPIs */
 402         vmm_ipinum = 0;
 403 
 404         error = vmm_mem_init();
 405         if (error)
 406                 return (error);
 407 
 408         if (vmm_is_intel())
 409                 ops = &vmm_ops_intel;
 410         else if (vmm_is_svm())
 411                 ops = &vmm_ops_amd;
 412         else
 413                 return (ENXIO);
 414 
 415         return (VMM_INIT(vmm_ipinum));
 416 }
 417 
 418 int
 419 vmm_mod_load()
 420 {
 421         int     error;
 422 
 423         VERIFY(vmm_initialized == 0);
 424 
 425         error = vmm_init();
 426         if (error == 0)
 427                 vmm_initialized = 1;
 428 
 429         return (error);
 430 }
 431 
 432 int
 433 vmm_mod_unload()
 434 {
 435         int     error;
 436 
 437         VERIFY(vmm_initialized == 1);
 438 
 439         iommu_cleanup();
 440         error = VMM_CLEANUP();
 441         if (error)
 442                 return (error);
 443         vmm_initialized = 0;
 444 
 445         return (0);
 446 }
 447 
 448 static void
 449 vm_init(struct vm *vm, bool create)
 450 {
 451         int i;
 452 
 453         vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 454         vm->iommu = NULL;
 455         vm->vioapic = vioapic_init(vm);
 456         vm->vhpet = vhpet_init(vm);
 457         vm->vatpic = vatpic_init(vm);
 458         vm->vatpit = vatpit_init(vm);
 459         vm->vpmtmr = vpmtmr_init(vm);
 460         if (create)
 461                 vm->vrtc = vrtc_init(vm);
 462 
 463         vm_inout_init(vm, &vm->ioports);
 464 
 465         CPU_ZERO(&vm->active_cpus);
 466         CPU_ZERO(&vm->debug_cpus);
 467 
 468         vm->suspend = 0;
 469         CPU_ZERO(&vm->suspended_cpus);
 470 
 471         for (i = 0; i < vm->maxcpus; i++)
 472                 vcpu_init(vm, i, create);
 473 
 474         /*
 475          * Configure the VM-wide TSC offset so that the call to vm_init()
 476          * represents the boot time (when the TSC(s) read 0).  Each vCPU will
 477          * have its own offset from this, which is altered if/when the guest
 478          * writes to MSR_TSC.
 479          *
 480          * The TSC offsetting math is all unsigned, using overflow for negative
 481          * offets.  A reading of the TSC is negated to form the boot offset.
 482          */
 483         vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset());
 484 }
 485 
 486 /*
 487  * The default CPU topology is a single thread per package.
 488  */
 489 uint_t cores_per_package = 1;
 490 uint_t threads_per_core = 1;
 491 
 492 int
 493 vm_create(const char *name, struct vm **retvm)
 494 {
 495         struct vm *vm;
 496         struct vmspace *vmspace;
 497 
 498         /*
 499          * If vmm.ko could not be successfully initialized then don't attempt
 500          * to create the virtual machine.
 501          */
 502         if (!vmm_initialized)
 503                 return (ENXIO);
 504 
 505         if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 506                 return (EINVAL);
 507 
 508         vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
 509         if (vmspace == NULL)
 510                 return (ENOMEM);
 511 
 512         vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
 513         strcpy(vm->name, name);
 514         vm->vmspace = vmspace;
 515 
 516         vm->sockets = 1;
 517         vm->cores = cores_per_package;       /* XXX backwards compatibility */
 518         vm->threads = threads_per_core;      /* XXX backwards compatibility */
 519         vm->maxcpus = VM_MAXCPU;     /* XXX temp to keep code working */
 520 
 521         vm_init(vm, true);
 522 
 523         *retvm = vm;
 524         return (0);
 525 }
 526 
 527 void
 528 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
 529     uint16_t *threads, uint16_t *maxcpus)
 530 {
 531         *sockets = vm->sockets;
 532         *cores = vm->cores;
 533         *threads = vm->threads;
 534         *maxcpus = vm->maxcpus;
 535 }
 536 
 537 uint16_t
 538 vm_get_maxcpus(struct vm *vm)
 539 {
 540         return (vm->maxcpus);
 541 }
 542 
 543 int
 544 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
 545     uint16_t threads, uint16_t maxcpus)
 546 {
 547         if (maxcpus != 0)
 548                 return (EINVAL);        /* XXX remove when supported */
 549         if ((sockets * cores * threads) > vm->maxcpus)
 550                 return (EINVAL);
 551         /* XXX need to check sockets * cores * threads == vCPU, how? */
 552         vm->sockets = sockets;
 553         vm->cores = cores;
 554         vm->threads = threads;
 555         vm->maxcpus = VM_MAXCPU;     /* XXX temp to keep code working */
 556         return (0);
 557 }
 558 
 559 static void
 560 vm_cleanup(struct vm *vm, bool destroy)
 561 {
 562         struct mem_map *mm;
 563         int i;
 564 
 565         ppt_unassign_all(vm);
 566 
 567         if (vm->iommu != NULL)
 568                 iommu_destroy_domain(vm->iommu);
 569 
 570         /*
 571          * Devices which attach their own ioport hooks should be cleaned up
 572          * first so they can tear down those registrations.
 573          */
 574         vpmtmr_cleanup(vm->vpmtmr);
 575 
 576         vm_inout_cleanup(vm, &vm->ioports);
 577 
 578         if (destroy)
 579                 vrtc_cleanup(vm->vrtc);
 580         else
 581                 vrtc_reset(vm->vrtc);
 582 
 583         vatpit_cleanup(vm->vatpit);
 584         vhpet_cleanup(vm->vhpet);
 585         vatpic_cleanup(vm->vatpic);
 586         vioapic_cleanup(vm->vioapic);
 587 
 588         for (i = 0; i < vm->maxcpus; i++)
 589                 vcpu_cleanup(vm, i, destroy);
 590 
 591         VMCLEANUP(vm->cookie);
 592 
 593         /*
 594          * System memory is removed from the guest address space only when
 595          * the VM is destroyed. This is because the mapping remains the same
 596          * across VM reset.
 597          *
 598          * Device memory can be relocated by the guest (e.g. using PCI BARs)
 599          * so those mappings are removed on a VM reset.
 600          */
 601         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 602                 mm = &vm->mem_maps[i];
 603                 if (destroy || !sysmem_mapping(vm, mm)) {
 604                         vm_free_memmap(vm, i);
 605                 } else {
 606                         /*
 607                          * We need to reset the IOMMU flag so this mapping can
 608                          * be reused when a VM is rebooted. Since the IOMMU
 609                          * domain has already been destroyed we can just reset
 610                          * the flag here.
 611                          */
 612                         mm->flags &= ~VM_MEMMAP_F_IOMMU;
 613                 }
 614         }
 615 
 616         if (destroy) {
 617                 for (i = 0; i < VM_MAX_MEMSEGS; i++)
 618                         vm_free_memseg(vm, i);
 619 
 620                 VMSPACE_FREE(vm->vmspace);
 621                 vm->vmspace = NULL;
 622         }
 623 }
 624 
 625 void
 626 vm_destroy(struct vm *vm)
 627 {
 628         vm_cleanup(vm, true);
 629         free(vm, M_VM);
 630 }
 631 
 632 int
 633 vm_reinit(struct vm *vm)
 634 {
 635         int error;
 636 
 637         /*
 638          * A virtual machine can be reset only if all vcpus are suspended.
 639          */
 640         if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 641                 vm_cleanup(vm, false);
 642                 vm_init(vm, false);
 643                 error = 0;
 644         } else {
 645                 error = EBUSY;
 646         }
 647 
 648         return (error);
 649 }
 650 
 651 const char *
 652 vm_name(struct vm *vm)
 653 {
 654         return (vm->name);
 655 }
 656 
 657 int
 658 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 659 {
 660         vm_object_t obj;
 661 
 662         if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 663                 return (ENOMEM);
 664         else
 665                 return (0);
 666 }
 667 
 668 int
 669 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 670 {
 671         return (vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len));
 672 }
 673 
 674 /*
 675  * Return 'true' if 'gpa' is allocated in the guest address space.
 676  *
 677  * This function is called in the context of a running vcpu which acts as
 678  * an implicit lock on 'vm->mem_maps[]'.
 679  */
 680 bool
 681 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 682 {
 683         struct mem_map *mm;
 684         int i;
 685 
 686 #ifdef INVARIANTS
 687         int hostcpu, state;
 688         state = vcpu_get_state(vm, vcpuid, &hostcpu);
 689         KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
 690             ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
 691 #endif
 692 
 693         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 694                 mm = &vm->mem_maps[i];
 695                 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
 696                         return (true);          /* 'gpa' is sysmem or devmem */
 697         }
 698 
 699         if (ppt_is_mmio(vm, gpa))
 700                 return (true);                  /* 'gpa' is pci passthru mmio */
 701 
 702         return (false);
 703 }
 704 
 705 int
 706 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 707 {
 708         struct mem_seg *seg;
 709         vm_object_t obj;
 710 
 711 #ifndef __FreeBSD__
 712         extern pgcnt_t get_max_page_get(void);
 713 #endif
 714 
 715         if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 716                 return (EINVAL);
 717 
 718         if (len == 0 || (len & PAGE_MASK))
 719                 return (EINVAL);
 720 
 721 #ifndef __FreeBSD__
 722         if (len > ptob(get_max_page_get()))
 723                 return (EINVAL);
 724 #endif
 725 
 726         seg = &vm->mem_segs[ident];
 727         if (seg->object != NULL) {
 728                 if (seg->len == len && seg->sysmem == sysmem)
 729                         return (EEXIST);
 730                 else
 731                         return (EINVAL);
 732         }
 733 
 734         obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
 735         if (obj == NULL)
 736                 return (ENOMEM);
 737 
 738         seg->len = len;
 739         seg->object = obj;
 740         seg->sysmem = sysmem;
 741         return (0);
 742 }
 743 
 744 int
 745 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
 746     vm_object_t *objptr)
 747 {
 748         struct mem_seg *seg;
 749 
 750         if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 751                 return (EINVAL);
 752 
 753         seg = &vm->mem_segs[ident];
 754         if (len)
 755                 *len = seg->len;
 756         if (sysmem)
 757                 *sysmem = seg->sysmem;
 758         if (objptr)
 759                 *objptr = seg->object;
 760         return (0);
 761 }
 762 
 763 void
 764 vm_free_memseg(struct vm *vm, int ident)
 765 {
 766         struct mem_seg *seg;
 767 
 768         KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 769             ("%s: invalid memseg ident %d", __func__, ident));
 770 
 771         seg = &vm->mem_segs[ident];
 772         if (seg->object != NULL) {
 773                 vm_object_deallocate(seg->object);
 774                 bzero(seg, sizeof (struct mem_seg));
 775         }
 776 }
 777 
 778 int
 779 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
 780     size_t len, int prot, int flags)
 781 {
 782         struct mem_seg *seg;
 783         struct mem_map *m, *map;
 784         vm_ooffset_t last;
 785         int i, error;
 786 
 787         if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
 788                 return (EINVAL);
 789 
 790         if (flags & ~VM_MEMMAP_F_WIRED)
 791                 return (EINVAL);
 792 
 793         if (segid < 0 || segid >= VM_MAX_MEMSEGS)
 794                 return (EINVAL);
 795 
 796         seg = &vm->mem_segs[segid];
 797         if (seg->object == NULL)
 798                 return (EINVAL);
 799 
 800         last = first + len;
 801         if (first < 0 || first >= last || last > seg->len)
 802                 return (EINVAL);
 803 
 804         if ((gpa | first | last) & PAGE_MASK)
 805                 return (EINVAL);
 806 
 807         map = NULL;
 808         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 809                 m = &vm->mem_maps[i];
 810                 if (m->len == 0) {
 811                         map = m;
 812                         break;
 813                 }
 814         }
 815 
 816         if (map == NULL)
 817                 return (ENOSPC);
 818 
 819         error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
 820             len, 0, VMFS_NO_SPACE, prot, prot, 0);
 821         if (error != 0)
 822                 return (EFAULT);
 823 
 824         vm_object_reference(seg->object);
 825 
 826         if ((flags & VM_MEMMAP_F_WIRED) != 0) {
 827                 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
 828                     VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 829                 if (error != 0) {
 830                         vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
 831                         return (EFAULT);
 832                 }
 833         }
 834 
 835         map->gpa = gpa;
 836         map->len = len;
 837         map->segoff = first;
 838         map->segid = segid;
 839         map->prot = prot;
 840         map->flags = flags;
 841         return (0);
 842 }
 843 
 844 int
 845 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
 846 {
 847         struct mem_map *m;
 848         int i;
 849 
 850         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 851                 m = &vm->mem_maps[i];
 852                 if (m->gpa == gpa && m->len == len &&
 853                     (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
 854                         vm_free_memmap(vm, i);
 855                         return (0);
 856                 }
 857         }
 858 
 859         return (EINVAL);
 860 }
 861 
 862 int
 863 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
 864     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 865 {
 866         struct mem_map *mm, *mmnext;
 867         int i;
 868 
 869         mmnext = NULL;
 870         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 871                 mm = &vm->mem_maps[i];
 872                 if (mm->len == 0 || mm->gpa < *gpa)
 873                         continue;
 874                 if (mmnext == NULL || mm->gpa < mmnext->gpa)
 875                         mmnext = mm;
 876         }
 877 
 878         if (mmnext != NULL) {
 879                 *gpa = mmnext->gpa;
 880                 if (segid)
 881                         *segid = mmnext->segid;
 882                 if (segoff)
 883                         *segoff = mmnext->segoff;
 884                 if (len)
 885                         *len = mmnext->len;
 886                 if (prot)
 887                         *prot = mmnext->prot;
 888                 if (flags)
 889                         *flags = mmnext->flags;
 890                 return (0);
 891         } else {
 892                 return (ENOENT);
 893         }
 894 }
 895 
 896 static void
 897 vm_free_memmap(struct vm *vm, int ident)
 898 {
 899         struct mem_map *mm;
 900         int error;
 901 
 902         mm = &vm->mem_maps[ident];
 903         if (mm->len) {
 904                 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 905                     mm->gpa + mm->len);
 906                 KASSERT(error == 0, ("%s: vm_map_remove error %d",
 907                     __func__, error));
 908                 bzero(mm, sizeof (struct mem_map));
 909         }
 910 }
 911 
 912 static __inline bool
 913 sysmem_mapping(struct vm *vm, struct mem_map *mm)
 914 {
 915 
 916         if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 917                 return (true);
 918         else
 919                 return (false);
 920 }
 921 
 922 vm_paddr_t
 923 vmm_sysmem_maxaddr(struct vm *vm)
 924 {
 925         struct mem_map *mm;
 926         vm_paddr_t maxaddr;
 927         int i;
 928 
 929         maxaddr = 0;
 930         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 931                 mm = &vm->mem_maps[i];
 932                 if (sysmem_mapping(vm, mm)) {
 933                         if (maxaddr < mm->gpa + mm->len)
 934                                 maxaddr = mm->gpa + mm->len;
 935                 }
 936         }
 937         return (maxaddr);
 938 }
 939 
 940 static void
 941 vm_iommu_modify(struct vm *vm, bool map)
 942 {
 943         int i, sz;
 944         vm_paddr_t gpa, hpa;
 945         struct mem_map *mm;
 946 #ifdef __FreeBSD__
 947         void *vp, *cookie, *host_domain;
 948 #else
 949         void *vp, *cookie, *host_domain __unused;
 950 #endif
 951 
 952         sz = PAGE_SIZE;
 953         host_domain = iommu_host_domain();
 954 
 955         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 956                 mm = &vm->mem_maps[i];
 957                 if (!sysmem_mapping(vm, mm))
 958                         continue;
 959 
 960                 if (map) {
 961                         KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
 962                             ("iommu map found invalid memmap %lx/%lx/%x",
 963                             mm->gpa, mm->len, mm->flags));
 964                         if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
 965                                 continue;
 966                         mm->flags |= VM_MEMMAP_F_IOMMU;
 967                 } else {
 968                         if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
 969                                 continue;
 970                         mm->flags &= ~VM_MEMMAP_F_IOMMU;
 971                         KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
 972                             ("iommu unmap found invalid memmap %lx/%lx/%x",
 973                             mm->gpa, mm->len, mm->flags));
 974                 }
 975 
 976                 gpa = mm->gpa;
 977                 while (gpa < mm->gpa + mm->len) {
 978                         vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, PROT_WRITE,
 979                             &cookie);
 980                         KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
 981                             vm_name(vm), gpa));
 982 
 983                         vm_gpa_release(cookie);
 984 
 985                         hpa = DMAP_TO_PHYS((uintptr_t)vp);
 986                         if (map) {
 987                                 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 988 #ifdef __FreeBSD__
 989                                 iommu_remove_mapping(host_domain, hpa, sz);
 990 #endif
 991                         } else {
 992                                 iommu_remove_mapping(vm->iommu, gpa, sz);
 993 #ifdef __FreeBSD__
 994                                 iommu_create_mapping(host_domain, hpa, hpa, sz);
 995 #endif
 996                         }
 997 
 998                         gpa += PAGE_SIZE;
 999                 }
1000         }
1001 
1002         /*
1003          * Invalidate the cached translations associated with the domain
1004          * from which pages were removed.
1005          */
1006 #ifdef __FreeBSD__
1007         if (map)
1008                 iommu_invalidate_tlb(host_domain);
1009         else
1010                 iommu_invalidate_tlb(vm->iommu);
1011 #else
1012         iommu_invalidate_tlb(vm->iommu);
1013 #endif
1014 }
1015 
1016 #define vm_iommu_unmap(vm)      vm_iommu_modify((vm), false)
1017 #define vm_iommu_map(vm)        vm_iommu_modify((vm), true)
1018 
1019 int
1020 vm_unassign_pptdev(struct vm *vm, int pptfd)
1021 {
1022         int error;
1023 
1024         error = ppt_unassign_device(vm, pptfd);
1025         if (error)
1026                 return (error);
1027 
1028         if (ppt_assigned_devices(vm) == 0)
1029                 vm_iommu_unmap(vm);
1030 
1031         return (0);
1032 }
1033 
1034 int
1035 vm_assign_pptdev(struct vm *vm, int pptfd)
1036 {
1037         int error;
1038         vm_paddr_t maxaddr;
1039 
1040         /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1041         if (ppt_assigned_devices(vm) == 0) {
1042                 KASSERT(vm->iommu == NULL,
1043                     ("vm_assign_pptdev: iommu must be NULL"));
1044                 maxaddr = vmm_sysmem_maxaddr(vm);
1045                 vm->iommu = iommu_create_domain(maxaddr);
1046                 if (vm->iommu == NULL)
1047                         return (ENXIO);
1048                 vm_iommu_map(vm);
1049         }
1050 
1051         error = ppt_assign_device(vm, pptfd);
1052         return (error);
1053 }
1054 
1055 void *
1056 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1057     void **cookie)
1058 {
1059         int i, count, pageoff;
1060         struct mem_map *mm;
1061         vm_page_t m;
1062 #ifdef INVARIANTS
1063         /*
1064          * All vcpus are frozen by ioctls that modify the memory map
1065          * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1066          * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1067          */
1068         int state;
1069         KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1070             __func__, vcpuid));
1071         for (i = 0; i < vm->maxcpus; i++) {
1072                 if (vcpuid != -1 && vcpuid != i)
1073                         continue;
1074                 state = vcpu_get_state(vm, i, NULL);
1075                 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1076                     __func__, state));
1077         }
1078 #endif
1079         pageoff = gpa & PAGE_MASK;
1080         if (len > PAGE_SIZE - pageoff)
1081                 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1082 
1083         count = 0;
1084         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1085                 mm = &vm->mem_maps[i];
1086                 if (mm->len == 0) {
1087                         continue;
1088                 }
1089                 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1090                         count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1091                             trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1092                         break;
1093                 }
1094         }
1095 
1096         if (count == 1) {
1097                 *cookie = m;
1098                 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1099         } else {
1100                 *cookie = NULL;
1101                 return (NULL);
1102         }
1103 }
1104 
1105 void
1106 vm_gpa_release(void *cookie)
1107 {
1108         vm_page_t m = cookie;
1109 
1110         vm_page_unwire(m, PQ_ACTIVE);
1111 }
1112 
1113 int
1114 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1115 {
1116 
1117         if (vcpu < 0 || vcpu >= vm->maxcpus)
1118                 return (EINVAL);
1119 
1120         if (reg >= VM_REG_LAST)
1121                 return (EINVAL);
1122 
1123         return (VMGETREG(vm->cookie, vcpu, reg, retval));
1124 }
1125 
1126 int
1127 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1128 {
1129         struct vcpu *vcpu;
1130         int error;
1131 
1132         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1133                 return (EINVAL);
1134 
1135         if (reg >= VM_REG_LAST)
1136                 return (EINVAL);
1137 
1138         error = VMSETREG(vm->cookie, vcpuid, reg, val);
1139         if (error || reg != VM_REG_GUEST_RIP)
1140                 return (error);
1141 
1142         /* Set 'nextrip' to match the value of %rip */
1143         VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1144         vcpu = &vm->vcpu[vcpuid];
1145         vcpu->nextrip = val;
1146         return (0);
1147 }
1148 
1149 static bool
1150 is_descriptor_table(int reg)
1151 {
1152         switch (reg) {
1153         case VM_REG_GUEST_IDTR:
1154         case VM_REG_GUEST_GDTR:
1155                 return (true);
1156         default:
1157                 return (false);
1158         }
1159 }
1160 
1161 static bool
1162 is_segment_register(int reg)
1163 {
1164         switch (reg) {
1165         case VM_REG_GUEST_ES:
1166         case VM_REG_GUEST_CS:
1167         case VM_REG_GUEST_SS:
1168         case VM_REG_GUEST_DS:
1169         case VM_REG_GUEST_FS:
1170         case VM_REG_GUEST_GS:
1171         case VM_REG_GUEST_TR:
1172         case VM_REG_GUEST_LDTR:
1173                 return (true);
1174         default:
1175                 return (false);
1176         }
1177 }
1178 
1179 int
1180 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1181 {
1182 
1183         if (vcpu < 0 || vcpu >= vm->maxcpus)
1184                 return (EINVAL);
1185 
1186         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1187                 return (EINVAL);
1188 
1189         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1190 }
1191 
1192 int
1193 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1194 {
1195         if (vcpu < 0 || vcpu >= vm->maxcpus)
1196                 return (EINVAL);
1197 
1198         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1199                 return (EINVAL);
1200 
1201         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1202 }
1203 
1204 int
1205 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1206 {
1207         struct vcpu *vcpu;
1208 
1209         if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1210                 return (EINVAL);
1211         }
1212 
1213         vcpu = &vm->vcpu[vcpuid];
1214 
1215         vcpu_lock(vcpu);
1216         *state = vcpu->run_state;
1217         *sipi_vec = vcpu->sipi_vector;
1218         vcpu_unlock(vcpu);
1219 
1220         return (0);
1221 }
1222 
1223 int
1224 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1225 {
1226         struct vcpu *vcpu;
1227 
1228         if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1229                 return (EINVAL);
1230         }
1231         if (!VRS_IS_VALID(state)) {
1232                 return (EINVAL);
1233         }
1234 
1235         vcpu = &vm->vcpu[vcpuid];
1236 
1237         vcpu_lock(vcpu);
1238         vcpu->run_state = state;
1239         vcpu->sipi_vector = sipi_vec;
1240         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1241         vcpu_unlock(vcpu);
1242 
1243         return (0);
1244 }
1245 
1246 
1247 static void
1248 restore_guest_fpustate(struct vcpu *vcpu)
1249 {
1250 
1251         /* flush host state to the pcb */
1252         fpuexit(curthread);
1253 
1254         /* restore guest FPU state */
1255         fpu_stop_emulating();
1256         fpurestore(vcpu->guestfpu);
1257 
1258         /* restore guest XCR0 if XSAVE is enabled in the host */
1259         if (rcr4() & CR4_XSAVE)
1260                 load_xcr(0, vcpu->guest_xcr0);
1261 
1262         /*
1263          * The FPU is now "dirty" with the guest's state so turn on emulation
1264          * to trap any access to the FPU by the host.
1265          */
1266         fpu_start_emulating();
1267 }
1268 
1269 static void
1270 save_guest_fpustate(struct vcpu *vcpu)
1271 {
1272 
1273         if ((rcr0() & CR0_TS) == 0)
1274                 panic("fpu emulation not enabled in host!");
1275 
1276         /* save guest XCR0 and restore host XCR0 */
1277         if (rcr4() & CR4_XSAVE) {
1278                 vcpu->guest_xcr0 = rxcr(0);
1279                 load_xcr(0, vmm_get_host_xcr0());
1280         }
1281 
1282         /* save guest FPU state */
1283         fpu_stop_emulating();
1284         fpusave(vcpu->guestfpu);
1285         /*
1286          * When the host state has been restored, we should not re-enable
1287          * CR0.TS on illumos for eager FPU.
1288          */
1289 }
1290 
1291 static int
1292 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1293     bool from_idle)
1294 {
1295         struct vcpu *vcpu;
1296         int error;
1297 
1298         vcpu = &vm->vcpu[vcpuid];
1299         vcpu_assert_locked(vcpu);
1300 
1301         /*
1302          * State transitions from the vmmdev_ioctl() must always begin from
1303          * the VCPU_IDLE state. This guarantees that there is only a single
1304          * ioctl() operating on a vcpu at any point.
1305          */
1306         if (from_idle) {
1307                 while (vcpu->state != VCPU_IDLE) {
1308                         vcpu->reqidle = 1;
1309                         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1310                         VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1311                             "idle requested", vcpu_state2str(vcpu->state));
1312                         cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1313                 }
1314         } else {
1315                 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1316                     "vcpu idle state"));
1317         }
1318 
1319         if (vcpu->state == VCPU_RUNNING) {
1320                 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1321                     "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1322         } else {
1323                 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1324                     "vcpu that is not running", vcpu->hostcpu));
1325         }
1326 
1327         /*
1328          * The following state transitions are allowed:
1329          * IDLE -> FROZEN -> IDLE
1330          * FROZEN -> RUNNING -> FROZEN
1331          * FROZEN -> SLEEPING -> FROZEN
1332          */
1333         switch (vcpu->state) {
1334         case VCPU_IDLE:
1335         case VCPU_RUNNING:
1336         case VCPU_SLEEPING:
1337                 error = (newstate != VCPU_FROZEN);
1338                 break;
1339         case VCPU_FROZEN:
1340                 error = (newstate == VCPU_FROZEN);
1341                 break;
1342         default:
1343                 error = 1;
1344                 break;
1345         }
1346 
1347         if (error)
1348                 return (EBUSY);
1349 
1350         VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1351             vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1352 
1353         vcpu->state = newstate;
1354         if (newstate == VCPU_RUNNING)
1355                 vcpu->hostcpu = curcpu;
1356         else
1357                 vcpu->hostcpu = NOCPU;
1358 
1359         if (newstate == VCPU_IDLE) {
1360                 cv_broadcast(&vcpu->state_cv);
1361         }
1362 
1363         return (0);
1364 }
1365 
1366 static void
1367 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1368 {
1369         int error;
1370 
1371         if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1372                 panic("Error %d setting state to %d\n", error, newstate);
1373 }
1374 
1375 static void
1376 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1377 {
1378         int error;
1379 
1380         if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1381                 panic("Error %d setting state to %d", error, newstate);
1382 }
1383 
1384 /*
1385  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1386  */
1387 static int
1388 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1389 {
1390         struct vcpu *vcpu;
1391         int vcpu_halted, vm_halted;
1392         bool userspace_exit = false;
1393 
1394         KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1395 
1396         vcpu = &vm->vcpu[vcpuid];
1397         vcpu_halted = 0;
1398         vm_halted = 0;
1399 
1400         vcpu_lock(vcpu);
1401         while (1) {
1402                 /*
1403                  * Do a final check for pending interrupts (including NMI and
1404                  * INIT) before putting this thread to sleep.
1405                  */
1406                 if (vm_nmi_pending(vm, vcpuid))
1407                         break;
1408                 if (vcpu_run_state_pending(vm, vcpuid))
1409                         break;
1410                 if (!intr_disabled) {
1411                         if (vm_extint_pending(vm, vcpuid) ||
1412                             vlapic_pending_intr(vcpu->vlapic, NULL)) {
1413                                 break;
1414                         }
1415                 }
1416 
1417                 /*
1418                  * Also check for software events which would cause a wake-up.
1419                  * This will set the appropriate exitcode directly, rather than
1420                  * requiring a trip through VM_RUN().
1421                  */
1422                 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1423                         userspace_exit = true;
1424                         break;
1425                 }
1426 
1427                 /*
1428                  * Some Linux guests implement "halt" by having all vcpus
1429                  * execute HLT with interrupts disabled. 'halted_cpus' keeps
1430                  * track of the vcpus that have entered this state. When all
1431                  * vcpus enter the halted state the virtual machine is halted.
1432                  */
1433                 if (intr_disabled) {
1434                         if (!vcpu_halted && halt_detection_enabled) {
1435                                 vcpu_halted = 1;
1436                                 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1437                         }
1438                         if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1439                                 vm_halted = 1;
1440                                 break;
1441                         }
1442                 }
1443 
1444                 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1445                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1446                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1447                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1448                 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1449         }
1450 
1451         if (vcpu_halted)
1452                 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1453 
1454         vcpu_unlock(vcpu);
1455 
1456         if (vm_halted)
1457                 vm_suspend(vm, VM_SUSPEND_HALT);
1458 
1459         return (userspace_exit ? -1 : 0);
1460 }
1461 
1462 static int
1463 vm_handle_paging(struct vm *vm, int vcpuid)
1464 {
1465         int rv, ftype;
1466         struct vm_map *map;
1467         struct vcpu *vcpu;
1468         struct vm_exit *vme;
1469 
1470         vcpu = &vm->vcpu[vcpuid];
1471         vme = &vcpu->exitinfo;
1472 
1473         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1474             __func__, vme->inst_length));
1475 
1476         ftype = vme->u.paging.fault_type;
1477         KASSERT(ftype == PROT_READ ||
1478             ftype == PROT_WRITE || ftype == PROT_EXEC,
1479             ("vm_handle_paging: invalid fault_type %d", ftype));
1480 
1481         if (ftype == PROT_READ || ftype == PROT_WRITE) {
1482                 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1483                     vme->u.paging.gpa, ftype);
1484                 if (rv == 0) {
1485                         VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1486                             ftype == PROT_READ ? "accessed" : "dirty",
1487                             vme->u.paging.gpa);
1488                         goto done;
1489                 }
1490         }
1491 
1492         map = &vm->vmspace->vm_map;
1493         rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1494 
1495         VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1496             "ftype = %d", rv, vme->u.paging.gpa, ftype);
1497 
1498         if (rv != 0)
1499                 return (EFAULT);
1500 done:
1501         return (0);
1502 }
1503 
1504 int
1505 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1506     int rsize)
1507 {
1508         int err = ESRCH;
1509 
1510         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1511                 err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1512         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1513                 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1514         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1515                 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1516         }
1517 
1518         return (err);
1519 }
1520 
1521 int
1522 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1523     int wsize)
1524 {
1525         int err = ESRCH;
1526 
1527         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1528                 err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1529         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1530                 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1531         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1532                 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1533         }
1534 
1535         return (err);
1536 }
1537 
1538 static int
1539 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1540 {
1541         struct vie *vie;
1542         struct vcpu *vcpu;
1543         struct vm_exit *vme;
1544         uint64_t inst_addr;
1545         int error, fault, cs_d;
1546 
1547         vcpu = &vm->vcpu[vcpuid];
1548         vme = &vcpu->exitinfo;
1549         vie = vcpu->vie_ctx;
1550 
1551         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1552             __func__, vme->inst_length));
1553 
1554         inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1555         cs_d = vme->u.mmio_emul.cs_d;
1556 
1557         VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1558             vme->u.mmio_emul.gpa);
1559 
1560         /* Fetch the faulting instruction */
1561         if (vie_needs_fetch(vie)) {
1562                 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1563                     &fault);
1564                 if (error != 0) {
1565                         return (error);
1566                 } else if (fault) {
1567                         /*
1568                          * If a fault during instruction fetch was encountered,
1569                          * it will have asserted that the appropriate exception
1570                          * be injected at next entry.
1571                          * No further work is required.
1572                          */
1573                         return (0);
1574                 }
1575         }
1576 
1577         if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1578                 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1579                     inst_addr);
1580                 /* Dump (unrecognized) instruction bytes in userspace */
1581                 vie_fallback_exitinfo(vie, vme);
1582                 return (-1);
1583         }
1584         if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1585             vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1586                 /* Decoded GLA does not match GLA from VM exit state */
1587                 vie_fallback_exitinfo(vie, vme);
1588                 return (-1);
1589         }
1590 
1591 repeat:
1592         error = vie_emulate_mmio(vie, vm, vcpuid);
1593         if (error < 0) {
1594                 /*
1595                  * MMIO not handled by any of the in-kernel-emulated devices, so
1596                  * make a trip out to userspace for it.
1597                  */
1598                 vie_exitinfo(vie, vme);
1599         } else if (error == EAGAIN) {
1600                 /*
1601                  * Continue emulating the rep-prefixed instruction, which has
1602                  * not completed its iterations.
1603                  *
1604                  * In case this can be emulated in-kernel and has a high
1605                  * repetition count (causing a tight spin), it should be
1606                  * deferential to yield conditions.
1607                  */
1608                 if (!vcpu_should_yield(vm, vcpuid)) {
1609                         goto repeat;
1610                 } else {
1611                         /*
1612                          * Defer to the contending load by making a trip to
1613                          * userspace with a no-op (BOGUS) exit reason.
1614                          */
1615                         vie_reset(vie);
1616                         vme->exitcode = VM_EXITCODE_BOGUS;
1617                         return (-1);
1618                 }
1619         } else if (error == 0) {
1620                 /* Update %rip now that instruction has been emulated */
1621                 vie_advance_pc(vie, &vcpu->nextrip);
1622         }
1623         return (error);
1624 }
1625 
1626 static int
1627 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1628 {
1629         struct vcpu *vcpu;
1630         struct vie *vie;
1631         int err;
1632 
1633         vcpu = &vm->vcpu[vcpuid];
1634         vie = vcpu->vie_ctx;
1635 
1636 repeat:
1637         err = vie_emulate_inout(vie, vm, vcpuid);
1638 
1639         if (err < 0) {
1640                 /*
1641                  * In/out not handled by any of the in-kernel-emulated devices,
1642                  * so make a trip out to userspace for it.
1643                  */
1644                 vie_exitinfo(vie, vme);
1645                 return (err);
1646         } else if (err == EAGAIN) {
1647                 /*
1648                  * Continue emulating the rep-prefixed ins/outs, which has not
1649                  * completed its iterations.
1650                  *
1651                  * In case this can be emulated in-kernel and has a high
1652                  * repetition count (causing a tight spin), it should be
1653                  * deferential to yield conditions.
1654                  */
1655                 if (!vcpu_should_yield(vm, vcpuid)) {
1656                         goto repeat;
1657                 } else {
1658                         /*
1659                          * Defer to the contending load by making a trip to
1660                          * userspace with a no-op (BOGUS) exit reason.
1661                          */
1662                         vie_reset(vie);
1663                         vme->exitcode = VM_EXITCODE_BOGUS;
1664                         return (-1);
1665                 }
1666         } else if (err != 0) {
1667                 /* Emulation failure.  Bail all the way out to userspace. */
1668                 vme->exitcode = VM_EXITCODE_INST_EMUL;
1669                 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1670                 return (-1);
1671         }
1672 
1673         vie_advance_pc(vie, &vcpu->nextrip);
1674         return (0);
1675 }
1676 
1677 static int
1678 vm_handle_inst_emul(struct vm *vm, int vcpuid)
1679 {
1680         struct vie *vie;
1681         struct vcpu *vcpu;
1682         struct vm_exit *vme;
1683         uint64_t cs_base;
1684         int error, fault, cs_d;
1685 
1686         vcpu = &vm->vcpu[vcpuid];
1687         vme = &vcpu->exitinfo;
1688         vie = vcpu->vie_ctx;
1689 
1690         vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1691 
1692         /* Fetch the faulting instruction */
1693         ASSERT(vie_needs_fetch(vie));
1694         error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1695             &fault);
1696         if (error != 0) {
1697                 return (error);
1698         } else if (fault) {
1699                 /*
1700                  * If a fault during instruction fetch was encounted, it will
1701                  * have asserted that the appropriate exception be injected at
1702                  * next entry.  No further work is required.
1703                  */
1704                 return (0);
1705         }
1706 
1707         if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1708                 /* Dump (unrecognized) instruction bytes in userspace */
1709                 vie_fallback_exitinfo(vie, vme);
1710                 return (-1);
1711         }
1712 
1713         error = vie_emulate_other(vie, vm, vcpuid);
1714         if (error != 0) {
1715                 /*
1716                  * Instruction emulation was unable to complete successfully, so
1717                  * kick it out to userspace for handling.
1718                  */
1719                 vie_fallback_exitinfo(vie, vme);
1720         } else {
1721                 /* Update %rip now that instruction has been emulated */
1722                 vie_advance_pc(vie, &vcpu->nextrip);
1723         }
1724         return (error);
1725 }
1726 
1727 static int
1728 vm_handle_suspend(struct vm *vm, int vcpuid)
1729 {
1730         int i;
1731         struct vcpu *vcpu;
1732 
1733         vcpu = &vm->vcpu[vcpuid];
1734 
1735         CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1736 
1737         /*
1738          * Wait until all 'active_cpus' have suspended themselves.
1739          */
1740         vcpu_lock(vcpu);
1741         vcpu_ustate_change(vm, vcpuid, VU_INIT);
1742         while (1) {
1743                 int rc;
1744 
1745                 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1746                         VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1747                         break;
1748                 }
1749 
1750                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1751                 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1752                     TR_CLOCK_TICK);
1753                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1754 
1755                 /*
1756                  * If the userspace process driving the instance is killed, any
1757                  * vCPUs yet to be marked suspended (because they are not
1758                  * VM_RUN-ing in the kernel presently) will never reach that
1759                  * state.
1760                  *
1761                  * To avoid vm_handle_suspend() getting stuck in the kernel
1762                  * waiting for those vCPUs, offer a bail-out even though it
1763                  * means returning without all vCPUs in a suspended state.
1764                  */
1765                 if (rc <= 0) {
1766                         if ((curproc->p_flag & SEXITING) != 0) {
1767                                 break;
1768                         }
1769                 }
1770         }
1771         vcpu_unlock(vcpu);
1772 
1773         /*
1774          * Wakeup the other sleeping vcpus and return to userspace.
1775          */
1776         for (i = 0; i < vm->maxcpus; i++) {
1777                 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1778                         vcpu_notify_event(vm, i);
1779                 }
1780         }
1781 
1782         return (-1);
1783 }
1784 
1785 static int
1786 vm_handle_reqidle(struct vm *vm, int vcpuid)
1787 {
1788         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1789 
1790         vcpu_lock(vcpu);
1791         KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1792         vcpu->reqidle = 0;
1793         vcpu_unlock(vcpu);
1794         return (-1);
1795 }
1796 
1797 static int
1798 vm_handle_run_state(struct vm *vm, int vcpuid)
1799 {
1800         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1801         bool handled = false;
1802 
1803         vcpu_lock(vcpu);
1804         while (1) {
1805                 if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1806                         vcpu_unlock(vcpu);
1807                         VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1808                         vcpu_lock(vcpu);
1809 
1810                         vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1811                         vcpu->run_state |= VRS_INIT;
1812                 }
1813 
1814                 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1815                     (VRS_INIT | VRS_PEND_SIPI)) {
1816                         const uint8_t vector = vcpu->sipi_vector;
1817 
1818                         vcpu_unlock(vcpu);
1819                         VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1820                         vcpu_lock(vcpu);
1821 
1822                         vcpu->run_state &= ~VRS_PEND_SIPI;
1823                         vcpu->run_state |= VRS_RUN;
1824                 }
1825 
1826                 /*
1827                  * If the vCPU is now in the running state, there is no need to
1828                  * wait for anything prior to re-entry.
1829                  */
1830                 if ((vcpu->run_state & VRS_RUN) != 0) {
1831                         handled = true;
1832                         break;
1833                 }
1834 
1835                 /*
1836                  * Also check for software events which would cause a wake-up.
1837                  * This will set the appropriate exitcode directly, rather than
1838                  * requiring a trip through VM_RUN().
1839                  */
1840                 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1841                         break;
1842                 }
1843 
1844                 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1845                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1846                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1847                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1848                 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1849         }
1850         vcpu_unlock(vcpu);
1851 
1852         return (handled ? 0 : -1);
1853 }
1854 
1855 static int
1856 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1857 {
1858         const uint32_t code = vme->u.msr.code;
1859         uint64_t val = 0;
1860 
1861         switch (code) {
1862         case MSR_MCG_CAP:
1863         case MSR_MCG_STATUS:
1864                 val = 0;
1865                 break;
1866 
1867         case MSR_MTRRcap:
1868         case MSR_MTRRdefType:
1869         case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1870         case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1871         case MSR_MTRR64kBase:
1872                 val = 0;
1873                 break;
1874 
1875         case MSR_TSC:
1876                 /*
1877                  * In all likelihood, this should always be handled in guest
1878                  * context by VMX/SVM rather than taking an exit.  (Both VMX and
1879                  * SVM pass through read-only access to MSR_TSC to the guest.)
1880                  *
1881                  * No physical offset is requested of vcpu_tsc_offset() since
1882                  * rdtsc_offset() takes care of that instead.
1883                  */
1884                 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
1885                 break;
1886 
1887         default:
1888                 /*
1889                  * Anything not handled at this point will be kicked out to
1890                  * userspace for attempted processing there.
1891                  */
1892                 return (-1);
1893         }
1894 
1895         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
1896             val & 0xffffffff));
1897         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
1898             val >> 32));
1899         return (0);
1900 }
1901 
1902 static int
1903 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1904 {
1905         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1906         const uint32_t code = vme->u.msr.code;
1907         const uint64_t val = vme->u.msr.wval;
1908 
1909         switch (code) {
1910         case MSR_MCG_CAP:
1911         case MSR_MCG_STATUS:
1912                 /* Ignore writes */
1913                 break;
1914 
1915         case MSR_MTRRcap:
1916                 vm_inject_gp(vm, vcpuid);
1917                 break;
1918         case MSR_MTRRdefType:
1919         case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1920         case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1921         case MSR_MTRR64kBase:
1922                 /* Ignore writes */
1923                 break;
1924 
1925         case MSR_TSC:
1926                 /*
1927                  * The effect of writing the TSC MSR is that a subsequent read
1928                  * of the TSC would report that value written (plus any time
1929                  * elapsed between the write and the read).  The guest TSC value
1930                  * is calculated from a global offset for the guest (which
1931                  * effectively makes its TSC read 0 at guest boot) and a
1932                  * per-vCPU offset to handle these writes to the MSR.
1933                  *
1934                  * To calculate that per-vCPU offset, we can work backwards from
1935                  * the guest value at the time of write:
1936                  *
1937                  * value = host TSC + VM boot offset + vCPU offset
1938                  *
1939                  * so therefore:
1940                  *
1941                  * value - host TSC - VM boot offset = vCPU offset
1942                  */
1943                 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
1944                 break;
1945 
1946         default:
1947                 /*
1948                  * Anything not handled at this point will be kicked out to
1949                  * userspace for attempted processing there.
1950                  */
1951                 return (-1);
1952         }
1953 
1954         return (0);
1955 }
1956 
1957 int
1958 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1959 {
1960         int i;
1961 
1962         if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1963                 return (EINVAL);
1964 
1965         if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1966                 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1967                     vm->suspend, how);
1968                 return (EALREADY);
1969         }
1970 
1971         VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1972 
1973         /*
1974          * Notify all active vcpus that they are now suspended.
1975          */
1976         for (i = 0; i < vm->maxcpus; i++) {
1977                 if (CPU_ISSET(i, &vm->active_cpus))
1978                         vcpu_notify_event(vm, i);
1979         }
1980 
1981         return (0);
1982 }
1983 
1984 void
1985 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1986 {
1987         struct vm_exit *vmexit;
1988 
1989         vmexit = vm_exitinfo(vm, vcpuid);
1990         vmexit->rip = rip;
1991         vmexit->inst_length = 0;
1992         vmexit->exitcode = VM_EXITCODE_RUN_STATE;
1993         vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
1994 }
1995 
1996 /*
1997  * Some vmm resources, such as the lapic, may have CPU-specific resources
1998  * allocated to them which would benefit from migration onto the host CPU which
1999  * is processing the vcpu state.
2000  */
2001 static void
2002 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2003 {
2004         /*
2005          * Localizing cyclic resources requires acquisition of cpu_lock, and
2006          * doing so with kpreempt disabled is a recipe for deadlock disaster.
2007          */
2008         VERIFY(curthread->t_preempt == 0);
2009 
2010         /*
2011          * Do not bother with localization if this vCPU is about to return to
2012          * the host CPU it was last localized to.
2013          */
2014         if (vcpu->lastloccpu == curcpu)
2015                 return;
2016 
2017         /*
2018          * Localize system-wide resources to the primary boot vCPU.  While any
2019          * of the other vCPUs may access them, it keeps the potential interrupt
2020          * footprint constrained to CPUs involved with this instance.
2021          */
2022         if (vcpu == &vm->vcpu[0]) {
2023                 vhpet_localize_resources(vm->vhpet);
2024                 vrtc_localize_resources(vm->vrtc);
2025                 vatpit_localize_resources(vm->vatpit);
2026         }
2027 
2028         vlapic_localize_resources(vcpu->vlapic);
2029 
2030         vcpu->lastloccpu = curcpu;
2031 }
2032 
2033 static void
2034 vmm_savectx(void *arg)
2035 {
2036         vm_thread_ctx_t *vtc = arg;
2037         struct vm *vm = vtc->vtc_vm;
2038         const int vcpuid = vtc->vtc_vcpuid;
2039 
2040         if (ops->vmsavectx != NULL) {
2041                 ops->vmsavectx(vm->cookie, vcpuid);
2042         }
2043 
2044         /*
2045          * Account for going off-cpu, unless the vCPU is idled, where being
2046          * off-cpu is the explicit point.
2047          */
2048         if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2049                 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2050                 vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2051         }
2052 
2053         /*
2054          * If the CPU holds the restored guest FPU state, save it and restore
2055          * the host FPU state before this thread goes off-cpu.
2056          */
2057         if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2058                 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2059 
2060                 save_guest_fpustate(vcpu);
2061                 vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2062         }
2063 }
2064 
2065 static void
2066 vmm_restorectx(void *arg)
2067 {
2068         vm_thread_ctx_t *vtc = arg;
2069         struct vm *vm = vtc->vtc_vm;
2070         const int vcpuid = vtc->vtc_vcpuid;
2071 
2072         /* Complete microstate accounting for vCPU being off-cpu */
2073         if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2074                 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2075         }
2076 
2077         /*
2078          * When coming back on-cpu, only restore the guest FPU status if the
2079          * thread is in a context marked as requiring it.  This should be rare,
2080          * occurring only when a future logic error results in a voluntary
2081          * sleep during the VMRUN critical section.
2082          *
2083          * The common case will result in elision of the guest FPU state
2084          * restoration, deferring that action until it is clearly necessary
2085          * during vm_run.
2086          */
2087         VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2088         if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2089                 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2090 
2091                 restore_guest_fpustate(vcpu);
2092                 vtc->vtc_status |= VTCS_FPU_RESTORED;
2093         }
2094 
2095         if (ops->vmrestorectx != NULL) {
2096                 ops->vmrestorectx(vm->cookie, vcpuid);
2097         }
2098 
2099 }
2100 
2101 /*
2102  * If we're in removectx(), we might still have state to tidy up.
2103  */
2104 static void
2105 vmm_freectx(void *arg, int isexec)
2106 {
2107         vmm_savectx(arg);
2108 }
2109 
2110 static int
2111 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2112     struct vm_exit *vme)
2113 {
2114         struct vcpu *vcpu;
2115         struct vie *vie;
2116         int err;
2117 
2118         vcpu = &vm->vcpu[vcpuid];
2119         vie = vcpu->vie_ctx;
2120         err = 0;
2121 
2122         switch (entry->cmd) {
2123         case VEC_DEFAULT:
2124                 return (0);
2125         case VEC_DISCARD_INSTR:
2126                 vie_reset(vie);
2127                 return (0);
2128         case VEC_FULFILL_MMIO:
2129                 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2130                 if (err == 0) {
2131                         err = vie_emulate_mmio(vie, vm, vcpuid);
2132                         if (err == 0) {
2133                                 vie_advance_pc(vie, &vcpu->nextrip);
2134                         } else if (err < 0) {
2135                                 vie_exitinfo(vie, vme);
2136                         } else if (err == EAGAIN) {
2137                                 /*
2138                                  * Clear the instruction emulation state in
2139                                  * order to re-enter VM context and continue
2140                                  * this 'rep <instruction>'
2141                                  */
2142                                 vie_reset(vie);
2143                                 err = 0;
2144                         }
2145                 }
2146                 break;
2147         case VEC_FULFILL_INOUT:
2148                 err = vie_fulfill_inout(vie, &entry->u.inout);
2149                 if (err == 0) {
2150                         err = vie_emulate_inout(vie, vm, vcpuid);
2151                         if (err == 0) {
2152                                 vie_advance_pc(vie, &vcpu->nextrip);
2153                         } else if (err < 0) {
2154                                 vie_exitinfo(vie, vme);
2155                         } else if (err == EAGAIN) {
2156                                 /*
2157                                  * Clear the instruction emulation state in
2158                                  * order to re-enter VM context and continue
2159                                  * this 'rep ins/outs'
2160                                  */
2161                                 vie_reset(vie);
2162                                 err = 0;
2163                         }
2164                 }
2165                 break;
2166         default:
2167                 return (EINVAL);
2168         }
2169         return (err);
2170 }
2171 
2172 static int
2173 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2174 {
2175         struct vie *vie;
2176 
2177         vie = vm->vcpu[vcpuid].vie_ctx;
2178 
2179         if (vie_pending(vie)) {
2180                 /*
2181                  * Userspace has not fulfilled the pending needs of the
2182                  * instruction emulation, so bail back out.
2183                  */
2184                 vie_exitinfo(vie, vme);
2185                 return (-1);
2186         }
2187 
2188         return (0);
2189 }
2190 
2191 int
2192 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2193 {
2194         int error;
2195         struct vcpu *vcpu;
2196         struct vm_exit *vme;
2197         bool intr_disabled;
2198         pmap_t pmap;
2199         vm_thread_ctx_t vtc;
2200         int affinity_type = CPU_CURRENT;
2201 
2202         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2203                 return (EINVAL);
2204         if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2205                 return (EINVAL);
2206         if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2207                 return (EINVAL);
2208 
2209         pmap = vmspace_pmap(vm->vmspace);
2210         vcpu = &vm->vcpu[vcpuid];
2211         vme = &vcpu->exitinfo;
2212 
2213         vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2214 
2215         vtc.vtc_vm = vm;
2216         vtc.vtc_vcpuid = vcpuid;
2217         vtc.vtc_status = 0;
2218         installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2219             NULL, vmm_freectx, NULL);
2220 
2221         error = vm_entry_actions(vm, vcpuid, entry, vme);
2222         if (error != 0) {
2223                 goto exit;
2224         }
2225 
2226 restart:
2227         error = vm_loop_checks(vm, vcpuid, vme);
2228         if (error != 0) {
2229                 goto exit;
2230         }
2231 
2232         thread_affinity_set(curthread, affinity_type);
2233         /*
2234          * Resource localization should happen after the CPU affinity for the
2235          * thread has been set to ensure that access from restricted contexts,
2236          * such as VMX-accelerated APIC operations, can occur without inducing
2237          * cyclic cross-calls.
2238          *
2239          * This must be done prior to disabling kpreempt via critical_enter().
2240          */
2241         vm_localize_resources(vm, vcpu);
2242         affinity_type = CPU_CURRENT;
2243         critical_enter();
2244 
2245         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2246             ("vm_run: absurd pm_active"));
2247 
2248         /* Force a trip through update_sregs to reload %fs/%gs and friends */
2249         PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2250 
2251         if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2252                 restore_guest_fpustate(vcpu);
2253                 vtc.vtc_status |= VTCS_FPU_RESTORED;
2254         }
2255         vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2256 
2257         vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2258         error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2259         vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2260 
2261         /*
2262          * Once clear of the delicate contexts comprising the VM_RUN handler,
2263          * thread CPU affinity can be loosened while other processing occurs.
2264          */
2265         vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2266         thread_affinity_clear(curthread);
2267         critical_exit();
2268 
2269         if (error != 0) {
2270                 /* Communicate out any error from VMRUN() above */
2271                 goto exit;
2272         }
2273 
2274         vcpu->nextrip = vme->rip + vme->inst_length;
2275         switch (vme->exitcode) {
2276         case VM_EXITCODE_REQIDLE:
2277                 error = vm_handle_reqidle(vm, vcpuid);
2278                 break;
2279         case VM_EXITCODE_RUN_STATE:
2280                 error = vm_handle_run_state(vm, vcpuid);
2281                 break;
2282         case VM_EXITCODE_SUSPENDED:
2283                 error = vm_handle_suspend(vm, vcpuid);
2284                 break;
2285         case VM_EXITCODE_IOAPIC_EOI:
2286                 vioapic_process_eoi(vm, vcpuid,
2287                     vme->u.ioapic_eoi.vector);
2288                 break;
2289         case VM_EXITCODE_HLT:
2290                 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2291                 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2292                 break;
2293         case VM_EXITCODE_PAGING:
2294                 error = vm_handle_paging(vm, vcpuid);
2295                 break;
2296         case VM_EXITCODE_MMIO_EMUL:
2297                 error = vm_handle_mmio_emul(vm, vcpuid);
2298                 break;
2299         case VM_EXITCODE_INOUT:
2300                 error = vm_handle_inout(vm, vcpuid, vme);
2301                 break;
2302         case VM_EXITCODE_INST_EMUL:
2303                 error = vm_handle_inst_emul(vm, vcpuid);
2304                 break;
2305         case VM_EXITCODE_MONITOR:
2306         case VM_EXITCODE_MWAIT:
2307         case VM_EXITCODE_VMINSN:
2308                 vm_inject_ud(vm, vcpuid);
2309                 break;
2310         case VM_EXITCODE_RDMSR:
2311                 error = vm_handle_rdmsr(vm, vcpuid, vme);
2312                 break;
2313         case VM_EXITCODE_WRMSR:
2314                 error = vm_handle_wrmsr(vm, vcpuid, vme);
2315                 break;
2316         case VM_EXITCODE_HT:
2317                 affinity_type = CPU_BEST;
2318                 break;
2319         case VM_EXITCODE_MTRAP:
2320                 vm_suspend_cpu(vm, vcpuid);
2321                 error = -1;
2322                 break;
2323         default:
2324                 /* handled in userland */
2325                 error = -1;
2326                 break;
2327         }
2328 
2329         if (error == 0) {
2330                 /* VM exit conditions handled in-kernel, continue running */
2331                 goto restart;
2332         }
2333 
2334 exit:
2335         removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2336             NULL, vmm_freectx);
2337 
2338         VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2339 
2340         vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2341         return (error);
2342 }
2343 
2344 int
2345 vm_restart_instruction(void *arg, int vcpuid)
2346 {
2347         struct vm *vm;
2348         struct vcpu *vcpu;
2349         enum vcpu_state state;
2350         uint64_t rip;
2351         int error;
2352 
2353         vm = arg;
2354         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2355                 return (EINVAL);
2356 
2357         vcpu = &vm->vcpu[vcpuid];
2358         state = vcpu_get_state(vm, vcpuid, NULL);
2359         if (state == VCPU_RUNNING) {
2360                 /*
2361                  * When a vcpu is "running" the next instruction is determined
2362                  * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2363                  * Thus setting 'inst_length' to zero will cause the current
2364                  * instruction to be restarted.
2365                  */
2366                 vcpu->exitinfo.inst_length = 0;
2367                 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2368                     "setting inst_length to zero", vcpu->exitinfo.rip);
2369         } else if (state == VCPU_FROZEN) {
2370                 /*
2371                  * When a vcpu is "frozen" it is outside the critical section
2372                  * around VMRUN() and 'nextrip' points to the next instruction.
2373                  * Thus instruction restart is achieved by setting 'nextrip'
2374                  * to the vcpu's %rip.
2375                  */
2376                 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2377                 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2378                 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2379                     "nextrip from %lx to %lx", vcpu->nextrip, rip);
2380                 vcpu->nextrip = rip;
2381         } else {
2382                 panic("%s: invalid state %d", __func__, state);
2383         }
2384         return (0);
2385 }
2386 
2387 int
2388 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2389 {
2390         struct vcpu *vcpu;
2391         int type, vector;
2392 
2393         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2394                 return (EINVAL);
2395 
2396         vcpu = &vm->vcpu[vcpuid];
2397 
2398         if (info & VM_INTINFO_VALID) {
2399                 type = info & VM_INTINFO_TYPE;
2400                 vector = info & 0xff;
2401                 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2402                         return (EINVAL);
2403                 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2404                         return (EINVAL);
2405                 if (info & VM_INTINFO_RSVD)
2406                         return (EINVAL);
2407         } else {
2408                 info = 0;
2409         }
2410         VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2411         vcpu->exitintinfo = info;
2412         return (0);
2413 }
2414 
2415 enum exc_class {
2416         EXC_BENIGN,
2417         EXC_CONTRIBUTORY,
2418         EXC_PAGEFAULT
2419 };
2420 
2421 #define IDT_VE  20      /* Virtualization Exception (Intel specific) */
2422 
2423 static enum exc_class
2424 exception_class(uint64_t info)
2425 {
2426         int type, vector;
2427 
2428         KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2429         type = info & VM_INTINFO_TYPE;
2430         vector = info & 0xff;
2431 
2432         /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2433         switch (type) {
2434         case VM_INTINFO_HWINTR:
2435         case VM_INTINFO_SWINTR:
2436         case VM_INTINFO_NMI:
2437                 return (EXC_BENIGN);
2438         default:
2439                 /*
2440                  * Hardware exception.
2441                  *
2442                  * SVM and VT-x use identical type values to represent NMI,
2443                  * hardware interrupt and software interrupt.
2444                  *
2445                  * SVM uses type '3' for all exceptions. VT-x uses type '3'
2446                  * for exceptions except #BP and #OF. #BP and #OF use a type
2447                  * value of '5' or '6'. Therefore we don't check for explicit
2448                  * values of 'type' to classify 'intinfo' into a hardware
2449                  * exception.
2450                  */
2451                 break;
2452         }
2453 
2454         switch (vector) {
2455         case IDT_PF:
2456         case IDT_VE:
2457                 return (EXC_PAGEFAULT);
2458         case IDT_DE:
2459         case IDT_TS:
2460         case IDT_NP:
2461         case IDT_SS:
2462         case IDT_GP:
2463                 return (EXC_CONTRIBUTORY);
2464         default:
2465                 return (EXC_BENIGN);
2466         }
2467 }
2468 
2469 static int
2470 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2471     uint64_t *retinfo)
2472 {
2473         enum exc_class exc1, exc2;
2474         int type1, vector1;
2475 
2476         KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2477         KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2478 
2479         /*
2480          * If an exception occurs while attempting to call the double-fault
2481          * handler the processor enters shutdown mode (aka triple fault).
2482          */
2483         type1 = info1 & VM_INTINFO_TYPE;
2484         vector1 = info1 & 0xff;
2485         if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2486                 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2487                     info1, info2);
2488                 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2489                 *retinfo = 0;
2490                 return (0);
2491         }
2492 
2493         /*
2494          * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2495          */
2496         exc1 = exception_class(info1);
2497         exc2 = exception_class(info2);
2498         if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2499             (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2500                 /* Convert nested fault into a double fault. */
2501                 *retinfo = IDT_DF;
2502                 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2503                 *retinfo |= VM_INTINFO_DEL_ERRCODE;
2504         } else {
2505                 /* Handle exceptions serially */
2506                 *retinfo = info2;
2507         }
2508         return (1);
2509 }
2510 
2511 static uint64_t
2512 vcpu_exception_intinfo(struct vcpu *vcpu)
2513 {
2514         uint64_t info = 0;
2515 
2516         if (vcpu->exception_pending) {
2517                 info = vcpu->exc_vector & 0xff;
2518                 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2519                 if (vcpu->exc_errcode_valid) {
2520                         info |= VM_INTINFO_DEL_ERRCODE;
2521                         info |= (uint64_t)vcpu->exc_errcode << 32;
2522                 }
2523         }
2524         return (info);
2525 }
2526 
2527 int
2528 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2529 {
2530         struct vcpu *vcpu;
2531         uint64_t info1, info2;
2532         int valid;
2533 
2534         KASSERT(vcpuid >= 0 &&
2535             vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2536 
2537         vcpu = &vm->vcpu[vcpuid];
2538 
2539         info1 = vcpu->exitintinfo;
2540         vcpu->exitintinfo = 0;
2541 
2542         info2 = 0;
2543         if (vcpu->exception_pending) {
2544                 info2 = vcpu_exception_intinfo(vcpu);
2545                 vcpu->exception_pending = 0;
2546                 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2547                     vcpu->exc_vector, info2);
2548         }
2549 
2550         if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2551                 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2552         } else if (info1 & VM_INTINFO_VALID) {
2553                 *retinfo = info1;
2554                 valid = 1;
2555         } else if (info2 & VM_INTINFO_VALID) {
2556                 *retinfo = info2;
2557                 valid = 1;
2558         } else {
2559                 valid = 0;
2560         }
2561 
2562         if (valid) {
2563                 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2564                     "retinfo(%lx)", __func__, info1, info2, *retinfo);
2565         }
2566 
2567         return (valid);
2568 }
2569 
2570 int
2571 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2572 {
2573         struct vcpu *vcpu;
2574 
2575         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2576                 return (EINVAL);
2577 
2578         vcpu = &vm->vcpu[vcpuid];
2579         *info1 = vcpu->exitintinfo;
2580         *info2 = vcpu_exception_intinfo(vcpu);
2581         return (0);
2582 }
2583 
2584 int
2585 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2586     uint32_t errcode, int restart_instruction)
2587 {
2588         struct vcpu *vcpu;
2589         uint64_t regval;
2590         int error;
2591 
2592         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2593                 return (EINVAL);
2594 
2595         if (vector < 0 || vector >= 32)
2596                 return (EINVAL);
2597 
2598         /*
2599          * NMIs (which bear an exception vector of 2) are to be injected via
2600          * their own specialized path using vm_inject_nmi().
2601          */
2602         if (vector == 2) {
2603                 return (EINVAL);
2604         }
2605 
2606         /*
2607          * A double fault exception should never be injected directly into
2608          * the guest. It is a derived exception that results from specific
2609          * combinations of nested faults.
2610          */
2611         if (vector == IDT_DF)
2612                 return (EINVAL);
2613 
2614         vcpu = &vm->vcpu[vcpuid];
2615 
2616         if (vcpu->exception_pending) {
2617                 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2618                     "pending exception %d", vector, vcpu->exc_vector);
2619                 return (EBUSY);
2620         }
2621 
2622         if (errcode_valid) {
2623                 /*
2624                  * Exceptions don't deliver an error code in real mode.
2625                  */
2626                 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2627                 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2628                 if (!(regval & CR0_PE))
2629                         errcode_valid = 0;
2630         }
2631 
2632         /*
2633          * From section 26.6.1 "Interruptibility State" in Intel SDM:
2634          *
2635          * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2636          * one instruction or incurs an exception.
2637          */
2638         error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2639         KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2640             __func__, error));
2641 
2642         if (restart_instruction)
2643                 vm_restart_instruction(vm, vcpuid);
2644 
2645         vcpu->exception_pending = 1;
2646         vcpu->exc_vector = vector;
2647         vcpu->exc_errcode = errcode;
2648         vcpu->exc_errcode_valid = errcode_valid;
2649         VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2650         return (0);
2651 }
2652 
2653 void
2654 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2655     int errcode)
2656 {
2657         int error;
2658 
2659         error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2660             errcode, 1);
2661         KASSERT(error == 0, ("vm_inject_exception error %d", error));
2662 }
2663 
2664 void
2665 vm_inject_ud(struct vm *vm, int vcpuid)
2666 {
2667         vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2668 }
2669 
2670 void
2671 vm_inject_gp(struct vm *vm, int vcpuid)
2672 {
2673         vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2674 }
2675 
2676 void
2677 vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2678 {
2679         vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2680 }
2681 
2682 void
2683 vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2684 {
2685         vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2686 }
2687 
2688 void
2689 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2690 {
2691         int error;
2692 
2693         VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2694             error_code, cr2);
2695 
2696         error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2697         KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2698 
2699         vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2700 }
2701 
2702 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2703 
2704 int
2705 vm_inject_nmi(struct vm *vm, int vcpuid)
2706 {
2707         struct vcpu *vcpu;
2708 
2709         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2710                 return (EINVAL);
2711 
2712         vcpu = &vm->vcpu[vcpuid];
2713 
2714         vcpu->nmi_pending = 1;
2715         vcpu_notify_event(vm, vcpuid);
2716         return (0);
2717 }
2718 
2719 int
2720 vm_nmi_pending(struct vm *vm, int vcpuid)
2721 {
2722         struct vcpu *vcpu;
2723 
2724         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2725                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2726 
2727         vcpu = &vm->vcpu[vcpuid];
2728 
2729         return (vcpu->nmi_pending);
2730 }
2731 
2732 void
2733 vm_nmi_clear(struct vm *vm, int vcpuid)
2734 {
2735         struct vcpu *vcpu;
2736 
2737         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2738                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2739 
2740         vcpu = &vm->vcpu[vcpuid];
2741 
2742         if (vcpu->nmi_pending == 0)
2743                 panic("vm_nmi_clear: inconsistent nmi_pending state");
2744 
2745         vcpu->nmi_pending = 0;
2746         vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2747 }
2748 
2749 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2750 
2751 int
2752 vm_inject_extint(struct vm *vm, int vcpuid)
2753 {
2754         struct vcpu *vcpu;
2755 
2756         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2757                 return (EINVAL);
2758 
2759         vcpu = &vm->vcpu[vcpuid];
2760 
2761         vcpu->extint_pending = 1;
2762         vcpu_notify_event(vm, vcpuid);
2763         return (0);
2764 }
2765 
2766 int
2767 vm_extint_pending(struct vm *vm, int vcpuid)
2768 {
2769         struct vcpu *vcpu;
2770 
2771         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2772                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2773 
2774         vcpu = &vm->vcpu[vcpuid];
2775 
2776         return (vcpu->extint_pending);
2777 }
2778 
2779 void
2780 vm_extint_clear(struct vm *vm, int vcpuid)
2781 {
2782         struct vcpu *vcpu;
2783 
2784         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2785                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2786 
2787         vcpu = &vm->vcpu[vcpuid];
2788 
2789         if (vcpu->extint_pending == 0)
2790                 panic("vm_extint_clear: inconsistent extint_pending state");
2791 
2792         vcpu->extint_pending = 0;
2793         vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2794 }
2795 
2796 int
2797 vm_inject_init(struct vm *vm, int vcpuid)
2798 {
2799         struct vcpu *vcpu;
2800 
2801         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2802                 return (EINVAL);
2803 
2804         vcpu = &vm->vcpu[vcpuid];
2805         vcpu_lock(vcpu);
2806         vcpu->run_state |= VRS_PEND_INIT;
2807         /*
2808          * As part of queuing the INIT request, clear any pending SIPI.  It
2809          * would not otherwise survive across the reset of the vCPU when it
2810          * undergoes the requested INIT.  We would not want it to linger when it
2811          * could be mistaken as a subsequent (after the INIT) SIPI request.
2812          */
2813         vcpu->run_state &= ~VRS_PEND_SIPI;
2814         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2815 
2816         vcpu_unlock(vcpu);
2817         return (0);
2818 }
2819 
2820 int
2821 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2822 {
2823         struct vcpu *vcpu;
2824 
2825         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2826                 return (EINVAL);
2827 
2828         vcpu = &vm->vcpu[vcpuid];
2829         vcpu_lock(vcpu);
2830         vcpu->run_state |= VRS_PEND_SIPI;
2831         vcpu->sipi_vector = vector;
2832         /* SIPI is only actionable if the CPU is waiting in INIT state */
2833         if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2834                 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2835         }
2836         vcpu_unlock(vcpu);
2837         return (0);
2838 }
2839 
2840 bool
2841 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2842 {
2843         struct vcpu *vcpu;
2844 
2845         ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2846         vcpu = &vm->vcpu[vcpuid];
2847 
2848         /* Of interest: vCPU not in running state or with pending INIT */
2849         return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2850 }
2851 
2852 int
2853 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2854 {
2855         struct seg_desc desc;
2856         const enum vm_reg_name clear_regs[] = {
2857                 VM_REG_GUEST_CR2,
2858                 VM_REG_GUEST_CR3,
2859                 VM_REG_GUEST_CR4,
2860                 VM_REG_GUEST_RAX,
2861                 VM_REG_GUEST_RBX,
2862                 VM_REG_GUEST_RCX,
2863                 VM_REG_GUEST_RSI,
2864                 VM_REG_GUEST_RDI,
2865                 VM_REG_GUEST_RBP,
2866                 VM_REG_GUEST_RSP,
2867                 VM_REG_GUEST_R8,
2868                 VM_REG_GUEST_R9,
2869                 VM_REG_GUEST_R10,
2870                 VM_REG_GUEST_R11,
2871                 VM_REG_GUEST_R12,
2872                 VM_REG_GUEST_R13,
2873                 VM_REG_GUEST_R14,
2874                 VM_REG_GUEST_R15,
2875                 VM_REG_GUEST_DR0,
2876                 VM_REG_GUEST_DR1,
2877                 VM_REG_GUEST_DR2,
2878                 VM_REG_GUEST_DR3,
2879                 VM_REG_GUEST_EFER,
2880         };
2881         const enum vm_reg_name data_segs[] = {
2882                 VM_REG_GUEST_SS,
2883                 VM_REG_GUEST_DS,
2884                 VM_REG_GUEST_ES,
2885                 VM_REG_GUEST_FS,
2886                 VM_REG_GUEST_GS,
2887         };
2888         struct vcpu *vcpu = &vm->vcpu[vcpuid];
2889 
2890         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2891                 return (EINVAL);
2892 
2893         for (uint_t i = 0; i < nitems(clear_regs); i++) {
2894                 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2895         }
2896 
2897         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2898         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2899         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2900 
2901         /*
2902          * The prescribed contents of %rdx differ slightly between the Intel and
2903          * AMD architectural definitions.  The former expects the Extended Model
2904          * in bits 16-19 where the latter expects all the Family, Model, and
2905          * Stepping be there.  Common boot ROMs appear to disregard this
2906          * anyways, so we stick with a compromise value similar to what is
2907          * spelled out in the Intel SDM.
2908          */
2909         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2910 
2911         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2912         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2913 
2914         /* CS: Present, R/W, Accessed */
2915         desc.access = 0x0093;
2916         desc.base = 0xffff0000;
2917         desc.limit = 0xffff;
2918         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2919         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2920 
2921         /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2922         desc.access = 0x0093;
2923         desc.base = 0;
2924         desc.limit = 0xffff;
2925         for (uint_t i = 0; i < nitems(data_segs); i++) {
2926                 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2927                 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2928         }
2929 
2930         /* GDTR, IDTR */
2931         desc.base = 0;
2932         desc.limit = 0xffff;
2933         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2934         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2935 
2936         /* LDTR: Present, LDT */
2937         desc.access = 0x0082;
2938         desc.base = 0;
2939         desc.limit = 0xffff;
2940         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2941         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2942 
2943         /* TR: Present, 32-bit TSS */
2944         desc.access = 0x008b;
2945         desc.base = 0;
2946         desc.limit = 0xffff;
2947         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2948         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2949 
2950         vlapic_reset(vm_lapic(vm, vcpuid));
2951 
2952         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2953 
2954         vcpu->exitintinfo = 0;
2955         vcpu->exception_pending = 0;
2956         vcpu->nmi_pending = 0;
2957         vcpu->extint_pending = 0;
2958 
2959         /*
2960          * A CPU reset caused by power-on or system reset clears more state than
2961          * one which is trigged from an INIT IPI.
2962          */
2963         if (!init_only) {
2964                 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2965                 fpu_save_area_reset(vcpu->guestfpu);
2966 
2967                 /* XXX: clear MSRs and other pieces */
2968         }
2969 
2970         return (0);
2971 }
2972 
2973 static int
2974 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2975 {
2976         struct seg_desc desc;
2977 
2978         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2979                 return (EINVAL);
2980 
2981         /* CS: Present, R/W, Accessed */
2982         desc.access = 0x0093;
2983         desc.base = (uint64_t)vector << 12;
2984         desc.limit = 0xffff;
2985         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2986         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2987             (uint64_t)vector << 8));
2988 
2989         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2990 
2991         return (0);
2992 }
2993 
2994 int
2995 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2996 {
2997         if (vcpu < 0 || vcpu >= vm->maxcpus)
2998                 return (EINVAL);
2999 
3000         if (type < 0 || type >= VM_CAP_MAX)
3001                 return (EINVAL);
3002 
3003         return (VMGETCAP(vm->cookie, vcpu, type, retval));
3004 }
3005 
3006 int
3007 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3008 {
3009         if (vcpu < 0 || vcpu >= vm->maxcpus)
3010                 return (EINVAL);
3011 
3012         if (type < 0 || type >= VM_CAP_MAX)
3013                 return (EINVAL);
3014 
3015         return (VMSETCAP(vm->cookie, vcpu, type, val));
3016 }
3017 
3018 struct vlapic *
3019 vm_lapic(struct vm *vm, int cpu)
3020 {
3021         return (vm->vcpu[cpu].vlapic);
3022 }
3023 
3024 struct vioapic *
3025 vm_ioapic(struct vm *vm)
3026 {
3027 
3028         return (vm->vioapic);
3029 }
3030 
3031 struct vhpet *
3032 vm_hpet(struct vm *vm)
3033 {
3034 
3035         return (vm->vhpet);
3036 }
3037 
3038 void *
3039 vm_iommu_domain(struct vm *vm)
3040 {
3041 
3042         return (vm->iommu);
3043 }
3044 
3045 int
3046 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3047     bool from_idle)
3048 {
3049         int error;
3050         struct vcpu *vcpu;
3051 
3052         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3053                 panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3054 
3055         vcpu = &vm->vcpu[vcpuid];
3056 
3057         vcpu_lock(vcpu);
3058         error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3059         vcpu_unlock(vcpu);
3060 
3061         return (error);
3062 }
3063 
3064 enum vcpu_state
3065 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3066 {
3067         struct vcpu *vcpu;
3068         enum vcpu_state state;
3069 
3070         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3071                 panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3072 
3073         vcpu = &vm->vcpu[vcpuid];
3074 
3075         vcpu_lock(vcpu);
3076         state = vcpu->state;
3077         if (hostcpu != NULL)
3078                 *hostcpu = vcpu->hostcpu;
3079         vcpu_unlock(vcpu);
3080 
3081         return (state);
3082 }
3083 
3084 uint64_t
3085 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3086 {
3087         ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3088 
3089         uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3090 
3091         if (phys_adj) {
3092                 /* Include any offset for the current physical CPU too */
3093                 extern hrtime_t tsc_gethrtime_tick_delta(void);
3094                 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3095         }
3096 
3097         return (vcpu_off);
3098 }
3099 
3100 int
3101 vm_activate_cpu(struct vm *vm, int vcpuid)
3102 {
3103 
3104         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3105                 return (EINVAL);
3106 
3107         if (CPU_ISSET(vcpuid, &vm->active_cpus))
3108                 return (EBUSY);
3109 
3110         VCPU_CTR0(vm, vcpuid, "activated");
3111         CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3112         return (0);
3113 }
3114 
3115 int
3116 vm_suspend_cpu(struct vm *vm, int vcpuid)
3117 {
3118         int i;
3119 
3120         if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3121                 return (EINVAL);
3122 
3123         if (vcpuid == -1) {
3124                 vm->debug_cpus = vm->active_cpus;
3125                 for (i = 0; i < vm->maxcpus; i++) {
3126                         if (CPU_ISSET(i, &vm->active_cpus))
3127                                 vcpu_notify_event(vm, i);
3128                 }
3129         } else {
3130                 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3131                         return (EINVAL);
3132 
3133                 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3134                 vcpu_notify_event(vm, vcpuid);
3135         }
3136         return (0);
3137 }
3138 
3139 int
3140 vm_resume_cpu(struct vm *vm, int vcpuid)
3141 {
3142 
3143         if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3144                 return (EINVAL);
3145 
3146         if (vcpuid == -1) {
3147                 CPU_ZERO(&vm->debug_cpus);
3148         } else {
3149                 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3150                         return (EINVAL);
3151 
3152                 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3153         }
3154         return (0);
3155 }
3156 
3157 static bool
3158 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3159     uint64_t entry_rip)
3160 {
3161         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3162         struct vm_exit *vme = &vcpu->exitinfo;
3163         bool bail = false;
3164 
3165         ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3166 
3167         if (vm->suspend) {
3168                 if (on_entry) {
3169                         VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3170                             vm->suspend < VM_SUSPEND_LAST);
3171 
3172                         vme->exitcode = VM_EXITCODE_SUSPENDED;
3173                         vme->u.suspended.how = vm->suspend;
3174                 } else {
3175                         /*
3176                          * Handling VM suspend is complicated, so if that
3177                          * condition is detected outside of VM-entry itself,
3178                          * just emit a BOGUS exitcode so we take a lap to pick
3179                          * up the event during an entry and are directed into
3180                          * the vm_handle_suspend() logic.
3181                          */
3182                         vme->exitcode = VM_EXITCODE_BOGUS;
3183                 }
3184                 bail = true;
3185         }
3186         if (vcpu->reqidle) {
3187                 vme->exitcode = VM_EXITCODE_REQIDLE;
3188                 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3189 
3190                 if (!on_entry) {
3191                         /*
3192                          * A reqidle request detected outside of VM-entry can be
3193                          * handled directly by clearing the request (and taking
3194                          * a lap to userspace).
3195                          */
3196                         vcpu_assert_locked(vcpu);
3197                         vcpu->reqidle = 0;
3198                 }
3199                 bail = true;
3200         }
3201         if (vcpu_should_yield(vm, vcpuid)) {
3202                 vme->exitcode = VM_EXITCODE_BOGUS;
3203                 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3204                 bail = true;
3205         }
3206         if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3207                 vme->exitcode = VM_EXITCODE_DEBUG;
3208                 bail = true;
3209         }
3210 
3211         if (bail) {
3212                 if (on_entry) {
3213                         /*
3214                          * If bailing out during VM-entry, the current %rip must
3215                          * be recorded in the exitinfo.
3216                          */
3217                         vme->rip = entry_rip;
3218                 }
3219                 vme->inst_length = 0;
3220         }
3221         return (bail);
3222 }
3223 
3224 static bool
3225 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3226 {
3227         /*
3228          * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3229          * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3230          * structure, and we would only modify the exitcode.
3231          */
3232         return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3233 }
3234 
3235 bool
3236 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3237 {
3238         /*
3239          * Bail-out checks done as part of VM entry require an updated %rip to
3240          * populate the vm_exit struct if any of the conditions of interest are
3241          * matched in the check.
3242          */
3243         return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3244 }
3245 
3246 cpuset_t
3247 vm_active_cpus(struct vm *vm)
3248 {
3249 
3250         return (vm->active_cpus);
3251 }
3252 
3253 cpuset_t
3254 vm_debug_cpus(struct vm *vm)
3255 {
3256 
3257         return (vm->debug_cpus);
3258 }
3259 
3260 cpuset_t
3261 vm_suspended_cpus(struct vm *vm)
3262 {
3263 
3264         return (vm->suspended_cpus);
3265 }
3266 
3267 void *
3268 vcpu_stats(struct vm *vm, int vcpuid)
3269 {
3270 
3271         return (vm->vcpu[vcpuid].stats);
3272 }
3273 
3274 int
3275 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3276 {
3277         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3278                 return (EINVAL);
3279 
3280         *state = vm->vcpu[vcpuid].x2apic_state;
3281 
3282         return (0);
3283 }
3284 
3285 int
3286 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3287 {
3288         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3289                 return (EINVAL);
3290 
3291         if (state >= X2APIC_STATE_LAST)
3292                 return (EINVAL);
3293 
3294         vm->vcpu[vcpuid].x2apic_state = state;
3295 
3296         vlapic_set_x2apic_state(vm, vcpuid, state);
3297 
3298         return (0);
3299 }
3300 
3301 /*
3302  * This function is called to ensure that a vcpu "sees" a pending event
3303  * as soon as possible:
3304  * - If the vcpu thread is sleeping then it is woken up.
3305  * - If the vcpu is running on a different host_cpu then an IPI will be directed
3306  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
3307  */
3308 static void
3309 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3310 {
3311         int hostcpu;
3312 
3313         ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3314 
3315         hostcpu = vcpu->hostcpu;
3316         if (vcpu->state == VCPU_RUNNING) {
3317                 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3318                 if (hostcpu != curcpu) {
3319                         if (ntype == VCPU_NOTIFY_APIC) {
3320                                 vlapic_post_intr(vcpu->vlapic, hostcpu,
3321                                     vmm_ipinum);
3322                         } else {
3323                                 ipi_cpu(hostcpu, vmm_ipinum);
3324                         }
3325                 } else {
3326                         /*
3327                          * If the 'vcpu' is running on 'curcpu' then it must
3328                          * be sending a notification to itself (e.g. SELF_IPI).
3329                          * The pending event will be picked up when the vcpu
3330                          * transitions back to guest context.
3331                          */
3332                 }
3333         } else {
3334                 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3335                     "with hostcpu %d", vcpu->state, hostcpu));
3336                 if (vcpu->state == VCPU_SLEEPING) {
3337                         cv_signal(&vcpu->vcpu_cv);
3338                 }
3339         }
3340 }
3341 
3342 void
3343 vcpu_notify_event(struct vm *vm, int vcpuid)
3344 {
3345         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3346 
3347         vcpu_lock(vcpu);
3348         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3349         vcpu_unlock(vcpu);
3350 }
3351 
3352 void
3353 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3354 {
3355         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3356 
3357         if (ntype == VCPU_NOTIFY_NONE) {
3358                 return;
3359         }
3360 
3361         vcpu_lock(vcpu);
3362         vcpu_notify_event_locked(vcpu, ntype);
3363         vcpu_unlock(vcpu);
3364 }
3365 
3366 void
3367 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3368 {
3369         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3370         hrtime_t now = gethrtime();
3371 
3372         ASSERT3U(ustate, !=, vcpu->ustate);
3373         ASSERT3S(ustate, <, VU_MAX);
3374         ASSERT3S(ustate, >=, VU_INIT);
3375 
3376         hrtime_t delta = now - vcpu->ustate_when;
3377         vcpu->ustate_total[vcpu->ustate] += delta;
3378 
3379         membar_producer();
3380 
3381         vcpu->ustate_when = now;
3382         vcpu->ustate = ustate;
3383 }
3384 
3385 struct vmspace *
3386 vm_get_vmspace(struct vm *vm)
3387 {
3388 
3389         return (vm->vmspace);
3390 }
3391 
3392 int
3393 vm_apicid2vcpuid(struct vm *vm, int apicid)
3394 {
3395         /*
3396          * XXX apic id is assumed to be numerically identical to vcpu id
3397          */
3398         return (apicid);
3399 }
3400 
3401 struct vatpic *
3402 vm_atpic(struct vm *vm)
3403 {
3404         return (vm->vatpic);
3405 }
3406 
3407 struct vatpit *
3408 vm_atpit(struct vm *vm)
3409 {
3410         return (vm->vatpit);
3411 }
3412 
3413 struct vpmtmr *
3414 vm_pmtmr(struct vm *vm)
3415 {
3416 
3417         return (vm->vpmtmr);
3418 }
3419 
3420 struct vrtc *
3421 vm_rtc(struct vm *vm)
3422 {
3423 
3424         return (vm->vrtc);
3425 }
3426 
3427 enum vm_reg_name
3428 vm_segment_name(int seg)
3429 {
3430         static enum vm_reg_name seg_names[] = {
3431                 VM_REG_GUEST_ES,
3432                 VM_REG_GUEST_CS,
3433                 VM_REG_GUEST_SS,
3434                 VM_REG_GUEST_DS,
3435                 VM_REG_GUEST_FS,
3436                 VM_REG_GUEST_GS
3437         };
3438 
3439         KASSERT(seg >= 0 && seg < nitems(seg_names),
3440             ("%s: invalid segment encoding %d", __func__, seg));
3441         return (seg_names[seg]);
3442 }
3443 
3444 void
3445 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3446     int num_copyinfo)
3447 {
3448         int idx;
3449 
3450         for (idx = 0; idx < num_copyinfo; idx++) {
3451                 if (copyinfo[idx].cookie != NULL)
3452                         vm_gpa_release(copyinfo[idx].cookie);
3453         }
3454         bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3455 }
3456 
3457 int
3458 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3459     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3460     int num_copyinfo, int *fault)
3461 {
3462         int error, idx, nused;
3463         size_t n, off, remaining;
3464         void *hva, *cookie;
3465         uint64_t gpa;
3466 
3467         bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3468 
3469         nused = 0;
3470         remaining = len;
3471         while (remaining > 0) {
3472                 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3473                 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3474                 if (error || *fault)
3475                         return (error);
3476                 off = gpa & PAGE_MASK;
3477                 n = min(remaining, PAGE_SIZE - off);
3478                 copyinfo[nused].gpa = gpa;
3479                 copyinfo[nused].len = n;
3480                 remaining -= n;
3481                 gla += n;
3482                 nused++;
3483         }
3484 
3485         for (idx = 0; idx < nused; idx++) {
3486                 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3487                     copyinfo[idx].len, prot, &cookie);
3488                 if (hva == NULL)
3489                         break;
3490                 copyinfo[idx].hva = hva;
3491                 copyinfo[idx].cookie = cookie;
3492         }
3493 
3494         if (idx != nused) {
3495                 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3496                 return (EFAULT);
3497         } else {
3498                 *fault = 0;
3499                 return (0);
3500         }
3501 }
3502 
3503 void
3504 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3505     size_t len)
3506 {
3507         char *dst;
3508         int idx;
3509 
3510         dst = kaddr;
3511         idx = 0;
3512         while (len > 0) {
3513                 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3514                 len -= copyinfo[idx].len;
3515                 dst += copyinfo[idx].len;
3516                 idx++;
3517         }
3518 }
3519 
3520 void
3521 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3522     struct vm_copyinfo *copyinfo, size_t len)
3523 {
3524         const char *src;
3525         int idx;
3526 
3527         src = kaddr;
3528         idx = 0;
3529         while (len > 0) {
3530                 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3531                 len -= copyinfo[idx].len;
3532                 src += copyinfo[idx].len;
3533                 idx++;
3534         }
3535 }
3536 
3537 /*
3538  * Return the amount of in-use and wired memory for the VM. Since
3539  * these are global stats, only return the values with for vCPU 0
3540  */
3541 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3542 VMM_STAT_DECLARE(VMM_MEM_WIRED);
3543 
3544 static void
3545 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3546 {
3547 
3548         if (vcpu == 0) {
3549                 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3550                     PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3551         }
3552 }
3553 
3554 static void
3555 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3556 {
3557 
3558         if (vcpu == 0) {
3559                 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3560                     PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3561         }
3562 }
3563 
3564 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3565 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3566 
3567 int
3568 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3569     uint8_t bytes, uint32_t *val)
3570 {
3571         return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3572 }
3573 
3574 /*
3575  * bhyve-internal interfaces to attach or detach IO port handlers.
3576  * Must be called with VM write lock held for safety.
3577  */
3578 int
3579 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3580     void **cookie)
3581 {
3582         int err;
3583         err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3584         if (err == 0) {
3585                 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3586         }
3587         return (err);
3588 }
3589 int
3590 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3591     void **old_arg)
3592 {
3593         uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3594         int err;
3595 
3596         err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3597         if (err == 0) {
3598                 *cookie = NULL;
3599         }
3600         return (err);
3601 }
3602 
3603 /*
3604  * External driver interfaces to attach or detach IO port handlers.
3605  * Must be called with VM write lock held for safety.
3606  */
3607 int
3608 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3609     void *arg, void **cookie)
3610 {
3611         int err;
3612 
3613         if (port == 0) {
3614                 return (EINVAL);
3615         }
3616 
3617         err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3618         if (err == 0) {
3619                 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3620         }
3621         return (err);
3622 }
3623 void
3624 vm_ioport_unhook(struct vm *vm, void **cookie)
3625 {
3626         uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3627         ioport_handler_t old_func;
3628         void *old_arg;
3629         int err;
3630 
3631         err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3632 
3633         /* ioport-hook-using drivers are expected to be well-behaved */
3634         VERIFY0(err);
3635         VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3636 
3637         *cookie = NULL;
3638 }
3639 
3640 int
3641 vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3642 {
3643         struct vm *vm = ksp->ks_private;
3644         vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3645         const int vcpuid = vvk->vvk_vcpu.value.ui32;
3646         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3647 
3648         ASSERT3U(vcpuid, <, VM_MAXCPU);
3649 
3650         vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3651         vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3652         vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3653         vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3654         vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3655         vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3656 
3657         return (0);
3658 }