8005-backout Old usr/src/uts/i86pc/io/vmm/vmm.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2011 NetApp, Inc.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  *
  28  * $FreeBSD$
  29  */
  30 /*
  31  * This file and its contents are supplied under the terms of the
  32  * Common Development and Distribution License ("CDDL"), version 1.0.
  33  * You may only use this file in accordance with the terms of version
  34  * 1.0 of the CDDL.
  35  *
  36  * A full copy of the text of the CDDL should have accompanied this
  37  * source.  A copy of the CDDL is also available via the Internet at
  38  * http://www.illumos.org/license/CDDL.
  39  *
  40  * Copyright 2015 Pluribus Networks Inc.
  41  * Copyright 2021 Joyent, Inc.
  42  * Copyright 2021 Oxide Computer Company
  43  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  44  */
  45 
  46 #include <sys/cdefs.h>
  47 __FBSDID("$FreeBSD$");
  48 
  49 #include <sys/param.h>
  50 #include <sys/systm.h>
  51 #include <sys/kernel.h>
  52 #include <sys/module.h>
  53 #include <sys/sysctl.h>
  54 #include <sys/malloc.h>
  55 #include <sys/pcpu.h>
  56 #include <sys/lock.h>
  57 #include <sys/mutex.h>
  58 #include <sys/proc.h>
  59 #include <sys/rwlock.h>
  60 #include <sys/sched.h>
  61 #include <sys/smp.h>
  62 #include <sys/systm.h>
  63 
  64 #include <machine/pcb.h>
  65 #include <machine/smp.h>
  66 #include <machine/md_var.h>
  67 #include <x86/psl.h>
  68 #include <x86/apicreg.h>
  69 
  70 #include <machine/specialreg.h>
  71 #include <machine/vmm.h>
  72 #include <machine/vmm_dev.h>
  73 #include <machine/vmparam.h>
  74 #include <sys/vmm_instruction_emul.h>
  75 #include <sys/vmm_vm.h>
  76 
  77 #include "vmm_ioport.h"
  78 #include "vmm_ktr.h"
  79 #include "vmm_host.h"
  80 #include "vmm_mem.h"
  81 #include "vmm_util.h"
  82 #include "vatpic.h"
  83 #include "vatpit.h"
  84 #include "vhpet.h"
  85 #include "vioapic.h"
  86 #include "vlapic.h"
  87 #include "vpmtmr.h"
  88 #include "vrtc.h"
  89 #include "vmm_stat.h"
  90 #include "vmm_lapic.h"
  91 
  92 #include "io/ppt.h"
  93 #include "io/iommu.h"
  94 
  95 struct vlapic;
  96 
  97 /*
  98  * Initialization:
  99  * (a) allocated when vcpu is created
 100  * (i) initialized when vcpu is created and when it is reinitialized
 101  * (o) initialized the first time the vcpu is created
 102  * (x) initialized before use
 103  */
 104 struct vcpu {
 105         /* (o) protects state, run_state, hostcpu, sipi_vector */
 106         struct mtx      mtx;
 107 
 108         enum vcpu_state state;          /* (o) vcpu state */
 109         enum vcpu_run_state run_state;  /* (i) vcpu init/sipi/run state */
 110         kcondvar_t      vcpu_cv;        /* (o) cpu waiter cv */
 111         kcondvar_t      state_cv;       /* (o) IDLE-transition cv */
 112         int             hostcpu;        /* (o) vcpu's current host cpu */
 113         int             lastloccpu;     /* (o) last host cpu localized to */
 114         int             reqidle;        /* (i) request vcpu to idle */
 115         struct vlapic   *vlapic;        /* (i) APIC device model */
 116         enum x2apic_state x2apic_state; /* (i) APIC mode */
 117         uint64_t        exitintinfo;    /* (i) events pending at VM exit */
 118         int             nmi_pending;    /* (i) NMI pending */
 119         int             extint_pending; /* (i) INTR pending */
 120         int     exception_pending;      /* (i) exception pending */
 121         int     exc_vector;             /* (x) exception collateral */
 122         int     exc_errcode_valid;
 123         uint32_t exc_errcode;
 124         uint8_t         sipi_vector;    /* (i) SIPI vector */
 125         struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
 126         uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
 127         void            *stats;         /* (a,i) statistics */
 128         struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
 129         uint64_t        nextrip;        /* (x) next instruction to execute */
 130         struct vie      *vie_ctx;       /* (x) instruction emulation context */
 131         uint64_t        tsc_offset;     /* (x) offset from host TSC */
 132 
 133         enum vcpu_ustate ustate;        /* (i) microstate for the vcpu */
 134         hrtime_t        ustate_when;    /* (i) time of last ustate change */
 135         uint64_t ustate_total[VU_MAX];  /* (o) total time spent in ustates */
 136 };
 137 
 138 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 139 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 140 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
 141 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
 142 #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
 143 
 144 struct mem_seg {
 145         size_t  len;
 146         bool    sysmem;
 147         struct vm_object *object;
 148 };
 149 #define VM_MAX_MEMSEGS  4
 150 
 151 struct mem_map {
 152         vm_paddr_t      gpa;
 153         size_t          len;
 154         vm_ooffset_t    segoff;
 155         int             segid;
 156         int             prot;
 157         int             flags;
 158 };
 159 #define VM_MAX_MEMMAPS  8
 160 
 161 /*
 162  * Initialization:
 163  * (o) initialized the first time the VM is created
 164  * (i) initialized when VM is created and when it is reinitialized
 165  * (x) initialized before use
 166  */
 167 struct vm {
 168         void            *cookie;                /* (i) cpu-specific data */
 169         void            *iommu;                 /* (x) iommu-specific data */
 170         struct vhpet    *vhpet;                 /* (i) virtual HPET */
 171         struct vioapic  *vioapic;               /* (i) virtual ioapic */
 172         struct vatpic   *vatpic;                /* (i) virtual atpic */
 173         struct vatpit   *vatpit;                /* (i) virtual atpit */
 174         struct vpmtmr   *vpmtmr;                /* (i) virtual ACPI PM timer */
 175         struct vrtc     *vrtc;                  /* (o) virtual RTC */
 176         volatile cpuset_t active_cpus;          /* (i) active vcpus */
 177         volatile cpuset_t debug_cpus;           /* (i) vcpus stopped for dbg */
 178         int             suspend;                /* (i) stop VM execution */
 179         volatile cpuset_t suspended_cpus;       /* (i) suspended vcpus */
 180         volatile cpuset_t halted_cpus;          /* (x) cpus in a hard halt */
 181         struct mem_map  mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 182         struct mem_seg  mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 183         struct vmspace  *vmspace;               /* (o) guest's address space */
 184         char            name[VM_MAX_NAMELEN];   /* (o) virtual machine name */
 185         struct vcpu     vcpu[VM_MAXCPU];        /* (i) guest vcpus */
 186         /* The following describe the vm cpu topology */
 187         uint16_t        sockets;                /* (o) num of sockets */
 188         uint16_t        cores;                  /* (o) num of cores/socket */
 189         uint16_t        threads;                /* (o) num of threads/core */
 190         uint16_t        maxcpus;                /* (o) max pluggable cpus */
 191         uint64_t        boot_tsc_offset;        /* (i) TSC offset at VM boot */
 192         size_t          arc_resv;               /* # of pages take from ARC */
 193 
 194         struct ioport_config ioports;           /* (o) ioport handling */
 195 };
 196 
 197 static int vmm_initialized;
 198 
 199 
 200 static void
 201 nullop_panic(void)
 202 {
 203         panic("null vmm operation call");
 204 }
 205 
 206 /* Do not allow use of an un-set `ops` to do anything but panic */
 207 static struct vmm_ops vmm_ops_null = {
 208         .init           = (vmm_init_func_t)nullop_panic,
 209         .cleanup        = (vmm_cleanup_func_t)nullop_panic,
 210         .resume         = (vmm_resume_func_t)nullop_panic,
 211         .vminit         = (vmi_init_func_t)nullop_panic,
 212         .vmrun          = (vmi_run_func_t)nullop_panic,
 213         .vmcleanup      = (vmi_cleanup_func_t)nullop_panic,
 214         .vmgetreg       = (vmi_get_register_t)nullop_panic,
 215         .vmsetreg       = (vmi_set_register_t)nullop_panic,
 216         .vmgetdesc      = (vmi_get_desc_t)nullop_panic,
 217         .vmsetdesc      = (vmi_set_desc_t)nullop_panic,
 218         .vmgetcap       = (vmi_get_cap_t)nullop_panic,
 219         .vmsetcap       = (vmi_set_cap_t)nullop_panic,
 220         .vmspace_alloc  = (vmi_vmspace_alloc)nullop_panic,
 221         .vmspace_free   = (vmi_vmspace_free)nullop_panic,
 222         .vlapic_init    = (vmi_vlapic_init)nullop_panic,
 223         .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
 224         .vmsavectx      = (vmi_savectx)nullop_panic,
 225         .vmrestorectx   = (vmi_restorectx)nullop_panic,
 226 };
 227 
 228 static struct vmm_ops *ops = &vmm_ops_null;
 229 
 230 #define VMM_INIT(num)                   ((*ops->init)(num))
 231 #define VMM_CLEANUP()                   ((*ops->cleanup)())
 232 #define VMM_RESUME()                    ((*ops->resume)())
 233 
 234 #define VMINIT(vm, pmap)                ((*ops->vminit)(vm, pmap))
 235 #define VMRUN(vmi, vcpu, rip, pmap) \
 236         ((*ops->vmrun)(vmi, vcpu, rip, pmap))
 237 #define VMCLEANUP(vmi)                  ((*ops->vmcleanup)(vmi))
 238 #define VMSPACE_ALLOC(min, max)         ((*ops->vmspace_alloc)(min, max))
 239 #define VMSPACE_FREE(vmspace)           ((*ops->vmspace_free)(vmspace))
 240 
 241 #define VMGETREG(vmi, vcpu, num, rv)    ((*ops->vmgetreg)(vmi, vcpu, num, rv))
 242 #define VMSETREG(vmi, vcpu, num, val)   ((*ops->vmsetreg)(vmi, vcpu, num, val))
 243 #define VMGETDESC(vmi, vcpu, num, dsc)  ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
 244 #define VMSETDESC(vmi, vcpu, num, dsc)  ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
 245 #define VMGETCAP(vmi, vcpu, num, rv)    ((*ops->vmgetcap)(vmi, vcpu, num, rv))
 246 #define VMSETCAP(vmi, vcpu, num, val)   ((*ops->vmsetcap)(vmi, vcpu, num, val))
 247 #define VLAPIC_INIT(vmi, vcpu)          ((*ops->vlapic_init)(vmi, vcpu))
 248 #define VLAPIC_CLEANUP(vmi, vlapic)     ((*ops->vlapic_cleanup)(vmi, vlapic))
 249 
 250 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 251 #define fpu_stop_emulating()    clts()
 252 
 253 SDT_PROVIDER_DEFINE(vmm);
 254 
 255 static MALLOC_DEFINE(M_VM, "vm", "vm");
 256 
 257 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 258     NULL);
 259 
 260 /*
 261  * Halt the guest if all vcpus are executing a HLT instruction with
 262  * interrupts disabled.
 263  */
 264 static int halt_detection_enabled = 1;
 265 
 266 /* IPI vector used for vcpu notifications */
 267 static int vmm_ipinum;
 268 
 269 /* Trap into hypervisor on all guest exceptions and reflect them back */
 270 static int trace_guest_exceptions;
 271 
 272 static void vm_free_memmap(struct vm *vm, int ident);
 273 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 274 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
 275 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
 276 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
 277 
 278 extern int arc_virt_machine_reserve(size_t);
 279 extern void arc_virt_machine_release(size_t);
 280 
 281 /* Flags for vtc_status */
 282 #define VTCS_FPU_RESTORED       1 /* guest FPU restored, host FPU saved */
 283 #define VTCS_FPU_CTX_CRITICAL   2 /* in ctx where FPU restore cannot be lazy */
 284 
 285 typedef struct vm_thread_ctx {
 286         struct vm       *vtc_vm;
 287         int             vtc_vcpuid;
 288         uint_t          vtc_status;
 289         enum vcpu_ustate vtc_ustate;
 290 } vm_thread_ctx_t;
 291 
 292 #ifdef KTR
 293 static const char *
 294 vcpu_state2str(enum vcpu_state state)
 295 {
 296 
 297         switch (state) {
 298         case VCPU_IDLE:
 299                 return ("idle");
 300         case VCPU_FROZEN:
 301                 return ("frozen");
 302         case VCPU_RUNNING:
 303                 return ("running");
 304         case VCPU_SLEEPING:
 305                 return ("sleeping");
 306         default:
 307                 return ("unknown");
 308         }
 309 }
 310 #endif
 311 
 312 static void
 313 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 314 {
 315         struct vcpu *vcpu = &vm->vcpu[i];
 316 
 317         VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 318         if (destroy) {
 319                 vmm_stat_free(vcpu->stats);
 320                 fpu_save_area_free(vcpu->guestfpu);
 321                 vie_free(vcpu->vie_ctx);
 322                 vcpu->vie_ctx = NULL;
 323         }
 324 }
 325 
 326 static void
 327 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 328 {
 329         struct vcpu *vcpu;
 330 
 331         KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 332             ("vcpu_init: invalid vcpu %d", vcpu_id));
 333 
 334         vcpu = &vm->vcpu[vcpu_id];
 335 
 336         if (create) {
 337                 vcpu_lock_init(vcpu);
 338                 vcpu->state = VCPU_IDLE;
 339                 vcpu->hostcpu = NOCPU;
 340                 vcpu->lastloccpu = NOCPU;
 341                 vcpu->guestfpu = fpu_save_area_alloc();
 342                 vcpu->stats = vmm_stat_alloc();
 343                 vcpu->vie_ctx = vie_alloc();
 344 
 345                 vcpu->ustate = VU_INIT;
 346                 vcpu->ustate_when = gethrtime();
 347         } else {
 348                 vie_reset(vcpu->vie_ctx);
 349                 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
 350                 if (vcpu->ustate != VU_INIT) {
 351                         vcpu_ustate_change(vm, vcpu_id, VU_INIT);
 352                 }
 353         }
 354 
 355         vcpu->run_state = VRS_HALT;
 356         vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 357         vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 358         vcpu->reqidle = 0;
 359         vcpu->exitintinfo = 0;
 360         vcpu->nmi_pending = 0;
 361         vcpu->extint_pending = 0;
 362         vcpu->exception_pending = 0;
 363         vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 364         fpu_save_area_reset(vcpu->guestfpu);
 365         vmm_stat_init(vcpu->stats);
 366         vcpu->tsc_offset = 0;
 367 }
 368 
 369 int
 370 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 371 {
 372 
 373         return (trace_guest_exceptions);
 374 }
 375 
 376 struct vm_exit *
 377 vm_exitinfo(struct vm *vm, int cpuid)
 378 {
 379         struct vcpu *vcpu;
 380 
 381         if (cpuid < 0 || cpuid >= vm->maxcpus)
 382                 panic("vm_exitinfo: invalid cpuid %d", cpuid);
 383 
 384         vcpu = &vm->vcpu[cpuid];
 385 
 386         return (&vcpu->exitinfo);
 387 }
 388 
 389 struct vie *
 390 vm_vie_ctx(struct vm *vm, int cpuid)
 391 {
 392         if (cpuid < 0 || cpuid >= vm->maxcpus)
 393                 panic("vm_vie_ctx: invalid cpuid %d", cpuid);
 394 
 395         return (vm->vcpu[cpuid].vie_ctx);
 396 }
 397 
 398 static int
 399 vmm_init(void)
 400 {
 401         int error;
 402 
 403         vmm_host_state_init();
 404 
 405         /* We use cpu_poke() for IPIs */
 406         vmm_ipinum = 0;
 407 
 408         error = vmm_mem_init();
 409         if (error)
 410                 return (error);
 411 
 412         if (vmm_is_intel())
 413                 ops = &vmm_ops_intel;
 414         else if (vmm_is_svm())
 415                 ops = &vmm_ops_amd;
 416         else
 417                 return (ENXIO);
 418 
 419         return (VMM_INIT(vmm_ipinum));
 420 }
 421 
 422 int
 423 vmm_mod_load()
 424 {
 425         int     error;
 426 
 427         VERIFY(vmm_initialized == 0);
 428 
 429         error = vmm_init();
 430         if (error == 0)
 431                 vmm_initialized = 1;
 432 
 433         return (error);
 434 }
 435 
 436 int
 437 vmm_mod_unload()
 438 {
 439         int     error;
 440 
 441         VERIFY(vmm_initialized == 1);
 442 
 443         iommu_cleanup();
 444         error = VMM_CLEANUP();
 445         if (error)
 446                 return (error);
 447         vmm_initialized = 0;
 448 
 449         return (0);
 450 }
 451 
 452 static void
 453 vm_init(struct vm *vm, bool create)
 454 {
 455         int i;
 456 
 457         vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 458         vm->iommu = NULL;
 459         vm->vioapic = vioapic_init(vm);
 460         vm->vhpet = vhpet_init(vm);
 461         vm->vatpic = vatpic_init(vm);
 462         vm->vatpit = vatpit_init(vm);
 463         vm->vpmtmr = vpmtmr_init(vm);
 464         if (create)
 465                 vm->vrtc = vrtc_init(vm);
 466 
 467         vm_inout_init(vm, &vm->ioports);
 468 
 469         CPU_ZERO(&vm->active_cpus);
 470         CPU_ZERO(&vm->debug_cpus);
 471 
 472         vm->suspend = 0;
 473         CPU_ZERO(&vm->suspended_cpus);
 474 
 475         for (i = 0; i < vm->maxcpus; i++)
 476                 vcpu_init(vm, i, create);
 477 
 478         /*
 479          * Configure the VM-wide TSC offset so that the call to vm_init()
 480          * represents the boot time (when the TSC(s) read 0).  Each vCPU will
 481          * have its own offset from this, which is altered if/when the guest
 482          * writes to MSR_TSC.
 483          *
 484          * The TSC offsetting math is all unsigned, using overflow for negative
 485          * offets.  A reading of the TSC is negated to form the boot offset.
 486          */
 487         vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset());
 488 }
 489 
 490 /*
 491  * The default CPU topology is a single thread per package.
 492  */
 493 uint_t cores_per_package = 1;
 494 uint_t threads_per_core = 1;
 495 
 496 int
 497 vm_create(const char *name, struct vm **retvm)
 498 {
 499         struct vm *vm;
 500         struct vmspace *vmspace;
 501 
 502         /*
 503          * If vmm.ko could not be successfully initialized then don't attempt
 504          * to create the virtual machine.
 505          */
 506         if (!vmm_initialized)
 507                 return (ENXIO);
 508 
 509         if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 510                 return (EINVAL);
 511 
 512         vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
 513         if (vmspace == NULL)
 514                 return (ENOMEM);
 515 
 516         vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
 517         strcpy(vm->name, name);
 518         vm->vmspace = vmspace;
 519 
 520         vm->sockets = 1;
 521         vm->cores = cores_per_package;       /* XXX backwards compatibility */
 522         vm->threads = threads_per_core;      /* XXX backwards compatibility */
 523         vm->maxcpus = VM_MAXCPU;     /* XXX temp to keep code working */
 524 
 525         vm_init(vm, true);
 526 
 527         *retvm = vm;
 528         return (0);
 529 }
 530 
 531 void
 532 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
 533     uint16_t *threads, uint16_t *maxcpus)
 534 {
 535         *sockets = vm->sockets;
 536         *cores = vm->cores;
 537         *threads = vm->threads;
 538         *maxcpus = vm->maxcpus;
 539 }
 540 
 541 uint16_t
 542 vm_get_maxcpus(struct vm *vm)
 543 {
 544         return (vm->maxcpus);
 545 }
 546 
 547 int
 548 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
 549     uint16_t threads, uint16_t maxcpus)
 550 {
 551         if (maxcpus != 0)
 552                 return (EINVAL);        /* XXX remove when supported */
 553         if ((sockets * cores * threads) > vm->maxcpus)
 554                 return (EINVAL);
 555         /* XXX need to check sockets * cores * threads == vCPU, how? */
 556         vm->sockets = sockets;
 557         vm->cores = cores;
 558         vm->threads = threads;
 559         vm->maxcpus = VM_MAXCPU;     /* XXX temp to keep code working */
 560         return (0);
 561 }
 562 
 563 static void
 564 vm_cleanup(struct vm *vm, bool destroy)
 565 {
 566         struct mem_map *mm;
 567         int i;
 568 
 569         ppt_unassign_all(vm);
 570 
 571         if (vm->iommu != NULL)
 572                 iommu_destroy_domain(vm->iommu);
 573 
 574         /*
 575          * Devices which attach their own ioport hooks should be cleaned up
 576          * first so they can tear down those registrations.
 577          */
 578         vpmtmr_cleanup(vm->vpmtmr);
 579 
 580         vm_inout_cleanup(vm, &vm->ioports);
 581 
 582         if (destroy)
 583                 vrtc_cleanup(vm->vrtc);
 584         else
 585                 vrtc_reset(vm->vrtc);
 586 
 587         vatpit_cleanup(vm->vatpit);
 588         vhpet_cleanup(vm->vhpet);
 589         vatpic_cleanup(vm->vatpic);
 590         vioapic_cleanup(vm->vioapic);
 591 
 592         for (i = 0; i < vm->maxcpus; i++)
 593                 vcpu_cleanup(vm, i, destroy);
 594 
 595         VMCLEANUP(vm->cookie);
 596 
 597         /*
 598          * System memory is removed from the guest address space only when
 599          * the VM is destroyed. This is because the mapping remains the same
 600          * across VM reset.
 601          *
 602          * Device memory can be relocated by the guest (e.g. using PCI BARs)
 603          * so those mappings are removed on a VM reset.
 604          */
 605         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 606                 mm = &vm->mem_maps[i];
 607                 if (destroy || !sysmem_mapping(vm, mm)) {
 608                         vm_free_memmap(vm, i);
 609                 } else {
 610                         /*
 611                          * We need to reset the IOMMU flag so this mapping can
 612                          * be reused when a VM is rebooted. Since the IOMMU
 613                          * domain has already been destroyed we can just reset
 614                          * the flag here.
 615                          */
 616                         mm->flags &= ~VM_MEMMAP_F_IOMMU;
 617                 }
 618         }
 619 
 620         if (destroy) {
 621                 for (i = 0; i < VM_MAX_MEMSEGS; i++)
 622                         vm_free_memseg(vm, i);
 623 
 624                 VMSPACE_FREE(vm->vmspace);
 625                 vm->vmspace = NULL;
 626 
 627                 arc_virt_machine_release(vm->arc_resv);
 628                 vm->arc_resv = 0;
 629         }
 630 }
 631 
 632 void
 633 vm_destroy(struct vm *vm)
 634 {
 635         vm_cleanup(vm, true);
 636         free(vm, M_VM);
 637 }
 638 
 639 int
 640 vm_reinit(struct vm *vm)
 641 {
 642         int error;
 643 
 644         /*
 645          * A virtual machine can be reset only if all vcpus are suspended.
 646          */
 647         if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 648                 vm_cleanup(vm, false);
 649                 vm_init(vm, false);
 650                 error = 0;
 651         } else {
 652                 error = EBUSY;
 653         }
 654 
 655         return (error);
 656 }
 657 
 658 const char *
 659 vm_name(struct vm *vm)
 660 {
 661         return (vm->name);
 662 }
 663 
 664 int
 665 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 666 {
 667         vm_object_t obj;
 668 
 669         if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 670                 return (ENOMEM);
 671         else
 672                 return (0);
 673 }
 674 
 675 int
 676 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 677 {
 678         return (vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len));
 679 }
 680 
 681 /*
 682  * Return 'true' if 'gpa' is allocated in the guest address space.
 683  *
 684  * This function is called in the context of a running vcpu which acts as
 685  * an implicit lock on 'vm->mem_maps[]'.
 686  */
 687 bool
 688 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 689 {
 690         struct mem_map *mm;
 691         int i;
 692 
 693 #ifdef INVARIANTS
 694         int hostcpu, state;
 695         state = vcpu_get_state(vm, vcpuid, &hostcpu);
 696         KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
 697             ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
 698 #endif
 699 
 700         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 701                 mm = &vm->mem_maps[i];
 702                 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
 703                         return (true);          /* 'gpa' is sysmem or devmem */
 704         }
 705 
 706         if (ppt_is_mmio(vm, gpa))
 707                 return (true);                  /* 'gpa' is pci passthru mmio */
 708 
 709         return (false);
 710 }
 711 
 712 int
 713 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 714 {
 715         struct mem_seg *seg;
 716         vm_object_t obj;
 717 
 718 #ifndef __FreeBSD__
 719         extern pgcnt_t get_max_page_get(void);
 720 #endif
 721 
 722         if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 723                 return (EINVAL);
 724 
 725         if (len == 0 || (len & PAGE_MASK))
 726                 return (EINVAL);
 727 
 728 #ifndef __FreeBSD__
 729         if (len > ptob(get_max_page_get()))
 730                 return (EINVAL);
 731 #endif
 732 
 733         seg = &vm->mem_segs[ident];
 734         if (seg->object != NULL) {
 735                 if (seg->len == len && seg->sysmem == sysmem)
 736                         return (EEXIST);
 737                 else
 738                         return (EINVAL);
 739         }
 740 
 741         obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
 742         if (obj == NULL)
 743                 return (ENOMEM);
 744 
 745         seg->len = len;
 746         seg->object = obj;
 747         seg->sysmem = sysmem;
 748         return (0);
 749 }
 750 
 751 int
 752 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
 753     vm_object_t *objptr)
 754 {
 755         struct mem_seg *seg;
 756 
 757         if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 758                 return (EINVAL);
 759 
 760         seg = &vm->mem_segs[ident];
 761         if (len)
 762                 *len = seg->len;
 763         if (sysmem)
 764                 *sysmem = seg->sysmem;
 765         if (objptr)
 766                 *objptr = seg->object;
 767         return (0);
 768 }
 769 
 770 void
 771 vm_free_memseg(struct vm *vm, int ident)
 772 {
 773         struct mem_seg *seg;
 774 
 775         KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 776             ("%s: invalid memseg ident %d", __func__, ident));
 777 
 778         seg = &vm->mem_segs[ident];
 779         if (seg->object != NULL) {
 780                 vm_object_deallocate(seg->object);
 781                 bzero(seg, sizeof (struct mem_seg));
 782         }
 783 }
 784 
 785 int
 786 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
 787     size_t len, int prot, int flags)
 788 {
 789         struct mem_seg *seg;
 790         struct mem_map *m, *map;
 791         vm_ooffset_t last;
 792         int i, error;
 793 
 794         if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
 795                 return (EINVAL);
 796 
 797         if (flags & ~VM_MEMMAP_F_WIRED)
 798                 return (EINVAL);
 799 
 800         if (segid < 0 || segid >= VM_MAX_MEMSEGS)
 801                 return (EINVAL);
 802 
 803         seg = &vm->mem_segs[segid];
 804         if (seg->object == NULL)
 805                 return (EINVAL);
 806 
 807         last = first + len;
 808         if (first < 0 || first >= last || last > seg->len)
 809                 return (EINVAL);
 810 
 811         if ((gpa | first | last) & PAGE_MASK)
 812                 return (EINVAL);
 813 
 814         map = NULL;
 815         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 816                 m = &vm->mem_maps[i];
 817                 if (m->len == 0) {
 818                         map = m;
 819                         break;
 820                 }
 821         }
 822 
 823         if (map == NULL)
 824                 return (ENOSPC);
 825 
 826         error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
 827             len, 0, VMFS_NO_SPACE, prot, prot, 0);
 828         if (error != 0)
 829                 return (EFAULT);
 830 
 831         vm_object_reference(seg->object);
 832 
 833         if ((flags & VM_MEMMAP_F_WIRED) != 0) {
 834                 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
 835                     VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 836                 if (error != 0) {
 837                         vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
 838                         return (EFAULT);
 839                 }
 840         }
 841 
 842         map->gpa = gpa;
 843         map->len = len;
 844         map->segoff = first;
 845         map->segid = segid;
 846         map->prot = prot;
 847         map->flags = flags;
 848         return (0);
 849 }
 850 
 851 int
 852 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
 853 {
 854         struct mem_map *m;
 855         int i;
 856 
 857         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 858                 m = &vm->mem_maps[i];
 859                 if (m->gpa == gpa && m->len == len &&
 860                     (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
 861                         vm_free_memmap(vm, i);
 862                         return (0);
 863                 }
 864         }
 865 
 866         return (EINVAL);
 867 }
 868 
 869 int
 870 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
 871     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 872 {
 873         struct mem_map *mm, *mmnext;
 874         int i;
 875 
 876         mmnext = NULL;
 877         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 878                 mm = &vm->mem_maps[i];
 879                 if (mm->len == 0 || mm->gpa < *gpa)
 880                         continue;
 881                 if (mmnext == NULL || mm->gpa < mmnext->gpa)
 882                         mmnext = mm;
 883         }
 884 
 885         if (mmnext != NULL) {
 886                 *gpa = mmnext->gpa;
 887                 if (segid)
 888                         *segid = mmnext->segid;
 889                 if (segoff)
 890                         *segoff = mmnext->segoff;
 891                 if (len)
 892                         *len = mmnext->len;
 893                 if (prot)
 894                         *prot = mmnext->prot;
 895                 if (flags)
 896                         *flags = mmnext->flags;
 897                 return (0);
 898         } else {
 899                 return (ENOENT);
 900         }
 901 }
 902 
 903 static void
 904 vm_free_memmap(struct vm *vm, int ident)
 905 {
 906         struct mem_map *mm;
 907         int error;
 908 
 909         mm = &vm->mem_maps[ident];
 910         if (mm->len) {
 911                 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 912                     mm->gpa + mm->len);
 913                 KASSERT(error == 0, ("%s: vm_map_remove error %d",
 914                     __func__, error));
 915                 bzero(mm, sizeof (struct mem_map));
 916         }
 917 }
 918 
 919 static __inline bool
 920 sysmem_mapping(struct vm *vm, struct mem_map *mm)
 921 {
 922 
 923         if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 924                 return (true);
 925         else
 926                 return (false);
 927 }
 928 
 929 vm_paddr_t
 930 vmm_sysmem_maxaddr(struct vm *vm)
 931 {
 932         struct mem_map *mm;
 933         vm_paddr_t maxaddr;
 934         int i;
 935 
 936         maxaddr = 0;
 937         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 938                 mm = &vm->mem_maps[i];
 939                 if (sysmem_mapping(vm, mm)) {
 940                         if (maxaddr < mm->gpa + mm->len)
 941                                 maxaddr = mm->gpa + mm->len;
 942                 }
 943         }
 944         return (maxaddr);
 945 }
 946 
 947 static void
 948 vm_iommu_modify(struct vm *vm, bool map)
 949 {
 950         int i, sz;
 951         vm_paddr_t gpa, hpa;
 952         struct mem_map *mm;
 953 #ifdef __FreeBSD__
 954         void *vp, *cookie, *host_domain;
 955 #else
 956         void *vp, *cookie, *host_domain __unused;
 957 #endif
 958 
 959         sz = PAGE_SIZE;
 960         host_domain = iommu_host_domain();
 961 
 962         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 963                 mm = &vm->mem_maps[i];
 964                 if (!sysmem_mapping(vm, mm))
 965                         continue;
 966 
 967                 if (map) {
 968                         KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
 969                             ("iommu map found invalid memmap %lx/%lx/%x",
 970                             mm->gpa, mm->len, mm->flags));
 971                         if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
 972                                 continue;
 973                         mm->flags |= VM_MEMMAP_F_IOMMU;
 974                 } else {
 975                         if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
 976                                 continue;
 977                         mm->flags &= ~VM_MEMMAP_F_IOMMU;
 978                         KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
 979                             ("iommu unmap found invalid memmap %lx/%lx/%x",
 980                             mm->gpa, mm->len, mm->flags));
 981                 }
 982 
 983                 gpa = mm->gpa;
 984                 while (gpa < mm->gpa + mm->len) {
 985                         vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, PROT_WRITE,
 986                             &cookie);
 987                         KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
 988                             vm_name(vm), gpa));
 989 
 990                         vm_gpa_release(cookie);
 991 
 992                         hpa = DMAP_TO_PHYS((uintptr_t)vp);
 993                         if (map) {
 994                                 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 995 #ifdef __FreeBSD__
 996                                 iommu_remove_mapping(host_domain, hpa, sz);
 997 #endif
 998                         } else {
 999                                 iommu_remove_mapping(vm->iommu, gpa, sz);
1000 #ifdef __FreeBSD__
1001                                 iommu_create_mapping(host_domain, hpa, hpa, sz);
1002 #endif
1003                         }
1004 
1005                         gpa += PAGE_SIZE;
1006                 }
1007         }
1008 
1009         /*
1010          * Invalidate the cached translations associated with the domain
1011          * from which pages were removed.
1012          */
1013 #ifdef __FreeBSD__
1014         if (map)
1015                 iommu_invalidate_tlb(host_domain);
1016         else
1017                 iommu_invalidate_tlb(vm->iommu);
1018 #else
1019         iommu_invalidate_tlb(vm->iommu);
1020 #endif
1021 }
1022 
1023 #define vm_iommu_unmap(vm)      vm_iommu_modify((vm), false)
1024 #define vm_iommu_map(vm)        vm_iommu_modify((vm), true)
1025 
1026 int
1027 vm_unassign_pptdev(struct vm *vm, int pptfd)
1028 {
1029         int error;
1030 
1031         error = ppt_unassign_device(vm, pptfd);
1032         if (error)
1033                 return (error);
1034 
1035         if (ppt_assigned_devices(vm) == 0)
1036                 vm_iommu_unmap(vm);
1037 
1038         return (0);
1039 }
1040 
1041 int
1042 vm_assign_pptdev(struct vm *vm, int pptfd)
1043 {
1044         int error;
1045         vm_paddr_t maxaddr;
1046 
1047         /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1048         if (ppt_assigned_devices(vm) == 0) {
1049                 KASSERT(vm->iommu == NULL,
1050                     ("vm_assign_pptdev: iommu must be NULL"));
1051                 maxaddr = vmm_sysmem_maxaddr(vm);
1052                 vm->iommu = iommu_create_domain(maxaddr);
1053                 if (vm->iommu == NULL)
1054                         return (ENXIO);
1055                 vm_iommu_map(vm);
1056         }
1057 
1058         error = ppt_assign_device(vm, pptfd);
1059         return (error);
1060 }
1061 
1062 void *
1063 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1064     void **cookie)
1065 {
1066         int i, count, pageoff;
1067         struct mem_map *mm;
1068         vm_page_t m;
1069 #ifdef INVARIANTS
1070         /*
1071          * All vcpus are frozen by ioctls that modify the memory map
1072          * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1073          * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1074          */
1075         int state;
1076         KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1077             __func__, vcpuid));
1078         for (i = 0; i < vm->maxcpus; i++) {
1079                 if (vcpuid != -1 && vcpuid != i)
1080                         continue;
1081                 state = vcpu_get_state(vm, i, NULL);
1082                 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1083                     __func__, state));
1084         }
1085 #endif
1086         pageoff = gpa & PAGE_MASK;
1087         if (len > PAGE_SIZE - pageoff)
1088                 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1089 
1090         count = 0;
1091         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1092                 mm = &vm->mem_maps[i];
1093                 if (mm->len == 0) {
1094                         continue;
1095                 }
1096                 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1097                         count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1098                             trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1099                         break;
1100                 }
1101         }
1102 
1103         if (count == 1) {
1104                 *cookie = m;
1105                 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1106         } else {
1107                 *cookie = NULL;
1108                 return (NULL);
1109         }
1110 }
1111 
1112 void
1113 vm_gpa_release(void *cookie)
1114 {
1115         vm_page_t m = cookie;
1116 
1117         vm_page_unwire(m, PQ_ACTIVE);
1118 }
1119 
1120 int
1121 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1122 {
1123 
1124         if (vcpu < 0 || vcpu >= vm->maxcpus)
1125                 return (EINVAL);
1126 
1127         if (reg >= VM_REG_LAST)
1128                 return (EINVAL);
1129 
1130         return (VMGETREG(vm->cookie, vcpu, reg, retval));
1131 }
1132 
1133 int
1134 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1135 {
1136         struct vcpu *vcpu;
1137         int error;
1138 
1139         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1140                 return (EINVAL);
1141 
1142         if (reg >= VM_REG_LAST)
1143                 return (EINVAL);
1144 
1145         error = VMSETREG(vm->cookie, vcpuid, reg, val);
1146         if (error || reg != VM_REG_GUEST_RIP)
1147                 return (error);
1148 
1149         /* Set 'nextrip' to match the value of %rip */
1150         VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1151         vcpu = &vm->vcpu[vcpuid];
1152         vcpu->nextrip = val;
1153         return (0);
1154 }
1155 
1156 static bool
1157 is_descriptor_table(int reg)
1158 {
1159         switch (reg) {
1160         case VM_REG_GUEST_IDTR:
1161         case VM_REG_GUEST_GDTR:
1162                 return (true);
1163         default:
1164                 return (false);
1165         }
1166 }
1167 
1168 static bool
1169 is_segment_register(int reg)
1170 {
1171         switch (reg) {
1172         case VM_REG_GUEST_ES:
1173         case VM_REG_GUEST_CS:
1174         case VM_REG_GUEST_SS:
1175         case VM_REG_GUEST_DS:
1176         case VM_REG_GUEST_FS:
1177         case VM_REG_GUEST_GS:
1178         case VM_REG_GUEST_TR:
1179         case VM_REG_GUEST_LDTR:
1180                 return (true);
1181         default:
1182                 return (false);
1183         }
1184 }
1185 
1186 int
1187 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1188 {
1189 
1190         if (vcpu < 0 || vcpu >= vm->maxcpus)
1191                 return (EINVAL);
1192 
1193         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1194                 return (EINVAL);
1195 
1196         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1197 }
1198 
1199 int
1200 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1201 {
1202         if (vcpu < 0 || vcpu >= vm->maxcpus)
1203                 return (EINVAL);
1204 
1205         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1206                 return (EINVAL);
1207 
1208         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1209 }
1210 
1211 int
1212 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1213 {
1214         struct vcpu *vcpu;
1215 
1216         if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1217                 return (EINVAL);
1218         }
1219 
1220         vcpu = &vm->vcpu[vcpuid];
1221 
1222         vcpu_lock(vcpu);
1223         *state = vcpu->run_state;
1224         *sipi_vec = vcpu->sipi_vector;
1225         vcpu_unlock(vcpu);
1226 
1227         return (0);
1228 }
1229 
1230 int
1231 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1232 {
1233         struct vcpu *vcpu;
1234 
1235         if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1236                 return (EINVAL);
1237         }
1238         if (!VRS_IS_VALID(state)) {
1239                 return (EINVAL);
1240         }
1241 
1242         vcpu = &vm->vcpu[vcpuid];
1243 
1244         vcpu_lock(vcpu);
1245         vcpu->run_state = state;
1246         vcpu->sipi_vector = sipi_vec;
1247         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1248         vcpu_unlock(vcpu);
1249 
1250         return (0);
1251 }
1252 
1253 
1254 static void
1255 restore_guest_fpustate(struct vcpu *vcpu)
1256 {
1257 
1258         /* flush host state to the pcb */
1259         fpuexit(curthread);
1260 
1261         /* restore guest FPU state */
1262         fpu_stop_emulating();
1263         fpurestore(vcpu->guestfpu);
1264 
1265         /* restore guest XCR0 if XSAVE is enabled in the host */
1266         if (rcr4() & CR4_XSAVE)
1267                 load_xcr(0, vcpu->guest_xcr0);
1268 
1269         /*
1270          * The FPU is now "dirty" with the guest's state so turn on emulation
1271          * to trap any access to the FPU by the host.
1272          */
1273         fpu_start_emulating();
1274 }
1275 
1276 static void
1277 save_guest_fpustate(struct vcpu *vcpu)
1278 {
1279 
1280         if ((rcr0() & CR0_TS) == 0)
1281                 panic("fpu emulation not enabled in host!");
1282 
1283         /* save guest XCR0 and restore host XCR0 */
1284         if (rcr4() & CR4_XSAVE) {
1285                 vcpu->guest_xcr0 = rxcr(0);
1286                 load_xcr(0, vmm_get_host_xcr0());
1287         }
1288 
1289         /* save guest FPU state */
1290         fpu_stop_emulating();
1291         fpusave(vcpu->guestfpu);
1292         /*
1293          * When the host state has been restored, we should not re-enable
1294          * CR0.TS on illumos for eager FPU.
1295          */
1296 }
1297 
1298 static int
1299 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1300     bool from_idle)
1301 {
1302         struct vcpu *vcpu;
1303         int error;
1304 
1305         vcpu = &vm->vcpu[vcpuid];
1306         vcpu_assert_locked(vcpu);
1307 
1308         /*
1309          * State transitions from the vmmdev_ioctl() must always begin from
1310          * the VCPU_IDLE state. This guarantees that there is only a single
1311          * ioctl() operating on a vcpu at any point.
1312          */
1313         if (from_idle) {
1314                 while (vcpu->state != VCPU_IDLE) {
1315                         vcpu->reqidle = 1;
1316                         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1317                         VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1318                             "idle requested", vcpu_state2str(vcpu->state));
1319                         cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1320                 }
1321         } else {
1322                 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1323                     "vcpu idle state"));
1324         }
1325 
1326         if (vcpu->state == VCPU_RUNNING) {
1327                 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1328                     "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1329         } else {
1330                 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1331                     "vcpu that is not running", vcpu->hostcpu));
1332         }
1333 
1334         /*
1335          * The following state transitions are allowed:
1336          * IDLE -> FROZEN -> IDLE
1337          * FROZEN -> RUNNING -> FROZEN
1338          * FROZEN -> SLEEPING -> FROZEN
1339          */
1340         switch (vcpu->state) {
1341         case VCPU_IDLE:
1342         case VCPU_RUNNING:
1343         case VCPU_SLEEPING:
1344                 error = (newstate != VCPU_FROZEN);
1345                 break;
1346         case VCPU_FROZEN:
1347                 error = (newstate == VCPU_FROZEN);
1348                 break;
1349         default:
1350                 error = 1;
1351                 break;
1352         }
1353 
1354         if (error)
1355                 return (EBUSY);
1356 
1357         VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1358             vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1359 
1360         vcpu->state = newstate;
1361         if (newstate == VCPU_RUNNING)
1362                 vcpu->hostcpu = curcpu;
1363         else
1364                 vcpu->hostcpu = NOCPU;
1365 
1366         if (newstate == VCPU_IDLE) {
1367                 cv_broadcast(&vcpu->state_cv);
1368         }
1369 
1370         return (0);
1371 }
1372 
1373 static void
1374 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1375 {
1376         int error;
1377 
1378         if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1379                 panic("Error %d setting state to %d\n", error, newstate);
1380 }
1381 
1382 static void
1383 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1384 {
1385         int error;
1386 
1387         if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1388                 panic("Error %d setting state to %d", error, newstate);
1389 }
1390 
1391 /*
1392  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1393  */
1394 static int
1395 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1396 {
1397         struct vcpu *vcpu;
1398         int vcpu_halted, vm_halted;
1399         bool userspace_exit = false;
1400 
1401         KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1402 
1403         vcpu = &vm->vcpu[vcpuid];
1404         vcpu_halted = 0;
1405         vm_halted = 0;
1406 
1407         vcpu_lock(vcpu);
1408         while (1) {
1409                 /*
1410                  * Do a final check for pending interrupts (including NMI and
1411                  * INIT) before putting this thread to sleep.
1412                  */
1413                 if (vm_nmi_pending(vm, vcpuid))
1414                         break;
1415                 if (vcpu_run_state_pending(vm, vcpuid))
1416                         break;
1417                 if (!intr_disabled) {
1418                         if (vm_extint_pending(vm, vcpuid) ||
1419                             vlapic_pending_intr(vcpu->vlapic, NULL)) {
1420                                 break;
1421                         }
1422                 }
1423 
1424                 /*
1425                  * Also check for software events which would cause a wake-up.
1426                  * This will set the appropriate exitcode directly, rather than
1427                  * requiring a trip through VM_RUN().
1428                  */
1429                 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1430                         userspace_exit = true;
1431                         break;
1432                 }
1433 
1434                 /*
1435                  * Some Linux guests implement "halt" by having all vcpus
1436                  * execute HLT with interrupts disabled. 'halted_cpus' keeps
1437                  * track of the vcpus that have entered this state. When all
1438                  * vcpus enter the halted state the virtual machine is halted.
1439                  */
1440                 if (intr_disabled) {
1441                         if (!vcpu_halted && halt_detection_enabled) {
1442                                 vcpu_halted = 1;
1443                                 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1444                         }
1445                         if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1446                                 vm_halted = 1;
1447                                 break;
1448                         }
1449                 }
1450 
1451                 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1452                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1453                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1454                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1455                 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1456         }
1457 
1458         if (vcpu_halted)
1459                 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1460 
1461         vcpu_unlock(vcpu);
1462 
1463         if (vm_halted)
1464                 vm_suspend(vm, VM_SUSPEND_HALT);
1465 
1466         return (userspace_exit ? -1 : 0);
1467 }
1468 
1469 static int
1470 vm_handle_paging(struct vm *vm, int vcpuid)
1471 {
1472         int rv, ftype;
1473         struct vm_map *map;
1474         struct vcpu *vcpu;
1475         struct vm_exit *vme;
1476 
1477         vcpu = &vm->vcpu[vcpuid];
1478         vme = &vcpu->exitinfo;
1479 
1480         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1481             __func__, vme->inst_length));
1482 
1483         ftype = vme->u.paging.fault_type;
1484         KASSERT(ftype == PROT_READ ||
1485             ftype == PROT_WRITE || ftype == PROT_EXEC,
1486             ("vm_handle_paging: invalid fault_type %d", ftype));
1487 
1488         if (ftype == PROT_READ || ftype == PROT_WRITE) {
1489                 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1490                     vme->u.paging.gpa, ftype);
1491                 if (rv == 0) {
1492                         VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1493                             ftype == PROT_READ ? "accessed" : "dirty",
1494                             vme->u.paging.gpa);
1495                         goto done;
1496                 }
1497         }
1498 
1499         map = &vm->vmspace->vm_map;
1500         rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1501 
1502         VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1503             "ftype = %d", rv, vme->u.paging.gpa, ftype);
1504 
1505         if (rv != 0)
1506                 return (EFAULT);
1507 done:
1508         return (0);
1509 }
1510 
1511 int
1512 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1513     int rsize)
1514 {
1515         int err = ESRCH;
1516 
1517         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1518                 err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1519         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1520                 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1521         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1522                 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1523         }
1524 
1525         return (err);
1526 }
1527 
1528 int
1529 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1530     int wsize)
1531 {
1532         int err = ESRCH;
1533 
1534         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1535                 err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1536         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1537                 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1538         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1539                 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1540         }
1541 
1542         return (err);
1543 }
1544 
1545 static int
1546 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1547 {
1548         struct vie *vie;
1549         struct vcpu *vcpu;
1550         struct vm_exit *vme;
1551         uint64_t inst_addr;
1552         int error, fault, cs_d;
1553 
1554         vcpu = &vm->vcpu[vcpuid];
1555         vme = &vcpu->exitinfo;
1556         vie = vcpu->vie_ctx;
1557 
1558         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1559             __func__, vme->inst_length));
1560 
1561         inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1562         cs_d = vme->u.mmio_emul.cs_d;
1563 
1564         VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1565             vme->u.mmio_emul.gpa);
1566 
1567         /* Fetch the faulting instruction */
1568         if (vie_needs_fetch(vie)) {
1569                 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1570                     &fault);
1571                 if (error != 0) {
1572                         return (error);
1573                 } else if (fault) {
1574                         /*
1575                          * If a fault during instruction fetch was encountered,
1576                          * it will have asserted that the appropriate exception
1577                          * be injected at next entry.
1578                          * No further work is required.
1579                          */
1580                         return (0);
1581                 }
1582         }
1583 
1584         if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1585                 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1586                     inst_addr);
1587                 /* Dump (unrecognized) instruction bytes in userspace */
1588                 vie_fallback_exitinfo(vie, vme);
1589                 return (-1);
1590         }
1591         if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1592             vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1593                 /* Decoded GLA does not match GLA from VM exit state */
1594                 vie_fallback_exitinfo(vie, vme);
1595                 return (-1);
1596         }
1597 
1598 repeat:
1599         error = vie_emulate_mmio(vie, vm, vcpuid);
1600         if (error < 0) {
1601                 /*
1602                  * MMIO not handled by any of the in-kernel-emulated devices, so
1603                  * make a trip out to userspace for it.
1604                  */
1605                 vie_exitinfo(vie, vme);
1606         } else if (error == EAGAIN) {
1607                 /*
1608                  * Continue emulating the rep-prefixed instruction, which has
1609                  * not completed its iterations.
1610                  *
1611                  * In case this can be emulated in-kernel and has a high
1612                  * repetition count (causing a tight spin), it should be
1613                  * deferential to yield conditions.
1614                  */
1615                 if (!vcpu_should_yield(vm, vcpuid)) {
1616                         goto repeat;
1617                 } else {
1618                         /*
1619                          * Defer to the contending load by making a trip to
1620                          * userspace with a no-op (BOGUS) exit reason.
1621                          */
1622                         vie_reset(vie);
1623                         vme->exitcode = VM_EXITCODE_BOGUS;
1624                         return (-1);
1625                 }
1626         } else if (error == 0) {
1627                 /* Update %rip now that instruction has been emulated */
1628                 vie_advance_pc(vie, &vcpu->nextrip);
1629         }
1630         return (error);
1631 }
1632 
1633 static int
1634 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1635 {
1636         struct vcpu *vcpu;
1637         struct vie *vie;
1638         int err;
1639 
1640         vcpu = &vm->vcpu[vcpuid];
1641         vie = vcpu->vie_ctx;
1642 
1643 repeat:
1644         err = vie_emulate_inout(vie, vm, vcpuid);
1645 
1646         if (err < 0) {
1647                 /*
1648                  * In/out not handled by any of the in-kernel-emulated devices,
1649                  * so make a trip out to userspace for it.
1650                  */
1651                 vie_exitinfo(vie, vme);
1652                 return (err);
1653         } else if (err == EAGAIN) {
1654                 /*
1655                  * Continue emulating the rep-prefixed ins/outs, which has not
1656                  * completed its iterations.
1657                  *
1658                  * In case this can be emulated in-kernel and has a high
1659                  * repetition count (causing a tight spin), it should be
1660                  * deferential to yield conditions.
1661                  */
1662                 if (!vcpu_should_yield(vm, vcpuid)) {
1663                         goto repeat;
1664                 } else {
1665                         /*
1666                          * Defer to the contending load by making a trip to
1667                          * userspace with a no-op (BOGUS) exit reason.
1668                          */
1669                         vie_reset(vie);
1670                         vme->exitcode = VM_EXITCODE_BOGUS;
1671                         return (-1);
1672                 }
1673         } else if (err != 0) {
1674                 /* Emulation failure.  Bail all the way out to userspace. */
1675                 vme->exitcode = VM_EXITCODE_INST_EMUL;
1676                 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1677                 return (-1);
1678         }
1679 
1680         vie_advance_pc(vie, &vcpu->nextrip);
1681         return (0);
1682 }
1683 
1684 static int
1685 vm_handle_inst_emul(struct vm *vm, int vcpuid)
1686 {
1687         struct vie *vie;
1688         struct vcpu *vcpu;
1689         struct vm_exit *vme;
1690         uint64_t cs_base;
1691         int error, fault, cs_d;
1692 
1693         vcpu = &vm->vcpu[vcpuid];
1694         vme = &vcpu->exitinfo;
1695         vie = vcpu->vie_ctx;
1696 
1697         vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1698 
1699         /* Fetch the faulting instruction */
1700         ASSERT(vie_needs_fetch(vie));
1701         error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1702             &fault);
1703         if (error != 0) {
1704                 return (error);
1705         } else if (fault) {
1706                 /*
1707                  * If a fault during instruction fetch was encounted, it will
1708                  * have asserted that the appropriate exception be injected at
1709                  * next entry.  No further work is required.
1710                  */
1711                 return (0);
1712         }
1713 
1714         if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1715                 /* Dump (unrecognized) instruction bytes in userspace */
1716                 vie_fallback_exitinfo(vie, vme);
1717                 return (-1);
1718         }
1719 
1720         error = vie_emulate_other(vie, vm, vcpuid);
1721         if (error != 0) {
1722                 /*
1723                  * Instruction emulation was unable to complete successfully, so
1724                  * kick it out to userspace for handling.
1725                  */
1726                 vie_fallback_exitinfo(vie, vme);
1727         } else {
1728                 /* Update %rip now that instruction has been emulated */
1729                 vie_advance_pc(vie, &vcpu->nextrip);
1730         }
1731         return (error);
1732 }
1733 
1734 static int
1735 vm_handle_suspend(struct vm *vm, int vcpuid)
1736 {
1737         int i;
1738         struct vcpu *vcpu;
1739 
1740         vcpu = &vm->vcpu[vcpuid];
1741 
1742         CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1743 
1744         /*
1745          * Wait until all 'active_cpus' have suspended themselves.
1746          */
1747         vcpu_lock(vcpu);
1748         vcpu_ustate_change(vm, vcpuid, VU_INIT);
1749         while (1) {
1750                 int rc;
1751 
1752                 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1753                         VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1754                         break;
1755                 }
1756 
1757                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1758                 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1759                     TR_CLOCK_TICK);
1760                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1761 
1762                 /*
1763                  * If the userspace process driving the instance is killed, any
1764                  * vCPUs yet to be marked suspended (because they are not
1765                  * VM_RUN-ing in the kernel presently) will never reach that
1766                  * state.
1767                  *
1768                  * To avoid vm_handle_suspend() getting stuck in the kernel
1769                  * waiting for those vCPUs, offer a bail-out even though it
1770                  * means returning without all vCPUs in a suspended state.
1771                  */
1772                 if (rc <= 0) {
1773                         if ((curproc->p_flag & SEXITING) != 0) {
1774                                 break;
1775                         }
1776                 }
1777         }
1778         vcpu_unlock(vcpu);
1779 
1780         /*
1781          * Wakeup the other sleeping vcpus and return to userspace.
1782          */
1783         for (i = 0; i < vm->maxcpus; i++) {
1784                 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1785                         vcpu_notify_event(vm, i);
1786                 }
1787         }
1788 
1789         return (-1);
1790 }
1791 
1792 static int
1793 vm_handle_reqidle(struct vm *vm, int vcpuid)
1794 {
1795         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1796 
1797         vcpu_lock(vcpu);
1798         KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1799         vcpu->reqidle = 0;
1800         vcpu_unlock(vcpu);
1801         return (-1);
1802 }
1803 
1804 static int
1805 vm_handle_run_state(struct vm *vm, int vcpuid)
1806 {
1807         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1808         bool handled = false;
1809 
1810         vcpu_lock(vcpu);
1811         while (1) {
1812                 if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1813                         vcpu_unlock(vcpu);
1814                         VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1815                         vcpu_lock(vcpu);
1816 
1817                         vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1818                         vcpu->run_state |= VRS_INIT;
1819                 }
1820 
1821                 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1822                     (VRS_INIT | VRS_PEND_SIPI)) {
1823                         const uint8_t vector = vcpu->sipi_vector;
1824 
1825                         vcpu_unlock(vcpu);
1826                         VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1827                         vcpu_lock(vcpu);
1828 
1829                         vcpu->run_state &= ~VRS_PEND_SIPI;
1830                         vcpu->run_state |= VRS_RUN;
1831                 }
1832 
1833                 /*
1834                  * If the vCPU is now in the running state, there is no need to
1835                  * wait for anything prior to re-entry.
1836                  */
1837                 if ((vcpu->run_state & VRS_RUN) != 0) {
1838                         handled = true;
1839                         break;
1840                 }
1841 
1842                 /*
1843                  * Also check for software events which would cause a wake-up.
1844                  * This will set the appropriate exitcode directly, rather than
1845                  * requiring a trip through VM_RUN().
1846                  */
1847                 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1848                         break;
1849                 }
1850 
1851                 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1852                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1853                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1854                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1855                 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1856         }
1857         vcpu_unlock(vcpu);
1858 
1859         return (handled ? 0 : -1);
1860 }
1861 
1862 static int
1863 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1864 {
1865         const uint32_t code = vme->u.msr.code;
1866         uint64_t val = 0;
1867 
1868         switch (code) {
1869         case MSR_MCG_CAP:
1870         case MSR_MCG_STATUS:
1871                 val = 0;
1872                 break;
1873 
1874         case MSR_MTRRcap:
1875         case MSR_MTRRdefType:
1876         case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1877         case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1878         case MSR_MTRR64kBase:
1879                 val = 0;
1880                 break;
1881 
1882         case MSR_TSC:
1883                 /*
1884                  * In all likelihood, this should always be handled in guest
1885                  * context by VMX/SVM rather than taking an exit.  (Both VMX and
1886                  * SVM pass through read-only access to MSR_TSC to the guest.)
1887                  *
1888                  * No physical offset is requested of vcpu_tsc_offset() since
1889                  * rdtsc_offset() takes care of that instead.
1890                  */
1891                 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
1892                 break;
1893 
1894         default:
1895                 /*
1896                  * Anything not handled at this point will be kicked out to
1897                  * userspace for attempted processing there.
1898                  */
1899                 return (-1);
1900         }
1901 
1902         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
1903             val & 0xffffffff));
1904         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
1905             val >> 32));
1906         return (0);
1907 }
1908 
1909 static int
1910 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1911 {
1912         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1913         const uint32_t code = vme->u.msr.code;
1914         const uint64_t val = vme->u.msr.wval;
1915 
1916         switch (code) {
1917         case MSR_MCG_CAP:
1918         case MSR_MCG_STATUS:
1919                 /* Ignore writes */
1920                 break;
1921 
1922         case MSR_MTRRcap:
1923                 vm_inject_gp(vm, vcpuid);
1924                 break;
1925         case MSR_MTRRdefType:
1926         case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1927         case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1928         case MSR_MTRR64kBase:
1929                 /* Ignore writes */
1930                 break;
1931 
1932         case MSR_TSC:
1933                 /*
1934                  * The effect of writing the TSC MSR is that a subsequent read
1935                  * of the TSC would report that value written (plus any time
1936                  * elapsed between the write and the read).  The guest TSC value
1937                  * is calculated from a global offset for the guest (which
1938                  * effectively makes its TSC read 0 at guest boot) and a
1939                  * per-vCPU offset to handle these writes to the MSR.
1940                  *
1941                  * To calculate that per-vCPU offset, we can work backwards from
1942                  * the guest value at the time of write:
1943                  *
1944                  * value = host TSC + VM boot offset + vCPU offset
1945                  *
1946                  * so therefore:
1947                  *
1948                  * value - host TSC - VM boot offset = vCPU offset
1949                  */
1950                 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
1951                 break;
1952 
1953         default:
1954                 /*
1955                  * Anything not handled at this point will be kicked out to
1956                  * userspace for attempted processing there.
1957                  */
1958                 return (-1);
1959         }
1960 
1961         return (0);
1962 }
1963 
1964 int
1965 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1966 {
1967         int i;
1968 
1969         if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1970                 return (EINVAL);
1971 
1972         if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1973                 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1974                     vm->suspend, how);
1975                 return (EALREADY);
1976         }
1977 
1978         VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1979 
1980         /*
1981          * Notify all active vcpus that they are now suspended.
1982          */
1983         for (i = 0; i < vm->maxcpus; i++) {
1984                 if (CPU_ISSET(i, &vm->active_cpus))
1985                         vcpu_notify_event(vm, i);
1986         }
1987 
1988         return (0);
1989 }
1990 
1991 void
1992 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1993 {
1994         struct vm_exit *vmexit;
1995 
1996         vmexit = vm_exitinfo(vm, vcpuid);
1997         vmexit->rip = rip;
1998         vmexit->inst_length = 0;
1999         vmexit->exitcode = VM_EXITCODE_RUN_STATE;
2000         vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
2001 }
2002 
2003 /*
2004  * Some vmm resources, such as the lapic, may have CPU-specific resources
2005  * allocated to them which would benefit from migration onto the host CPU which
2006  * is processing the vcpu state.
2007  */
2008 static void
2009 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2010 {
2011         /*
2012          * Localizing cyclic resources requires acquisition of cpu_lock, and
2013          * doing so with kpreempt disabled is a recipe for deadlock disaster.
2014          */
2015         VERIFY(curthread->t_preempt == 0);
2016 
2017         /*
2018          * Do not bother with localization if this vCPU is about to return to
2019          * the host CPU it was last localized to.
2020          */
2021         if (vcpu->lastloccpu == curcpu)
2022                 return;
2023 
2024         /*
2025          * Localize system-wide resources to the primary boot vCPU.  While any
2026          * of the other vCPUs may access them, it keeps the potential interrupt
2027          * footprint constrained to CPUs involved with this instance.
2028          */
2029         if (vcpu == &vm->vcpu[0]) {
2030                 vhpet_localize_resources(vm->vhpet);
2031                 vrtc_localize_resources(vm->vrtc);
2032                 vatpit_localize_resources(vm->vatpit);
2033         }
2034 
2035         vlapic_localize_resources(vcpu->vlapic);
2036 
2037         vcpu->lastloccpu = curcpu;
2038 }
2039 
2040 static void
2041 vmm_savectx(void *arg)
2042 {
2043         vm_thread_ctx_t *vtc = arg;
2044         struct vm *vm = vtc->vtc_vm;
2045         const int vcpuid = vtc->vtc_vcpuid;
2046 
2047         if (ops->vmsavectx != NULL) {
2048                 ops->vmsavectx(vm->cookie, vcpuid);
2049         }
2050 
2051         /*
2052          * Account for going off-cpu, unless the vCPU is idled, where being
2053          * off-cpu is the explicit point.
2054          */
2055         if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2056                 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2057                 vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2058         }
2059 
2060         /*
2061          * If the CPU holds the restored guest FPU state, save it and restore
2062          * the host FPU state before this thread goes off-cpu.
2063          */
2064         if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2065                 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2066 
2067                 save_guest_fpustate(vcpu);
2068                 vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2069         }
2070 }
2071 
2072 static void
2073 vmm_restorectx(void *arg)
2074 {
2075         vm_thread_ctx_t *vtc = arg;
2076         struct vm *vm = vtc->vtc_vm;
2077         const int vcpuid = vtc->vtc_vcpuid;
2078 
2079         /* Complete microstate accounting for vCPU being off-cpu */
2080         if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2081                 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2082         }
2083 
2084         /*
2085          * When coming back on-cpu, only restore the guest FPU status if the
2086          * thread is in a context marked as requiring it.  This should be rare,
2087          * occurring only when a future logic error results in a voluntary
2088          * sleep during the VMRUN critical section.
2089          *
2090          * The common case will result in elision of the guest FPU state
2091          * restoration, deferring that action until it is clearly necessary
2092          * during vm_run.
2093          */
2094         VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2095         if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2096                 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2097 
2098                 restore_guest_fpustate(vcpu);
2099                 vtc->vtc_status |= VTCS_FPU_RESTORED;
2100         }
2101 
2102         if (ops->vmrestorectx != NULL) {
2103                 ops->vmrestorectx(vm->cookie, vcpuid);
2104         }
2105 
2106 }
2107 
2108 /*
2109  * If we're in removectx(), we might still have state to tidy up.
2110  */
2111 static void
2112 vmm_freectx(void *arg, int isexec)
2113 {
2114         vmm_savectx(arg);
2115 }
2116 
2117 static int
2118 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2119     struct vm_exit *vme)
2120 {
2121         struct vcpu *vcpu;
2122         struct vie *vie;
2123         int err;
2124 
2125         vcpu = &vm->vcpu[vcpuid];
2126         vie = vcpu->vie_ctx;
2127         err = 0;
2128 
2129         switch (entry->cmd) {
2130         case VEC_DEFAULT:
2131                 return (0);
2132         case VEC_DISCARD_INSTR:
2133                 vie_reset(vie);
2134                 return (0);
2135         case VEC_FULFILL_MMIO:
2136                 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2137                 if (err == 0) {
2138                         err = vie_emulate_mmio(vie, vm, vcpuid);
2139                         if (err == 0) {
2140                                 vie_advance_pc(vie, &vcpu->nextrip);
2141                         } else if (err < 0) {
2142                                 vie_exitinfo(vie, vme);
2143                         } else if (err == EAGAIN) {
2144                                 /*
2145                                  * Clear the instruction emulation state in
2146                                  * order to re-enter VM context and continue
2147                                  * this 'rep <instruction>'
2148                                  */
2149                                 vie_reset(vie);
2150                                 err = 0;
2151                         }
2152                 }
2153                 break;
2154         case VEC_FULFILL_INOUT:
2155                 err = vie_fulfill_inout(vie, &entry->u.inout);
2156                 if (err == 0) {
2157                         err = vie_emulate_inout(vie, vm, vcpuid);
2158                         if (err == 0) {
2159                                 vie_advance_pc(vie, &vcpu->nextrip);
2160                         } else if (err < 0) {
2161                                 vie_exitinfo(vie, vme);
2162                         } else if (err == EAGAIN) {
2163                                 /*
2164                                  * Clear the instruction emulation state in
2165                                  * order to re-enter VM context and continue
2166                                  * this 'rep ins/outs'
2167                                  */
2168                                 vie_reset(vie);
2169                                 err = 0;
2170                         }
2171                 }
2172                 break;
2173         default:
2174                 return (EINVAL);
2175         }
2176         return (err);
2177 }
2178 
2179 static int
2180 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2181 {
2182         struct vie *vie;
2183 
2184         vie = vm->vcpu[vcpuid].vie_ctx;
2185 
2186         if (vie_pending(vie)) {
2187                 /*
2188                  * Userspace has not fulfilled the pending needs of the
2189                  * instruction emulation, so bail back out.
2190                  */
2191                 vie_exitinfo(vie, vme);
2192                 return (-1);
2193         }
2194 
2195         return (0);
2196 }
2197 
2198 int
2199 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2200 {
2201         int error;
2202         struct vcpu *vcpu;
2203         struct vm_exit *vme;
2204         bool intr_disabled;
2205         pmap_t pmap;
2206         vm_thread_ctx_t vtc;
2207         int affinity_type = CPU_CURRENT;
2208 
2209         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2210                 return (EINVAL);
2211         if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2212                 return (EINVAL);
2213         if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2214                 return (EINVAL);
2215 
2216         pmap = vmspace_pmap(vm->vmspace);
2217         vcpu = &vm->vcpu[vcpuid];
2218         vme = &vcpu->exitinfo;
2219 
2220         vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2221 
2222         vtc.vtc_vm = vm;
2223         vtc.vtc_vcpuid = vcpuid;
2224         vtc.vtc_status = 0;
2225         installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2226             NULL, vmm_freectx, NULL);
2227 
2228         error = vm_entry_actions(vm, vcpuid, entry, vme);
2229         if (error != 0) {
2230                 goto exit;
2231         }
2232 
2233 restart:
2234         error = vm_loop_checks(vm, vcpuid, vme);
2235         if (error != 0) {
2236                 goto exit;
2237         }
2238 
2239         thread_affinity_set(curthread, affinity_type);
2240         /*
2241          * Resource localization should happen after the CPU affinity for the
2242          * thread has been set to ensure that access from restricted contexts,
2243          * such as VMX-accelerated APIC operations, can occur without inducing
2244          * cyclic cross-calls.
2245          *
2246          * This must be done prior to disabling kpreempt via critical_enter().
2247          */
2248         vm_localize_resources(vm, vcpu);
2249         affinity_type = CPU_CURRENT;
2250         critical_enter();
2251 
2252         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2253             ("vm_run: absurd pm_active"));
2254 
2255         /* Force a trip through update_sregs to reload %fs/%gs and friends */
2256         PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2257 
2258         if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2259                 restore_guest_fpustate(vcpu);
2260                 vtc.vtc_status |= VTCS_FPU_RESTORED;
2261         }
2262         vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2263 
2264         vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2265         error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2266         vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2267 
2268         /*
2269          * Once clear of the delicate contexts comprising the VM_RUN handler,
2270          * thread CPU affinity can be loosened while other processing occurs.
2271          */
2272         vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2273         thread_affinity_clear(curthread);
2274         critical_exit();
2275 
2276         if (error != 0) {
2277                 /* Communicate out any error from VMRUN() above */
2278                 goto exit;
2279         }
2280 
2281         vcpu->nextrip = vme->rip + vme->inst_length;
2282         switch (vme->exitcode) {
2283         case VM_EXITCODE_REQIDLE:
2284                 error = vm_handle_reqidle(vm, vcpuid);
2285                 break;
2286         case VM_EXITCODE_RUN_STATE:
2287                 error = vm_handle_run_state(vm, vcpuid);
2288                 break;
2289         case VM_EXITCODE_SUSPENDED:
2290                 error = vm_handle_suspend(vm, vcpuid);
2291                 break;
2292         case VM_EXITCODE_IOAPIC_EOI:
2293                 vioapic_process_eoi(vm, vcpuid,
2294                     vme->u.ioapic_eoi.vector);
2295                 break;
2296         case VM_EXITCODE_HLT:
2297                 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2298                 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2299                 break;
2300         case VM_EXITCODE_PAGING:
2301                 error = vm_handle_paging(vm, vcpuid);
2302                 break;
2303         case VM_EXITCODE_MMIO_EMUL:
2304                 error = vm_handle_mmio_emul(vm, vcpuid);
2305                 break;
2306         case VM_EXITCODE_INOUT:
2307                 error = vm_handle_inout(vm, vcpuid, vme);
2308                 break;
2309         case VM_EXITCODE_INST_EMUL:
2310                 error = vm_handle_inst_emul(vm, vcpuid);
2311                 break;
2312         case VM_EXITCODE_MONITOR:
2313         case VM_EXITCODE_MWAIT:
2314         case VM_EXITCODE_VMINSN:
2315                 vm_inject_ud(vm, vcpuid);
2316                 break;
2317         case VM_EXITCODE_RDMSR:
2318                 error = vm_handle_rdmsr(vm, vcpuid, vme);
2319                 break;
2320         case VM_EXITCODE_WRMSR:
2321                 error = vm_handle_wrmsr(vm, vcpuid, vme);
2322                 break;
2323         case VM_EXITCODE_HT:
2324                 affinity_type = CPU_BEST;
2325                 break;
2326         case VM_EXITCODE_MTRAP:
2327                 vm_suspend_cpu(vm, vcpuid);
2328                 error = -1;
2329                 break;
2330         default:
2331                 /* handled in userland */
2332                 error = -1;
2333                 break;
2334         }
2335 
2336         if (error == 0) {
2337                 /* VM exit conditions handled in-kernel, continue running */
2338                 goto restart;
2339         }
2340 
2341 exit:
2342         removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2343             NULL, vmm_freectx);
2344 
2345         VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2346 
2347         vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2348         return (error);
2349 }
2350 
2351 int
2352 vm_restart_instruction(void *arg, int vcpuid)
2353 {
2354         struct vm *vm;
2355         struct vcpu *vcpu;
2356         enum vcpu_state state;
2357         uint64_t rip;
2358         int error;
2359 
2360         vm = arg;
2361         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2362                 return (EINVAL);
2363 
2364         vcpu = &vm->vcpu[vcpuid];
2365         state = vcpu_get_state(vm, vcpuid, NULL);
2366         if (state == VCPU_RUNNING) {
2367                 /*
2368                  * When a vcpu is "running" the next instruction is determined
2369                  * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2370                  * Thus setting 'inst_length' to zero will cause the current
2371                  * instruction to be restarted.
2372                  */
2373                 vcpu->exitinfo.inst_length = 0;
2374                 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2375                     "setting inst_length to zero", vcpu->exitinfo.rip);
2376         } else if (state == VCPU_FROZEN) {
2377                 /*
2378                  * When a vcpu is "frozen" it is outside the critical section
2379                  * around VMRUN() and 'nextrip' points to the next instruction.
2380                  * Thus instruction restart is achieved by setting 'nextrip'
2381                  * to the vcpu's %rip.
2382                  */
2383                 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2384                 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2385                 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2386                     "nextrip from %lx to %lx", vcpu->nextrip, rip);
2387                 vcpu->nextrip = rip;
2388         } else {
2389                 panic("%s: invalid state %d", __func__, state);
2390         }
2391         return (0);
2392 }
2393 
2394 int
2395 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2396 {
2397         struct vcpu *vcpu;
2398         int type, vector;
2399 
2400         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2401                 return (EINVAL);
2402 
2403         vcpu = &vm->vcpu[vcpuid];
2404 
2405         if (info & VM_INTINFO_VALID) {
2406                 type = info & VM_INTINFO_TYPE;
2407                 vector = info & 0xff;
2408                 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2409                         return (EINVAL);
2410                 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2411                         return (EINVAL);
2412                 if (info & VM_INTINFO_RSVD)
2413                         return (EINVAL);
2414         } else {
2415                 info = 0;
2416         }
2417         VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2418         vcpu->exitintinfo = info;
2419         return (0);
2420 }
2421 
2422 enum exc_class {
2423         EXC_BENIGN,
2424         EXC_CONTRIBUTORY,
2425         EXC_PAGEFAULT
2426 };
2427 
2428 #define IDT_VE  20      /* Virtualization Exception (Intel specific) */
2429 
2430 static enum exc_class
2431 exception_class(uint64_t info)
2432 {
2433         int type, vector;
2434 
2435         KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2436         type = info & VM_INTINFO_TYPE;
2437         vector = info & 0xff;
2438 
2439         /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2440         switch (type) {
2441         case VM_INTINFO_HWINTR:
2442         case VM_INTINFO_SWINTR:
2443         case VM_INTINFO_NMI:
2444                 return (EXC_BENIGN);
2445         default:
2446                 /*
2447                  * Hardware exception.
2448                  *
2449                  * SVM and VT-x use identical type values to represent NMI,
2450                  * hardware interrupt and software interrupt.
2451                  *
2452                  * SVM uses type '3' for all exceptions. VT-x uses type '3'
2453                  * for exceptions except #BP and #OF. #BP and #OF use a type
2454                  * value of '5' or '6'. Therefore we don't check for explicit
2455                  * values of 'type' to classify 'intinfo' into a hardware
2456                  * exception.
2457                  */
2458                 break;
2459         }
2460 
2461         switch (vector) {
2462         case IDT_PF:
2463         case IDT_VE:
2464                 return (EXC_PAGEFAULT);
2465         case IDT_DE:
2466         case IDT_TS:
2467         case IDT_NP:
2468         case IDT_SS:
2469         case IDT_GP:
2470                 return (EXC_CONTRIBUTORY);
2471         default:
2472                 return (EXC_BENIGN);
2473         }
2474 }
2475 
2476 static int
2477 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2478     uint64_t *retinfo)
2479 {
2480         enum exc_class exc1, exc2;
2481         int type1, vector1;
2482 
2483         KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2484         KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2485 
2486         /*
2487          * If an exception occurs while attempting to call the double-fault
2488          * handler the processor enters shutdown mode (aka triple fault).
2489          */
2490         type1 = info1 & VM_INTINFO_TYPE;
2491         vector1 = info1 & 0xff;
2492         if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2493                 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2494                     info1, info2);
2495                 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2496                 *retinfo = 0;
2497                 return (0);
2498         }
2499 
2500         /*
2501          * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2502          */
2503         exc1 = exception_class(info1);
2504         exc2 = exception_class(info2);
2505         if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2506             (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2507                 /* Convert nested fault into a double fault. */
2508                 *retinfo = IDT_DF;
2509                 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2510                 *retinfo |= VM_INTINFO_DEL_ERRCODE;
2511         } else {
2512                 /* Handle exceptions serially */
2513                 *retinfo = info2;
2514         }
2515         return (1);
2516 }
2517 
2518 static uint64_t
2519 vcpu_exception_intinfo(struct vcpu *vcpu)
2520 {
2521         uint64_t info = 0;
2522 
2523         if (vcpu->exception_pending) {
2524                 info = vcpu->exc_vector & 0xff;
2525                 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2526                 if (vcpu->exc_errcode_valid) {
2527                         info |= VM_INTINFO_DEL_ERRCODE;
2528                         info |= (uint64_t)vcpu->exc_errcode << 32;
2529                 }
2530         }
2531         return (info);
2532 }
2533 
2534 int
2535 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2536 {
2537         struct vcpu *vcpu;
2538         uint64_t info1, info2;
2539         int valid;
2540 
2541         KASSERT(vcpuid >= 0 &&
2542             vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2543 
2544         vcpu = &vm->vcpu[vcpuid];
2545 
2546         info1 = vcpu->exitintinfo;
2547         vcpu->exitintinfo = 0;
2548 
2549         info2 = 0;
2550         if (vcpu->exception_pending) {
2551                 info2 = vcpu_exception_intinfo(vcpu);
2552                 vcpu->exception_pending = 0;
2553                 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2554                     vcpu->exc_vector, info2);
2555         }
2556 
2557         if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2558                 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2559         } else if (info1 & VM_INTINFO_VALID) {
2560                 *retinfo = info1;
2561                 valid = 1;
2562         } else if (info2 & VM_INTINFO_VALID) {
2563                 *retinfo = info2;
2564                 valid = 1;
2565         } else {
2566                 valid = 0;
2567         }
2568 
2569         if (valid) {
2570                 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2571                     "retinfo(%lx)", __func__, info1, info2, *retinfo);
2572         }
2573 
2574         return (valid);
2575 }
2576 
2577 int
2578 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2579 {
2580         struct vcpu *vcpu;
2581 
2582         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2583                 return (EINVAL);
2584 
2585         vcpu = &vm->vcpu[vcpuid];
2586         *info1 = vcpu->exitintinfo;
2587         *info2 = vcpu_exception_intinfo(vcpu);
2588         return (0);
2589 }
2590 
2591 int
2592 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2593     uint32_t errcode, int restart_instruction)
2594 {
2595         struct vcpu *vcpu;
2596         uint64_t regval;
2597         int error;
2598 
2599         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2600                 return (EINVAL);
2601 
2602         if (vector < 0 || vector >= 32)
2603                 return (EINVAL);
2604 
2605         /*
2606          * NMIs (which bear an exception vector of 2) are to be injected via
2607          * their own specialized path using vm_inject_nmi().
2608          */
2609         if (vector == 2) {
2610                 return (EINVAL);
2611         }
2612 
2613         /*
2614          * A double fault exception should never be injected directly into
2615          * the guest. It is a derived exception that results from specific
2616          * combinations of nested faults.
2617          */
2618         if (vector == IDT_DF)
2619                 return (EINVAL);
2620 
2621         vcpu = &vm->vcpu[vcpuid];
2622 
2623         if (vcpu->exception_pending) {
2624                 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2625                     "pending exception %d", vector, vcpu->exc_vector);
2626                 return (EBUSY);
2627         }
2628 
2629         if (errcode_valid) {
2630                 /*
2631                  * Exceptions don't deliver an error code in real mode.
2632                  */
2633                 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2634                 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2635                 if (!(regval & CR0_PE))
2636                         errcode_valid = 0;
2637         }
2638 
2639         /*
2640          * From section 26.6.1 "Interruptibility State" in Intel SDM:
2641          *
2642          * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2643          * one instruction or incurs an exception.
2644          */
2645         error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2646         KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2647             __func__, error));
2648 
2649         if (restart_instruction)
2650                 vm_restart_instruction(vm, vcpuid);
2651 
2652         vcpu->exception_pending = 1;
2653         vcpu->exc_vector = vector;
2654         vcpu->exc_errcode = errcode;
2655         vcpu->exc_errcode_valid = errcode_valid;
2656         VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2657         return (0);
2658 }
2659 
2660 void
2661 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2662     int errcode)
2663 {
2664         int error;
2665 
2666         error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2667             errcode, 1);
2668         KASSERT(error == 0, ("vm_inject_exception error %d", error));
2669 }
2670 
2671 void
2672 vm_inject_ud(struct vm *vm, int vcpuid)
2673 {
2674         vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2675 }
2676 
2677 void
2678 vm_inject_gp(struct vm *vm, int vcpuid)
2679 {
2680         vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2681 }
2682 
2683 void
2684 vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2685 {
2686         vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2687 }
2688 
2689 void
2690 vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2691 {
2692         vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2693 }
2694 
2695 void
2696 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2697 {
2698         int error;
2699 
2700         VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2701             error_code, cr2);
2702 
2703         error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2704         KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2705 
2706         vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2707 }
2708 
2709 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2710 
2711 int
2712 vm_inject_nmi(struct vm *vm, int vcpuid)
2713 {
2714         struct vcpu *vcpu;
2715 
2716         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2717                 return (EINVAL);
2718 
2719         vcpu = &vm->vcpu[vcpuid];
2720 
2721         vcpu->nmi_pending = 1;
2722         vcpu_notify_event(vm, vcpuid);
2723         return (0);
2724 }
2725 
2726 int
2727 vm_nmi_pending(struct vm *vm, int vcpuid)
2728 {
2729         struct vcpu *vcpu;
2730 
2731         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2732                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2733 
2734         vcpu = &vm->vcpu[vcpuid];
2735 
2736         return (vcpu->nmi_pending);
2737 }
2738 
2739 void
2740 vm_nmi_clear(struct vm *vm, int vcpuid)
2741 {
2742         struct vcpu *vcpu;
2743 
2744         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2745                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2746 
2747         vcpu = &vm->vcpu[vcpuid];
2748 
2749         if (vcpu->nmi_pending == 0)
2750                 panic("vm_nmi_clear: inconsistent nmi_pending state");
2751 
2752         vcpu->nmi_pending = 0;
2753         vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2754 }
2755 
2756 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2757 
2758 int
2759 vm_inject_extint(struct vm *vm, int vcpuid)
2760 {
2761         struct vcpu *vcpu;
2762 
2763         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2764                 return (EINVAL);
2765 
2766         vcpu = &vm->vcpu[vcpuid];
2767 
2768         vcpu->extint_pending = 1;
2769         vcpu_notify_event(vm, vcpuid);
2770         return (0);
2771 }
2772 
2773 int
2774 vm_extint_pending(struct vm *vm, int vcpuid)
2775 {
2776         struct vcpu *vcpu;
2777 
2778         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2779                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2780 
2781         vcpu = &vm->vcpu[vcpuid];
2782 
2783         return (vcpu->extint_pending);
2784 }
2785 
2786 void
2787 vm_extint_clear(struct vm *vm, int vcpuid)
2788 {
2789         struct vcpu *vcpu;
2790 
2791         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2792                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2793 
2794         vcpu = &vm->vcpu[vcpuid];
2795 
2796         if (vcpu->extint_pending == 0)
2797                 panic("vm_extint_clear: inconsistent extint_pending state");
2798 
2799         vcpu->extint_pending = 0;
2800         vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2801 }
2802 
2803 int
2804 vm_inject_init(struct vm *vm, int vcpuid)
2805 {
2806         struct vcpu *vcpu;
2807 
2808         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2809                 return (EINVAL);
2810 
2811         vcpu = &vm->vcpu[vcpuid];
2812         vcpu_lock(vcpu);
2813         vcpu->run_state |= VRS_PEND_INIT;
2814         /*
2815          * As part of queuing the INIT request, clear any pending SIPI.  It
2816          * would not otherwise survive across the reset of the vCPU when it
2817          * undergoes the requested INIT.  We would not want it to linger when it
2818          * could be mistaken as a subsequent (after the INIT) SIPI request.
2819          */
2820         vcpu->run_state &= ~VRS_PEND_SIPI;
2821         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2822 
2823         vcpu_unlock(vcpu);
2824         return (0);
2825 }
2826 
2827 int
2828 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2829 {
2830         struct vcpu *vcpu;
2831 
2832         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2833                 return (EINVAL);
2834 
2835         vcpu = &vm->vcpu[vcpuid];
2836         vcpu_lock(vcpu);
2837         vcpu->run_state |= VRS_PEND_SIPI;
2838         vcpu->sipi_vector = vector;
2839         /* SIPI is only actionable if the CPU is waiting in INIT state */
2840         if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2841                 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2842         }
2843         vcpu_unlock(vcpu);
2844         return (0);
2845 }
2846 
2847 bool
2848 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2849 {
2850         struct vcpu *vcpu;
2851 
2852         ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2853         vcpu = &vm->vcpu[vcpuid];
2854 
2855         /* Of interest: vCPU not in running state or with pending INIT */
2856         return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2857 }
2858 
2859 int
2860 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2861 {
2862         struct seg_desc desc;
2863         const enum vm_reg_name clear_regs[] = {
2864                 VM_REG_GUEST_CR2,
2865                 VM_REG_GUEST_CR3,
2866                 VM_REG_GUEST_CR4,
2867                 VM_REG_GUEST_RAX,
2868                 VM_REG_GUEST_RBX,
2869                 VM_REG_GUEST_RCX,
2870                 VM_REG_GUEST_RSI,
2871                 VM_REG_GUEST_RDI,
2872                 VM_REG_GUEST_RBP,
2873                 VM_REG_GUEST_RSP,
2874                 VM_REG_GUEST_R8,
2875                 VM_REG_GUEST_R9,
2876                 VM_REG_GUEST_R10,
2877                 VM_REG_GUEST_R11,
2878                 VM_REG_GUEST_R12,
2879                 VM_REG_GUEST_R13,
2880                 VM_REG_GUEST_R14,
2881                 VM_REG_GUEST_R15,
2882                 VM_REG_GUEST_DR0,
2883                 VM_REG_GUEST_DR1,
2884                 VM_REG_GUEST_DR2,
2885                 VM_REG_GUEST_DR3,
2886                 VM_REG_GUEST_EFER,
2887         };
2888         const enum vm_reg_name data_segs[] = {
2889                 VM_REG_GUEST_SS,
2890                 VM_REG_GUEST_DS,
2891                 VM_REG_GUEST_ES,
2892                 VM_REG_GUEST_FS,
2893                 VM_REG_GUEST_GS,
2894         };
2895         struct vcpu *vcpu = &vm->vcpu[vcpuid];
2896 
2897         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2898                 return (EINVAL);
2899 
2900         for (uint_t i = 0; i < nitems(clear_regs); i++) {
2901                 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2902         }
2903 
2904         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2905         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2906         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2907 
2908         /*
2909          * The prescribed contents of %rdx differ slightly between the Intel and
2910          * AMD architectural definitions.  The former expects the Extended Model
2911          * in bits 16-19 where the latter expects all the Family, Model, and
2912          * Stepping be there.  Common boot ROMs appear to disregard this
2913          * anyways, so we stick with a compromise value similar to what is
2914          * spelled out in the Intel SDM.
2915          */
2916         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2917 
2918         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2919         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2920 
2921         /* CS: Present, R/W, Accessed */
2922         desc.access = 0x0093;
2923         desc.base = 0xffff0000;
2924         desc.limit = 0xffff;
2925         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2926         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2927 
2928         /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2929         desc.access = 0x0093;
2930         desc.base = 0;
2931         desc.limit = 0xffff;
2932         for (uint_t i = 0; i < nitems(data_segs); i++) {
2933                 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2934                 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2935         }
2936 
2937         /* GDTR, IDTR */
2938         desc.base = 0;
2939         desc.limit = 0xffff;
2940         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2941         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2942 
2943         /* LDTR: Present, LDT */
2944         desc.access = 0x0082;
2945         desc.base = 0;
2946         desc.limit = 0xffff;
2947         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2948         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2949 
2950         /* TR: Present, 32-bit TSS */
2951         desc.access = 0x008b;
2952         desc.base = 0;
2953         desc.limit = 0xffff;
2954         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2955         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2956 
2957         vlapic_reset(vm_lapic(vm, vcpuid));
2958 
2959         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2960 
2961         vcpu->exitintinfo = 0;
2962         vcpu->exception_pending = 0;
2963         vcpu->nmi_pending = 0;
2964         vcpu->extint_pending = 0;
2965 
2966         /*
2967          * A CPU reset caused by power-on or system reset clears more state than
2968          * one which is trigged from an INIT IPI.
2969          */
2970         if (!init_only) {
2971                 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2972                 fpu_save_area_reset(vcpu->guestfpu);
2973 
2974                 /* XXX: clear MSRs and other pieces */
2975         }
2976 
2977         return (0);
2978 }
2979 
2980 static int
2981 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2982 {
2983         struct seg_desc desc;
2984 
2985         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2986                 return (EINVAL);
2987 
2988         /* CS: Present, R/W, Accessed */
2989         desc.access = 0x0093;
2990         desc.base = (uint64_t)vector << 12;
2991         desc.limit = 0xffff;
2992         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2993         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2994             (uint64_t)vector << 8));
2995 
2996         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2997 
2998         return (0);
2999 }
3000 
3001 int
3002 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
3003 {
3004         if (vcpu < 0 || vcpu >= vm->maxcpus)
3005                 return (EINVAL);
3006 
3007         if (type < 0 || type >= VM_CAP_MAX)
3008                 return (EINVAL);
3009 
3010         return (VMGETCAP(vm->cookie, vcpu, type, retval));
3011 }
3012 
3013 int
3014 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3015 {
3016         if (vcpu < 0 || vcpu >= vm->maxcpus)
3017                 return (EINVAL);
3018 
3019         if (type < 0 || type >= VM_CAP_MAX)
3020                 return (EINVAL);
3021 
3022         return (VMSETCAP(vm->cookie, vcpu, type, val));
3023 }
3024 
3025 struct vlapic *
3026 vm_lapic(struct vm *vm, int cpu)
3027 {
3028         return (vm->vcpu[cpu].vlapic);
3029 }
3030 
3031 struct vioapic *
3032 vm_ioapic(struct vm *vm)
3033 {
3034 
3035         return (vm->vioapic);
3036 }
3037 
3038 struct vhpet *
3039 vm_hpet(struct vm *vm)
3040 {
3041 
3042         return (vm->vhpet);
3043 }
3044 
3045 void *
3046 vm_iommu_domain(struct vm *vm)
3047 {
3048 
3049         return (vm->iommu);
3050 }
3051 
3052 int
3053 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3054     bool from_idle)
3055 {
3056         int error;
3057         struct vcpu *vcpu;
3058 
3059         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3060                 panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3061 
3062         vcpu = &vm->vcpu[vcpuid];
3063 
3064         vcpu_lock(vcpu);
3065         error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3066         vcpu_unlock(vcpu);
3067 
3068         return (error);
3069 }
3070 
3071 enum vcpu_state
3072 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3073 {
3074         struct vcpu *vcpu;
3075         enum vcpu_state state;
3076 
3077         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3078                 panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3079 
3080         vcpu = &vm->vcpu[vcpuid];
3081 
3082         vcpu_lock(vcpu);
3083         state = vcpu->state;
3084         if (hostcpu != NULL)
3085                 *hostcpu = vcpu->hostcpu;
3086         vcpu_unlock(vcpu);
3087 
3088         return (state);
3089 }
3090 
3091 uint64_t
3092 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3093 {
3094         ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3095 
3096         uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3097 
3098         if (phys_adj) {
3099                 /* Include any offset for the current physical CPU too */
3100                 extern hrtime_t tsc_gethrtime_tick_delta(void);
3101                 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3102         }
3103 
3104         return (vcpu_off);
3105 }
3106 
3107 int
3108 vm_activate_cpu(struct vm *vm, int vcpuid)
3109 {
3110 
3111         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3112                 return (EINVAL);
3113 
3114         if (CPU_ISSET(vcpuid, &vm->active_cpus))
3115                 return (EBUSY);
3116 
3117         VCPU_CTR0(vm, vcpuid, "activated");
3118         CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3119         return (0);
3120 }
3121 
3122 int
3123 vm_suspend_cpu(struct vm *vm, int vcpuid)
3124 {
3125         int i;
3126 
3127         if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3128                 return (EINVAL);
3129 
3130         if (vcpuid == -1) {
3131                 vm->debug_cpus = vm->active_cpus;
3132                 for (i = 0; i < vm->maxcpus; i++) {
3133                         if (CPU_ISSET(i, &vm->active_cpus))
3134                                 vcpu_notify_event(vm, i);
3135                 }
3136         } else {
3137                 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3138                         return (EINVAL);
3139 
3140                 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3141                 vcpu_notify_event(vm, vcpuid);
3142         }
3143         return (0);
3144 }
3145 
3146 int
3147 vm_resume_cpu(struct vm *vm, int vcpuid)
3148 {
3149 
3150         if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3151                 return (EINVAL);
3152 
3153         if (vcpuid == -1) {
3154                 CPU_ZERO(&vm->debug_cpus);
3155         } else {
3156                 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3157                         return (EINVAL);
3158 
3159                 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3160         }
3161         return (0);
3162 }
3163 
3164 static bool
3165 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3166     uint64_t entry_rip)
3167 {
3168         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3169         struct vm_exit *vme = &vcpu->exitinfo;
3170         bool bail = false;
3171 
3172         ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3173 
3174         if (vm->suspend) {
3175                 if (on_entry) {
3176                         VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3177                             vm->suspend < VM_SUSPEND_LAST);
3178 
3179                         vme->exitcode = VM_EXITCODE_SUSPENDED;
3180                         vme->u.suspended.how = vm->suspend;
3181                 } else {
3182                         /*
3183                          * Handling VM suspend is complicated, so if that
3184                          * condition is detected outside of VM-entry itself,
3185                          * just emit a BOGUS exitcode so we take a lap to pick
3186                          * up the event during an entry and are directed into
3187                          * the vm_handle_suspend() logic.
3188                          */
3189                         vme->exitcode = VM_EXITCODE_BOGUS;
3190                 }
3191                 bail = true;
3192         }
3193         if (vcpu->reqidle) {
3194                 vme->exitcode = VM_EXITCODE_REQIDLE;
3195                 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3196 
3197                 if (!on_entry) {
3198                         /*
3199                          * A reqidle request detected outside of VM-entry can be
3200                          * handled directly by clearing the request (and taking
3201                          * a lap to userspace).
3202                          */
3203                         vcpu_assert_locked(vcpu);
3204                         vcpu->reqidle = 0;
3205                 }
3206                 bail = true;
3207         }
3208         if (vcpu_should_yield(vm, vcpuid)) {
3209                 vme->exitcode = VM_EXITCODE_BOGUS;
3210                 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3211                 bail = true;
3212         }
3213         if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3214                 vme->exitcode = VM_EXITCODE_DEBUG;
3215                 bail = true;
3216         }
3217 
3218         if (bail) {
3219                 if (on_entry) {
3220                         /*
3221                          * If bailing out during VM-entry, the current %rip must
3222                          * be recorded in the exitinfo.
3223                          */
3224                         vme->rip = entry_rip;
3225                 }
3226                 vme->inst_length = 0;
3227         }
3228         return (bail);
3229 }
3230 
3231 static bool
3232 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3233 {
3234         /*
3235          * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3236          * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3237          * structure, and we would only modify the exitcode.
3238          */
3239         return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3240 }
3241 
3242 bool
3243 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3244 {
3245         /*
3246          * Bail-out checks done as part of VM entry require an updated %rip to
3247          * populate the vm_exit struct if any of the conditions of interest are
3248          * matched in the check.
3249          */
3250         return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3251 }
3252 
3253 cpuset_t
3254 vm_active_cpus(struct vm *vm)
3255 {
3256 
3257         return (vm->active_cpus);
3258 }
3259 
3260 cpuset_t
3261 vm_debug_cpus(struct vm *vm)
3262 {
3263 
3264         return (vm->debug_cpus);
3265 }
3266 
3267 cpuset_t
3268 vm_suspended_cpus(struct vm *vm)
3269 {
3270 
3271         return (vm->suspended_cpus);
3272 }
3273 
3274 void *
3275 vcpu_stats(struct vm *vm, int vcpuid)
3276 {
3277 
3278         return (vm->vcpu[vcpuid].stats);
3279 }
3280 
3281 int
3282 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3283 {
3284         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3285                 return (EINVAL);
3286 
3287         *state = vm->vcpu[vcpuid].x2apic_state;
3288 
3289         return (0);
3290 }
3291 
3292 int
3293 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3294 {
3295         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3296                 return (EINVAL);
3297 
3298         if (state >= X2APIC_STATE_LAST)
3299                 return (EINVAL);
3300 
3301         vm->vcpu[vcpuid].x2apic_state = state;
3302 
3303         vlapic_set_x2apic_state(vm, vcpuid, state);
3304 
3305         return (0);
3306 }
3307 
3308 /*
3309  * This function is called to ensure that a vcpu "sees" a pending event
3310  * as soon as possible:
3311  * - If the vcpu thread is sleeping then it is woken up.
3312  * - If the vcpu is running on a different host_cpu then an IPI will be directed
3313  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
3314  */
3315 static void
3316 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3317 {
3318         int hostcpu;
3319 
3320         ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3321 
3322         hostcpu = vcpu->hostcpu;
3323         if (vcpu->state == VCPU_RUNNING) {
3324                 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3325                 if (hostcpu != curcpu) {
3326                         if (ntype == VCPU_NOTIFY_APIC) {
3327                                 vlapic_post_intr(vcpu->vlapic, hostcpu,
3328                                     vmm_ipinum);
3329                         } else {
3330                                 ipi_cpu(hostcpu, vmm_ipinum);
3331                         }
3332                 } else {
3333                         /*
3334                          * If the 'vcpu' is running on 'curcpu' then it must
3335                          * be sending a notification to itself (e.g. SELF_IPI).
3336                          * The pending event will be picked up when the vcpu
3337                          * transitions back to guest context.
3338                          */
3339                 }
3340         } else {
3341                 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3342                     "with hostcpu %d", vcpu->state, hostcpu));
3343                 if (vcpu->state == VCPU_SLEEPING) {
3344                         cv_signal(&vcpu->vcpu_cv);
3345                 }
3346         }
3347 }
3348 
3349 void
3350 vcpu_notify_event(struct vm *vm, int vcpuid)
3351 {
3352         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3353 
3354         vcpu_lock(vcpu);
3355         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3356         vcpu_unlock(vcpu);
3357 }
3358 
3359 void
3360 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3361 {
3362         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3363 
3364         if (ntype == VCPU_NOTIFY_NONE) {
3365                 return;
3366         }
3367 
3368         vcpu_lock(vcpu);
3369         vcpu_notify_event_locked(vcpu, ntype);
3370         vcpu_unlock(vcpu);
3371 }
3372 
3373 void
3374 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3375 {
3376         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3377         hrtime_t now = gethrtime();
3378 
3379         ASSERT3U(ustate, !=, vcpu->ustate);
3380         ASSERT3S(ustate, <, VU_MAX);
3381         ASSERT3S(ustate, >=, VU_INIT);
3382 
3383         hrtime_t delta = now - vcpu->ustate_when;
3384         vcpu->ustate_total[vcpu->ustate] += delta;
3385 
3386         membar_producer();
3387 
3388         vcpu->ustate_when = now;
3389         vcpu->ustate = ustate;
3390 }
3391 
3392 struct vmspace *
3393 vm_get_vmspace(struct vm *vm)
3394 {
3395 
3396         return (vm->vmspace);
3397 }
3398 
3399 int
3400 vm_apicid2vcpuid(struct vm *vm, int apicid)
3401 {
3402         /*
3403          * XXX apic id is assumed to be numerically identical to vcpu id
3404          */
3405         return (apicid);
3406 }
3407 
3408 struct vatpic *
3409 vm_atpic(struct vm *vm)
3410 {
3411         return (vm->vatpic);
3412 }
3413 
3414 struct vatpit *
3415 vm_atpit(struct vm *vm)
3416 {
3417         return (vm->vatpit);
3418 }
3419 
3420 struct vpmtmr *
3421 vm_pmtmr(struct vm *vm)
3422 {
3423 
3424         return (vm->vpmtmr);
3425 }
3426 
3427 struct vrtc *
3428 vm_rtc(struct vm *vm)
3429 {
3430 
3431         return (vm->vrtc);
3432 }
3433 
3434 enum vm_reg_name
3435 vm_segment_name(int seg)
3436 {
3437         static enum vm_reg_name seg_names[] = {
3438                 VM_REG_GUEST_ES,
3439                 VM_REG_GUEST_CS,
3440                 VM_REG_GUEST_SS,
3441                 VM_REG_GUEST_DS,
3442                 VM_REG_GUEST_FS,
3443                 VM_REG_GUEST_GS
3444         };
3445 
3446         KASSERT(seg >= 0 && seg < nitems(seg_names),
3447             ("%s: invalid segment encoding %d", __func__, seg));
3448         return (seg_names[seg]);
3449 }
3450 
3451 void
3452 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3453     int num_copyinfo)
3454 {
3455         int idx;
3456 
3457         for (idx = 0; idx < num_copyinfo; idx++) {
3458                 if (copyinfo[idx].cookie != NULL)
3459                         vm_gpa_release(copyinfo[idx].cookie);
3460         }
3461         bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3462 }
3463 
3464 int
3465 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3466     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3467     int num_copyinfo, int *fault)
3468 {
3469         int error, idx, nused;
3470         size_t n, off, remaining;
3471         void *hva, *cookie;
3472         uint64_t gpa;
3473 
3474         bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3475 
3476         nused = 0;
3477         remaining = len;
3478         while (remaining > 0) {
3479                 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3480                 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3481                 if (error || *fault)
3482                         return (error);
3483                 off = gpa & PAGE_MASK;
3484                 n = min(remaining, PAGE_SIZE - off);
3485                 copyinfo[nused].gpa = gpa;
3486                 copyinfo[nused].len = n;
3487                 remaining -= n;
3488                 gla += n;
3489                 nused++;
3490         }
3491 
3492         for (idx = 0; idx < nused; idx++) {
3493                 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3494                     copyinfo[idx].len, prot, &cookie);
3495                 if (hva == NULL)
3496                         break;
3497                 copyinfo[idx].hva = hva;
3498                 copyinfo[idx].cookie = cookie;
3499         }
3500 
3501         if (idx != nused) {
3502                 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3503                 return (EFAULT);
3504         } else {
3505                 *fault = 0;
3506                 return (0);
3507         }
3508 }
3509 
3510 void
3511 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3512     size_t len)
3513 {
3514         char *dst;
3515         int idx;
3516 
3517         dst = kaddr;
3518         idx = 0;
3519         while (len > 0) {
3520                 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3521                 len -= copyinfo[idx].len;
3522                 dst += copyinfo[idx].len;
3523                 idx++;
3524         }
3525 }
3526 
3527 void
3528 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3529     struct vm_copyinfo *copyinfo, size_t len)
3530 {
3531         const char *src;
3532         int idx;
3533 
3534         src = kaddr;
3535         idx = 0;
3536         while (len > 0) {
3537                 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3538                 len -= copyinfo[idx].len;
3539                 src += copyinfo[idx].len;
3540                 idx++;
3541         }
3542 }
3543 
3544 /*
3545  * Return the amount of in-use and wired memory for the VM. Since
3546  * these are global stats, only return the values with for vCPU 0
3547  */
3548 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3549 VMM_STAT_DECLARE(VMM_MEM_WIRED);
3550 
3551 static void
3552 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3553 {
3554 
3555         if (vcpu == 0) {
3556                 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3557                     PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3558         }
3559 }
3560 
3561 static void
3562 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3563 {
3564 
3565         if (vcpu == 0) {
3566                 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3567                     PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3568         }
3569 }
3570 
3571 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3572 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3573 
3574 int
3575 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3576     uint8_t bytes, uint32_t *val)
3577 {
3578         return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3579 }
3580 
3581 /*
3582  * bhyve-internal interfaces to attach or detach IO port handlers.
3583  * Must be called with VM write lock held for safety.
3584  */
3585 int
3586 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3587     void **cookie)
3588 {
3589         int err;
3590         err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3591         if (err == 0) {
3592                 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3593         }
3594         return (err);
3595 }
3596 int
3597 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3598     void **old_arg)
3599 {
3600         uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3601         int err;
3602 
3603         err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3604         if (err == 0) {
3605                 *cookie = NULL;
3606         }
3607         return (err);
3608 }
3609 
3610 /*
3611  * External driver interfaces to attach or detach IO port handlers.
3612  * Must be called with VM write lock held for safety.
3613  */
3614 int
3615 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3616     void *arg, void **cookie)
3617 {
3618         int err;
3619 
3620         if (port == 0) {
3621                 return (EINVAL);
3622         }
3623 
3624         err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3625         if (err == 0) {
3626                 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3627         }
3628         return (err);
3629 }
3630 void
3631 vm_ioport_unhook(struct vm *vm, void **cookie)
3632 {
3633         uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3634         ioport_handler_t old_func;
3635         void *old_arg;
3636         int err;
3637 
3638         err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3639 
3640         /* ioport-hook-using drivers are expected to be well-behaved */
3641         VERIFY0(err);
3642         VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3643 
3644         *cookie = NULL;
3645 }
3646 
3647 int
3648 vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3649 {
3650         struct vm *vm = ksp->ks_private;
3651         vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3652         const int vcpuid = vvk->vvk_vcpu.value.ui32;
3653         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3654 
3655         ASSERT3U(vcpuid, <, VM_MAXCPU);
3656 
3657         vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3658         vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3659         vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3660         vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3661         vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3662         vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3663 
3664         return (0);
3665 }
3666 
3667 int
3668 vm_arc_resv(struct vm *vm, uint64_t len)
3669 {
3670         /* Since we already have the compat macros included, we use those */
3671         size_t pages = (size_t)roundup2(len, PAGE_SIZE) >> PAGE_SHIFT;
3672         int err = 0;
3673 
3674         err = arc_virt_machine_reserve(pages);
3675         if (err != 0)
3676                 return (err);
3677 
3678         vm->arc_resv += pages;
3679         return (0);
3680 }