checkme New usr/src/uts/i86pc/io/vmm/vmm.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2011 NetApp, Inc.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  *
  28  * $FreeBSD$
  29  */
  30 /*
  31  * This file and its contents are supplied under the terms of the
  32  * Common Development and Distribution License ("CDDL"), version 1.0.
  33  * You may only use this file in accordance with the terms of version
  34  * 1.0 of the CDDL.
  35  *
  36  * A full copy of the text of the CDDL should have accompanied this
  37  * source.  A copy of the CDDL is also available via the Internet at
  38  * http://www.illumos.org/license/CDDL.
  39  *
  40  * Copyright 2015 Pluribus Networks Inc.
  41  * Copyright 2018 Joyent, Inc.
  42  * Copyright 2020 Oxide Computer Company
  43  */
  44 
  45 #include <sys/cdefs.h>
  46 __FBSDID("$FreeBSD$");
  47 
  48 #include <sys/param.h>
  49 #include <sys/systm.h>
  50 #include <sys/kernel.h>
  51 #include <sys/module.h>
  52 #include <sys/sysctl.h>
  53 #include <sys/malloc.h>
  54 #include <sys/pcpu.h>
  55 #include <sys/lock.h>
  56 #include <sys/mutex.h>
  57 #include <sys/proc.h>
  58 #include <sys/rwlock.h>
  59 #include <sys/sched.h>
  60 #include <sys/smp.h>
  61 #include <sys/systm.h>
  62 
  63 #include <vm/vm.h>
  64 #include <vm/vm_object.h>
  65 #include <vm/vm_map.h>
  66 #include <vm/vm_page.h>
  67 #include <vm/pmap.h>
  68 #include <vm/vm_extern.h>
  69 #include <vm/vm_param.h>
  70 
  71 #ifdef __FreeBSD__
  72 #include <machine/cpu.h>
  73 #endif
  74 #include <machine/pcb.h>
  75 #include <machine/smp.h>
  76 #include <machine/md_var.h>
  77 #include <x86/psl.h>
  78 #include <x86/apicreg.h>
  79 
  80 #include <machine/vmm.h>
  81 #include <machine/vmm_dev.h>
  82 #include <sys/vmm_instruction_emul.h>
  83 
  84 #include "vmm_ioport.h"
  85 #include "vmm_ktr.h"
  86 #include "vmm_host.h"
  87 #include "vmm_mem.h"
  88 #include "vmm_util.h"
  89 #include "vatpic.h"
  90 #include "vatpit.h"
  91 #include "vhpet.h"
  92 #include "vioapic.h"
  93 #include "vlapic.h"
  94 #include "vpmtmr.h"
  95 #include "vrtc.h"
  96 #include "vmm_stat.h"
  97 #include "vmm_lapic.h"
  98 
  99 #include "io/ppt.h"
 100 #include "io/iommu.h"
 101 
 102 struct vlapic;
 103 
 104 /*
 105  * Initialization:
 106  * (a) allocated when vcpu is created
 107  * (i) initialized when vcpu is created and when it is reinitialized
 108  * (o) initialized the first time the vcpu is created
 109  * (x) initialized before use
 110  */
 111 struct vcpu {
 112         /* (o) protects state, run_state, hostcpu, sipi_vector */
 113         struct mtx      mtx;
 114 
 115         enum vcpu_state state;          /* (o) vcpu state */
 116         enum vcpu_run_state run_state;  /* (i) vcpu init/sipi/run state */
 117         kcondvar_t      vcpu_cv;        /* (o) cpu waiter cv */
 118         kcondvar_t      state_cv;       /* (o) IDLE-transition cv */
 119         int             hostcpu;        /* (o) vcpu's current host cpu */
 120         int             lastloccpu;     /* (o) last host cpu localized to */
 121         int             reqidle;        /* (i) request vcpu to idle */
 122         struct vlapic   *vlapic;        /* (i) APIC device model */
 123         enum x2apic_state x2apic_state; /* (i) APIC mode */
 124         uint64_t        exitintinfo;    /* (i) events pending at VM exit */
 125         int             nmi_pending;    /* (i) NMI pending */
 126         int             extint_pending; /* (i) INTR pending */
 127         int     exception_pending;      /* (i) exception pending */
 128         int     exc_vector;             /* (x) exception collateral */
 129         int     exc_errcode_valid;
 130         uint32_t exc_errcode;
 131         uint8_t         sipi_vector;    /* (i) SIPI vector */
 132         struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
 133         uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
 134         void            *stats;         /* (a,i) statistics */
 135         struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
 136         uint64_t        nextrip;        /* (x) next instruction to execute */
 137         struct vie      *vie_ctx;       /* (x) instruction emulation context */
 138 #ifndef __FreeBSD__
 139         uint64_t        tsc_offset;     /* (x) offset from host TSC */
 140 #endif
 141 };
 142 
 143 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 144 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 145 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
 146 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
 147 #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
 148 
 149 struct mem_seg {
 150         size_t  len;
 151         bool    sysmem;
 152         struct vm_object *object;
 153 };
 154 #ifdef __FreeBSD__
 155 #define VM_MAX_MEMSEGS  3
 156 #else
 157 #define VM_MAX_MEMSEGS  4
 158 #endif
 159 
 160 struct mem_map {
 161         vm_paddr_t      gpa;
 162         size_t          len;
 163         vm_ooffset_t    segoff;
 164         int             segid;
 165         int             prot;
 166         int             flags;
 167 };
 168 #define VM_MAX_MEMMAPS  8
 169 
 170 /*
 171  * Initialization:
 172  * (o) initialized the first time the VM is created
 173  * (i) initialized when VM is created and when it is reinitialized
 174  * (x) initialized before use
 175  */
 176 struct vm {
 177         void            *cookie;                /* (i) cpu-specific data */
 178         void            *iommu;                 /* (x) iommu-specific data */
 179         struct vhpet    *vhpet;                 /* (i) virtual HPET */
 180         struct vioapic  *vioapic;               /* (i) virtual ioapic */
 181         struct vatpic   *vatpic;                /* (i) virtual atpic */
 182         struct vatpit   *vatpit;                /* (i) virtual atpit */
 183         struct vpmtmr   *vpmtmr;                /* (i) virtual ACPI PM timer */
 184         struct vrtc     *vrtc;                  /* (o) virtual RTC */
 185         volatile cpuset_t active_cpus;          /* (i) active vcpus */
 186         volatile cpuset_t debug_cpus;           /* (i) vcpus stopped for dbg */
 187         int             suspend;                /* (i) stop VM execution */
 188         volatile cpuset_t suspended_cpus;       /* (i) suspended vcpus */
 189         volatile cpuset_t halted_cpus;          /* (x) cpus in a hard halt */
 190         struct mem_map  mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 191         struct mem_seg  mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 192         struct vmspace  *vmspace;               /* (o) guest's address space */
 193         char            name[VM_MAX_NAMELEN];   /* (o) virtual machine name */
 194         struct vcpu     vcpu[VM_MAXCPU];        /* (i) guest vcpus */
 195         /* The following describe the vm cpu topology */
 196         uint16_t        sockets;                /* (o) num of sockets */
 197         uint16_t        cores;                  /* (o) num of cores/socket */
 198         uint16_t        threads;                /* (o) num of threads/core */
 199         uint16_t        maxcpus;                /* (o) max pluggable cpus */
 200 
 201         struct ioport_config ioports;           /* (o) ioport handling */
 202 };
 203 
 204 static int vmm_initialized;
 205 
 206 
 207 static void
 208 nullop_panic(void)
 209 {
 210         panic("null vmm operation call");
 211 }
 212 
 213 /* Do not allow use of an un-set `ops` to do anything but panic */
 214 static struct vmm_ops vmm_ops_null = {
 215         .init           = (vmm_init_func_t)nullop_panic,
 216         .cleanup        = (vmm_cleanup_func_t)nullop_panic,
 217         .resume         = (vmm_resume_func_t)nullop_panic,
 218         .vminit         = (vmi_init_func_t)nullop_panic,
 219         .vmrun          = (vmi_run_func_t)nullop_panic,
 220         .vmcleanup      = (vmi_cleanup_func_t)nullop_panic,
 221         .vmgetreg       = (vmi_get_register_t)nullop_panic,
 222         .vmsetreg       = (vmi_set_register_t)nullop_panic,
 223         .vmgetdesc      = (vmi_get_desc_t)nullop_panic,
 224         .vmsetdesc      = (vmi_set_desc_t)nullop_panic,
 225         .vmgetcap       = (vmi_get_cap_t)nullop_panic,
 226         .vmsetcap       = (vmi_set_cap_t)nullop_panic,
 227         .vmspace_alloc  = (vmi_vmspace_alloc)nullop_panic,
 228         .vmspace_free   = (vmi_vmspace_free)nullop_panic,
 229         .vlapic_init    = (vmi_vlapic_init)nullop_panic,
 230         .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
 231         .vmsavectx      = (vmi_savectx)nullop_panic,
 232         .vmrestorectx   = (vmi_restorectx)nullop_panic,
 233 };
 234 
 235 static struct vmm_ops *ops = &vmm_ops_null;
 236 
 237 #define VMM_INIT(num)                   ((*ops->init)(num))
 238 #define VMM_CLEANUP()                   ((*ops->cleanup)())
 239 #define VMM_RESUME()                    ((*ops->resume)())
 240 
 241 #define VMINIT(vm, pmap)                ((*ops->vminit)(vm, pmap))
 242 #define VMRUN(vmi, vcpu, rip, pmap) \
 243         ((*ops->vmrun)(vmi, vcpu, rip, pmap))
 244 #define VMCLEANUP(vmi)                  ((*ops->vmcleanup)(vmi))
 245 #define VMSPACE_ALLOC(min, max)         ((*ops->vmspace_alloc)(min, max))
 246 #define VMSPACE_FREE(vmspace)           ((*ops->vmspace_free)(vmspace))
 247 
 248 #define VMGETREG(vmi, vcpu, num, rv)    ((*ops->vmgetreg)(vmi, vcpu, num, rv))
 249 #define VMSETREG(vmi, vcpu, num, val)   ((*ops->vmsetreg)(vmi, vcpu, num, val))
 250 #define VMGETDESC(vmi, vcpu, num, dsc)  ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
 251 #define VMSETDESC(vmi, vcpu, num, dsc)  ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
 252 #define VMGETCAP(vmi, vcpu, num, rv)    ((*ops->vmgetcap)(vmi, vcpu, num, rv))
 253 #define VMSETCAP(vmi, vcpu, num, val)   ((*ops->vmsetcap)(vmi, vcpu, num, val))
 254 #define VLAPIC_INIT(vmi, vcpu)          ((*ops->vlapic_init)(vmi, vcpu))
 255 #define VLAPIC_CLEANUP(vmi, vlapic)     ((*ops->vlapic_cleanup)(vmi, vlapic))
 256 
 257 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 258 #define fpu_stop_emulating()    clts()
 259 
 260 SDT_PROVIDER_DEFINE(vmm);
 261 
 262 static MALLOC_DEFINE(M_VM, "vm", "vm");
 263 
 264 /* statistics */
 265 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 266 
 267 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 268     NULL);
 269 
 270 /*
 271  * Halt the guest if all vcpus are executing a HLT instruction with
 272  * interrupts disabled.
 273  */
 274 static int halt_detection_enabled = 1;
 275 
 276 /* IPI vector used for vcpu notifications */
 277 static int vmm_ipinum;
 278 
 279 /* Trap into hypervisor on all guest exceptions and reflect them back */
 280 static int trace_guest_exceptions;
 281 
 282 static void vm_free_memmap(struct vm *vm, int ident);
 283 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 284 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
 285 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
 286 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
 287 
 288 #ifndef __FreeBSD__
 289 static void vm_clear_memseg(struct vm *, int);
 290 
 291 /* Flags for vtc_status */
 292 #define VTCS_FPU_RESTORED       1 /* guest FPU restored, host FPU saved */
 293 #define VTCS_FPU_CTX_CRITICAL   2 /* in ctx where FPU restore cannot be lazy */
 294 
 295 typedef struct vm_thread_ctx {
 296         struct vm       *vtc_vm;
 297         int             vtc_vcpuid;
 298         uint_t          vtc_status;
 299 } vm_thread_ctx_t;
 300 #endif /* __FreeBSD__ */
 301 
 302 #ifdef KTR
 303 static const char *
 304 vcpu_state2str(enum vcpu_state state)
 305 {
 306 
 307         switch (state) {
 308         case VCPU_IDLE:
 309                 return ("idle");
 310         case VCPU_FROZEN:
 311                 return ("frozen");
 312         case VCPU_RUNNING:
 313                 return ("running");
 314         case VCPU_SLEEPING:
 315                 return ("sleeping");
 316         default:
 317                 return ("unknown");
 318         }
 319 }
 320 #endif
 321 
 322 static void
 323 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 324 {
 325         struct vcpu *vcpu = &vm->vcpu[i];
 326 
 327         VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 328         if (destroy) {
 329                 vmm_stat_free(vcpu->stats);
 330                 fpu_save_area_free(vcpu->guestfpu);
 331                 vie_free(vcpu->vie_ctx);
 332                 vcpu->vie_ctx = NULL;
 333         }
 334 }
 335 
 336 static void
 337 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 338 {
 339         struct vcpu *vcpu;
 340 
 341         KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 342             ("vcpu_init: invalid vcpu %d", vcpu_id));
 343 
 344         vcpu = &vm->vcpu[vcpu_id];
 345 
 346         if (create) {
 347 #ifdef __FreeBSD__
 348                 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 349                     "initialized", vcpu_id));
 350 #endif
 351                 vcpu_lock_init(vcpu);
 352                 vcpu->state = VCPU_IDLE;
 353                 vcpu->hostcpu = NOCPU;
 354 #ifndef __FreeBSD__
 355                 vcpu->lastloccpu = NOCPU;
 356 #endif
 357                 vcpu->guestfpu = fpu_save_area_alloc();
 358                 vcpu->stats = vmm_stat_alloc();
 359                 vcpu->vie_ctx = vie_alloc();
 360         } else {
 361                 vie_reset(vcpu->vie_ctx);
 362                 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
 363         }
 364 
 365         vcpu->run_state = VRS_HALT;
 366         vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 367         vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 368         vcpu->reqidle = 0;
 369         vcpu->exitintinfo = 0;
 370         vcpu->nmi_pending = 0;
 371         vcpu->extint_pending = 0;
 372         vcpu->exception_pending = 0;
 373         vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 374         fpu_save_area_reset(vcpu->guestfpu);
 375         vmm_stat_init(vcpu->stats);
 376 }
 377 
 378 int
 379 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 380 {
 381 
 382         return (trace_guest_exceptions);
 383 }
 384 
 385 struct vm_exit *
 386 vm_exitinfo(struct vm *vm, int cpuid)
 387 {
 388         struct vcpu *vcpu;
 389 
 390         if (cpuid < 0 || cpuid >= vm->maxcpus)
 391                 panic("vm_exitinfo: invalid cpuid %d", cpuid);
 392 
 393         vcpu = &vm->vcpu[cpuid];
 394 
 395         return (&vcpu->exitinfo);
 396 }
 397 
 398 struct vie *
 399 vm_vie_ctx(struct vm *vm, int cpuid)
 400 {
 401         if (cpuid < 0 || cpuid >= vm->maxcpus)
 402                 panic("vm_vie_ctx: invalid cpuid %d", cpuid);
 403 
 404         return (vm->vcpu[cpuid].vie_ctx);
 405 }
 406 
 407 static int
 408 vmm_init(void)
 409 {
 410         int error;
 411 
 412         vmm_host_state_init();
 413 
 414 #ifdef __FreeBSD__
 415         vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
 416             &IDTVEC(justreturn));
 417         if (vmm_ipinum < 0)
 418                 vmm_ipinum = IPI_AST;
 419 #else
 420         /* We use cpu_poke() for IPIs */
 421         vmm_ipinum = 0;
 422 #endif
 423 
 424         error = vmm_mem_init();
 425         if (error)
 426                 return (error);
 427 
 428         if (vmm_is_intel())
 429                 ops = &vmm_ops_intel;
 430         else if (vmm_is_svm())
 431                 ops = &vmm_ops_amd;
 432         else
 433                 return (ENXIO);
 434 
 435 #ifdef __FreeBSD__
 436         vmm_resume_p = vmm_resume;
 437 #endif
 438 
 439         return (VMM_INIT(vmm_ipinum));
 440 }
 441 
 442 int
 443 vmm_mod_load()
 444 {
 445         int     error;
 446 
 447         VERIFY(vmm_initialized == 0);
 448 
 449         error = vmm_init();
 450         if (error == 0)
 451                 vmm_initialized = 1;
 452 
 453         return (error);
 454 }
 455 
 456 int
 457 vmm_mod_unload()
 458 {
 459         int     error;
 460 
 461         VERIFY(vmm_initialized == 1);
 462 
 463         iommu_cleanup();
 464         error = VMM_CLEANUP();
 465         if (error)
 466                 return (error);
 467         vmm_initialized = 0;
 468 
 469         return (0);
 470 }
 471 
 472 static void
 473 vm_init(struct vm *vm, bool create)
 474 {
 475         int i;
 476 #ifndef __FreeBSD__
 477         uint64_t tsc_off;
 478 #endif
 479 
 480         vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 481         vm->iommu = NULL;
 482         vm->vioapic = vioapic_init(vm);
 483         vm->vhpet = vhpet_init(vm);
 484         vm->vatpic = vatpic_init(vm);
 485         vm->vatpit = vatpit_init(vm);
 486         vm->vpmtmr = vpmtmr_init(vm);
 487         if (create)
 488                 vm->vrtc = vrtc_init(vm);
 489 
 490         vm_inout_init(vm, &vm->ioports);
 491 
 492         CPU_ZERO(&vm->active_cpus);
 493         CPU_ZERO(&vm->debug_cpus);
 494 
 495         vm->suspend = 0;
 496         CPU_ZERO(&vm->suspended_cpus);
 497 
 498         for (i = 0; i < vm->maxcpus; i++)
 499                 vcpu_init(vm, i, create);
 500 
 501 #ifndef __FreeBSD__
 502         tsc_off = (uint64_t)(-(int64_t)rdtsc());
 503         for (i = 0; i < vm->maxcpus; i++) {
 504                 vm->vcpu[i].tsc_offset = tsc_off;
 505         }
 506 #endif /* __FreeBSD__ */
 507 }
 508 
 509 /*
 510  * The default CPU topology is a single thread per package.
 511  */
 512 uint_t cores_per_package = 1;
 513 uint_t threads_per_core = 1;
 514 
 515 int
 516 vm_create(const char *name, struct vm **retvm)
 517 {
 518         struct vm *vm;
 519         struct vmspace *vmspace;
 520 
 521         /*
 522          * If vmm.ko could not be successfully initialized then don't attempt
 523          * to create the virtual machine.
 524          */
 525         if (!vmm_initialized)
 526                 return (ENXIO);
 527 
 528         if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 529                 return (EINVAL);
 530 
 531         vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
 532         if (vmspace == NULL)
 533                 return (ENOMEM);
 534 
 535         vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
 536         strcpy(vm->name, name);
 537         vm->vmspace = vmspace;
 538 
 539         vm->sockets = 1;
 540         vm->cores = cores_per_package;       /* XXX backwards compatibility */
 541         vm->threads = threads_per_core;      /* XXX backwards compatibility */
 542         vm->maxcpus = VM_MAXCPU;     /* XXX temp to keep code working */
 543 
 544         vm_init(vm, true);
 545 
 546         *retvm = vm;
 547         return (0);
 548 }
 549 
 550 void
 551 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
 552     uint16_t *threads, uint16_t *maxcpus)
 553 {
 554         *sockets = vm->sockets;
 555         *cores = vm->cores;
 556         *threads = vm->threads;
 557         *maxcpus = vm->maxcpus;
 558 }
 559 
 560 uint16_t
 561 vm_get_maxcpus(struct vm *vm)
 562 {
 563         return (vm->maxcpus);
 564 }
 565 
 566 int
 567 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
 568     uint16_t threads, uint16_t maxcpus)
 569 {
 570         if (maxcpus != 0)
 571                 return (EINVAL);        /* XXX remove when supported */
 572         if ((sockets * cores * threads) > vm->maxcpus)
 573                 return (EINVAL);
 574         /* XXX need to check sockets * cores * threads == vCPU, how? */
 575         vm->sockets = sockets;
 576         vm->cores = cores;
 577         vm->threads = threads;
 578         vm->maxcpus = VM_MAXCPU;     /* XXX temp to keep code working */
 579         return (0);
 580 }
 581 
 582 static void
 583 vm_cleanup(struct vm *vm, bool destroy)
 584 {
 585         struct mem_map *mm;
 586         int i;
 587 
 588         ppt_unassign_all(vm);
 589 
 590         if (vm->iommu != NULL)
 591                 iommu_destroy_domain(vm->iommu);
 592 
 593         /*
 594          * Devices which attach their own ioport hooks should be cleaned up
 595          * first so they can tear down those registrations.
 596          */
 597         vpmtmr_cleanup(vm->vpmtmr);
 598 
 599         vm_inout_cleanup(vm, &vm->ioports);
 600 
 601         if (destroy)
 602                 vrtc_cleanup(vm->vrtc);
 603         else
 604                 vrtc_reset(vm->vrtc);
 605 
 606         vatpit_cleanup(vm->vatpit);
 607         vhpet_cleanup(vm->vhpet);
 608         vatpic_cleanup(vm->vatpic);
 609         vioapic_cleanup(vm->vioapic);
 610 
 611         for (i = 0; i < vm->maxcpus; i++)
 612                 vcpu_cleanup(vm, i, destroy);
 613 
 614         VMCLEANUP(vm->cookie);
 615 
 616         /*
 617          * System memory is removed from the guest address space only when
 618          * the VM is destroyed. This is because the mapping remains the same
 619          * across VM reset.
 620          *
 621          * Device memory can be relocated by the guest (e.g. using PCI BARs)
 622          * so those mappings are removed on a VM reset.
 623          */
 624         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 625                 mm = &vm->mem_maps[i];
 626                 if (destroy || !sysmem_mapping(vm, mm))
 627                         vm_free_memmap(vm, i);
 628 #ifndef __FreeBSD__
 629                 else {
 630                         /*
 631                          * We need to reset the IOMMU flag so this mapping can
 632                          * be reused when a VM is rebooted. Since the IOMMU
 633                          * domain has already been destroyed we can just reset
 634                          * the flag here.
 635                          */
 636                         mm->flags &= ~VM_MEMMAP_F_IOMMU;
 637                 }
 638 #endif
 639         }
 640 
 641         if (destroy) {
 642                 for (i = 0; i < VM_MAX_MEMSEGS; i++)
 643                         vm_free_memseg(vm, i);
 644 
 645                 VMSPACE_FREE(vm->vmspace);
 646                 vm->vmspace = NULL;
 647         }
 648 #ifndef __FreeBSD__
 649         else {
 650                 /*
 651                  * Clear the first memory segment (low mem), old memory contents
 652                  * could confuse the UEFI firmware.
 653                  */
 654                 vm_clear_memseg(vm, 0);
 655         }
 656 #endif
 657 }
 658 
 659 void
 660 vm_destroy(struct vm *vm)
 661 {
 662         vm_cleanup(vm, true);
 663         free(vm, M_VM);
 664 }
 665 
 666 int
 667 vm_reinit(struct vm *vm)
 668 {
 669         int error;
 670 
 671         /*
 672          * A virtual machine can be reset only if all vcpus are suspended.
 673          */
 674         if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 675                 vm_cleanup(vm, false);
 676                 vm_init(vm, false);
 677                 error = 0;
 678         } else {
 679                 error = EBUSY;
 680         }
 681 
 682         return (error);
 683 }
 684 
 685 const char *
 686 vm_name(struct vm *vm)
 687 {
 688         return (vm->name);
 689 }
 690 
 691 int
 692 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 693 {
 694         vm_object_t obj;
 695 
 696         if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 697                 return (ENOMEM);
 698         else
 699                 return (0);
 700 }
 701 
 702 int
 703 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 704 {
 705 
 706         vmm_mmio_free(vm->vmspace, gpa, len);
 707         return (0);
 708 }
 709 
 710 /*
 711  * Return 'true' if 'gpa' is allocated in the guest address space.
 712  *
 713  * This function is called in the context of a running vcpu which acts as
 714  * an implicit lock on 'vm->mem_maps[]'.
 715  */
 716 bool
 717 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 718 {
 719         struct mem_map *mm;
 720         int i;
 721 
 722 #ifdef INVARIANTS
 723         int hostcpu, state;
 724         state = vcpu_get_state(vm, vcpuid, &hostcpu);
 725         KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
 726             ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
 727 #endif
 728 
 729         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 730                 mm = &vm->mem_maps[i];
 731                 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
 732                         return (true);          /* 'gpa' is sysmem or devmem */
 733         }
 734 
 735         if (ppt_is_mmio(vm, gpa))
 736                 return (true);                  /* 'gpa' is pci passthru mmio */
 737 
 738         return (false);
 739 }
 740 
 741 int
 742 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 743 {
 744         struct mem_seg *seg;
 745         vm_object_t obj;
 746 
 747 #ifndef __FreeBSD__
 748         extern pgcnt_t get_max_page_get(void);
 749 #endif
 750 
 751         if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 752                 return (EINVAL);
 753 
 754         if (len == 0 || (len & PAGE_MASK))
 755                 return (EINVAL);
 756 
 757 #ifndef __FreeBSD__
 758         if (len > ptob(get_max_page_get()))
 759                 return (EINVAL);
 760 #endif
 761 
 762         seg = &vm->mem_segs[ident];
 763         if (seg->object != NULL) {
 764                 if (seg->len == len && seg->sysmem == sysmem)
 765                         return (EEXIST);
 766                 else
 767                         return (EINVAL);
 768         }
 769 
 770         obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
 771         if (obj == NULL)
 772                 return (ENOMEM);
 773 
 774         seg->len = len;
 775         seg->object = obj;
 776         seg->sysmem = sysmem;
 777         return (0);
 778 }
 779 
 780 int
 781 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
 782     vm_object_t *objptr)
 783 {
 784         struct mem_seg *seg;
 785 
 786         if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 787                 return (EINVAL);
 788 
 789         seg = &vm->mem_segs[ident];
 790         if (len)
 791                 *len = seg->len;
 792         if (sysmem)
 793                 *sysmem = seg->sysmem;
 794         if (objptr)
 795                 *objptr = seg->object;
 796         return (0);
 797 }
 798 
 799 #ifndef __FreeBSD__
 800 static void
 801 vm_clear_memseg(struct vm *vm, int ident)
 802 {
 803         struct mem_seg *seg;
 804 
 805         KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 806             ("%s: invalid memseg ident %d", __func__, ident));
 807 
 808         seg = &vm->mem_segs[ident];
 809 
 810         if (seg->object != NULL)
 811                 vm_object_clear(seg->object);
 812 }
 813 #endif
 814 
 815 void
 816 vm_free_memseg(struct vm *vm, int ident)
 817 {
 818         struct mem_seg *seg;
 819 
 820         KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 821             ("%s: invalid memseg ident %d", __func__, ident));
 822 
 823         seg = &vm->mem_segs[ident];
 824         if (seg->object != NULL) {
 825                 vm_object_deallocate(seg->object);
 826                 bzero(seg, sizeof (struct mem_seg));
 827         }
 828 }
 829 
 830 int
 831 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
 832     size_t len, int prot, int flags)
 833 {
 834         struct mem_seg *seg;
 835         struct mem_map *m, *map;
 836         vm_ooffset_t last;
 837         int i, error;
 838 
 839         if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
 840                 return (EINVAL);
 841 
 842         if (flags & ~VM_MEMMAP_F_WIRED)
 843                 return (EINVAL);
 844 
 845         if (segid < 0 || segid >= VM_MAX_MEMSEGS)
 846                 return (EINVAL);
 847 
 848         seg = &vm->mem_segs[segid];
 849         if (seg->object == NULL)
 850                 return (EINVAL);
 851 
 852         last = first + len;
 853         if (first < 0 || first >= last || last > seg->len)
 854                 return (EINVAL);
 855 
 856         if ((gpa | first | last) & PAGE_MASK)
 857                 return (EINVAL);
 858 
 859         map = NULL;
 860         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 861                 m = &vm->mem_maps[i];
 862                 if (m->len == 0) {
 863                         map = m;
 864                         break;
 865                 }
 866         }
 867 
 868         if (map == NULL)
 869                 return (ENOSPC);
 870 
 871         error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
 872             len, 0, VMFS_NO_SPACE, prot, prot, 0);
 873         if (error != KERN_SUCCESS)
 874                 return (EFAULT);
 875 
 876         vm_object_reference(seg->object);
 877 
 878         if ((flags & VM_MEMMAP_F_WIRED) != 0) {
 879                 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
 880                     VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 881                 if (error != KERN_SUCCESS) {
 882                         vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
 883                         return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
 884                             EFAULT);
 885                 }
 886         }
 887 
 888         map->gpa = gpa;
 889         map->len = len;
 890         map->segoff = first;
 891         map->segid = segid;
 892         map->prot = prot;
 893         map->flags = flags;
 894         return (0);
 895 }
 896 
 897 int
 898 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
 899     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 900 {
 901         struct mem_map *mm, *mmnext;
 902         int i;
 903 
 904         mmnext = NULL;
 905         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 906                 mm = &vm->mem_maps[i];
 907                 if (mm->len == 0 || mm->gpa < *gpa)
 908                         continue;
 909                 if (mmnext == NULL || mm->gpa < mmnext->gpa)
 910                         mmnext = mm;
 911         }
 912 
 913         if (mmnext != NULL) {
 914                 *gpa = mmnext->gpa;
 915                 if (segid)
 916                         *segid = mmnext->segid;
 917                 if (segoff)
 918                         *segoff = mmnext->segoff;
 919                 if (len)
 920                         *len = mmnext->len;
 921                 if (prot)
 922                         *prot = mmnext->prot;
 923                 if (flags)
 924                         *flags = mmnext->flags;
 925                 return (0);
 926         } else {
 927                 return (ENOENT);
 928         }
 929 }
 930 
 931 static void
 932 vm_free_memmap(struct vm *vm, int ident)
 933 {
 934         struct mem_map *mm;
 935         int error;
 936 
 937         mm = &vm->mem_maps[ident];
 938         if (mm->len) {
 939                 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 940                     mm->gpa + mm->len);
 941                 KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
 942                     __func__, error));
 943                 bzero(mm, sizeof (struct mem_map));
 944         }
 945 }
 946 
 947 static __inline bool
 948 sysmem_mapping(struct vm *vm, struct mem_map *mm)
 949 {
 950 
 951         if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 952                 return (true);
 953         else
 954                 return (false);
 955 }
 956 
 957 vm_paddr_t
 958 vmm_sysmem_maxaddr(struct vm *vm)
 959 {
 960         struct mem_map *mm;
 961         vm_paddr_t maxaddr;
 962         int i;
 963 
 964         maxaddr = 0;
 965         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 966                 mm = &vm->mem_maps[i];
 967                 if (sysmem_mapping(vm, mm)) {
 968                         if (maxaddr < mm->gpa + mm->len)
 969                                 maxaddr = mm->gpa + mm->len;
 970                 }
 971         }
 972         return (maxaddr);
 973 }
 974 
 975 static void
 976 vm_iommu_modify(struct vm *vm, bool map)
 977 {
 978         int i, sz;
 979         vm_paddr_t gpa, hpa;
 980         struct mem_map *mm;
 981 #ifdef __FreeBSD__
 982         void *vp, *cookie, *host_domain;
 983 #else
 984         void *vp, *cookie, *host_domain __unused;
 985 #endif
 986 
 987         sz = PAGE_SIZE;
 988         host_domain = iommu_host_domain();
 989 
 990         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 991                 mm = &vm->mem_maps[i];
 992                 if (!sysmem_mapping(vm, mm))
 993                         continue;
 994 
 995                 if (map) {
 996                         KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
 997                             ("iommu map found invalid memmap %lx/%lx/%x",
 998                             mm->gpa, mm->len, mm->flags));
 999                         if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
1000                                 continue;
1001                         mm->flags |= VM_MEMMAP_F_IOMMU;
1002                 } else {
1003                         if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
1004                                 continue;
1005                         mm->flags &= ~VM_MEMMAP_F_IOMMU;
1006                         KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
1007                             ("iommu unmap found invalid memmap %lx/%lx/%x",
1008                             mm->gpa, mm->len, mm->flags));
1009                 }
1010 
1011                 gpa = mm->gpa;
1012                 while (gpa < mm->gpa + mm->len) {
1013                         vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
1014                             &cookie);
1015                         KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
1016                             vm_name(vm), gpa));
1017 
1018                         vm_gpa_release(cookie);
1019 
1020                         hpa = DMAP_TO_PHYS((uintptr_t)vp);
1021                         if (map) {
1022                                 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
1023 #ifdef __FreeBSD__
1024                                 iommu_remove_mapping(host_domain, hpa, sz);
1025 #endif
1026                         } else {
1027                                 iommu_remove_mapping(vm->iommu, gpa, sz);
1028 #ifdef __FreeBSD__
1029                                 iommu_create_mapping(host_domain, hpa, hpa, sz);
1030 #endif
1031                         }
1032 
1033                         gpa += PAGE_SIZE;
1034                 }
1035         }
1036 
1037         /*
1038          * Invalidate the cached translations associated with the domain
1039          * from which pages were removed.
1040          */
1041 #ifdef __FreeBSD__
1042         if (map)
1043                 iommu_invalidate_tlb(host_domain);
1044         else
1045                 iommu_invalidate_tlb(vm->iommu);
1046 #else
1047         iommu_invalidate_tlb(vm->iommu);
1048 #endif
1049 }
1050 
1051 #define vm_iommu_unmap(vm)      vm_iommu_modify((vm), false)
1052 #define vm_iommu_map(vm)        vm_iommu_modify((vm), true)
1053 
1054 int
1055 vm_unassign_pptdev(struct vm *vm, int pptfd)
1056 {
1057         int error;
1058 
1059         error = ppt_unassign_device(vm, pptfd);
1060         if (error)
1061                 return (error);
1062 
1063         if (ppt_assigned_devices(vm) == 0)
1064                 vm_iommu_unmap(vm);
1065 
1066         return (0);
1067 }
1068 
1069 int
1070 vm_assign_pptdev(struct vm *vm, int pptfd)
1071 {
1072         int error;
1073         vm_paddr_t maxaddr;
1074 
1075         /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1076         if (ppt_assigned_devices(vm) == 0) {
1077                 KASSERT(vm->iommu == NULL,
1078                     ("vm_assign_pptdev: iommu must be NULL"));
1079                 maxaddr = vmm_sysmem_maxaddr(vm);
1080                 vm->iommu = iommu_create_domain(maxaddr);
1081                 if (vm->iommu == NULL)
1082                         return (ENXIO);
1083                 vm_iommu_map(vm);
1084         }
1085 
1086         error = ppt_assign_device(vm, pptfd);
1087         return (error);
1088 }
1089 
1090 void *
1091 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1092     void **cookie)
1093 {
1094         int i, count, pageoff;
1095         struct mem_map *mm;
1096         vm_page_t m;
1097 #ifdef INVARIANTS
1098         /*
1099          * All vcpus are frozen by ioctls that modify the memory map
1100          * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1101          * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1102          */
1103         int state;
1104         KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1105             __func__, vcpuid));
1106         for (i = 0; i < vm->maxcpus; i++) {
1107                 if (vcpuid != -1 && vcpuid != i)
1108                         continue;
1109                 state = vcpu_get_state(vm, i, NULL);
1110                 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1111                     __func__, state));
1112         }
1113 #endif
1114         pageoff = gpa & PAGE_MASK;
1115         if (len > PAGE_SIZE - pageoff)
1116                 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1117 
1118         count = 0;
1119         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1120                 mm = &vm->mem_maps[i];
1121                 if (mm->len == 0) {
1122                         continue;
1123                 }
1124                 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1125                         count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1126                             trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1127                         break;
1128                 }
1129         }
1130 
1131         if (count == 1) {
1132                 *cookie = m;
1133                 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1134         } else {
1135                 *cookie = NULL;
1136                 return (NULL);
1137         }
1138 }
1139 
1140 void
1141 vm_gpa_release(void *cookie)
1142 {
1143         vm_page_t m = cookie;
1144 
1145         vm_page_unwire(m, PQ_ACTIVE);
1146 }
1147 
1148 int
1149 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1150 {
1151 
1152         if (vcpu < 0 || vcpu >= vm->maxcpus)
1153                 return (EINVAL);
1154 
1155         if (reg >= VM_REG_LAST)
1156                 return (EINVAL);
1157 
1158         return (VMGETREG(vm->cookie, vcpu, reg, retval));
1159 }
1160 
1161 int
1162 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1163 {
1164         struct vcpu *vcpu;
1165         int error;
1166 
1167         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1168                 return (EINVAL);
1169 
1170         if (reg >= VM_REG_LAST)
1171                 return (EINVAL);
1172 
1173         error = VMSETREG(vm->cookie, vcpuid, reg, val);
1174         if (error || reg != VM_REG_GUEST_RIP)
1175                 return (error);
1176 
1177         /* Set 'nextrip' to match the value of %rip */
1178         VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1179         vcpu = &vm->vcpu[vcpuid];
1180         vcpu->nextrip = val;
1181         return (0);
1182 }
1183 
1184 static bool
1185 is_descriptor_table(int reg)
1186 {
1187         switch (reg) {
1188         case VM_REG_GUEST_IDTR:
1189         case VM_REG_GUEST_GDTR:
1190                 return (true);
1191         default:
1192                 return (false);
1193         }
1194 }
1195 
1196 static bool
1197 is_segment_register(int reg)
1198 {
1199         switch (reg) {
1200         case VM_REG_GUEST_ES:
1201         case VM_REG_GUEST_CS:
1202         case VM_REG_GUEST_SS:
1203         case VM_REG_GUEST_DS:
1204         case VM_REG_GUEST_FS:
1205         case VM_REG_GUEST_GS:
1206         case VM_REG_GUEST_TR:
1207         case VM_REG_GUEST_LDTR:
1208                 return (true);
1209         default:
1210                 return (false);
1211         }
1212 }
1213 
1214 int
1215 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1216 {
1217 
1218         if (vcpu < 0 || vcpu >= vm->maxcpus)
1219                 return (EINVAL);
1220 
1221         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1222                 return (EINVAL);
1223 
1224         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1225 }
1226 
1227 int
1228 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1229 {
1230         if (vcpu < 0 || vcpu >= vm->maxcpus)
1231                 return (EINVAL);
1232 
1233         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1234                 return (EINVAL);
1235 
1236         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1237 }
1238 
1239 int
1240 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1241 {
1242         struct vcpu *vcpu;
1243 
1244         if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1245                 return (EINVAL);
1246         }
1247 
1248         vcpu = &vm->vcpu[vcpuid];
1249 
1250         vcpu_lock(vcpu);
1251         *state = vcpu->run_state;
1252         *sipi_vec = vcpu->sipi_vector;
1253         vcpu_unlock(vcpu);
1254 
1255         return (0);
1256 }
1257 
1258 int
1259 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1260 {
1261         struct vcpu *vcpu;
1262 
1263         if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1264                 return (EINVAL);
1265         }
1266         if (!VRS_IS_VALID(state)) {
1267                 return (EINVAL);
1268         }
1269 
1270         vcpu = &vm->vcpu[vcpuid];
1271 
1272         vcpu_lock(vcpu);
1273         vcpu->run_state = state;
1274         vcpu->sipi_vector = sipi_vec;
1275         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1276         vcpu_unlock(vcpu);
1277 
1278         return (0);
1279 }
1280 
1281 
1282 static void
1283 restore_guest_fpustate(struct vcpu *vcpu)
1284 {
1285 
1286         /* flush host state to the pcb */
1287         fpuexit(curthread);
1288 
1289         /* restore guest FPU state */
1290         fpu_stop_emulating();
1291         fpurestore(vcpu->guestfpu);
1292 
1293         /* restore guest XCR0 if XSAVE is enabled in the host */
1294         if (rcr4() & CR4_XSAVE)
1295                 load_xcr(0, vcpu->guest_xcr0);
1296 
1297         /*
1298          * The FPU is now "dirty" with the guest's state so turn on emulation
1299          * to trap any access to the FPU by the host.
1300          */
1301         fpu_start_emulating();
1302 }
1303 
1304 static void
1305 save_guest_fpustate(struct vcpu *vcpu)
1306 {
1307 
1308         if ((rcr0() & CR0_TS) == 0)
1309                 panic("fpu emulation not enabled in host!");
1310 
1311         /* save guest XCR0 and restore host XCR0 */
1312         if (rcr4() & CR4_XSAVE) {
1313                 vcpu->guest_xcr0 = rxcr(0);
1314                 load_xcr(0, vmm_get_host_xcr0());
1315         }
1316 
1317         /* save guest FPU state */
1318         fpu_stop_emulating();
1319         fpusave(vcpu->guestfpu);
1320 #ifdef __FreeBSD__
1321         fpu_start_emulating();
1322 #else
1323         /*
1324          * When the host state has been restored, we should not re-enable
1325          * CR0.TS on illumos for eager FPU.
1326          */
1327 #endif
1328 }
1329 
1330 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
1331 
1332 static int
1333 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1334     bool from_idle)
1335 {
1336         struct vcpu *vcpu;
1337         int error;
1338 
1339         vcpu = &vm->vcpu[vcpuid];
1340         vcpu_assert_locked(vcpu);
1341 
1342         /*
1343          * State transitions from the vmmdev_ioctl() must always begin from
1344          * the VCPU_IDLE state. This guarantees that there is only a single
1345          * ioctl() operating on a vcpu at any point.
1346          */
1347         if (from_idle) {
1348                 while (vcpu->state != VCPU_IDLE) {
1349                         vcpu->reqidle = 1;
1350                         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1351                         VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1352                             "idle requested", vcpu_state2str(vcpu->state));
1353 #ifdef __FreeBSD__
1354                         msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1355 #else
1356                         cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1357 #endif
1358                 }
1359         } else {
1360                 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1361                     "vcpu idle state"));
1362         }
1363 
1364         if (vcpu->state == VCPU_RUNNING) {
1365                 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1366                     "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1367         } else {
1368                 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1369                     "vcpu that is not running", vcpu->hostcpu));
1370         }
1371 
1372         /*
1373          * The following state transitions are allowed:
1374          * IDLE -> FROZEN -> IDLE
1375          * FROZEN -> RUNNING -> FROZEN
1376          * FROZEN -> SLEEPING -> FROZEN
1377          */
1378         switch (vcpu->state) {
1379         case VCPU_IDLE:
1380         case VCPU_RUNNING:
1381         case VCPU_SLEEPING:
1382                 error = (newstate != VCPU_FROZEN);
1383                 break;
1384         case VCPU_FROZEN:
1385                 error = (newstate == VCPU_FROZEN);
1386                 break;
1387         default:
1388                 error = 1;
1389                 break;
1390         }
1391 
1392         if (error)
1393                 return (EBUSY);
1394 
1395         VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1396             vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1397 
1398         vcpu->state = newstate;
1399         if (newstate == VCPU_RUNNING)
1400                 vcpu->hostcpu = curcpu;
1401         else
1402                 vcpu->hostcpu = NOCPU;
1403 
1404         if (newstate == VCPU_IDLE) {
1405 #ifdef __FreeBSD__
1406                 wakeup(&vcpu->state);
1407 #else
1408                 cv_broadcast(&vcpu->state_cv);
1409 #endif
1410         }
1411 
1412         return (0);
1413 }
1414 
1415 static void
1416 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1417 {
1418         int error;
1419 
1420         if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1421                 panic("Error %d setting state to %d\n", error, newstate);
1422 }
1423 
1424 static void
1425 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1426 {
1427         int error;
1428 
1429         if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1430                 panic("Error %d setting state to %d", error, newstate);
1431 }
1432 
1433 /*
1434  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1435  */
1436 static int
1437 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1438 {
1439         struct vcpu *vcpu;
1440         int t, vcpu_halted, vm_halted;
1441         bool userspace_exit = false;
1442 
1443         KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1444 
1445         vcpu = &vm->vcpu[vcpuid];
1446         vcpu_halted = 0;
1447         vm_halted = 0;
1448 
1449         vcpu_lock(vcpu);
1450         while (1) {
1451                 /*
1452                  * Do a final check for pending interrupts (including NMI and
1453                  * INIT) before putting this thread to sleep.
1454                  */
1455                 if (vm_nmi_pending(vm, vcpuid))
1456                         break;
1457                 if (vcpu_run_state_pending(vm, vcpuid))
1458                         break;
1459                 if (!intr_disabled) {
1460                         if (vm_extint_pending(vm, vcpuid) ||
1461                             vlapic_pending_intr(vcpu->vlapic, NULL)) {
1462                                 break;
1463                         }
1464                 }
1465 
1466                 /*
1467                  * Also check for software events which would cause a wake-up.
1468                  * This will set the appropriate exitcode directly, rather than
1469                  * requiring a trip through VM_RUN().
1470                  */
1471                 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1472                         userspace_exit = true;
1473                         break;
1474                 }
1475 
1476                 /*
1477                  * Some Linux guests implement "halt" by having all vcpus
1478                  * execute HLT with interrupts disabled. 'halted_cpus' keeps
1479                  * track of the vcpus that have entered this state. When all
1480                  * vcpus enter the halted state the virtual machine is halted.
1481                  */
1482                 if (intr_disabled) {
1483                         if (!vcpu_halted && halt_detection_enabled) {
1484                                 vcpu_halted = 1;
1485                                 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1486                         }
1487                         if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1488                                 vm_halted = 1;
1489                                 break;
1490                         }
1491                 }
1492 
1493                 t = ticks;
1494                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1495                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1496                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1497                 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1498         }
1499 
1500         if (vcpu_halted)
1501                 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1502 
1503         vcpu_unlock(vcpu);
1504 
1505         if (vm_halted)
1506                 vm_suspend(vm, VM_SUSPEND_HALT);
1507 
1508         return (userspace_exit ? -1 : 0);
1509 }
1510 
1511 static int
1512 vm_handle_paging(struct vm *vm, int vcpuid)
1513 {
1514         int rv, ftype;
1515         struct vm_map *map;
1516         struct vcpu *vcpu;
1517         struct vm_exit *vme;
1518 
1519         vcpu = &vm->vcpu[vcpuid];
1520         vme = &vcpu->exitinfo;
1521 
1522         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1523             __func__, vme->inst_length));
1524 
1525         ftype = vme->u.paging.fault_type;
1526         KASSERT(ftype == VM_PROT_READ ||
1527             ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1528             ("vm_handle_paging: invalid fault_type %d", ftype));
1529 
1530         if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1531                 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1532                     vme->u.paging.gpa, ftype);
1533                 if (rv == 0) {
1534                         VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1535                             ftype == VM_PROT_READ ? "accessed" : "dirty",
1536                             vme->u.paging.gpa);
1537                         goto done;
1538                 }
1539         }
1540 
1541         map = &vm->vmspace->vm_map;
1542         rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1543 
1544         VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1545             "ftype = %d", rv, vme->u.paging.gpa, ftype);
1546 
1547         if (rv != KERN_SUCCESS)
1548                 return (EFAULT);
1549 done:
1550         return (0);
1551 }
1552 
1553 int
1554 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1555     int rsize)
1556 {
1557         int err = ESRCH;
1558 
1559         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1560                 err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1561         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1562                 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1563         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1564                 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1565         }
1566 
1567         return (err);
1568 }
1569 
1570 int
1571 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1572     int wsize)
1573 {
1574         int err = ESRCH;
1575 
1576         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1577                 err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1578         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1579                 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1580         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1581                 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1582         }
1583 
1584         return (err);
1585 }
1586 
1587 static int
1588 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1589 {
1590         struct vie *vie;
1591         struct vcpu *vcpu;
1592         struct vm_exit *vme;
1593         uint64_t inst_addr;
1594         int error, fault, cs_d;
1595 
1596         vcpu = &vm->vcpu[vcpuid];
1597         vme = &vcpu->exitinfo;
1598         vie = vcpu->vie_ctx;
1599 
1600         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1601             __func__, vme->inst_length));
1602 
1603         inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1604         cs_d = vme->u.mmio_emul.cs_d;
1605 
1606         VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1607             vme->u.mmio_emul.gpa);
1608 
1609         /* Fetch the faulting instruction */
1610         if (vie_needs_fetch(vie)) {
1611                 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1612                     &fault);
1613                 if (error != 0) {
1614                         return (error);
1615                 } else if (fault) {
1616                         /*
1617                          * If a fault during instruction fetch was encounted, it
1618                          * will have asserted that the appropriate exception be
1619                          * injected at next entry.  No further work is required.
1620                          */
1621                         return (0);
1622                 }
1623         }
1624 
1625         if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1626                 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1627                     inst_addr);
1628                 /* Dump (unrecognized) instruction bytes in userspace */
1629                 vie_fallback_exitinfo(vie, vme);
1630                 return (-1);
1631         }
1632         if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1633             vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1634                 /* Decoded GLA does not match GLA from VM exit state */
1635                 vie_fallback_exitinfo(vie, vme);
1636                 return (-1);
1637         }
1638 
1639 repeat:
1640         error = vie_emulate_mmio(vie, vm, vcpuid);
1641         if (error < 0) {
1642                 /*
1643                  * MMIO not handled by any of the in-kernel-emulated devices, so
1644                  * make a trip out to userspace for it.
1645                  */
1646                 vie_exitinfo(vie, vme);
1647         } else if (error == EAGAIN) {
1648                 /*
1649                  * Continue emulating the rep-prefixed instruction, which has
1650                  * not completed its iterations.
1651                  *
1652                  * In case this can be emulated in-kernel and has a high
1653                  * repetition count (causing a tight spin), it should be
1654                  * deferential to yield conditions.
1655                  */
1656                 if (!vcpu_should_yield(vm, vcpuid)) {
1657                         goto repeat;
1658                 } else {
1659                         /*
1660                          * Defer to the contending load by making a trip to
1661                          * userspace with a no-op (BOGUS) exit reason.
1662                          */
1663                         vie_reset(vie);
1664                         vme->exitcode = VM_EXITCODE_BOGUS;
1665                         return (-1);
1666                 }
1667         } else if (error == 0) {
1668                 /* Update %rip now that instruction has been emulated */
1669                 vie_advance_pc(vie, &vcpu->nextrip);
1670         }
1671         return (error);
1672 }
1673 
1674 static int
1675 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1676 {
1677         struct vcpu *vcpu;
1678         struct vie *vie;
1679         int err;
1680 
1681         vcpu = &vm->vcpu[vcpuid];
1682         vie = vcpu->vie_ctx;
1683 
1684 repeat:
1685         err = vie_emulate_inout(vie, vm, vcpuid);
1686 
1687         if (err < 0) {
1688                 /*
1689                  * In/out not handled by any of the in-kernel-emulated devices,
1690                  * so make a trip out to userspace for it.
1691                  */
1692                 vie_exitinfo(vie, vme);
1693                 return (err);
1694         } else if (err == EAGAIN) {
1695                 /*
1696                  * Continue emulating the rep-prefixed ins/outs, which has not
1697                  * completed its iterations.
1698                  *
1699                  * In case this can be emulated in-kernel and has a high
1700                  * repetition count (causing a tight spin), it should be
1701                  * deferential to yield conditions.
1702                  */
1703                 if (!vcpu_should_yield(vm, vcpuid)) {
1704                         goto repeat;
1705                 } else {
1706                         /*
1707                          * Defer to the contending load by making a trip to
1708                          * userspace with a no-op (BOGUS) exit reason.
1709                          */
1710                         vie_reset(vie);
1711                         vme->exitcode = VM_EXITCODE_BOGUS;
1712                         return (-1);
1713                 }
1714         } else if (err != 0) {
1715                 /* Emulation failure.  Bail all the way out to userspace. */
1716                 vme->exitcode = VM_EXITCODE_INST_EMUL;
1717                 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1718                 return (-1);
1719         }
1720 
1721         vie_advance_pc(vie, &vcpu->nextrip);
1722         return (0);
1723 }
1724 
1725 static int
1726 vm_handle_suspend(struct vm *vm, int vcpuid)
1727 {
1728 #ifdef __FreeBSD__
1729         int error, i;
1730         struct vcpu *vcpu;
1731         struct thread *td;
1732 
1733         error = 0;
1734         vcpu = &vm->vcpu[vcpuid];
1735         td = curthread;
1736 #else
1737         int i;
1738         struct vcpu *vcpu;
1739 
1740         vcpu = &vm->vcpu[vcpuid];
1741 #endif
1742 
1743         CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1744 
1745 #ifdef __FreeBSD__
1746         /*
1747          * Wait until all 'active_cpus' have suspended themselves.
1748          *
1749          * Since a VM may be suspended at any time including when one or
1750          * more vcpus are doing a rendezvous we need to call the rendezvous
1751          * handler while we are waiting to prevent a deadlock.
1752          */
1753         vcpu_lock(vcpu);
1754         while (error == 0) {
1755                 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1756                         VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1757                         break;
1758                 }
1759 
1760                 if (vm->rendezvous_func == NULL) {
1761                         VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1762                         vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1763                         msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1764                         vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1765                         if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
1766                                 vcpu_unlock(vcpu);
1767                                 error = thread_check_susp(td, false);
1768                                 vcpu_lock(vcpu);
1769                         }
1770                 } else {
1771                         VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1772                         vcpu_unlock(vcpu);
1773                         error = vm_handle_rendezvous(vm, vcpuid);
1774                         vcpu_lock(vcpu);
1775                 }
1776         }
1777         vcpu_unlock(vcpu);
1778 #else
1779         vcpu_lock(vcpu);
1780         while (1) {
1781                 int rc;
1782 
1783                 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1784                         VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1785                         break;
1786                 }
1787 
1788                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1789                 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1790                     TR_CLOCK_TICK);
1791                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1792 
1793                 /*
1794                  * If the userspace process driving the instance is killed, any
1795                  * vCPUs yet to be marked suspended (because they are not
1796                  * VM_RUN-ing in the kernel presently) will never reach that
1797                  * state.
1798                  *
1799                  * To avoid vm_handle_suspend() getting stuck in the kernel
1800                  * waiting for those vCPUs, offer a bail-out even though it
1801                  * means returning without all vCPUs in a suspended state.
1802                  */
1803                 if (rc <= 0) {
1804                         if ((curproc->p_flag & SEXITING) != 0) {
1805                                 break;
1806                         }
1807                 }
1808         }
1809         vcpu_unlock(vcpu);
1810 
1811 #endif
1812 
1813         /*
1814          * Wakeup the other sleeping vcpus and return to userspace.
1815          */
1816         for (i = 0; i < vm->maxcpus; i++) {
1817                 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1818                         vcpu_notify_event(vm, i);
1819                 }
1820         }
1821 
1822         return (-1);
1823 }
1824 
1825 static int
1826 vm_handle_reqidle(struct vm *vm, int vcpuid)
1827 {
1828         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1829 
1830         vcpu_lock(vcpu);
1831         KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1832         vcpu->reqidle = 0;
1833         vcpu_unlock(vcpu);
1834         return (-1);
1835 }
1836 
1837 static int
1838 vm_handle_run_state(struct vm *vm, int vcpuid)
1839 {
1840         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1841         bool handled = false;
1842 
1843         vcpu_lock(vcpu);
1844         while (1) {
1845                 if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1846                         vcpu_unlock(vcpu);
1847                         VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1848                         vcpu_lock(vcpu);
1849 
1850                         vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1851                         vcpu->run_state |= VRS_INIT;
1852                 }
1853 
1854                 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1855                     (VRS_INIT | VRS_PEND_SIPI)) {
1856                         const uint8_t vector = vcpu->sipi_vector;
1857 
1858                         vcpu_unlock(vcpu);
1859                         VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1860                         vcpu_lock(vcpu);
1861 
1862                         vcpu->run_state &= ~VRS_PEND_SIPI;
1863                         vcpu->run_state |= VRS_RUN;
1864                 }
1865 
1866                 /*
1867                  * If the vCPU is now in the running state, there is no need to
1868                  * wait for anything prior to re-entry.
1869                  */
1870                 if ((vcpu->run_state & VRS_RUN) != 0) {
1871                         handled = true;
1872                         break;
1873                 }
1874 
1875                 /*
1876                  * Also check for software events which would cause a wake-up.
1877                  * This will set the appropriate exitcode directly, rather than
1878                  * requiring a trip through VM_RUN().
1879                  */
1880                 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1881                         break;
1882                 }
1883 
1884                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1885                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1886                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1887         }
1888         vcpu_unlock(vcpu);
1889 
1890         return (handled ? 0 : -1);
1891 }
1892 
1893 #ifndef __FreeBSD__
1894 static int
1895 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1896 {
1897         struct vcpu *cpu = &vm->vcpu[vcpuid];
1898         const uint32_t code = vme->u.msr.code;
1899         const uint64_t val = vme->u.msr.wval;
1900 
1901         switch (code) {
1902         case MSR_TSC:
1903                 cpu->tsc_offset = val - rdtsc();
1904                 return (0);
1905         }
1906 
1907         return (-1);
1908 }
1909 #endif /* __FreeBSD__ */
1910 
1911 int
1912 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1913 {
1914         int i;
1915 
1916         if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1917                 return (EINVAL);
1918 
1919         if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1920                 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1921                     vm->suspend, how);
1922                 return (EALREADY);
1923         }
1924 
1925         VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1926 
1927         /*
1928          * Notify all active vcpus that they are now suspended.
1929          */
1930         for (i = 0; i < vm->maxcpus; i++) {
1931                 if (CPU_ISSET(i, &vm->active_cpus))
1932                         vcpu_notify_event(vm, i);
1933         }
1934 
1935         return (0);
1936 }
1937 
1938 void
1939 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1940 {
1941         struct vm_exit *vmexit;
1942 
1943         vmexit = vm_exitinfo(vm, vcpuid);
1944         vmexit->rip = rip;
1945         vmexit->inst_length = 0;
1946         vmexit->exitcode = VM_EXITCODE_RUN_STATE;
1947         vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
1948 }
1949 
1950 
1951 #ifndef __FreeBSD__
1952 /*
1953  * Some vmm resources, such as the lapic, may have CPU-specific resources
1954  * allocated to them which would benefit from migration onto the host CPU which
1955  * is processing the vcpu state.
1956  */
1957 static void
1958 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
1959 {
1960         /*
1961          * Localizing cyclic resources requires acquisition of cpu_lock, and
1962          * doing so with kpreempt disabled is a recipe for deadlock disaster.
1963          */
1964         VERIFY(curthread->t_preempt == 0);
1965 
1966         /*
1967          * Do not bother with localization if this vCPU is about to return to
1968          * the host CPU it was last localized to.
1969          */
1970         if (vcpu->lastloccpu == curcpu)
1971                 return;
1972 
1973         /*
1974          * Localize system-wide resources to the primary boot vCPU.  While any
1975          * of the other vCPUs may access them, it keeps the potential interrupt
1976          * footprint constrained to CPUs involved with this instance.
1977          */
1978         if (vcpu == &vm->vcpu[0]) {
1979                 vhpet_localize_resources(vm->vhpet);
1980                 vrtc_localize_resources(vm->vrtc);
1981                 vatpit_localize_resources(vm->vatpit);
1982         }
1983 
1984         vlapic_localize_resources(vcpu->vlapic);
1985 
1986         vcpu->lastloccpu = curcpu;
1987 }
1988 
1989 static void
1990 vmm_savectx(void *arg)
1991 {
1992         vm_thread_ctx_t *vtc = arg;
1993         struct vm *vm = vtc->vtc_vm;
1994         const int vcpuid = vtc->vtc_vcpuid;
1995 
1996         if (ops->vmsavectx != NULL) {
1997                 ops->vmsavectx(vm->cookie, vcpuid);
1998         }
1999 
2000         /*
2001          * If the CPU holds the restored guest FPU state, save it and restore
2002          * the host FPU state before this thread goes off-cpu.
2003          */
2004         if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2005                 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2006 
2007                 save_guest_fpustate(vcpu);
2008                 vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2009         }
2010 }
2011 
2012 static void
2013 vmm_restorectx(void *arg)
2014 {
2015         vm_thread_ctx_t *vtc = arg;
2016         struct vm *vm = vtc->vtc_vm;
2017         const int vcpuid = vtc->vtc_vcpuid;
2018 
2019         /*
2020          * When coming back on-cpu, only restore the guest FPU status if the
2021          * thread is in a context marked as requiring it.  This should be rare,
2022          * occurring only when a future logic error results in a voluntary
2023          * sleep during the VMRUN critical section.
2024          *
2025          * The common case will result in elision of the guest FPU state
2026          * restoration, deferring that action until it is clearly necessary
2027          * during vm_run.
2028          */
2029         VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2030         if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2031                 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2032 
2033                 restore_guest_fpustate(vcpu);
2034                 vtc->vtc_status |= VTCS_FPU_RESTORED;
2035         }
2036 
2037         if (ops->vmrestorectx != NULL) {
2038                 ops->vmrestorectx(vm->cookie, vcpuid);
2039         }
2040 
2041 }
2042 
2043 /*
2044  * If we're in removectx(), we might still have state to tidy up.
2045  */
2046 static void
2047 vmm_freectx(void *arg, int isexec)
2048 {
2049         vmm_savectx(arg);
2050 }
2051 
2052 #endif /* __FreeBSD */
2053 
2054 static int
2055 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2056     struct vm_exit *vme)
2057 {
2058         struct vcpu *vcpu;
2059         struct vie *vie;
2060         int err;
2061 
2062         vcpu = &vm->vcpu[vcpuid];
2063         vie = vcpu->vie_ctx;
2064         err = 0;
2065 
2066         switch (entry->cmd) {
2067         case VEC_DEFAULT:
2068                 return (0);
2069         case VEC_DISCARD_INSTR:
2070                 vie_reset(vie);
2071                 return (0);
2072         case VEC_FULFILL_MMIO:
2073                 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2074                 if (err == 0) {
2075                         err = vie_emulate_mmio(vie, vm, vcpuid);
2076                         if (err == 0) {
2077                                 vie_advance_pc(vie, &vcpu->nextrip);
2078                         } else if (err < 0) {
2079                                 vie_exitinfo(vie, vme);
2080                         } else if (err == EAGAIN) {
2081                                 /*
2082                                  * Clear the instruction emulation state in
2083                                  * order to re-enter VM context and continue
2084                                  * this 'rep <instruction>'
2085                                  */
2086                                 vie_reset(vie);
2087                                 err = 0;
2088                         }
2089                 }
2090                 break;
2091         case VEC_FULFILL_INOUT:
2092                 err = vie_fulfill_inout(vie, &entry->u.inout);
2093                 if (err == 0) {
2094                         err = vie_emulate_inout(vie, vm, vcpuid);
2095                         if (err == 0) {
2096                                 vie_advance_pc(vie, &vcpu->nextrip);
2097                         } else if (err < 0) {
2098                                 vie_exitinfo(vie, vme);
2099                         } else if (err == EAGAIN) {
2100                                 /*
2101                                  * Clear the instruction emulation state in
2102                                  * order to re-enter VM context and continue
2103                                  * this 'rep ins/outs'
2104                                  */
2105                                 vie_reset(vie);
2106                                 err = 0;
2107                         }
2108                 }
2109                 break;
2110         default:
2111                 return (EINVAL);
2112         }
2113         return (err);
2114 }
2115 
2116 static int
2117 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2118 {
2119         struct vie *vie;
2120 
2121         vie = vm->vcpu[vcpuid].vie_ctx;
2122 
2123         if (vie_pending(vie)) {
2124                 /*
2125                  * Userspace has not fulfilled the pending needs of the
2126                  * instruction emulation, so bail back out.
2127                  */
2128                 vie_exitinfo(vie, vme);
2129                 return (-1);
2130         }
2131 
2132         return (0);
2133 }
2134 
2135 int
2136 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2137 {
2138         int error;
2139         struct vcpu *vcpu;
2140 #ifdef  __FreeBSD__
2141         struct pcb *pcb;
2142 #endif
2143         uint64_t tscval;
2144         struct vm_exit *vme;
2145         bool intr_disabled;
2146         pmap_t pmap;
2147 #ifndef __FreeBSD__
2148         vm_thread_ctx_t vtc;
2149         int affinity_type = CPU_CURRENT;
2150 #endif
2151 
2152         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2153                 return (EINVAL);
2154 
2155         if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2156                 return (EINVAL);
2157 
2158         if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2159                 return (EINVAL);
2160 
2161         pmap = vmspace_pmap(vm->vmspace);
2162         vcpu = &vm->vcpu[vcpuid];
2163         vme = &vcpu->exitinfo;
2164 
2165 #ifndef __FreeBSD__
2166         vtc.vtc_vm = vm;
2167         vtc.vtc_vcpuid = vcpuid;
2168         vtc.vtc_status = 0;
2169 
2170         installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2171             NULL, vmm_freectx);
2172 #endif
2173 
2174         error = vm_entry_actions(vm, vcpuid, entry, vme);
2175         if (error != 0) {
2176                 goto exit;
2177         }
2178 
2179 restart:
2180         error = vm_loop_checks(vm, vcpuid, vme);
2181         if (error != 0) {
2182                 goto exit;
2183         }
2184 
2185 #ifndef __FreeBSD__
2186         thread_affinity_set(curthread, affinity_type);
2187         /*
2188          * Resource localization should happen after the CPU affinity for the
2189          * thread has been set to ensure that access from restricted contexts,
2190          * such as VMX-accelerated APIC operations, can occur without inducing
2191          * cyclic cross-calls.
2192          *
2193          * This must be done prior to disabling kpreempt via critical_enter().
2194          */
2195         vm_localize_resources(vm, vcpu);
2196 
2197         affinity_type = CPU_CURRENT;
2198 #endif
2199 
2200         critical_enter();
2201 
2202         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2203             ("vm_run: absurd pm_active"));
2204 
2205         tscval = rdtsc();
2206 
2207 #ifdef  __FreeBSD__
2208         pcb = PCPU_GET(curpcb);
2209         set_pcb_flags(pcb, PCB_FULL_IRET);
2210 #else
2211         /* Force a trip through update_sregs to reload %fs/%gs and friends */
2212         PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2213 #endif
2214 
2215 #ifdef  __FreeBSD__
2216         restore_guest_fpustate(vcpu);
2217 #else
2218         if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2219                 restore_guest_fpustate(vcpu);
2220                 vtc.vtc_status |= VTCS_FPU_RESTORED;
2221         }
2222         vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2223 #endif
2224 
2225         vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2226         error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2227         vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2228 
2229 #ifdef  __FreeBSD__
2230         save_guest_fpustate(vcpu);
2231 #else
2232         vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2233 #endif
2234 
2235 #ifndef __FreeBSD__
2236         /*
2237          * Once clear of the delicate contexts comprising the VM_RUN handler,
2238          * thread CPU affinity can be loosened while other processing occurs.
2239          */
2240         thread_affinity_clear(curthread);
2241 #endif
2242 
2243         vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
2244 
2245         critical_exit();
2246 
2247         if (error != 0) {
2248                 /* Communicate out any error from VMRUN() above */
2249                 goto exit;
2250         }
2251 
2252         vcpu->nextrip = vme->rip + vme->inst_length;
2253         switch (vme->exitcode) {
2254         case VM_EXITCODE_REQIDLE:
2255                 error = vm_handle_reqidle(vm, vcpuid);
2256                 break;
2257         case VM_EXITCODE_RUN_STATE:
2258                 error = vm_handle_run_state(vm, vcpuid);
2259                 break;
2260         case VM_EXITCODE_SUSPENDED:
2261                 error = vm_handle_suspend(vm, vcpuid);
2262                 break;
2263         case VM_EXITCODE_IOAPIC_EOI:
2264                 vioapic_process_eoi(vm, vcpuid,
2265                     vme->u.ioapic_eoi.vector);
2266                 break;
2267         case VM_EXITCODE_HLT:
2268                 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2269                 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2270                 break;
2271         case VM_EXITCODE_PAGING:
2272                 error = vm_handle_paging(vm, vcpuid);
2273                 break;
2274         case VM_EXITCODE_MMIO_EMUL:
2275                 error = vm_handle_mmio_emul(vm, vcpuid);
2276                 break;
2277         case VM_EXITCODE_INOUT:
2278                 error = vm_handle_inout(vm, vcpuid, vme);
2279                 break;
2280         case VM_EXITCODE_MONITOR:
2281         case VM_EXITCODE_MWAIT:
2282         case VM_EXITCODE_VMINSN:
2283                 vm_inject_ud(vm, vcpuid);
2284                 break;
2285 #ifndef __FreeBSD__
2286         case VM_EXITCODE_WRMSR:
2287                 if (vm_handle_wrmsr(vm, vcpuid, vme) != 0) {
2288                         error = -1;
2289                 }
2290                 break;
2291 
2292         case VM_EXITCODE_HT: {
2293                 affinity_type = CPU_BEST;
2294                 break;
2295         }
2296 #endif
2297 
2298         case VM_EXITCODE_MTRAP:
2299                 vm_suspend_cpu(vm, vcpuid);
2300                 error = -1;
2301                 break;
2302         default:
2303                 /* handled in userland */
2304                 error = -1;
2305                 break;
2306         }
2307 
2308         if (error == 0) {
2309                 /* VM exit conditions handled in-kernel, continue running */
2310                 goto restart;
2311         }
2312 
2313 exit:
2314 #ifndef __FreeBSD__
2315         removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2316             NULL, vmm_freectx);
2317 #endif
2318 
2319         VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2320 
2321         return (error);
2322 }
2323 
2324 int
2325 vm_restart_instruction(void *arg, int vcpuid)
2326 {
2327         struct vm *vm;
2328         struct vcpu *vcpu;
2329         enum vcpu_state state;
2330         uint64_t rip;
2331         int error;
2332 
2333         vm = arg;
2334         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2335                 return (EINVAL);
2336 
2337         vcpu = &vm->vcpu[vcpuid];
2338         state = vcpu_get_state(vm, vcpuid, NULL);
2339         if (state == VCPU_RUNNING) {
2340                 /*
2341                  * When a vcpu is "running" the next instruction is determined
2342                  * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2343                  * Thus setting 'inst_length' to zero will cause the current
2344                  * instruction to be restarted.
2345                  */
2346                 vcpu->exitinfo.inst_length = 0;
2347                 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2348                     "setting inst_length to zero", vcpu->exitinfo.rip);
2349         } else if (state == VCPU_FROZEN) {
2350                 /*
2351                  * When a vcpu is "frozen" it is outside the critical section
2352                  * around VMRUN() and 'nextrip' points to the next instruction.
2353                  * Thus instruction restart is achieved by setting 'nextrip'
2354                  * to the vcpu's %rip.
2355                  */
2356                 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2357                 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2358                 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2359                     "nextrip from %lx to %lx", vcpu->nextrip, rip);
2360                 vcpu->nextrip = rip;
2361         } else {
2362                 panic("%s: invalid state %d", __func__, state);
2363         }
2364         return (0);
2365 }
2366 
2367 int
2368 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2369 {
2370         struct vcpu *vcpu;
2371         int type, vector;
2372 
2373         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2374                 return (EINVAL);
2375 
2376         vcpu = &vm->vcpu[vcpuid];
2377 
2378         if (info & VM_INTINFO_VALID) {
2379                 type = info & VM_INTINFO_TYPE;
2380                 vector = info & 0xff;
2381                 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2382                         return (EINVAL);
2383                 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2384                         return (EINVAL);
2385                 if (info & VM_INTINFO_RSVD)
2386                         return (EINVAL);
2387         } else {
2388                 info = 0;
2389         }
2390         VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2391         vcpu->exitintinfo = info;
2392         return (0);
2393 }
2394 
2395 enum exc_class {
2396         EXC_BENIGN,
2397         EXC_CONTRIBUTORY,
2398         EXC_PAGEFAULT
2399 };
2400 
2401 #define IDT_VE  20      /* Virtualization Exception (Intel specific) */
2402 
2403 static enum exc_class
2404 exception_class(uint64_t info)
2405 {
2406         int type, vector;
2407 
2408         KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2409         type = info & VM_INTINFO_TYPE;
2410         vector = info & 0xff;
2411 
2412         /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2413         switch (type) {
2414         case VM_INTINFO_HWINTR:
2415         case VM_INTINFO_SWINTR:
2416         case VM_INTINFO_NMI:
2417                 return (EXC_BENIGN);
2418         default:
2419                 /*
2420                  * Hardware exception.
2421                  *
2422                  * SVM and VT-x use identical type values to represent NMI,
2423                  * hardware interrupt and software interrupt.
2424                  *
2425                  * SVM uses type '3' for all exceptions. VT-x uses type '3'
2426                  * for exceptions except #BP and #OF. #BP and #OF use a type
2427                  * value of '5' or '6'. Therefore we don't check for explicit
2428                  * values of 'type' to classify 'intinfo' into a hardware
2429                  * exception.
2430                  */
2431                 break;
2432         }
2433 
2434         switch (vector) {
2435         case IDT_PF:
2436         case IDT_VE:
2437                 return (EXC_PAGEFAULT);
2438         case IDT_DE:
2439         case IDT_TS:
2440         case IDT_NP:
2441         case IDT_SS:
2442         case IDT_GP:
2443                 return (EXC_CONTRIBUTORY);
2444         default:
2445                 return (EXC_BENIGN);
2446         }
2447 }
2448 
2449 static int
2450 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2451     uint64_t *retinfo)
2452 {
2453         enum exc_class exc1, exc2;
2454         int type1, vector1;
2455 
2456         KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2457         KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2458 
2459         /*
2460          * If an exception occurs while attempting to call the double-fault
2461          * handler the processor enters shutdown mode (aka triple fault).
2462          */
2463         type1 = info1 & VM_INTINFO_TYPE;
2464         vector1 = info1 & 0xff;
2465         if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2466                 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2467                     info1, info2);
2468                 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2469                 *retinfo = 0;
2470                 return (0);
2471         }
2472 
2473         /*
2474          * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2475          */
2476         exc1 = exception_class(info1);
2477         exc2 = exception_class(info2);
2478         if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2479             (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2480                 /* Convert nested fault into a double fault. */
2481                 *retinfo = IDT_DF;
2482                 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2483                 *retinfo |= VM_INTINFO_DEL_ERRCODE;
2484         } else {
2485                 /* Handle exceptions serially */
2486                 *retinfo = info2;
2487         }
2488         return (1);
2489 }
2490 
2491 static uint64_t
2492 vcpu_exception_intinfo(struct vcpu *vcpu)
2493 {
2494         uint64_t info = 0;
2495 
2496         if (vcpu->exception_pending) {
2497                 info = vcpu->exc_vector & 0xff;
2498                 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2499                 if (vcpu->exc_errcode_valid) {
2500                         info |= VM_INTINFO_DEL_ERRCODE;
2501                         info |= (uint64_t)vcpu->exc_errcode << 32;
2502                 }
2503         }
2504         return (info);
2505 }
2506 
2507 int
2508 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2509 {
2510         struct vcpu *vcpu;
2511         uint64_t info1, info2;
2512         int valid;
2513 
2514         KASSERT(vcpuid >= 0 &&
2515             vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2516 
2517         vcpu = &vm->vcpu[vcpuid];
2518 
2519         info1 = vcpu->exitintinfo;
2520         vcpu->exitintinfo = 0;
2521 
2522         info2 = 0;
2523         if (vcpu->exception_pending) {
2524                 info2 = vcpu_exception_intinfo(vcpu);
2525                 vcpu->exception_pending = 0;
2526                 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2527                     vcpu->exc_vector, info2);
2528         }
2529 
2530         if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2531                 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2532         } else if (info1 & VM_INTINFO_VALID) {
2533                 *retinfo = info1;
2534                 valid = 1;
2535         } else if (info2 & VM_INTINFO_VALID) {
2536                 *retinfo = info2;
2537                 valid = 1;
2538         } else {
2539                 valid = 0;
2540         }
2541 
2542         if (valid) {
2543                 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2544                     "retinfo(%lx)", __func__, info1, info2, *retinfo);
2545         }
2546 
2547         return (valid);
2548 }
2549 
2550 int
2551 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2552 {
2553         struct vcpu *vcpu;
2554 
2555         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2556                 return (EINVAL);
2557 
2558         vcpu = &vm->vcpu[vcpuid];
2559         *info1 = vcpu->exitintinfo;
2560         *info2 = vcpu_exception_intinfo(vcpu);
2561         return (0);
2562 }
2563 
2564 int
2565 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2566     uint32_t errcode, int restart_instruction)
2567 {
2568         struct vcpu *vcpu;
2569         uint64_t regval;
2570         int error;
2571 
2572         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2573                 return (EINVAL);
2574 
2575         if (vector < 0 || vector >= 32)
2576                 return (EINVAL);
2577 
2578         /*
2579          * NMIs (which bear an exception vector of 2) are to be injected via
2580          * their own specialized path using vm_inject_nmi().
2581          */
2582         if (vector == 2) {
2583                 return (EINVAL);
2584         }
2585 
2586         /*
2587          * A double fault exception should never be injected directly into
2588          * the guest. It is a derived exception that results from specific
2589          * combinations of nested faults.
2590          */
2591         if (vector == IDT_DF)
2592                 return (EINVAL);
2593 
2594         vcpu = &vm->vcpu[vcpuid];
2595 
2596         if (vcpu->exception_pending) {
2597                 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2598                     "pending exception %d", vector, vcpu->exc_vector);
2599                 return (EBUSY);
2600         }
2601 
2602         if (errcode_valid) {
2603                 /*
2604                  * Exceptions don't deliver an error code in real mode.
2605                  */
2606                 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2607                 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2608                 if (!(regval & CR0_PE))
2609                         errcode_valid = 0;
2610         }
2611 
2612         /*
2613          * From section 26.6.1 "Interruptibility State" in Intel SDM:
2614          *
2615          * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2616          * one instruction or incurs an exception.
2617          */
2618         error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2619         KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2620             __func__, error));
2621 
2622         if (restart_instruction)
2623                 vm_restart_instruction(vm, vcpuid);
2624 
2625         vcpu->exception_pending = 1;
2626         vcpu->exc_vector = vector;
2627         vcpu->exc_errcode = errcode;
2628         vcpu->exc_errcode_valid = errcode_valid;
2629         VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2630         return (0);
2631 }
2632 
2633 void
2634 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2635     int errcode)
2636 {
2637         int error;
2638 
2639         error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2640             errcode, 1);
2641         KASSERT(error == 0, ("vm_inject_exception error %d", error));
2642 }
2643 
2644 void
2645 vm_inject_ud(struct vm *vm, int vcpuid)
2646 {
2647         vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2648 }
2649 
2650 void
2651 vm_inject_gp(struct vm *vm, int vcpuid)
2652 {
2653         vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2654 }
2655 
2656 void
2657 vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2658 {
2659         vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2660 }
2661 
2662 void
2663 vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2664 {
2665         vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2666 }
2667 
2668 void
2669 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2670 {
2671         int error;
2672 
2673         VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2674             error_code, cr2);
2675 
2676         error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2677         KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2678 
2679         vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2680 }
2681 
2682 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2683 
2684 int
2685 vm_inject_nmi(struct vm *vm, int vcpuid)
2686 {
2687         struct vcpu *vcpu;
2688 
2689         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2690                 return (EINVAL);
2691 
2692         vcpu = &vm->vcpu[vcpuid];
2693 
2694         vcpu->nmi_pending = 1;
2695         vcpu_notify_event(vm, vcpuid);
2696         return (0);
2697 }
2698 
2699 int
2700 vm_nmi_pending(struct vm *vm, int vcpuid)
2701 {
2702         struct vcpu *vcpu;
2703 
2704         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2705                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2706 
2707         vcpu = &vm->vcpu[vcpuid];
2708 
2709         return (vcpu->nmi_pending);
2710 }
2711 
2712 void
2713 vm_nmi_clear(struct vm *vm, int vcpuid)
2714 {
2715         struct vcpu *vcpu;
2716 
2717         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2718                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2719 
2720         vcpu = &vm->vcpu[vcpuid];
2721 
2722         if (vcpu->nmi_pending == 0)
2723                 panic("vm_nmi_clear: inconsistent nmi_pending state");
2724 
2725         vcpu->nmi_pending = 0;
2726         vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2727 }
2728 
2729 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2730 
2731 int
2732 vm_inject_extint(struct vm *vm, int vcpuid)
2733 {
2734         struct vcpu *vcpu;
2735 
2736         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2737                 return (EINVAL);
2738 
2739         vcpu = &vm->vcpu[vcpuid];
2740 
2741         vcpu->extint_pending = 1;
2742         vcpu_notify_event(vm, vcpuid);
2743         return (0);
2744 }
2745 
2746 int
2747 vm_extint_pending(struct vm *vm, int vcpuid)
2748 {
2749         struct vcpu *vcpu;
2750 
2751         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2752                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2753 
2754         vcpu = &vm->vcpu[vcpuid];
2755 
2756         return (vcpu->extint_pending);
2757 }
2758 
2759 void
2760 vm_extint_clear(struct vm *vm, int vcpuid)
2761 {
2762         struct vcpu *vcpu;
2763 
2764         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2765                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2766 
2767         vcpu = &vm->vcpu[vcpuid];
2768 
2769         if (vcpu->extint_pending == 0)
2770                 panic("vm_extint_clear: inconsistent extint_pending state");
2771 
2772         vcpu->extint_pending = 0;
2773         vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2774 }
2775 
2776 int
2777 vm_inject_init(struct vm *vm, int vcpuid)
2778 {
2779         struct vcpu *vcpu;
2780 
2781         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2782                 return (EINVAL);
2783 
2784         vcpu = &vm->vcpu[vcpuid];
2785         vcpu_lock(vcpu);
2786         vcpu->run_state |= VRS_PEND_INIT;
2787         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2788         vcpu_unlock(vcpu);
2789         return (0);
2790 }
2791 
2792 int
2793 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2794 {
2795         struct vcpu *vcpu;
2796 
2797         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2798                 return (EINVAL);
2799 
2800         vcpu = &vm->vcpu[vcpuid];
2801         vcpu_lock(vcpu);
2802         vcpu->run_state |= VRS_PEND_SIPI;
2803         vcpu->sipi_vector = vector;
2804         /* SIPI is only actionable if the CPU is waiting in INIT state */
2805         if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2806                 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2807         }
2808         vcpu_unlock(vcpu);
2809         return (0);
2810 }
2811 
2812 bool
2813 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2814 {
2815         struct vcpu *vcpu;
2816 
2817         ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2818         vcpu = &vm->vcpu[vcpuid];
2819 
2820         /* Of interest: vCPU not in running state or with pending INIT */
2821         return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2822 }
2823 
2824 int
2825 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2826 {
2827         struct seg_desc desc;
2828         const enum vm_reg_name clear_regs[] = {
2829                 VM_REG_GUEST_CR2,
2830                 VM_REG_GUEST_CR3,
2831                 VM_REG_GUEST_CR4,
2832                 VM_REG_GUEST_RAX,
2833                 VM_REG_GUEST_RBX,
2834                 VM_REG_GUEST_RCX,
2835                 VM_REG_GUEST_RSI,
2836                 VM_REG_GUEST_RDI,
2837                 VM_REG_GUEST_RBP,
2838                 VM_REG_GUEST_RSP,
2839                 VM_REG_GUEST_R8,
2840                 VM_REG_GUEST_R9,
2841                 VM_REG_GUEST_R10,
2842                 VM_REG_GUEST_R11,
2843                 VM_REG_GUEST_R12,
2844                 VM_REG_GUEST_R13,
2845                 VM_REG_GUEST_R14,
2846                 VM_REG_GUEST_R15,
2847                 VM_REG_GUEST_DR0,
2848                 VM_REG_GUEST_DR1,
2849                 VM_REG_GUEST_DR2,
2850                 VM_REG_GUEST_DR3,
2851                 VM_REG_GUEST_EFER,
2852         };
2853         const enum vm_reg_name data_segs[] = {
2854                 VM_REG_GUEST_SS,
2855                 VM_REG_GUEST_DS,
2856                 VM_REG_GUEST_ES,
2857                 VM_REG_GUEST_FS,
2858                 VM_REG_GUEST_GS,
2859         };
2860         struct vcpu *vcpu = &vm->vcpu[vcpuid];
2861 
2862         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2863                 return (EINVAL);
2864 
2865         for (uint_t i = 0; i < nitems(clear_regs); i++) {
2866                 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2867         }
2868 
2869         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2870         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2871         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2872 
2873         /*
2874          * The prescribed contents of %rdx differ slightly between the Intel and
2875          * AMD architectural definitions.  The former expects the Extended Model
2876          * in bits 16-19 where the latter expects all the Family, Model, and
2877          * Stepping be there.  Common boot ROMs appear to disregard this
2878          * anyways, so we stick with a compromise value similar to what is
2879          * spelled out in the Intel SDM.
2880          */
2881         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2882 
2883         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2884         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2885 
2886         /* CS: Present, R/W, Accessed */
2887         desc.access = 0x0093;
2888         desc.base = 0xffff0000;
2889         desc.limit = 0xffff;
2890         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2891         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2892 
2893         /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2894         desc.access = 0x0093;
2895         desc.base = 0;
2896         desc.limit = 0xffff;
2897         for (uint_t i = 0; i < nitems(data_segs); i++) {
2898                 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2899                 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2900         }
2901 
2902         /* GDTR, IDTR */
2903         desc.base = 0;
2904         desc.limit = 0xffff;
2905         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2906         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2907 
2908         /* LDTR: Present, LDT */
2909         desc.access = 0x0082;
2910         desc.base = 0;
2911         desc.limit = 0xffff;
2912         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2913         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2914 
2915         /* TR: Present, 32-bit TSS */
2916         desc.access = 0x008b;
2917         desc.base = 0;
2918         desc.limit = 0xffff;
2919         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2920         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2921 
2922         vlapic_reset(vm_lapic(vm, vcpuid));
2923 
2924         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2925 
2926         vcpu->exitintinfo = 0;
2927         vcpu->exception_pending = 0;
2928         vcpu->nmi_pending = 0;
2929         vcpu->extint_pending = 0;
2930 
2931         /*
2932          * A CPU reset caused by power-on or system reset clears more state than
2933          * one which is trigged from an INIT IPI.
2934          */
2935         if (!init_only) {
2936                 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2937                 fpu_save_area_reset(vcpu->guestfpu);
2938 
2939                 /* XXX: clear MSRs and other pieces */
2940         }
2941 
2942         return (0);
2943 }
2944 
2945 static int
2946 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2947 {
2948         struct seg_desc desc;
2949 
2950         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2951                 return (EINVAL);
2952 
2953         /* CS: Present, R/W, Accessed */
2954         desc.access = 0x0093;
2955         desc.base = (uint64_t)vector << 12;
2956         desc.limit = 0xffff;
2957         VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2958         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2959             (uint64_t)vector << 8));
2960 
2961         VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2962 
2963         return (0);
2964 }
2965 
2966 int
2967 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2968 {
2969         if (vcpu < 0 || vcpu >= vm->maxcpus)
2970                 return (EINVAL);
2971 
2972         if (type < 0 || type >= VM_CAP_MAX)
2973                 return (EINVAL);
2974 
2975         return (VMGETCAP(vm->cookie, vcpu, type, retval));
2976 }
2977 
2978 int
2979 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
2980 {
2981         if (vcpu < 0 || vcpu >= vm->maxcpus)
2982                 return (EINVAL);
2983 
2984         if (type < 0 || type >= VM_CAP_MAX)
2985                 return (EINVAL);
2986 
2987         return (VMSETCAP(vm->cookie, vcpu, type, val));
2988 }
2989 
2990 struct vlapic *
2991 vm_lapic(struct vm *vm, int cpu)
2992 {
2993         return (vm->vcpu[cpu].vlapic);
2994 }
2995 
2996 struct vioapic *
2997 vm_ioapic(struct vm *vm)
2998 {
2999 
3000         return (vm->vioapic);
3001 }
3002 
3003 struct vhpet *
3004 vm_hpet(struct vm *vm)
3005 {
3006 
3007         return (vm->vhpet);
3008 }
3009 
3010 #ifdef  __FreeBSD__
3011 bool
3012 vmm_is_pptdev(int bus, int slot, int func)
3013 {
3014         int b, f, i, n, s;
3015         char *val, *cp, *cp2;
3016         bool found;
3017 
3018         /*
3019          * XXX
3020          * The length of an environment variable is limited to 128 bytes which
3021          * puts an upper limit on the number of passthru devices that may be
3022          * specified using a single environment variable.
3023          *
3024          * Work around this by scanning multiple environment variable
3025          * names instead of a single one - yuck!
3026          */
3027         const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
3028 
3029         /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
3030         found = false;
3031         for (i = 0; names[i] != NULL && !found; i++) {
3032                 cp = val = kern_getenv(names[i]);
3033                 while (cp != NULL && *cp != '\0') {
3034                         if ((cp2 = strchr(cp, ' ')) != NULL)
3035                                 *cp2 = '\0';
3036 
3037                         n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
3038                         if (n == 3 && bus == b && slot == s && func == f) {
3039                                 found = true;
3040                                 break;
3041                         }
3042 
3043                         if (cp2 != NULL)
3044                                 *cp2++ = ' ';
3045 
3046                         cp = cp2;
3047                 }
3048                 freeenv(val);
3049         }
3050         return (found);
3051 }
3052 #endif
3053 
3054 void *
3055 vm_iommu_domain(struct vm *vm)
3056 {
3057 
3058         return (vm->iommu);
3059 }
3060 
3061 int
3062 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3063     bool from_idle)
3064 {
3065         int error;
3066         struct vcpu *vcpu;
3067 
3068         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3069                 panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3070 
3071         vcpu = &vm->vcpu[vcpuid];
3072 
3073         vcpu_lock(vcpu);
3074         error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3075         vcpu_unlock(vcpu);
3076 
3077         return (error);
3078 }
3079 
3080 enum vcpu_state
3081 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3082 {
3083         struct vcpu *vcpu;
3084         enum vcpu_state state;
3085 
3086         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3087                 panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3088 
3089         vcpu = &vm->vcpu[vcpuid];
3090 
3091         vcpu_lock(vcpu);
3092         state = vcpu->state;
3093         if (hostcpu != NULL)
3094                 *hostcpu = vcpu->hostcpu;
3095         vcpu_unlock(vcpu);
3096 
3097         return (state);
3098 }
3099 
3100 #ifndef __FreeBSD__
3101 uint64_t
3102 vcpu_tsc_offset(struct vm *vm, int vcpuid)
3103 {
3104         return (vm->vcpu[vcpuid].tsc_offset);
3105 }
3106 #endif /* __FreeBSD__ */
3107 
3108 int
3109 vm_activate_cpu(struct vm *vm, int vcpuid)
3110 {
3111 
3112         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3113                 return (EINVAL);
3114 
3115         if (CPU_ISSET(vcpuid, &vm->active_cpus))
3116                 return (EBUSY);
3117 
3118         VCPU_CTR0(vm, vcpuid, "activated");
3119         CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3120         return (0);
3121 }
3122 
3123 int
3124 vm_suspend_cpu(struct vm *vm, int vcpuid)
3125 {
3126         int i;
3127 
3128         if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3129                 return (EINVAL);
3130 
3131         if (vcpuid == -1) {
3132                 vm->debug_cpus = vm->active_cpus;
3133                 for (i = 0; i < vm->maxcpus; i++) {
3134                         if (CPU_ISSET(i, &vm->active_cpus))
3135                                 vcpu_notify_event(vm, i);
3136                 }
3137         } else {
3138                 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3139                         return (EINVAL);
3140 
3141                 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3142                 vcpu_notify_event(vm, vcpuid);
3143         }
3144         return (0);
3145 }
3146 
3147 int
3148 vm_resume_cpu(struct vm *vm, int vcpuid)
3149 {
3150 
3151         if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3152                 return (EINVAL);
3153 
3154         if (vcpuid == -1) {
3155                 CPU_ZERO(&vm->debug_cpus);
3156         } else {
3157                 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3158                         return (EINVAL);
3159 
3160                 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3161         }
3162         return (0);
3163 }
3164 
3165 static bool
3166 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3167     uint64_t entry_rip)
3168 {
3169         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3170         struct vm_exit *vme = &vcpu->exitinfo;
3171         bool bail = false;
3172 
3173         ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3174 
3175         if (vm->suspend) {
3176                 if (on_entry) {
3177                         VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3178                             vm->suspend < VM_SUSPEND_LAST);
3179 
3180                         vme->exitcode = VM_EXITCODE_SUSPENDED;
3181                         vme->u.suspended.how = vm->suspend;
3182                 } else {
3183                         /*
3184                          * Handling VM suspend is complicated, so if that
3185                          * condition is detected outside of VM-entry itself,
3186                          * just emit a BOGUS exitcode so we take a lap to pick
3187                          * up the event during an entry and are directed into
3188                          * the vm_handle_suspend() logic.
3189                          */
3190                         vme->exitcode = VM_EXITCODE_BOGUS;
3191                 }
3192                 bail = true;
3193         }
3194         if (vcpu->reqidle) {
3195                 vme->exitcode = VM_EXITCODE_REQIDLE;
3196                 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3197 
3198                 if (!on_entry) {
3199                         /*
3200                          * A reqidle request detected outside of VM-entry can be
3201                          * handled directly by clearing the request (and taking
3202                          * a lap to userspace).
3203                          */
3204                         vcpu_assert_locked(vcpu);
3205                         vcpu->reqidle = 0;
3206                 }
3207                 bail = true;
3208         }
3209         if (vcpu_should_yield(vm, vcpuid)) {
3210                 vme->exitcode = VM_EXITCODE_BOGUS;
3211                 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3212                 bail = true;
3213         }
3214         if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3215                 vme->exitcode = VM_EXITCODE_DEBUG;
3216                 bail = true;
3217         }
3218 
3219         if (bail) {
3220                 if (on_entry) {
3221                         /*
3222                          * If bailing out during VM-entry, the current %rip must
3223                          * be recorded in the exitinfo.
3224                          */
3225                         vme->rip = entry_rip;
3226                 }
3227                 vme->inst_length = 0;
3228         }
3229         return (bail);
3230 }
3231 
3232 static bool
3233 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3234 {
3235         /*
3236          * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3237          * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3238          * structure, and we would only modify the exitcode.
3239          */
3240         return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3241 }
3242 
3243 bool
3244 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3245 {
3246         /*
3247          * Bail-out checks done as part of VM entry require an updated %rip to
3248          * populate the vm_exit struct if any of the conditions of interest are
3249          * matched in the check.
3250          */
3251         return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3252 }
3253 
3254 cpuset_t
3255 vm_active_cpus(struct vm *vm)
3256 {
3257 
3258         return (vm->active_cpus);
3259 }
3260 
3261 cpuset_t
3262 vm_debug_cpus(struct vm *vm)
3263 {
3264 
3265         return (vm->debug_cpus);
3266 }
3267 
3268 cpuset_t
3269 vm_suspended_cpus(struct vm *vm)
3270 {
3271 
3272         return (vm->suspended_cpus);
3273 }
3274 
3275 void *
3276 vcpu_stats(struct vm *vm, int vcpuid)
3277 {
3278 
3279         return (vm->vcpu[vcpuid].stats);
3280 }
3281 
3282 int
3283 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3284 {
3285         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3286                 return (EINVAL);
3287 
3288         *state = vm->vcpu[vcpuid].x2apic_state;
3289 
3290         return (0);
3291 }
3292 
3293 int
3294 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3295 {
3296         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3297                 return (EINVAL);
3298 
3299         if (state >= X2APIC_STATE_LAST)
3300                 return (EINVAL);
3301 
3302         vm->vcpu[vcpuid].x2apic_state = state;
3303 
3304         vlapic_set_x2apic_state(vm, vcpuid, state);
3305 
3306         return (0);
3307 }
3308 
3309 /*
3310  * This function is called to ensure that a vcpu "sees" a pending event
3311  * as soon as possible:
3312  * - If the vcpu thread is sleeping then it is woken up.
3313  * - If the vcpu is running on a different host_cpu then an IPI will be directed
3314  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
3315  */
3316 static void
3317 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3318 {
3319         int hostcpu;
3320 
3321         ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3322 
3323         hostcpu = vcpu->hostcpu;
3324         if (vcpu->state == VCPU_RUNNING) {
3325                 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3326                 if (hostcpu != curcpu) {
3327                         if (ntype == VCPU_NOTIFY_APIC) {
3328                                 vlapic_post_intr(vcpu->vlapic, hostcpu,
3329                                     vmm_ipinum);
3330                         } else {
3331                                 ipi_cpu(hostcpu, vmm_ipinum);
3332                         }
3333                 } else {
3334                         /*
3335                          * If the 'vcpu' is running on 'curcpu' then it must
3336                          * be sending a notification to itself (e.g. SELF_IPI).
3337                          * The pending event will be picked up when the vcpu
3338                          * transitions back to guest context.
3339                          */
3340                 }
3341         } else {
3342                 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3343                     "with hostcpu %d", vcpu->state, hostcpu));
3344                 if (vcpu->state == VCPU_SLEEPING) {
3345 #ifdef __FreeBSD__
3346                         wakeup_one(vcpu);
3347 #else
3348                         cv_signal(&vcpu->vcpu_cv);
3349 #endif
3350                 }
3351         }
3352 }
3353 
3354 void
3355 vcpu_notify_event(struct vm *vm, int vcpuid)
3356 {
3357         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3358 
3359         vcpu_lock(vcpu);
3360         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3361         vcpu_unlock(vcpu);
3362 }
3363 
3364 void
3365 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3366 {
3367         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3368 
3369         if (ntype == VCPU_NOTIFY_NONE) {
3370                 return;
3371         }
3372 
3373         vcpu_lock(vcpu);
3374         vcpu_notify_event_locked(vcpu, ntype);
3375         vcpu_unlock(vcpu);
3376 }
3377 
3378 struct vmspace *
3379 vm_get_vmspace(struct vm *vm)
3380 {
3381 
3382         return (vm->vmspace);
3383 }
3384 
3385 int
3386 vm_apicid2vcpuid(struct vm *vm, int apicid)
3387 {
3388         /*
3389          * XXX apic id is assumed to be numerically identical to vcpu id
3390          */
3391         return (apicid);
3392 }
3393 
3394 struct vatpic *
3395 vm_atpic(struct vm *vm)
3396 {
3397         return (vm->vatpic);
3398 }
3399 
3400 struct vatpit *
3401 vm_atpit(struct vm *vm)
3402 {
3403         return (vm->vatpit);
3404 }
3405 
3406 struct vpmtmr *
3407 vm_pmtmr(struct vm *vm)
3408 {
3409 
3410         return (vm->vpmtmr);
3411 }
3412 
3413 struct vrtc *
3414 vm_rtc(struct vm *vm)
3415 {
3416 
3417         return (vm->vrtc);
3418 }
3419 
3420 enum vm_reg_name
3421 vm_segment_name(int seg)
3422 {
3423         static enum vm_reg_name seg_names[] = {
3424                 VM_REG_GUEST_ES,
3425                 VM_REG_GUEST_CS,
3426                 VM_REG_GUEST_SS,
3427                 VM_REG_GUEST_DS,
3428                 VM_REG_GUEST_FS,
3429                 VM_REG_GUEST_GS
3430         };
3431 
3432         KASSERT(seg >= 0 && seg < nitems(seg_names),
3433             ("%s: invalid segment encoding %d", __func__, seg));
3434         return (seg_names[seg]);
3435 }
3436 
3437 void
3438 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3439     int num_copyinfo)
3440 {
3441         int idx;
3442 
3443         for (idx = 0; idx < num_copyinfo; idx++) {
3444                 if (copyinfo[idx].cookie != NULL)
3445                         vm_gpa_release(copyinfo[idx].cookie);
3446         }
3447         bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3448 }
3449 
3450 int
3451 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3452     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3453     int num_copyinfo, int *fault)
3454 {
3455         int error, idx, nused;
3456         size_t n, off, remaining;
3457         void *hva, *cookie;
3458         uint64_t gpa;
3459 
3460         bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3461 
3462         nused = 0;
3463         remaining = len;
3464         while (remaining > 0) {
3465                 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3466                 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3467                 if (error || *fault)
3468                         return (error);
3469                 off = gpa & PAGE_MASK;
3470                 n = min(remaining, PAGE_SIZE - off);
3471                 copyinfo[nused].gpa = gpa;
3472                 copyinfo[nused].len = n;
3473                 remaining -= n;
3474                 gla += n;
3475                 nused++;
3476         }
3477 
3478         for (idx = 0; idx < nused; idx++) {
3479                 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3480                     copyinfo[idx].len, prot, &cookie);
3481                 if (hva == NULL)
3482                         break;
3483                 copyinfo[idx].hva = hva;
3484                 copyinfo[idx].cookie = cookie;
3485         }
3486 
3487         if (idx != nused) {
3488                 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3489                 return (EFAULT);
3490         } else {
3491                 *fault = 0;
3492                 return (0);
3493         }
3494 }
3495 
3496 void
3497 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3498     size_t len)
3499 {
3500         char *dst;
3501         int idx;
3502 
3503         dst = kaddr;
3504         idx = 0;
3505         while (len > 0) {
3506                 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3507                 len -= copyinfo[idx].len;
3508                 dst += copyinfo[idx].len;
3509                 idx++;
3510         }
3511 }
3512 
3513 void
3514 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3515     struct vm_copyinfo *copyinfo, size_t len)
3516 {
3517         const char *src;
3518         int idx;
3519 
3520         src = kaddr;
3521         idx = 0;
3522         while (len > 0) {
3523                 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3524                 len -= copyinfo[idx].len;
3525                 src += copyinfo[idx].len;
3526                 idx++;
3527         }
3528 }
3529 
3530 /*
3531  * Return the amount of in-use and wired memory for the VM. Since
3532  * these are global stats, only return the values with for vCPU 0
3533  */
3534 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3535 VMM_STAT_DECLARE(VMM_MEM_WIRED);
3536 
3537 static void
3538 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3539 {
3540 
3541         if (vcpu == 0) {
3542                 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3543                     PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3544         }
3545 }
3546 
3547 static void
3548 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3549 {
3550 
3551         if (vcpu == 0) {
3552                 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3553                     PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3554         }
3555 }
3556 
3557 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3558 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3559 
3560 int
3561 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3562     uint8_t bytes, uint32_t *val)
3563 {
3564         return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3565 }
3566 
3567 /*
3568  * bhyve-internal interfaces to attach or detach IO port handlers.
3569  * Must be called with VM write lock held for safety.
3570  */
3571 int
3572 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3573     void **cookie)
3574 {
3575         int err;
3576         err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3577         if (err == 0) {
3578                 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3579         }
3580         return (err);
3581 }
3582 int
3583 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3584     void **old_arg)
3585 {
3586         uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3587         int err;
3588 
3589         err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3590         if (err == 0) {
3591                 *cookie = NULL;
3592         }
3593         return (err);
3594 }
3595 
3596 /*
3597  * External driver interfaces to attach or detach IO port handlers.
3598  * Must be called with VM write lock held for safety.
3599  */
3600 int
3601 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3602     void *arg, void **cookie)
3603 {
3604         int err;
3605 
3606         if (port == 0) {
3607                 return (EINVAL);
3608         }
3609 
3610         err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3611         if (err == 0) {
3612                 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3613         }
3614         return (err);
3615 }
3616 void
3617 vm_ioport_unhook(struct vm *vm, void **cookie)
3618 {
3619         uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3620         ioport_handler_t old_func;
3621         void *old_arg;
3622         int err;
3623 
3624         err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3625 
3626         /* ioport-hook-using drivers are expected to be well-behaved */
3627         VERIFY0(err);
3628         VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3629 
3630         *cookie = NULL;
3631 }