1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2011 NetApp, Inc.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  *
  28  * $FreeBSD$
  29  */
  30 /*
  31  * This file and its contents are supplied under the terms of the
  32  * Common Development and Distribution License ("CDDL"), version 1.0.
  33  * You may only use this file in accordance with the terms of version
  34  * 1.0 of the CDDL.
  35  *
  36  * A full copy of the text of the CDDL should have accompanied this
  37  * source.  A copy of the CDDL is also available via the Internet at
  38  * http://www.illumos.org/license/CDDL.
  39  *
  40  * Copyright 2015 Pluribus Networks Inc.
  41  * Copyright 2018 Joyent, Inc.
  42  * Copyright 2020 Oxide Computer Company
  43  */
  44 
  45 #include <sys/cdefs.h>
  46 __FBSDID("$FreeBSD$");
  47 
  48 #include <sys/param.h>
  49 #include <sys/systm.h>
  50 #include <sys/kernel.h>
  51 #include <sys/module.h>
  52 #include <sys/sysctl.h>
  53 #include <sys/malloc.h>
  54 #include <sys/pcpu.h>
  55 #include <sys/lock.h>
  56 #include <sys/mutex.h>
  57 #include <sys/proc.h>
  58 #include <sys/rwlock.h>
  59 #include <sys/sched.h>
  60 #include <sys/smp.h>
  61 #include <sys/systm.h>
  62 
  63 #include <vm/vm.h>
  64 #include <vm/vm_object.h>
  65 #include <vm/vm_map.h>
  66 #include <vm/vm_page.h>
  67 #include <vm/pmap.h>
  68 #include <vm/vm_extern.h>
  69 #include <vm/vm_param.h>
  70 
  71 #ifdef __FreeBSD__
  72 #include <machine/cpu.h>
  73 #endif
  74 #include <machine/pcb.h>
  75 #include <machine/smp.h>
  76 #include <machine/md_var.h>
  77 #include <x86/psl.h>
  78 #include <x86/apicreg.h>
  79 
  80 #include <machine/vmm.h>
  81 #include <machine/vmm_dev.h>
  82 #include <sys/vmm_instruction_emul.h>
  83 
  84 #include "vmm_ioport.h"
  85 #include "vmm_ktr.h"
  86 #include "vmm_host.h"
  87 #include "vmm_mem.h"
  88 #include "vmm_util.h"
  89 #include "vatpic.h"
  90 #include "vatpit.h"
  91 #include "vhpet.h"
  92 #include "vioapic.h"
  93 #include "vlapic.h"
  94 #include "vpmtmr.h"
  95 #include "vrtc.h"
  96 #include "vmm_stat.h"
  97 #include "vmm_lapic.h"
  98 
  99 #include "io/ppt.h"
 100 #include "io/iommu.h"
 101 
 102 struct vlapic;
 103 
 104 /*
 105  * Initialization:
 106  * (a) allocated when vcpu is created
 107  * (i) initialized when vcpu is created and when it is reinitialized
 108  * (o) initialized the first time the vcpu is created
 109  * (x) initialized before use
 110  */
 111 struct vcpu {
 112         struct mtx      mtx;            /* (o) protects 'state' and 'hostcpu' */
 113         enum vcpu_state state;          /* (o) vcpu state */
 114 #ifndef __FreeBSD__
 115         kcondvar_t      vcpu_cv;        /* (o) cpu waiter cv */
 116         kcondvar_t      state_cv;       /* (o) IDLE-transition cv */
 117 #endif /* __FreeBSD__ */
 118         int             hostcpu;        /* (o) vcpu's current host cpu */
 119 #ifndef __FreeBSD__
 120         int             lastloccpu;     /* (o) last host cpu localized to */
 121 #endif
 122         uint_t          runblock;       /* (i) block vcpu from run state */
 123         int             reqidle;        /* (i) request vcpu to idle */
 124         struct vlapic   *vlapic;        /* (i) APIC device model */
 125         enum x2apic_state x2apic_state; /* (i) APIC mode */
 126         uint64_t        exitintinfo;    /* (i) events pending at VM exit */
 127         int             nmi_pending;    /* (i) NMI pending */
 128         int             extint_pending; /* (i) INTR pending */
 129         int     exception_pending;      /* (i) exception pending */
 130         int     exc_vector;             /* (x) exception collateral */
 131         int     exc_errcode_valid;
 132         uint32_t exc_errcode;
 133         struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
 134         uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
 135         void            *stats;         /* (a,i) statistics */
 136         struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
 137         uint64_t        nextrip;        /* (x) next instruction to execute */
 138         struct vie      *vie_ctx;       /* (x) instruction emulation context */
 139 #ifndef __FreeBSD__
 140         uint64_t        tsc_offset;     /* (x) offset from host TSC */
 141 #endif
 142 };
 143 
 144 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 145 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 146 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
 147 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
 148 #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
 149 
 150 struct mem_seg {
 151         size_t  len;
 152         bool    sysmem;
 153         struct vm_object *object;
 154 };
 155 #ifdef __FreeBSD__
 156 #define VM_MAX_MEMSEGS  3
 157 #else
 158 #define VM_MAX_MEMSEGS  4
 159 #endif
 160 
 161 struct mem_map {
 162         vm_paddr_t      gpa;
 163         size_t          len;
 164         vm_ooffset_t    segoff;
 165         int             segid;
 166         int             prot;
 167         int             flags;
 168 };
 169 #define VM_MAX_MEMMAPS  8
 170 
 171 /*
 172  * Initialization:
 173  * (o) initialized the first time the VM is created
 174  * (i) initialized when VM is created and when it is reinitialized
 175  * (x) initialized before use
 176  */
 177 struct vm {
 178         void            *cookie;                /* (i) cpu-specific data */
 179         void            *iommu;                 /* (x) iommu-specific data */
 180         struct vhpet    *vhpet;                 /* (i) virtual HPET */
 181         struct vioapic  *vioapic;               /* (i) virtual ioapic */
 182         struct vatpic   *vatpic;                /* (i) virtual atpic */
 183         struct vatpit   *vatpit;                /* (i) virtual atpit */
 184         struct vpmtmr   *vpmtmr;                /* (i) virtual ACPI PM timer */
 185         struct vrtc     *vrtc;                  /* (o) virtual RTC */
 186         volatile cpuset_t active_cpus;          /* (i) active vcpus */
 187         volatile cpuset_t debug_cpus;           /* (i) vcpus stopped for dbg */
 188         int             suspend;                /* (i) stop VM execution */
 189         volatile cpuset_t suspended_cpus;       /* (i) suspended vcpus */
 190         volatile cpuset_t halted_cpus;          /* (x) cpus in a hard halt */
 191         struct mem_map  mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 192         struct mem_seg  mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 193         struct vmspace  *vmspace;               /* (o) guest's address space */
 194         char            name[VM_MAX_NAMELEN];   /* (o) virtual machine name */
 195         struct vcpu     vcpu[VM_MAXCPU];        /* (i) guest vcpus */
 196         /* The following describe the vm cpu topology */
 197         uint16_t        sockets;                /* (o) num of sockets */
 198         uint16_t        cores;                  /* (o) num of cores/socket */
 199         uint16_t        threads;                /* (o) num of threads/core */
 200         uint16_t        maxcpus;                /* (o) max pluggable cpus */
 201 
 202         struct ioport_config ioports;           /* (o) ioport handling */
 203 
 204         bool            sipi_req;               /* (i) SIPI requested */
 205         int             sipi_req_vcpu;          /* (i) SIPI destination */
 206         uint64_t        sipi_req_rip;           /* (i) SIPI start %rip */
 207 
 208         /* Miscellaneous VM-wide statistics and counters */
 209         struct vm_wide_stats {
 210                 uint64_t sipi_supersede;
 211         } stats;
 212 };
 213 
 214 static int vmm_initialized;
 215 
 216 
 217 static void
 218 nullop_panic(void)
 219 {
 220         panic("null vmm operation call");
 221 }
 222 
 223 /* Do not allow use of an un-set `ops` to do anything but panic */
 224 static struct vmm_ops vmm_ops_null = {
 225         .init           = (vmm_init_func_t)nullop_panic,
 226         .cleanup        = (vmm_cleanup_func_t)nullop_panic,
 227         .resume         = (vmm_resume_func_t)nullop_panic,
 228         .vminit         = (vmi_init_func_t)nullop_panic,
 229         .vmrun          = (vmi_run_func_t)nullop_panic,
 230         .vmcleanup      = (vmi_cleanup_func_t)nullop_panic,
 231         .vmgetreg       = (vmi_get_register_t)nullop_panic,
 232         .vmsetreg       = (vmi_set_register_t)nullop_panic,
 233         .vmgetdesc      = (vmi_get_desc_t)nullop_panic,
 234         .vmsetdesc      = (vmi_set_desc_t)nullop_panic,
 235         .vmgetcap       = (vmi_get_cap_t)nullop_panic,
 236         .vmsetcap       = (vmi_set_cap_t)nullop_panic,
 237         .vmspace_alloc  = (vmi_vmspace_alloc)nullop_panic,
 238         .vmspace_free   = (vmi_vmspace_free)nullop_panic,
 239         .vlapic_init    = (vmi_vlapic_init)nullop_panic,
 240         .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
 241         .vmsavectx      = (vmi_savectx)nullop_panic,
 242         .vmrestorectx   = (vmi_restorectx)nullop_panic,
 243 };
 244 
 245 static struct vmm_ops *ops = &vmm_ops_null;
 246 
 247 #define VMM_INIT(num)                   ((*ops->init)(num))
 248 #define VMM_CLEANUP()                   ((*ops->cleanup)())
 249 #define VMM_RESUME()                    ((*ops->resume)())
 250 
 251 #define VMINIT(vm, pmap)                ((*ops->vminit)(vm, pmap))
 252 #define VMRUN(vmi, vcpu, rip, pmap, evinfo) \
 253         ((*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo))
 254 #define VMCLEANUP(vmi)                  ((*ops->vmcleanup)(vmi))
 255 #define VMSPACE_ALLOC(min, max)         ((*ops->vmspace_alloc)(min, max))
 256 #define VMSPACE_FREE(vmspace)           ((*ops->vmspace_free)(vmspace))
 257 
 258 #define VMGETREG(vmi, vcpu, num, rv)    ((*ops->vmgetreg)(vmi, vcpu, num, rv))
 259 #define VMSETREG(vmi, vcpu, num, val)   ((*ops->vmsetreg)(vmi, vcpu, num, val))
 260 #define VMGETDESC(vmi, vcpu, num, dsc)  ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
 261 #define VMSETDESC(vmi, vcpu, num, dsc)  ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
 262 #define VMGETCAP(vmi, vcpu, num, rv)    ((*ops->vmgetcap)(vmi, vcpu, num, rv))
 263 #define VMSETCAP(vmi, vcpu, num, val)   ((*ops->vmsetcap)(vmi, vcpu, num, val))
 264 #define VLAPIC_INIT(vmi, vcpu)          ((*ops->vlapic_init)(vmi, vcpu))
 265 #define VLAPIC_CLEANUP(vmi, vlapic)     ((*ops->vlapic_cleanup)(vmi, vlapic))
 266 
 267 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 268 #define fpu_stop_emulating()    clts()
 269 
 270 SDT_PROVIDER_DEFINE(vmm);
 271 
 272 static MALLOC_DEFINE(M_VM, "vm", "vm");
 273 
 274 /* statistics */
 275 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 276 
 277 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 278     NULL);
 279 
 280 /*
 281  * Halt the guest if all vcpus are executing a HLT instruction with
 282  * interrupts disabled.
 283  */
 284 static int halt_detection_enabled = 1;
 285 
 286 /* IPI vector used for vcpu notifications */
 287 static int vmm_ipinum;
 288 
 289 /* Trap into hypervisor on all guest exceptions and reflect them back */
 290 static int trace_guest_exceptions;
 291 
 292 static void vm_free_memmap(struct vm *vm, int ident);
 293 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 294 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
 295 
 296 #ifndef __FreeBSD__
 297 static void vm_clear_memseg(struct vm *, int);
 298 
 299 /* Flags for vtc_status */
 300 #define VTCS_FPU_RESTORED       1 /* guest FPU restored, host FPU saved */
 301 #define VTCS_FPU_CTX_CRITICAL   2 /* in ctx where FPU restore cannot be lazy */
 302 
 303 typedef struct vm_thread_ctx {
 304         struct vm       *vtc_vm;
 305         int             vtc_vcpuid;
 306         uint_t          vtc_status;
 307 } vm_thread_ctx_t;
 308 #endif /* __FreeBSD__ */
 309 
 310 #ifdef KTR
 311 static const char *
 312 vcpu_state2str(enum vcpu_state state)
 313 {
 314 
 315         switch (state) {
 316         case VCPU_IDLE:
 317                 return ("idle");
 318         case VCPU_FROZEN:
 319                 return ("frozen");
 320         case VCPU_RUNNING:
 321                 return ("running");
 322         case VCPU_SLEEPING:
 323                 return ("sleeping");
 324         default:
 325                 return ("unknown");
 326         }
 327 }
 328 #endif
 329 
 330 static void
 331 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 332 {
 333         struct vcpu *vcpu = &vm->vcpu[i];
 334 
 335         VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 336         if (destroy) {
 337                 vmm_stat_free(vcpu->stats);
 338                 fpu_save_area_free(vcpu->guestfpu);
 339                 vie_free(vcpu->vie_ctx);
 340                 vcpu->vie_ctx = NULL;
 341         }
 342 }
 343 
 344 static void
 345 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 346 {
 347         struct vcpu *vcpu;
 348 
 349         KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 350             ("vcpu_init: invalid vcpu %d", vcpu_id));
 351 
 352         vcpu = &vm->vcpu[vcpu_id];
 353 
 354         if (create) {
 355 #ifdef __FreeBSD__
 356                 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 357                     "initialized", vcpu_id));
 358 #endif
 359                 vcpu_lock_init(vcpu);
 360                 vcpu->state = VCPU_IDLE;
 361                 vcpu->hostcpu = NOCPU;
 362 #ifndef __FreeBSD__
 363                 vcpu->lastloccpu = NOCPU;
 364 #endif
 365                 vcpu->guestfpu = fpu_save_area_alloc();
 366                 vcpu->stats = vmm_stat_alloc();
 367                 vcpu->vie_ctx = vie_alloc();
 368         } else {
 369                 vie_reset(vcpu->vie_ctx);
 370                 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
 371         }
 372 
 373         vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 374         vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 375         vcpu->runblock = 0;
 376         vcpu->reqidle = 0;
 377         vcpu->exitintinfo = 0;
 378         vcpu->nmi_pending = 0;
 379         vcpu->extint_pending = 0;
 380         vcpu->exception_pending = 0;
 381         vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 382         fpu_save_area_reset(vcpu->guestfpu);
 383         vmm_stat_init(vcpu->stats);
 384 }
 385 
 386 int
 387 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 388 {
 389 
 390         return (trace_guest_exceptions);
 391 }
 392 
 393 struct vm_exit *
 394 vm_exitinfo(struct vm *vm, int cpuid)
 395 {
 396         struct vcpu *vcpu;
 397 
 398         if (cpuid < 0 || cpuid >= vm->maxcpus)
 399                 panic("vm_exitinfo: invalid cpuid %d", cpuid);
 400 
 401         vcpu = &vm->vcpu[cpuid];
 402 
 403         return (&vcpu->exitinfo);
 404 }
 405 
 406 struct vie *
 407 vm_vie_ctx(struct vm *vm, int cpuid)
 408 {
 409         if (cpuid < 0 || cpuid >= vm->maxcpus)
 410                 panic("vm_vie_ctx: invalid cpuid %d", cpuid);
 411 
 412         return (vm->vcpu[cpuid].vie_ctx);
 413 }
 414 
 415 static int
 416 vmm_init(void)
 417 {
 418         int error;
 419 
 420         vmm_host_state_init();
 421 
 422 #ifdef __FreeBSD__
 423         vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
 424             &IDTVEC(justreturn));
 425         if (vmm_ipinum < 0)
 426                 vmm_ipinum = IPI_AST;
 427 #else
 428         /* We use cpu_poke() for IPIs */
 429         vmm_ipinum = 0;
 430 #endif
 431 
 432         error = vmm_mem_init();
 433         if (error)
 434                 return (error);
 435 
 436         if (vmm_is_intel())
 437                 ops = &vmm_ops_intel;
 438         else if (vmm_is_svm())
 439                 ops = &vmm_ops_amd;
 440         else
 441                 return (ENXIO);
 442 
 443 #ifdef __FreeBSD__
 444         vmm_resume_p = vmm_resume;
 445 #endif
 446 
 447         return (VMM_INIT(vmm_ipinum));
 448 }
 449 
 450 int
 451 vmm_mod_load()
 452 {
 453         int     error;
 454 
 455         VERIFY(vmm_initialized == 0);
 456 
 457         error = vmm_init();
 458         if (error == 0)
 459                 vmm_initialized = 1;
 460 
 461         return (error);
 462 }
 463 
 464 int
 465 vmm_mod_unload()
 466 {
 467         int     error;
 468 
 469         VERIFY(vmm_initialized == 1);
 470 
 471         iommu_cleanup();
 472         error = VMM_CLEANUP();
 473         if (error)
 474                 return (error);
 475         vmm_initialized = 0;
 476 
 477         return (0);
 478 }
 479 
 480 static void
 481 vm_init(struct vm *vm, bool create)
 482 {
 483         int i;
 484 #ifndef __FreeBSD__
 485         uint64_t tsc_off;
 486 #endif
 487 
 488         vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 489         vm->iommu = NULL;
 490         vm->vioapic = vioapic_init(vm);
 491         vm->vhpet = vhpet_init(vm);
 492         vm->vatpic = vatpic_init(vm);
 493         vm->vatpit = vatpit_init(vm);
 494         vm->vpmtmr = vpmtmr_init(vm);
 495         if (create)
 496                 vm->vrtc = vrtc_init(vm);
 497 
 498         vm_inout_init(vm, &vm->ioports);
 499 
 500         CPU_ZERO(&vm->active_cpus);
 501         CPU_ZERO(&vm->debug_cpus);
 502 
 503         vm->suspend = 0;
 504         CPU_ZERO(&vm->suspended_cpus);
 505 
 506         for (i = 0; i < vm->maxcpus; i++)
 507                 vcpu_init(vm, i, create);
 508 
 509 #ifndef __FreeBSD__
 510         tsc_off = (uint64_t)(-(int64_t)rdtsc());
 511         for (i = 0; i < vm->maxcpus; i++) {
 512                 vm->vcpu[i].tsc_offset = tsc_off;
 513         }
 514 #endif /* __FreeBSD__ */
 515 }
 516 
 517 /*
 518  * The default CPU topology is a single thread per package.
 519  */
 520 uint_t cores_per_package = 1;
 521 uint_t threads_per_core = 1;
 522 
 523 int
 524 vm_create(const char *name, struct vm **retvm)
 525 {
 526         struct vm *vm;
 527         struct vmspace *vmspace;
 528 
 529         /*
 530          * If vmm.ko could not be successfully initialized then don't attempt
 531          * to create the virtual machine.
 532          */
 533         if (!vmm_initialized)
 534                 return (ENXIO);
 535 
 536         if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 537                 return (EINVAL);
 538 
 539         vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
 540         if (vmspace == NULL)
 541                 return (ENOMEM);
 542 
 543         vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
 544         strcpy(vm->name, name);
 545         vm->vmspace = vmspace;
 546 
 547         vm->sockets = 1;
 548         vm->cores = cores_per_package;       /* XXX backwards compatibility */
 549         vm->threads = threads_per_core;      /* XXX backwards compatibility */
 550         vm->maxcpus = VM_MAXCPU;     /* XXX temp to keep code working */
 551 
 552         vm_init(vm, true);
 553 
 554         *retvm = vm;
 555         return (0);
 556 }
 557 
 558 void
 559 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
 560     uint16_t *threads, uint16_t *maxcpus)
 561 {
 562         *sockets = vm->sockets;
 563         *cores = vm->cores;
 564         *threads = vm->threads;
 565         *maxcpus = vm->maxcpus;
 566 }
 567 
 568 uint16_t
 569 vm_get_maxcpus(struct vm *vm)
 570 {
 571         return (vm->maxcpus);
 572 }
 573 
 574 int
 575 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
 576     uint16_t threads, uint16_t maxcpus)
 577 {
 578         if (maxcpus != 0)
 579                 return (EINVAL);        /* XXX remove when supported */
 580         if ((sockets * cores * threads) > vm->maxcpus)
 581                 return (EINVAL);
 582         /* XXX need to check sockets * cores * threads == vCPU, how? */
 583         vm->sockets = sockets;
 584         vm->cores = cores;
 585         vm->threads = threads;
 586         vm->maxcpus = VM_MAXCPU;     /* XXX temp to keep code working */
 587         return (0);
 588 }
 589 
 590 static void
 591 vm_cleanup(struct vm *vm, bool destroy)
 592 {
 593         struct mem_map *mm;
 594         int i;
 595 
 596         ppt_unassign_all(vm);
 597 
 598         if (vm->iommu != NULL)
 599                 iommu_destroy_domain(vm->iommu);
 600 
 601         /*
 602          * Devices which attach their own ioport hooks should be cleaned up
 603          * first so they can tear down those registrations.
 604          */
 605         vpmtmr_cleanup(vm->vpmtmr);
 606 
 607         vm_inout_cleanup(vm, &vm->ioports);
 608 
 609         if (destroy)
 610                 vrtc_cleanup(vm->vrtc);
 611         else
 612                 vrtc_reset(vm->vrtc);
 613 
 614         vatpit_cleanup(vm->vatpit);
 615         vhpet_cleanup(vm->vhpet);
 616         vatpic_cleanup(vm->vatpic);
 617         vioapic_cleanup(vm->vioapic);
 618 
 619         for (i = 0; i < vm->maxcpus; i++)
 620                 vcpu_cleanup(vm, i, destroy);
 621 
 622         VMCLEANUP(vm->cookie);
 623 
 624         /*
 625          * System memory is removed from the guest address space only when
 626          * the VM is destroyed. This is because the mapping remains the same
 627          * across VM reset.
 628          *
 629          * Device memory can be relocated by the guest (e.g. using PCI BARs)
 630          * so those mappings are removed on a VM reset.
 631          */
 632         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 633                 mm = &vm->mem_maps[i];
 634                 if (destroy || !sysmem_mapping(vm, mm))
 635                         vm_free_memmap(vm, i);
 636 #ifndef __FreeBSD__
 637                 else {
 638                         /*
 639                          * We need to reset the IOMMU flag so this mapping can
 640                          * be reused when a VM is rebooted. Since the IOMMU
 641                          * domain has already been destroyed we can just reset
 642                          * the flag here.
 643                          */
 644                         mm->flags &= ~VM_MEMMAP_F_IOMMU;
 645                 }
 646 #endif
 647         }
 648 
 649         if (destroy) {
 650                 for (i = 0; i < VM_MAX_MEMSEGS; i++)
 651                         vm_free_memseg(vm, i);
 652 
 653                 VMSPACE_FREE(vm->vmspace);
 654                 vm->vmspace = NULL;
 655         }
 656 #ifndef __FreeBSD__
 657         else {
 658                 /*
 659                  * Clear the first memory segment (low mem), old memory contents
 660                  * could confuse the UEFI firmware.
 661                  */
 662                 vm_clear_memseg(vm, 0);
 663         }
 664 #endif
 665 }
 666 
 667 void
 668 vm_destroy(struct vm *vm)
 669 {
 670         vm_cleanup(vm, true);
 671         free(vm, M_VM);
 672 }
 673 
 674 int
 675 vm_reinit(struct vm *vm)
 676 {
 677         int error;
 678 
 679         /*
 680          * A virtual machine can be reset only if all vcpus are suspended.
 681          */
 682         if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 683                 vm_cleanup(vm, false);
 684                 vm_init(vm, false);
 685                 error = 0;
 686         } else {
 687                 error = EBUSY;
 688         }
 689 
 690         return (error);
 691 }
 692 
 693 const char *
 694 vm_name(struct vm *vm)
 695 {
 696         return (vm->name);
 697 }
 698 
 699 int
 700 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 701 {
 702         vm_object_t obj;
 703 
 704         if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 705                 return (ENOMEM);
 706         else
 707                 return (0);
 708 }
 709 
 710 int
 711 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 712 {
 713 
 714         vmm_mmio_free(vm->vmspace, gpa, len);
 715         return (0);
 716 }
 717 
 718 /*
 719  * Return 'true' if 'gpa' is allocated in the guest address space.
 720  *
 721  * This function is called in the context of a running vcpu which acts as
 722  * an implicit lock on 'vm->mem_maps[]'.
 723  */
 724 bool
 725 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 726 {
 727         struct mem_map *mm;
 728         int i;
 729 
 730 #ifdef INVARIANTS
 731         int hostcpu, state;
 732         state = vcpu_get_state(vm, vcpuid, &hostcpu);
 733         KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
 734             ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
 735 #endif
 736 
 737         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 738                 mm = &vm->mem_maps[i];
 739                 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
 740                         return (true);          /* 'gpa' is sysmem or devmem */
 741         }
 742 
 743         if (ppt_is_mmio(vm, gpa))
 744                 return (true);                  /* 'gpa' is pci passthru mmio */
 745 
 746         return (false);
 747 }
 748 
 749 int
 750 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 751 {
 752         struct mem_seg *seg;
 753         vm_object_t obj;
 754 
 755 #ifndef __FreeBSD__
 756         extern pgcnt_t get_max_page_get(void);
 757 #endif
 758 
 759         if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 760                 return (EINVAL);
 761 
 762         if (len == 0 || (len & PAGE_MASK))
 763                 return (EINVAL);
 764 
 765 #ifndef __FreeBSD__
 766         if (len > ptob(get_max_page_get()))
 767                 return (EINVAL);
 768 #endif
 769 
 770         seg = &vm->mem_segs[ident];
 771         if (seg->object != NULL) {
 772                 if (seg->len == len && seg->sysmem == sysmem)
 773                         return (EEXIST);
 774                 else
 775                         return (EINVAL);
 776         }
 777 
 778         obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
 779         if (obj == NULL)
 780                 return (ENOMEM);
 781 
 782         seg->len = len;
 783         seg->object = obj;
 784         seg->sysmem = sysmem;
 785         return (0);
 786 }
 787 
 788 int
 789 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
 790     vm_object_t *objptr)
 791 {
 792         struct mem_seg *seg;
 793 
 794         if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 795                 return (EINVAL);
 796 
 797         seg = &vm->mem_segs[ident];
 798         if (len)
 799                 *len = seg->len;
 800         if (sysmem)
 801                 *sysmem = seg->sysmem;
 802         if (objptr)
 803                 *objptr = seg->object;
 804         return (0);
 805 }
 806 
 807 #ifndef __FreeBSD__
 808 static void
 809 vm_clear_memseg(struct vm *vm, int ident)
 810 {
 811         struct mem_seg *seg;
 812 
 813         KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 814             ("%s: invalid memseg ident %d", __func__, ident));
 815 
 816         seg = &vm->mem_segs[ident];
 817 
 818         if (seg->object != NULL)
 819                 vm_object_clear(seg->object);
 820 }
 821 #endif
 822 
 823 void
 824 vm_free_memseg(struct vm *vm, int ident)
 825 {
 826         struct mem_seg *seg;
 827 
 828         KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 829             ("%s: invalid memseg ident %d", __func__, ident));
 830 
 831         seg = &vm->mem_segs[ident];
 832         if (seg->object != NULL) {
 833                 vm_object_deallocate(seg->object);
 834                 bzero(seg, sizeof (struct mem_seg));
 835         }
 836 }
 837 
 838 int
 839 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
 840     size_t len, int prot, int flags)
 841 {
 842         struct mem_seg *seg;
 843         struct mem_map *m, *map;
 844         vm_ooffset_t last;
 845         int i, error;
 846 
 847         if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
 848                 return (EINVAL);
 849 
 850         if (flags & ~VM_MEMMAP_F_WIRED)
 851                 return (EINVAL);
 852 
 853         if (segid < 0 || segid >= VM_MAX_MEMSEGS)
 854                 return (EINVAL);
 855 
 856         seg = &vm->mem_segs[segid];
 857         if (seg->object == NULL)
 858                 return (EINVAL);
 859 
 860         last = first + len;
 861         if (first < 0 || first >= last || last > seg->len)
 862                 return (EINVAL);
 863 
 864         if ((gpa | first | last) & PAGE_MASK)
 865                 return (EINVAL);
 866 
 867         map = NULL;
 868         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 869                 m = &vm->mem_maps[i];
 870                 if (m->len == 0) {
 871                         map = m;
 872                         break;
 873                 }
 874         }
 875 
 876         if (map == NULL)
 877                 return (ENOSPC);
 878 
 879         error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
 880             len, 0, VMFS_NO_SPACE, prot, prot, 0);
 881         if (error != KERN_SUCCESS)
 882                 return (EFAULT);
 883 
 884         vm_object_reference(seg->object);
 885 
 886         if ((flags & VM_MEMMAP_F_WIRED) != 0) {
 887                 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
 888                     VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 889                 if (error != KERN_SUCCESS) {
 890                         vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
 891                         return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
 892                             EFAULT);
 893                 }
 894         }
 895 
 896         map->gpa = gpa;
 897         map->len = len;
 898         map->segoff = first;
 899         map->segid = segid;
 900         map->prot = prot;
 901         map->flags = flags;
 902         return (0);
 903 }
 904 
 905 int
 906 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
 907     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 908 {
 909         struct mem_map *mm, *mmnext;
 910         int i;
 911 
 912         mmnext = NULL;
 913         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 914                 mm = &vm->mem_maps[i];
 915                 if (mm->len == 0 || mm->gpa < *gpa)
 916                         continue;
 917                 if (mmnext == NULL || mm->gpa < mmnext->gpa)
 918                         mmnext = mm;
 919         }
 920 
 921         if (mmnext != NULL) {
 922                 *gpa = mmnext->gpa;
 923                 if (segid)
 924                         *segid = mmnext->segid;
 925                 if (segoff)
 926                         *segoff = mmnext->segoff;
 927                 if (len)
 928                         *len = mmnext->len;
 929                 if (prot)
 930                         *prot = mmnext->prot;
 931                 if (flags)
 932                         *flags = mmnext->flags;
 933                 return (0);
 934         } else {
 935                 return (ENOENT);
 936         }
 937 }
 938 
 939 static void
 940 vm_free_memmap(struct vm *vm, int ident)
 941 {
 942         struct mem_map *mm;
 943         int error;
 944 
 945         mm = &vm->mem_maps[ident];
 946         if (mm->len) {
 947                 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 948                     mm->gpa + mm->len);
 949                 KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
 950                     __func__, error));
 951                 bzero(mm, sizeof (struct mem_map));
 952         }
 953 }
 954 
 955 static __inline bool
 956 sysmem_mapping(struct vm *vm, struct mem_map *mm)
 957 {
 958 
 959         if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 960                 return (true);
 961         else
 962                 return (false);
 963 }
 964 
 965 vm_paddr_t
 966 vmm_sysmem_maxaddr(struct vm *vm)
 967 {
 968         struct mem_map *mm;
 969         vm_paddr_t maxaddr;
 970         int i;
 971 
 972         maxaddr = 0;
 973         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 974                 mm = &vm->mem_maps[i];
 975                 if (sysmem_mapping(vm, mm)) {
 976                         if (maxaddr < mm->gpa + mm->len)
 977                                 maxaddr = mm->gpa + mm->len;
 978                 }
 979         }
 980         return (maxaddr);
 981 }
 982 
 983 static void
 984 vm_iommu_modify(struct vm *vm, bool map)
 985 {
 986         int i, sz;
 987         vm_paddr_t gpa, hpa;
 988         struct mem_map *mm;
 989 #ifdef __FreeBSD__
 990         void *vp, *cookie, *host_domain;
 991 #else
 992         void *vp, *cookie, *host_domain __unused;
 993 #endif
 994 
 995         sz = PAGE_SIZE;
 996         host_domain = iommu_host_domain();
 997 
 998         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 999                 mm = &vm->mem_maps[i];
1000                 if (!sysmem_mapping(vm, mm))
1001                         continue;
1002 
1003                 if (map) {
1004                         KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
1005                             ("iommu map found invalid memmap %lx/%lx/%x",
1006                             mm->gpa, mm->len, mm->flags));
1007                         if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
1008                                 continue;
1009                         mm->flags |= VM_MEMMAP_F_IOMMU;
1010                 } else {
1011                         if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
1012                                 continue;
1013                         mm->flags &= ~VM_MEMMAP_F_IOMMU;
1014                         KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
1015                             ("iommu unmap found invalid memmap %lx/%lx/%x",
1016                             mm->gpa, mm->len, mm->flags));
1017                 }
1018 
1019                 gpa = mm->gpa;
1020                 while (gpa < mm->gpa + mm->len) {
1021                         vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
1022                             &cookie);
1023                         KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
1024                             vm_name(vm), gpa));
1025 
1026                         vm_gpa_release(cookie);
1027 
1028                         hpa = DMAP_TO_PHYS((uintptr_t)vp);
1029                         if (map) {
1030                                 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
1031 #ifdef __FreeBSD__
1032                                 iommu_remove_mapping(host_domain, hpa, sz);
1033 #endif
1034                         } else {
1035                                 iommu_remove_mapping(vm->iommu, gpa, sz);
1036 #ifdef __FreeBSD__
1037                                 iommu_create_mapping(host_domain, hpa, hpa, sz);
1038 #endif
1039                         }
1040 
1041                         gpa += PAGE_SIZE;
1042                 }
1043         }
1044 
1045         /*
1046          * Invalidate the cached translations associated with the domain
1047          * from which pages were removed.
1048          */
1049 #ifdef __FreeBSD__
1050         if (map)
1051                 iommu_invalidate_tlb(host_domain);
1052         else
1053                 iommu_invalidate_tlb(vm->iommu);
1054 #else
1055         iommu_invalidate_tlb(vm->iommu);
1056 #endif
1057 }
1058 
1059 #define vm_iommu_unmap(vm)      vm_iommu_modify((vm), false)
1060 #define vm_iommu_map(vm)        vm_iommu_modify((vm), true)
1061 
1062 int
1063 vm_unassign_pptdev(struct vm *vm, int pptfd)
1064 {
1065         int error;
1066 
1067         error = ppt_unassign_device(vm, pptfd);
1068         if (error)
1069                 return (error);
1070 
1071         if (ppt_assigned_devices(vm) == 0)
1072                 vm_iommu_unmap(vm);
1073 
1074         return (0);
1075 }
1076 
1077 int
1078 vm_assign_pptdev(struct vm *vm, int pptfd)
1079 {
1080         int error;
1081         vm_paddr_t maxaddr;
1082 
1083         /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1084         if (ppt_assigned_devices(vm) == 0) {
1085                 KASSERT(vm->iommu == NULL,
1086                     ("vm_assign_pptdev: iommu must be NULL"));
1087                 maxaddr = vmm_sysmem_maxaddr(vm);
1088                 vm->iommu = iommu_create_domain(maxaddr);
1089                 if (vm->iommu == NULL)
1090                         return (ENXIO);
1091                 vm_iommu_map(vm);
1092         }
1093 
1094         error = ppt_assign_device(vm, pptfd);
1095         return (error);
1096 }
1097 
1098 void *
1099 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1100     void **cookie)
1101 {
1102         int i, count, pageoff;
1103         struct mem_map *mm;
1104         vm_page_t m;
1105 #ifdef INVARIANTS
1106         /*
1107          * All vcpus are frozen by ioctls that modify the memory map
1108          * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1109          * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1110          */
1111         int state;
1112         KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1113             __func__, vcpuid));
1114         for (i = 0; i < vm->maxcpus; i++) {
1115                 if (vcpuid != -1 && vcpuid != i)
1116                         continue;
1117                 state = vcpu_get_state(vm, i, NULL);
1118                 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1119                     __func__, state));
1120         }
1121 #endif
1122         pageoff = gpa & PAGE_MASK;
1123         if (len > PAGE_SIZE - pageoff)
1124                 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1125 
1126         count = 0;
1127         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1128                 mm = &vm->mem_maps[i];
1129                 if (mm->len == 0) {
1130                         continue;
1131                 }
1132                 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1133                         count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1134                             trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1135                         break;
1136                 }
1137         }
1138 
1139         if (count == 1) {
1140                 *cookie = m;
1141                 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1142         } else {
1143                 *cookie = NULL;
1144                 return (NULL);
1145         }
1146 }
1147 
1148 void
1149 vm_gpa_release(void *cookie)
1150 {
1151         vm_page_t m = cookie;
1152 
1153         vm_page_unwire(m, PQ_ACTIVE);
1154 }
1155 
1156 int
1157 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1158 {
1159 
1160         if (vcpu < 0 || vcpu >= vm->maxcpus)
1161                 return (EINVAL);
1162 
1163         if (reg >= VM_REG_LAST)
1164                 return (EINVAL);
1165 
1166         return (VMGETREG(vm->cookie, vcpu, reg, retval));
1167 }
1168 
1169 int
1170 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1171 {
1172         struct vcpu *vcpu;
1173         int error;
1174 
1175         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1176                 return (EINVAL);
1177 
1178         if (reg >= VM_REG_LAST)
1179                 return (EINVAL);
1180 
1181         error = VMSETREG(vm->cookie, vcpuid, reg, val);
1182         if (error || reg != VM_REG_GUEST_RIP)
1183                 return (error);
1184 
1185         /* Set 'nextrip' to match the value of %rip */
1186         VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1187         vcpu = &vm->vcpu[vcpuid];
1188         vcpu->nextrip = val;
1189         return (0);
1190 }
1191 
1192 static bool
1193 is_descriptor_table(int reg)
1194 {
1195         switch (reg) {
1196         case VM_REG_GUEST_IDTR:
1197         case VM_REG_GUEST_GDTR:
1198                 return (true);
1199         default:
1200                 return (false);
1201         }
1202 }
1203 
1204 static bool
1205 is_segment_register(int reg)
1206 {
1207         switch (reg) {
1208         case VM_REG_GUEST_ES:
1209         case VM_REG_GUEST_CS:
1210         case VM_REG_GUEST_SS:
1211         case VM_REG_GUEST_DS:
1212         case VM_REG_GUEST_FS:
1213         case VM_REG_GUEST_GS:
1214         case VM_REG_GUEST_TR:
1215         case VM_REG_GUEST_LDTR:
1216                 return (true);
1217         default:
1218                 return (false);
1219         }
1220 }
1221 
1222 int
1223 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1224 {
1225 
1226         if (vcpu < 0 || vcpu >= vm->maxcpus)
1227                 return (EINVAL);
1228 
1229         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1230                 return (EINVAL);
1231 
1232         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1233 }
1234 
1235 int
1236 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1237 {
1238         if (vcpu < 0 || vcpu >= vm->maxcpus)
1239                 return (EINVAL);
1240 
1241         if (!is_segment_register(reg) && !is_descriptor_table(reg))
1242                 return (EINVAL);
1243 
1244         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1245 }
1246 
1247 static void
1248 restore_guest_fpustate(struct vcpu *vcpu)
1249 {
1250 
1251         /* flush host state to the pcb */
1252         fpuexit(curthread);
1253 
1254         /* restore guest FPU state */
1255         fpu_stop_emulating();
1256         fpurestore(vcpu->guestfpu);
1257 
1258         /* restore guest XCR0 if XSAVE is enabled in the host */
1259         if (rcr4() & CR4_XSAVE)
1260                 load_xcr(0, vcpu->guest_xcr0);
1261 
1262         /*
1263          * The FPU is now "dirty" with the guest's state so turn on emulation
1264          * to trap any access to the FPU by the host.
1265          */
1266         fpu_start_emulating();
1267 }
1268 
1269 static void
1270 save_guest_fpustate(struct vcpu *vcpu)
1271 {
1272 
1273         if ((rcr0() & CR0_TS) == 0)
1274                 panic("fpu emulation not enabled in host!");
1275 
1276         /* save guest XCR0 and restore host XCR0 */
1277         if (rcr4() & CR4_XSAVE) {
1278                 vcpu->guest_xcr0 = rxcr(0);
1279                 load_xcr(0, vmm_get_host_xcr0());
1280         }
1281 
1282         /* save guest FPU state */
1283         fpu_stop_emulating();
1284         fpusave(vcpu->guestfpu);
1285 #ifdef __FreeBSD__
1286         fpu_start_emulating();
1287 #else
1288         /*
1289          * When the host state has been restored, we should not re-enable
1290          * CR0.TS on illumos for eager FPU.
1291          */
1292 #endif
1293 }
1294 
1295 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
1296 
1297 static int
1298 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1299     bool from_idle)
1300 {
1301         struct vcpu *vcpu;
1302         int error;
1303 
1304         vcpu = &vm->vcpu[vcpuid];
1305         vcpu_assert_locked(vcpu);
1306 
1307         /*
1308          * State transitions from the vmmdev_ioctl() must always begin from
1309          * the VCPU_IDLE state. This guarantees that there is only a single
1310          * ioctl() operating on a vcpu at any point.
1311          */
1312         if (from_idle) {
1313                 while (vcpu->state != VCPU_IDLE) {
1314                         vcpu->reqidle = 1;
1315                         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1316                         VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1317                             "idle requested", vcpu_state2str(vcpu->state));
1318 #ifdef __FreeBSD__
1319                         msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1320 #else
1321                         cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1322 #endif
1323                 }
1324         } else {
1325                 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1326                     "vcpu idle state"));
1327         }
1328 
1329         if (vcpu->state == VCPU_RUNNING) {
1330                 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1331                     "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1332         } else {
1333                 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1334                     "vcpu that is not running", vcpu->hostcpu));
1335         }
1336 
1337         /*
1338          * The following state transitions are allowed:
1339          * IDLE -> FROZEN -> IDLE
1340          * FROZEN -> RUNNING -> FROZEN
1341          * FROZEN -> SLEEPING -> FROZEN
1342          */
1343         switch (vcpu->state) {
1344         case VCPU_IDLE:
1345         case VCPU_RUNNING:
1346         case VCPU_SLEEPING:
1347                 error = (newstate != VCPU_FROZEN);
1348                 break;
1349         case VCPU_FROZEN:
1350                 error = (newstate == VCPU_FROZEN);
1351                 break;
1352         default:
1353                 error = 1;
1354                 break;
1355         }
1356 
1357         if (newstate == VCPU_RUNNING) {
1358                 while (vcpu->runblock != 0) {
1359 #ifdef __FreeBSD__
1360                         msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
1361 #else
1362                         cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1363 #endif
1364                 }
1365         }
1366 
1367         if (error)
1368                 return (EBUSY);
1369 
1370         VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1371             vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1372 
1373         vcpu->state = newstate;
1374         if (newstate == VCPU_RUNNING)
1375                 vcpu->hostcpu = curcpu;
1376         else
1377                 vcpu->hostcpu = NOCPU;
1378 
1379         if (newstate == VCPU_IDLE ||
1380             (newstate == VCPU_FROZEN && vcpu->runblock != 0)) {
1381 #ifdef __FreeBSD__
1382                 wakeup(&vcpu->state);
1383 #else
1384                 cv_broadcast(&vcpu->state_cv);
1385 #endif
1386         }
1387 
1388         return (0);
1389 }
1390 
1391 static void
1392 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1393 {
1394         int error;
1395 
1396         if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1397                 panic("Error %d setting state to %d\n", error, newstate);
1398 }
1399 
1400 static void
1401 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1402 {
1403         int error;
1404 
1405         if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1406                 panic("Error %d setting state to %d", error, newstate);
1407 }
1408 
1409 /*
1410  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1411  */
1412 static int
1413 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1414 {
1415         struct vcpu *vcpu;
1416 #ifdef __FreeBSD__
1417         const char *wmesg;
1418 #else
1419         const char *wmesg __unused;
1420 #endif
1421         int t, vcpu_halted, vm_halted;
1422 
1423         KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1424 
1425         vcpu = &vm->vcpu[vcpuid];
1426         vcpu_halted = 0;
1427         vm_halted = 0;
1428 
1429         vcpu_lock(vcpu);
1430         while (1) {
1431                 /*
1432                  * Do a final check for pending NMI or interrupts before
1433                  * really putting this thread to sleep. Also check for
1434                  * software events that would cause this vcpu to wakeup.
1435                  *
1436                  * These interrupts/events could have happened after the
1437                  * vcpu returned from VMRUN() and before it acquired the
1438                  * vcpu lock above.
1439                  */
1440                 if (vm->suspend || vcpu->reqidle)
1441                         break;
1442                 if (vm_nmi_pending(vm, vcpuid))
1443                         break;
1444                 if (!intr_disabled) {
1445                         if (vm_extint_pending(vm, vcpuid) ||
1446                             vlapic_pending_intr(vcpu->vlapic, NULL)) {
1447                                 break;
1448                         }
1449                 }
1450 
1451                 /* Don't go to sleep if the vcpu thread needs to yield */
1452                 if (vcpu_should_yield(vm, vcpuid))
1453                         break;
1454 
1455                 if (vcpu_debugged(vm, vcpuid))
1456                         break;
1457 
1458                 /*
1459                  * Some Linux guests implement "halt" by having all vcpus
1460                  * execute HLT with interrupts disabled. 'halted_cpus' keeps
1461                  * track of the vcpus that have entered this state. When all
1462                  * vcpus enter the halted state the virtual machine is halted.
1463                  */
1464                 if (intr_disabled) {
1465                         wmesg = "vmhalt";
1466                         VCPU_CTR0(vm, vcpuid, "Halted");
1467                         if (!vcpu_halted && halt_detection_enabled) {
1468                                 vcpu_halted = 1;
1469                                 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1470                         }
1471                         if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1472                                 vm_halted = 1;
1473                                 break;
1474                         }
1475                 } else {
1476                         wmesg = "vmidle";
1477                 }
1478 
1479                 t = ticks;
1480                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1481 #ifdef __FreeBSD__
1482                 /*
1483                  * XXX msleep_spin() cannot be interrupted by signals so
1484                  * wake up periodically to check pending signals.
1485                  */
1486                 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1487 #else
1488                 /*
1489                  * Fortunately, cv_wait_sig can be interrupted by signals, so
1490                  * there is no need to periodically wake up.
1491                  */
1492                 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1493 #endif
1494                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1495                 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1496         }
1497 
1498         if (vcpu_halted)
1499                 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1500 
1501         vcpu_unlock(vcpu);
1502 
1503         if (vm_halted)
1504                 vm_suspend(vm, VM_SUSPEND_HALT);
1505 
1506         return (0);
1507 }
1508 
1509 static int
1510 vm_handle_paging(struct vm *vm, int vcpuid)
1511 {
1512         int rv, ftype;
1513         struct vm_map *map;
1514         struct vcpu *vcpu;
1515         struct vm_exit *vme;
1516 
1517         vcpu = &vm->vcpu[vcpuid];
1518         vme = &vcpu->exitinfo;
1519 
1520         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1521             __func__, vme->inst_length));
1522 
1523         ftype = vme->u.paging.fault_type;
1524         KASSERT(ftype == VM_PROT_READ ||
1525             ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1526             ("vm_handle_paging: invalid fault_type %d", ftype));
1527 
1528         if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1529                 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1530                     vme->u.paging.gpa, ftype);
1531                 if (rv == 0) {
1532                         VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1533                             ftype == VM_PROT_READ ? "accessed" : "dirty",
1534                             vme->u.paging.gpa);
1535                         goto done;
1536                 }
1537         }
1538 
1539         map = &vm->vmspace->vm_map;
1540         rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1541 
1542         VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1543             "ftype = %d", rv, vme->u.paging.gpa, ftype);
1544 
1545         if (rv != KERN_SUCCESS)
1546                 return (EFAULT);
1547 done:
1548         return (0);
1549 }
1550 
1551 int
1552 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1553     int rsize)
1554 {
1555         int err = ESRCH;
1556 
1557         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1558                 err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1559         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1560                 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1561         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1562                 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1563         }
1564 
1565         return (err);
1566 }
1567 
1568 int
1569 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1570     int wsize)
1571 {
1572         int err = ESRCH;
1573 
1574         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1575                 err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1576         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1577                 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1578         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1579                 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1580         }
1581 
1582         return (err);
1583 }
1584 
1585 static int
1586 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1587 {
1588         struct vie *vie;
1589         struct vcpu *vcpu;
1590         struct vm_exit *vme;
1591         uint64_t inst_addr;
1592         int error, fault, cs_d;
1593 
1594         vcpu = &vm->vcpu[vcpuid];
1595         vme = &vcpu->exitinfo;
1596         vie = vcpu->vie_ctx;
1597 
1598         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1599             __func__, vme->inst_length));
1600 
1601         inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1602         cs_d = vme->u.mmio_emul.cs_d;
1603 
1604         VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1605             vme->u.mmio_emul.gpa);
1606 
1607         /* Fetch the faulting instruction */
1608         if (vie_needs_fetch(vie)) {
1609                 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1610                     &fault);
1611                 if (error != 0) {
1612                         return (error);
1613                 } else if (fault) {
1614                         /*
1615                          * If a fault during instruction fetch was encounted, it
1616                          * will have asserted that the appropriate exception be
1617                          * injected at next entry.  No further work is required.
1618                          */
1619                         return (0);
1620                 }
1621         }
1622 
1623         if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1624                 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1625                     inst_addr);
1626                 /* Dump (unrecognized) instruction bytes in userspace */
1627                 vie_fallback_exitinfo(vie, vme);
1628                 return (-1);
1629         }
1630         if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1631             vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1632                 /* Decoded GLA does not match GLA from VM exit state */
1633                 vie_fallback_exitinfo(vie, vme);
1634                 return (-1);
1635         }
1636 
1637 repeat:
1638         error = vie_emulate_mmio(vie, vm, vcpuid);
1639         if (error < 0) {
1640                 /*
1641                  * MMIO not handled by any of the in-kernel-emulated devices, so
1642                  * make a trip out to userspace for it.
1643                  */
1644                 vie_exitinfo(vie, vme);
1645         } else if (error == EAGAIN) {
1646                 /*
1647                  * Continue emulating the rep-prefixed instruction, which has
1648                  * not completed its iterations.
1649                  *
1650                  * In case this can be emulated in-kernel and has a high
1651                  * repetition count (causing a tight spin), it should be
1652                  * deferential to yield conditions.
1653                  */
1654                 if (!vcpu_should_yield(vm, vcpuid)) {
1655                         goto repeat;
1656                 } else {
1657                         /*
1658                          * Defer to the contending load by making a trip to
1659                          * userspace with a no-op (BOGUS) exit reason.
1660                          */
1661                         vie_reset(vie);
1662                         vme->exitcode = VM_EXITCODE_BOGUS;
1663                         return (-1);
1664                 }
1665         } else if (error == 0) {
1666                 /* Update %rip now that instruction has been emulated */
1667                 vie_advance_pc(vie, &vcpu->nextrip);
1668         }
1669         return (error);
1670 }
1671 
1672 static int
1673 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1674 {
1675         struct vcpu *vcpu;
1676         struct vie *vie;
1677         int err;
1678 
1679         vcpu = &vm->vcpu[vcpuid];
1680         vie = vcpu->vie_ctx;
1681 
1682 repeat:
1683         err = vie_emulate_inout(vie, vm, vcpuid);
1684 
1685         if (err < 0) {
1686                 /*
1687                  * In/out not handled by any of the in-kernel-emulated devices,
1688                  * so make a trip out to userspace for it.
1689                  */
1690                 vie_exitinfo(vie, vme);
1691                 return (err);
1692         } else if (err == EAGAIN) {
1693                 /*
1694                  * Continue emulating the rep-prefixed ins/outs, which has not
1695                  * completed its iterations.
1696                  *
1697                  * In case this can be emulated in-kernel and has a high
1698                  * repetition count (causing a tight spin), it should be
1699                  * deferential to yield conditions.
1700                  */
1701                 if (!vcpu_should_yield(vm, vcpuid)) {
1702                         goto repeat;
1703                 } else {
1704                         /*
1705                          * Defer to the contending load by making a trip to
1706                          * userspace with a no-op (BOGUS) exit reason.
1707                          */
1708                         vie_reset(vie);
1709                         vme->exitcode = VM_EXITCODE_BOGUS;
1710                         return (-1);
1711                 }
1712         } else if (err != 0) {
1713                 /* Emulation failure.  Bail all the way out to userspace. */
1714                 vme->exitcode = VM_EXITCODE_INST_EMUL;
1715                 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1716                 return (-1);
1717         }
1718 
1719         vie_advance_pc(vie, &vcpu->nextrip);
1720         return (0);
1721 }
1722 
1723 static int
1724 vm_handle_suspend(struct vm *vm, int vcpuid)
1725 {
1726 #ifdef __FreeBSD__
1727         int error, i;
1728         struct vcpu *vcpu;
1729         struct thread *td;
1730 
1731         error = 0;
1732         vcpu = &vm->vcpu[vcpuid];
1733         td = curthread;
1734 #else
1735         int i;
1736         struct vcpu *vcpu;
1737 
1738         vcpu = &vm->vcpu[vcpuid];
1739 #endif
1740 
1741         CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1742 
1743 #ifdef __FreeBSD__
1744         /*
1745          * Wait until all 'active_cpus' have suspended themselves.
1746          *
1747          * Since a VM may be suspended at any time including when one or
1748          * more vcpus are doing a rendezvous we need to call the rendezvous
1749          * handler while we are waiting to prevent a deadlock.
1750          */
1751         vcpu_lock(vcpu);
1752         while (error == 0) {
1753                 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1754                         VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1755                         break;
1756                 }
1757 
1758                 if (vm->rendezvous_func == NULL) {
1759                         VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1760                         vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1761                         msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1762                         vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1763                         if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
1764                                 vcpu_unlock(vcpu);
1765                                 error = thread_check_susp(td, false);
1766                                 vcpu_lock(vcpu);
1767                         }
1768                 } else {
1769                         VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1770                         vcpu_unlock(vcpu);
1771                         error = vm_handle_rendezvous(vm, vcpuid);
1772                         vcpu_lock(vcpu);
1773                 }
1774         }
1775         vcpu_unlock(vcpu);
1776 #else
1777         vcpu_lock(vcpu);
1778         while (1) {
1779                 int rc;
1780 
1781                 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1782                         VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1783                         break;
1784                 }
1785 
1786                 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1787                 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1788                     TR_CLOCK_TICK);
1789                 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1790 
1791                 /*
1792                  * If the userspace process driving the instance is killed, any
1793                  * vCPUs yet to be marked suspended (because they are not
1794                  * VM_RUN-ing in the kernel presently) will never reach that
1795                  * state.
1796                  *
1797                  * To avoid vm_handle_suspend() getting stuck in the kernel
1798                  * waiting for those vCPUs, offer a bail-out even though it
1799                  * means returning without all vCPUs in a suspended state.
1800                  */
1801                 if (rc <= 0) {
1802                         if ((curproc->p_flag & SEXITING) != 0) {
1803                                 break;
1804                         }
1805                 }
1806         }
1807         vcpu_unlock(vcpu);
1808 
1809 #endif
1810 
1811         /*
1812          * Wakeup the other sleeping vcpus and return to userspace.
1813          */
1814         for (i = 0; i < vm->maxcpus; i++) {
1815                 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1816                         vcpu_notify_event(vm, i);
1817                 }
1818         }
1819 
1820         return (-1);
1821 }
1822 
1823 static int
1824 vm_handle_reqidle(struct vm *vm, int vcpuid)
1825 {
1826         struct vcpu *vcpu = &vm->vcpu[vcpuid];
1827 
1828         vcpu_lock(vcpu);
1829         KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1830         vcpu->reqidle = 0;
1831         vcpu_unlock(vcpu);
1832         return (-1);
1833 }
1834 
1835 #ifndef __FreeBSD__
1836 static int
1837 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1838 {
1839         struct vcpu *cpu = &vm->vcpu[vcpuid];
1840         const uint32_t code = vme->u.msr.code;
1841         const uint64_t val = vme->u.msr.wval;
1842 
1843         switch (code) {
1844         case MSR_TSC:
1845                 cpu->tsc_offset = val - rdtsc();
1846                 return (0);
1847         }
1848 
1849         return (-1);
1850 }
1851 #endif /* __FreeBSD__ */
1852 
1853 void
1854 vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip)
1855 {
1856         if (vm->sipi_req) {
1857                 /* This should never occur if userspace is doing its job. */
1858                 vm->stats.sipi_supersede++;
1859         }
1860         vm->sipi_req = true;
1861         vm->sipi_req_vcpu = req_vcpuid;
1862         vm->sipi_req_rip = req_rip;
1863 }
1864 
1865 int
1866 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1867 {
1868         int i;
1869 
1870         if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1871                 return (EINVAL);
1872 
1873         if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1874                 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1875                     vm->suspend, how);
1876                 return (EALREADY);
1877         }
1878 
1879         VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1880 
1881         /*
1882          * Notify all active vcpus that they are now suspended.
1883          */
1884         for (i = 0; i < vm->maxcpus; i++) {
1885                 if (CPU_ISSET(i, &vm->active_cpus))
1886                         vcpu_notify_event(vm, i);
1887         }
1888 
1889         return (0);
1890 }
1891 
1892 void
1893 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1894 {
1895         struct vm_exit *vmexit;
1896 
1897         KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1898             ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1899 
1900         vmexit = vm_exitinfo(vm, vcpuid);
1901         vmexit->rip = rip;
1902         vmexit->inst_length = 0;
1903         vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1904         vmexit->u.suspended.how = vm->suspend;
1905 }
1906 
1907 void
1908 vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
1909 {
1910         struct vm_exit *vmexit;
1911 
1912         vmexit = vm_exitinfo(vm, vcpuid);
1913         vmexit->rip = rip;
1914         vmexit->inst_length = 0;
1915         vmexit->exitcode = VM_EXITCODE_DEBUG;
1916 }
1917 
1918 void
1919 vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip)
1920 {
1921         struct vm_exit *vmexit;
1922 
1923         vmexit = vm_exitinfo(vm, vcpuid);
1924         vmexit->rip = rip;
1925         vmexit->inst_length = 0;
1926         vmexit->exitcode = VM_EXITCODE_RUNBLOCK;
1927         vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1);
1928 }
1929 
1930 void
1931 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
1932 {
1933         struct vm_exit *vmexit;
1934 
1935         vmexit = vm_exitinfo(vm, vcpuid);
1936         vmexit->rip = rip;
1937         vmexit->inst_length = 0;
1938         vmexit->exitcode = VM_EXITCODE_REQIDLE;
1939         vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
1940 }
1941 
1942 void
1943 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1944 {
1945         struct vm_exit *vmexit;
1946 
1947         vmexit = vm_exitinfo(vm, vcpuid);
1948         vmexit->rip = rip;
1949         vmexit->inst_length = 0;
1950         vmexit->exitcode = VM_EXITCODE_BOGUS;
1951         vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1952 }
1953 
1954 #ifndef __FreeBSD__
1955 /*
1956  * Some vmm resources, such as the lapic, may have CPU-specific resources
1957  * allocated to them which would benefit from migration onto the host CPU which
1958  * is processing the vcpu state.
1959  */
1960 static void
1961 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
1962 {
1963         /*
1964          * Localizing cyclic resources requires acquisition of cpu_lock, and
1965          * doing so with kpreempt disabled is a recipe for deadlock disaster.
1966          */
1967         VERIFY(curthread->t_preempt == 0);
1968 
1969         /*
1970          * Do not bother with localization if this vCPU is about to return to
1971          * the host CPU it was last localized to.
1972          */
1973         if (vcpu->lastloccpu == curcpu)
1974                 return;
1975 
1976         /*
1977          * Localize system-wide resources to the primary boot vCPU.  While any
1978          * of the other vCPUs may access them, it keeps the potential interrupt
1979          * footprint constrained to CPUs involved with this instance.
1980          */
1981         if (vcpu == &vm->vcpu[0]) {
1982                 vhpet_localize_resources(vm->vhpet);
1983                 vrtc_localize_resources(vm->vrtc);
1984                 vatpit_localize_resources(vm->vatpit);
1985         }
1986 
1987         vlapic_localize_resources(vcpu->vlapic);
1988 
1989         vcpu->lastloccpu = curcpu;
1990 }
1991 
1992 static void
1993 vmm_savectx(void *arg)
1994 {
1995         vm_thread_ctx_t *vtc = arg;
1996         struct vm *vm = vtc->vtc_vm;
1997         const int vcpuid = vtc->vtc_vcpuid;
1998 
1999         if (ops->vmsavectx != NULL) {
2000                 ops->vmsavectx(vm->cookie, vcpuid);
2001         }
2002 
2003         /*
2004          * If the CPU holds the restored guest FPU state, save it and restore
2005          * the host FPU state before this thread goes off-cpu.
2006          */
2007         if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2008                 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2009 
2010                 save_guest_fpustate(vcpu);
2011                 vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2012         }
2013 }
2014 
2015 static void
2016 vmm_restorectx(void *arg)
2017 {
2018         vm_thread_ctx_t *vtc = arg;
2019         struct vm *vm = vtc->vtc_vm;
2020         const int vcpuid = vtc->vtc_vcpuid;
2021 
2022         /*
2023          * When coming back on-cpu, only restore the guest FPU status if the
2024          * thread is in a context marked as requiring it.  This should be rare,
2025          * occurring only when a future logic error results in a voluntary
2026          * sleep during the VMRUN critical section.
2027          *
2028          * The common case will result in elision of the guest FPU state
2029          * restoration, deferring that action until it is clearly necessary
2030          * during vm_run.
2031          */
2032         VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2033         if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2034                 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2035 
2036                 restore_guest_fpustate(vcpu);
2037                 vtc->vtc_status |= VTCS_FPU_RESTORED;
2038         }
2039 
2040         if (ops->vmrestorectx != NULL) {
2041                 ops->vmrestorectx(vm->cookie, vcpuid);
2042         }
2043 
2044 }
2045 
2046 /*
2047  * If we're in removectx(), we might still have state to tidy up.
2048  */
2049 static void
2050 vmm_freectx(void *arg, int isexec)
2051 {
2052         vmm_savectx(arg);
2053 }
2054 
2055 #endif /* __FreeBSD */
2056 
2057 static int
2058 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2059     struct vm_exit *vme)
2060 {
2061         struct vcpu *vcpu;
2062         struct vie *vie;
2063         int err;
2064 
2065         vcpu = &vm->vcpu[vcpuid];
2066         vie = vcpu->vie_ctx;
2067         err = 0;
2068 
2069         switch (entry->cmd) {
2070         case VEC_DEFAULT:
2071                 return (0);
2072         case VEC_DISCARD_INSTR:
2073                 vie_reset(vie);
2074                 return (0);
2075         case VEC_COMPLETE_MMIO:
2076                 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2077                 if (err == 0) {
2078                         err = vie_emulate_mmio(vie, vm, vcpuid);
2079                         if (err == 0) {
2080                                 vie_advance_pc(vie, &vcpu->nextrip);
2081                         } else if (err < 0) {
2082                                 vie_exitinfo(vie, vme);
2083                         } else if (err == EAGAIN) {
2084                                 /*
2085                                  * Clear the instruction emulation state in
2086                                  * order to re-enter VM context and continue
2087                                  * this 'rep <instruction>'
2088                                  */
2089                                 vie_reset(vie);
2090                                 err = 0;
2091                         }
2092                 }
2093                 break;
2094         case VEC_COMPLETE_INOUT:
2095                 err = vie_fulfill_inout(vie, &entry->u.inout);
2096                 if (err == 0) {
2097                         err = vie_emulate_inout(vie, vm, vcpuid);
2098                         if (err == 0) {
2099                                 vie_advance_pc(vie, &vcpu->nextrip);
2100                         } else if (err < 0) {
2101                                 vie_exitinfo(vie, vme);
2102                         } else if (err == EAGAIN) {
2103                                 /*
2104                                  * Clear the instruction emulation state in
2105                                  * order to re-enter VM context and continue
2106                                  * this 'rep ins/outs'
2107                                  */
2108                                 vie_reset(vie);
2109                                 err = 0;
2110                         }
2111                 }
2112                 break;
2113         default:
2114                 return (EINVAL);
2115         }
2116         return (err);
2117 }
2118 
2119 static int
2120 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2121 {
2122         struct vie *vie;
2123 
2124         vie = vm->vcpu[vcpuid].vie_ctx;
2125 
2126         if (vie_pending(vie)) {
2127                 /*
2128                  * Userspace has not fulfilled the pending needs of the
2129                  * instruction emulation, so bail back out.
2130                  */
2131                 vie_exitinfo(vie, vme);
2132                 return (-1);
2133         }
2134 
2135         if (vcpuid == 0 && vm->sipi_req) {
2136                 /* The boot vCPU has sent a SIPI to one of the other CPUs */
2137                 vme->exitcode = VM_EXITCODE_SPINUP_AP;
2138                 vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu;
2139                 vme->u.spinup_ap.rip = vm->sipi_req_rip;
2140 
2141                 vm->sipi_req = false;
2142                 vm->sipi_req_vcpu = 0;
2143                 vm->sipi_req_rip = 0;
2144                 return (-1);
2145         }
2146 
2147         return (0);
2148 }
2149 
2150 int
2151 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2152 {
2153         struct vm_eventinfo evinfo;
2154         int error;
2155         struct vcpu *vcpu;
2156 #ifdef  __FreeBSD__
2157         struct pcb *pcb;
2158 #endif
2159         uint64_t tscval;
2160         struct vm_exit *vme;
2161         bool intr_disabled;
2162         pmap_t pmap;
2163 #ifndef __FreeBSD__
2164         vm_thread_ctx_t vtc;
2165         int affinity_type = CPU_CURRENT;
2166 #endif
2167 
2168         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2169                 return (EINVAL);
2170 
2171         if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2172                 return (EINVAL);
2173 
2174         if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2175                 return (EINVAL);
2176 
2177         pmap = vmspace_pmap(vm->vmspace);
2178         vcpu = &vm->vcpu[vcpuid];
2179         vme = &vcpu->exitinfo;
2180         evinfo.rptr = &vcpu->runblock;
2181         evinfo.sptr = &vm->suspend;
2182         evinfo.iptr = &vcpu->reqidle;
2183 
2184 #ifndef __FreeBSD__
2185         vtc.vtc_vm = vm;
2186         vtc.vtc_vcpuid = vcpuid;
2187         vtc.vtc_status = 0;
2188 
2189         installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2190             NULL, vmm_freectx);
2191 #endif
2192 
2193         error = vm_entry_actions(vm, vcpuid, entry, vme);
2194         if (error != 0) {
2195                 goto exit;
2196         }
2197 
2198 restart:
2199         error = vm_loop_checks(vm, vcpuid, vme);
2200         if (error != 0) {
2201                 goto exit;
2202         }
2203 
2204 #ifndef __FreeBSD__
2205         thread_affinity_set(curthread, affinity_type);
2206         /*
2207          * Resource localization should happen after the CPU affinity for the
2208          * thread has been set to ensure that access from restricted contexts,
2209          * such as VMX-accelerated APIC operations, can occur without inducing
2210          * cyclic cross-calls.
2211          *
2212          * This must be done prior to disabling kpreempt via critical_enter().
2213          */
2214         vm_localize_resources(vm, vcpu);
2215 
2216         affinity_type = CPU_CURRENT;
2217 #endif
2218 
2219         critical_enter();
2220 
2221         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2222             ("vm_run: absurd pm_active"));
2223 
2224         tscval = rdtsc();
2225 
2226 #ifdef  __FreeBSD__
2227         pcb = PCPU_GET(curpcb);
2228         set_pcb_flags(pcb, PCB_FULL_IRET);
2229 #else
2230         /* Force a trip through update_sregs to reload %fs/%gs and friends */
2231         PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2232 #endif
2233 
2234 #ifdef  __FreeBSD__
2235         restore_guest_fpustate(vcpu);
2236 #else
2237         if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2238                 restore_guest_fpustate(vcpu);
2239                 vtc.vtc_status |= VTCS_FPU_RESTORED;
2240         }
2241         vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2242 #endif
2243 
2244         vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2245         error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
2246         vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2247 
2248 #ifdef  __FreeBSD__
2249         save_guest_fpustate(vcpu);
2250 #else
2251         vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2252 #endif
2253 
2254 #ifndef __FreeBSD__
2255         /*
2256          * Once clear of the delicate contexts comprising the VM_RUN handler,
2257          * thread CPU affinity can be loosened while other processing occurs.
2258          */
2259         thread_affinity_clear(curthread);
2260 #endif
2261 
2262         vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
2263 
2264         critical_exit();
2265 
2266         if (error != 0) {
2267                 /* Communicate out any error from VMRUN() above */
2268                 goto exit;
2269         }
2270 
2271         vcpu->nextrip = vme->rip + vme->inst_length;
2272         switch (vme->exitcode) {
2273         case VM_EXITCODE_REQIDLE:
2274                 error = vm_handle_reqidle(vm, vcpuid);
2275                 break;
2276         case VM_EXITCODE_SUSPENDED:
2277                 error = vm_handle_suspend(vm, vcpuid);
2278                 break;
2279         case VM_EXITCODE_IOAPIC_EOI:
2280                 vioapic_process_eoi(vm, vcpuid,
2281                     vme->u.ioapic_eoi.vector);
2282                 break;
2283         case VM_EXITCODE_RUNBLOCK:
2284                 break;
2285         case VM_EXITCODE_HLT:
2286                 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2287                 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2288                 break;
2289         case VM_EXITCODE_PAGING:
2290                 error = vm_handle_paging(vm, vcpuid);
2291                 break;
2292         case VM_EXITCODE_MMIO_EMUL:
2293                 error = vm_handle_mmio_emul(vm, vcpuid);
2294                 break;
2295         case VM_EXITCODE_INOUT:
2296                 error = vm_handle_inout(vm, vcpuid, vme);
2297                 break;
2298         case VM_EXITCODE_MONITOR:
2299         case VM_EXITCODE_MWAIT:
2300         case VM_EXITCODE_VMINSN:
2301                 vm_inject_ud(vm, vcpuid);
2302                 break;
2303 #ifndef __FreeBSD__
2304         case VM_EXITCODE_WRMSR:
2305                 if (vm_handle_wrmsr(vm, vcpuid, vme) != 0) {
2306                         error = -1;
2307                 }
2308                 break;
2309 
2310         case VM_EXITCODE_HT: {
2311                 affinity_type = CPU_BEST;
2312                 break;
2313         }
2314 #endif
2315 
2316         case VM_EXITCODE_MTRAP:
2317                 vm_suspend_cpu(vm, vcpuid);
2318                 error = -1;
2319                 break;
2320         default:
2321                 /* handled in userland */
2322                 error = -1;
2323                 break;
2324         }
2325 
2326         if (error == 0) {
2327                 /* VM exit conditions handled in-kernel, continue running */
2328                 goto restart;
2329         }
2330 
2331 exit:
2332 #ifndef __FreeBSD__
2333         removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2334             NULL, vmm_freectx);
2335 #endif
2336 
2337         VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2338 
2339         return (error);
2340 }
2341 
2342 int
2343 vm_restart_instruction(void *arg, int vcpuid)
2344 {
2345         struct vm *vm;
2346         struct vcpu *vcpu;
2347         enum vcpu_state state;
2348         uint64_t rip;
2349         int error;
2350 
2351         vm = arg;
2352         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2353                 return (EINVAL);
2354 
2355         vcpu = &vm->vcpu[vcpuid];
2356         state = vcpu_get_state(vm, vcpuid, NULL);
2357         if (state == VCPU_RUNNING) {
2358                 /*
2359                  * When a vcpu is "running" the next instruction is determined
2360                  * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2361                  * Thus setting 'inst_length' to zero will cause the current
2362                  * instruction to be restarted.
2363                  */
2364                 vcpu->exitinfo.inst_length = 0;
2365                 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2366                     "setting inst_length to zero", vcpu->exitinfo.rip);
2367         } else if (state == VCPU_FROZEN) {
2368                 /*
2369                  * When a vcpu is "frozen" it is outside the critical section
2370                  * around VMRUN() and 'nextrip' points to the next instruction.
2371                  * Thus instruction restart is achieved by setting 'nextrip'
2372                  * to the vcpu's %rip.
2373                  */
2374                 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2375                 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2376                 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2377                     "nextrip from %lx to %lx", vcpu->nextrip, rip);
2378                 vcpu->nextrip = rip;
2379         } else {
2380                 panic("%s: invalid state %d", __func__, state);
2381         }
2382         return (0);
2383 }
2384 
2385 int
2386 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2387 {
2388         struct vcpu *vcpu;
2389         int type, vector;
2390 
2391         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2392                 return (EINVAL);
2393 
2394         vcpu = &vm->vcpu[vcpuid];
2395 
2396         if (info & VM_INTINFO_VALID) {
2397                 type = info & VM_INTINFO_TYPE;
2398                 vector = info & 0xff;
2399                 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2400                         return (EINVAL);
2401                 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2402                         return (EINVAL);
2403                 if (info & VM_INTINFO_RSVD)
2404                         return (EINVAL);
2405         } else {
2406                 info = 0;
2407         }
2408         VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2409         vcpu->exitintinfo = info;
2410         return (0);
2411 }
2412 
2413 enum exc_class {
2414         EXC_BENIGN,
2415         EXC_CONTRIBUTORY,
2416         EXC_PAGEFAULT
2417 };
2418 
2419 #define IDT_VE  20      /* Virtualization Exception (Intel specific) */
2420 
2421 static enum exc_class
2422 exception_class(uint64_t info)
2423 {
2424         int type, vector;
2425 
2426         KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2427         type = info & VM_INTINFO_TYPE;
2428         vector = info & 0xff;
2429 
2430         /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2431         switch (type) {
2432         case VM_INTINFO_HWINTR:
2433         case VM_INTINFO_SWINTR:
2434         case VM_INTINFO_NMI:
2435                 return (EXC_BENIGN);
2436         default:
2437                 /*
2438                  * Hardware exception.
2439                  *
2440                  * SVM and VT-x use identical type values to represent NMI,
2441                  * hardware interrupt and software interrupt.
2442                  *
2443                  * SVM uses type '3' for all exceptions. VT-x uses type '3'
2444                  * for exceptions except #BP and #OF. #BP and #OF use a type
2445                  * value of '5' or '6'. Therefore we don't check for explicit
2446                  * values of 'type' to classify 'intinfo' into a hardware
2447                  * exception.
2448                  */
2449                 break;
2450         }
2451 
2452         switch (vector) {
2453         case IDT_PF:
2454         case IDT_VE:
2455                 return (EXC_PAGEFAULT);
2456         case IDT_DE:
2457         case IDT_TS:
2458         case IDT_NP:
2459         case IDT_SS:
2460         case IDT_GP:
2461                 return (EXC_CONTRIBUTORY);
2462         default:
2463                 return (EXC_BENIGN);
2464         }
2465 }
2466 
2467 static int
2468 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2469     uint64_t *retinfo)
2470 {
2471         enum exc_class exc1, exc2;
2472         int type1, vector1;
2473 
2474         KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2475         KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2476 
2477         /*
2478          * If an exception occurs while attempting to call the double-fault
2479          * handler the processor enters shutdown mode (aka triple fault).
2480          */
2481         type1 = info1 & VM_INTINFO_TYPE;
2482         vector1 = info1 & 0xff;
2483         if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2484                 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2485                     info1, info2);
2486                 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2487                 *retinfo = 0;
2488                 return (0);
2489         }
2490 
2491         /*
2492          * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2493          */
2494         exc1 = exception_class(info1);
2495         exc2 = exception_class(info2);
2496         if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2497             (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2498                 /* Convert nested fault into a double fault. */
2499                 *retinfo = IDT_DF;
2500                 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2501                 *retinfo |= VM_INTINFO_DEL_ERRCODE;
2502         } else {
2503                 /* Handle exceptions serially */
2504                 *retinfo = info2;
2505         }
2506         return (1);
2507 }
2508 
2509 static uint64_t
2510 vcpu_exception_intinfo(struct vcpu *vcpu)
2511 {
2512         uint64_t info = 0;
2513 
2514         if (vcpu->exception_pending) {
2515                 info = vcpu->exc_vector & 0xff;
2516                 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2517                 if (vcpu->exc_errcode_valid) {
2518                         info |= VM_INTINFO_DEL_ERRCODE;
2519                         info |= (uint64_t)vcpu->exc_errcode << 32;
2520                 }
2521         }
2522         return (info);
2523 }
2524 
2525 int
2526 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2527 {
2528         struct vcpu *vcpu;
2529         uint64_t info1, info2;
2530         int valid;
2531 
2532         KASSERT(vcpuid >= 0 &&
2533             vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2534 
2535         vcpu = &vm->vcpu[vcpuid];
2536 
2537         info1 = vcpu->exitintinfo;
2538         vcpu->exitintinfo = 0;
2539 
2540         info2 = 0;
2541         if (vcpu->exception_pending) {
2542                 info2 = vcpu_exception_intinfo(vcpu);
2543                 vcpu->exception_pending = 0;
2544                 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2545                     vcpu->exc_vector, info2);
2546         }
2547 
2548         if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2549                 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2550         } else if (info1 & VM_INTINFO_VALID) {
2551                 *retinfo = info1;
2552                 valid = 1;
2553         } else if (info2 & VM_INTINFO_VALID) {
2554                 *retinfo = info2;
2555                 valid = 1;
2556         } else {
2557                 valid = 0;
2558         }
2559 
2560         if (valid) {
2561                 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2562                     "retinfo(%lx)", __func__, info1, info2, *retinfo);
2563         }
2564 
2565         return (valid);
2566 }
2567 
2568 int
2569 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2570 {
2571         struct vcpu *vcpu;
2572 
2573         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2574                 return (EINVAL);
2575 
2576         vcpu = &vm->vcpu[vcpuid];
2577         *info1 = vcpu->exitintinfo;
2578         *info2 = vcpu_exception_intinfo(vcpu);
2579         return (0);
2580 }
2581 
2582 int
2583 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2584     uint32_t errcode, int restart_instruction)
2585 {
2586         struct vcpu *vcpu;
2587         uint64_t regval;
2588         int error;
2589 
2590         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2591                 return (EINVAL);
2592 
2593         if (vector < 0 || vector >= 32)
2594                 return (EINVAL);
2595 
2596         /*
2597          * NMIs (which bear an exception vector of 2) are to be injected via
2598          * their own specialized path using vm_inject_nmi().
2599          */
2600         if (vector == 2) {
2601                 return (EINVAL);
2602         }
2603 
2604         /*
2605          * A double fault exception should never be injected directly into
2606          * the guest. It is a derived exception that results from specific
2607          * combinations of nested faults.
2608          */
2609         if (vector == IDT_DF)
2610                 return (EINVAL);
2611 
2612         vcpu = &vm->vcpu[vcpuid];
2613 
2614         if (vcpu->exception_pending) {
2615                 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2616                     "pending exception %d", vector, vcpu->exc_vector);
2617                 return (EBUSY);
2618         }
2619 
2620         if (errcode_valid) {
2621                 /*
2622                  * Exceptions don't deliver an error code in real mode.
2623                  */
2624                 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2625                 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2626                 if (!(regval & CR0_PE))
2627                         errcode_valid = 0;
2628         }
2629 
2630         /*
2631          * From section 26.6.1 "Interruptibility State" in Intel SDM:
2632          *
2633          * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2634          * one instruction or incurs an exception.
2635          */
2636         error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2637         KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2638             __func__, error));
2639 
2640         if (restart_instruction)
2641                 vm_restart_instruction(vm, vcpuid);
2642 
2643         vcpu->exception_pending = 1;
2644         vcpu->exc_vector = vector;
2645         vcpu->exc_errcode = errcode;
2646         vcpu->exc_errcode_valid = errcode_valid;
2647         VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2648         return (0);
2649 }
2650 
2651 void
2652 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2653     int errcode)
2654 {
2655         int error;
2656 
2657         error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2658             errcode, 1);
2659         KASSERT(error == 0, ("vm_inject_exception error %d", error));
2660 }
2661 
2662 void
2663 vm_inject_ud(struct vm *vm, int vcpuid)
2664 {
2665         vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2666 }
2667 
2668 void
2669 vm_inject_gp(struct vm *vm, int vcpuid)
2670 {
2671         vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2672 }
2673 
2674 void
2675 vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2676 {
2677         vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2678 }
2679 
2680 void
2681 vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2682 {
2683         vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2684 }
2685 
2686 void
2687 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2688 {
2689         int error;
2690 
2691         VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2692             error_code, cr2);
2693 
2694         error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2695         KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2696 
2697         vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2698 }
2699 
2700 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2701 
2702 int
2703 vm_inject_nmi(struct vm *vm, int vcpuid)
2704 {
2705         struct vcpu *vcpu;
2706 
2707         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2708                 return (EINVAL);
2709 
2710         vcpu = &vm->vcpu[vcpuid];
2711 
2712         vcpu->nmi_pending = 1;
2713         vcpu_notify_event(vm, vcpuid);
2714         return (0);
2715 }
2716 
2717 int
2718 vm_nmi_pending(struct vm *vm, int vcpuid)
2719 {
2720         struct vcpu *vcpu;
2721 
2722         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2723                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2724 
2725         vcpu = &vm->vcpu[vcpuid];
2726 
2727         return (vcpu->nmi_pending);
2728 }
2729 
2730 void
2731 vm_nmi_clear(struct vm *vm, int vcpuid)
2732 {
2733         struct vcpu *vcpu;
2734 
2735         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2736                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2737 
2738         vcpu = &vm->vcpu[vcpuid];
2739 
2740         if (vcpu->nmi_pending == 0)
2741                 panic("vm_nmi_clear: inconsistent nmi_pending state");
2742 
2743         vcpu->nmi_pending = 0;
2744         vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2745 }
2746 
2747 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2748 
2749 int
2750 vm_inject_extint(struct vm *vm, int vcpuid)
2751 {
2752         struct vcpu *vcpu;
2753 
2754         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2755                 return (EINVAL);
2756 
2757         vcpu = &vm->vcpu[vcpuid];
2758 
2759         vcpu->extint_pending = 1;
2760         vcpu_notify_event(vm, vcpuid);
2761         return (0);
2762 }
2763 
2764 int
2765 vm_extint_pending(struct vm *vm, int vcpuid)
2766 {
2767         struct vcpu *vcpu;
2768 
2769         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2770                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2771 
2772         vcpu = &vm->vcpu[vcpuid];
2773 
2774         return (vcpu->extint_pending);
2775 }
2776 
2777 void
2778 vm_extint_clear(struct vm *vm, int vcpuid)
2779 {
2780         struct vcpu *vcpu;
2781 
2782         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2783                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2784 
2785         vcpu = &vm->vcpu[vcpuid];
2786 
2787         if (vcpu->extint_pending == 0)
2788                 panic("vm_extint_clear: inconsistent extint_pending state");
2789 
2790         vcpu->extint_pending = 0;
2791         vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2792 }
2793 
2794 int
2795 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2796 {
2797         if (vcpu < 0 || vcpu >= vm->maxcpus)
2798                 return (EINVAL);
2799 
2800         if (type < 0 || type >= VM_CAP_MAX)
2801                 return (EINVAL);
2802 
2803         return (VMGETCAP(vm->cookie, vcpu, type, retval));
2804 }
2805 
2806 int
2807 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
2808 {
2809         if (vcpu < 0 || vcpu >= vm->maxcpus)
2810                 return (EINVAL);
2811 
2812         if (type < 0 || type >= VM_CAP_MAX)
2813                 return (EINVAL);
2814 
2815         return (VMSETCAP(vm->cookie, vcpu, type, val));
2816 }
2817 
2818 struct vlapic *
2819 vm_lapic(struct vm *vm, int cpu)
2820 {
2821         return (vm->vcpu[cpu].vlapic);
2822 }
2823 
2824 struct vioapic *
2825 vm_ioapic(struct vm *vm)
2826 {
2827 
2828         return (vm->vioapic);
2829 }
2830 
2831 struct vhpet *
2832 vm_hpet(struct vm *vm)
2833 {
2834 
2835         return (vm->vhpet);
2836 }
2837 
2838 #ifdef  __FreeBSD__
2839 bool
2840 vmm_is_pptdev(int bus, int slot, int func)
2841 {
2842         int b, f, i, n, s;
2843         char *val, *cp, *cp2;
2844         bool found;
2845 
2846         /*
2847          * XXX
2848          * The length of an environment variable is limited to 128 bytes which
2849          * puts an upper limit on the number of passthru devices that may be
2850          * specified using a single environment variable.
2851          *
2852          * Work around this by scanning multiple environment variable
2853          * names instead of a single one - yuck!
2854          */
2855         const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
2856 
2857         /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
2858         found = false;
2859         for (i = 0; names[i] != NULL && !found; i++) {
2860                 cp = val = kern_getenv(names[i]);
2861                 while (cp != NULL && *cp != '\0') {
2862                         if ((cp2 = strchr(cp, ' ')) != NULL)
2863                                 *cp2 = '\0';
2864 
2865                         n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
2866                         if (n == 3 && bus == b && slot == s && func == f) {
2867                                 found = true;
2868                                 break;
2869                         }
2870 
2871                         if (cp2 != NULL)
2872                                 *cp2++ = ' ';
2873 
2874                         cp = cp2;
2875                 }
2876                 freeenv(val);
2877         }
2878         return (found);
2879 }
2880 #endif
2881 
2882 void *
2883 vm_iommu_domain(struct vm *vm)
2884 {
2885 
2886         return (vm->iommu);
2887 }
2888 
2889 int
2890 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
2891     bool from_idle)
2892 {
2893         int error;
2894         struct vcpu *vcpu;
2895 
2896         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2897                 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
2898 
2899         vcpu = &vm->vcpu[vcpuid];
2900 
2901         vcpu_lock(vcpu);
2902         error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
2903         vcpu_unlock(vcpu);
2904 
2905         return (error);
2906 }
2907 
2908 enum vcpu_state
2909 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
2910 {
2911         struct vcpu *vcpu;
2912         enum vcpu_state state;
2913 
2914         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2915                 panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
2916 
2917         vcpu = &vm->vcpu[vcpuid];
2918 
2919         vcpu_lock(vcpu);
2920         state = vcpu->state;
2921         if (hostcpu != NULL)
2922                 *hostcpu = vcpu->hostcpu;
2923         vcpu_unlock(vcpu);
2924 
2925         return (state);
2926 }
2927 
2928 void
2929 vcpu_block_run(struct vm *vm, int vcpuid)
2930 {
2931         struct vcpu *vcpu;
2932 
2933         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2934                 panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
2935 
2936         vcpu = &vm->vcpu[vcpuid];
2937 
2938         vcpu_lock(vcpu);
2939         vcpu->runblock++;
2940         if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) {
2941                 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2942         }
2943         while (vcpu->state == VCPU_RUNNING) {
2944 #ifdef __FreeBSD__
2945                 msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
2946 #else
2947                 cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
2948 #endif
2949         }
2950         vcpu_unlock(vcpu);
2951 }
2952 
2953 void
2954 vcpu_unblock_run(struct vm *vm, int vcpuid)
2955 {
2956         struct vcpu *vcpu;
2957 
2958         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2959                 panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
2960 
2961         vcpu = &vm->vcpu[vcpuid];
2962 
2963         vcpu_lock(vcpu);
2964         KASSERT(vcpu->runblock != 0, ("expected non-zero runblock"));
2965         vcpu->runblock--;
2966         if (vcpu->runblock == 0) {
2967 #ifdef __FreeBSD__
2968                 wakeup(&vcpu->state);
2969 #else
2970                 cv_broadcast(&vcpu->state_cv);
2971 #endif
2972         }
2973         vcpu_unlock(vcpu);
2974 }
2975 
2976 #ifndef __FreeBSD__
2977 uint64_t
2978 vcpu_tsc_offset(struct vm *vm, int vcpuid)
2979 {
2980         return (vm->vcpu[vcpuid].tsc_offset);
2981 }
2982 #endif /* __FreeBSD__ */
2983 
2984 int
2985 vm_activate_cpu(struct vm *vm, int vcpuid)
2986 {
2987 
2988         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2989                 return (EINVAL);
2990 
2991         if (CPU_ISSET(vcpuid, &vm->active_cpus))
2992                 return (EBUSY);
2993 
2994         VCPU_CTR0(vm, vcpuid, "activated");
2995         CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
2996         return (0);
2997 }
2998 
2999 int
3000 vm_suspend_cpu(struct vm *vm, int vcpuid)
3001 {
3002         int i;
3003 
3004         if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3005                 return (EINVAL);
3006 
3007         if (vcpuid == -1) {
3008                 vm->debug_cpus = vm->active_cpus;
3009                 for (i = 0; i < vm->maxcpus; i++) {
3010                         if (CPU_ISSET(i, &vm->active_cpus))
3011                                 vcpu_notify_event(vm, i);
3012                 }
3013         } else {
3014                 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3015                         return (EINVAL);
3016 
3017                 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3018                 vcpu_notify_event(vm, vcpuid);
3019         }
3020         return (0);
3021 }
3022 
3023 int
3024 vm_resume_cpu(struct vm *vm, int vcpuid)
3025 {
3026 
3027         if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3028                 return (EINVAL);
3029 
3030         if (vcpuid == -1) {
3031                 CPU_ZERO(&vm->debug_cpus);
3032         } else {
3033                 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3034                         return (EINVAL);
3035 
3036                 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3037         }
3038         return (0);
3039 }
3040 
3041 int
3042 vcpu_debugged(struct vm *vm, int vcpuid)
3043 {
3044 
3045         return (CPU_ISSET(vcpuid, &vm->debug_cpus));
3046 }
3047 
3048 cpuset_t
3049 vm_active_cpus(struct vm *vm)
3050 {
3051 
3052         return (vm->active_cpus);
3053 }
3054 
3055 cpuset_t
3056 vm_debug_cpus(struct vm *vm)
3057 {
3058 
3059         return (vm->debug_cpus);
3060 }
3061 
3062 cpuset_t
3063 vm_suspended_cpus(struct vm *vm)
3064 {
3065 
3066         return (vm->suspended_cpus);
3067 }
3068 
3069 void *
3070 vcpu_stats(struct vm *vm, int vcpuid)
3071 {
3072 
3073         return (vm->vcpu[vcpuid].stats);
3074 }
3075 
3076 int
3077 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3078 {
3079         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3080                 return (EINVAL);
3081 
3082         *state = vm->vcpu[vcpuid].x2apic_state;
3083 
3084         return (0);
3085 }
3086 
3087 int
3088 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3089 {
3090         if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3091                 return (EINVAL);
3092 
3093         if (state >= X2APIC_STATE_LAST)
3094                 return (EINVAL);
3095 
3096         vm->vcpu[vcpuid].x2apic_state = state;
3097 
3098         vlapic_set_x2apic_state(vm, vcpuid, state);
3099 
3100         return (0);
3101 }
3102 
3103 /*
3104  * This function is called to ensure that a vcpu "sees" a pending event
3105  * as soon as possible:
3106  * - If the vcpu thread is sleeping then it is woken up.
3107  * - If the vcpu is running on a different host_cpu then an IPI will be directed
3108  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
3109  */
3110 static void
3111 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3112 {
3113         int hostcpu;
3114 
3115         ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3116 
3117         hostcpu = vcpu->hostcpu;
3118         if (vcpu->state == VCPU_RUNNING) {
3119                 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3120                 if (hostcpu != curcpu) {
3121                         if (ntype == VCPU_NOTIFY_APIC) {
3122                                 vlapic_post_intr(vcpu->vlapic, hostcpu,
3123                                     vmm_ipinum);
3124                         } else {
3125                                 ipi_cpu(hostcpu, vmm_ipinum);
3126                         }
3127                 } else {
3128                         /*
3129                          * If the 'vcpu' is running on 'curcpu' then it must
3130                          * be sending a notification to itself (e.g. SELF_IPI).
3131                          * The pending event will be picked up when the vcpu
3132                          * transitions back to guest context.
3133                          */
3134                 }
3135         } else {
3136                 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3137                     "with hostcpu %d", vcpu->state, hostcpu));
3138                 if (vcpu->state == VCPU_SLEEPING) {
3139 #ifdef __FreeBSD__
3140                         wakeup_one(vcpu);
3141 #else
3142                         cv_signal(&vcpu->vcpu_cv);
3143 #endif
3144                 }
3145         }
3146 }
3147 
3148 void
3149 vcpu_notify_event(struct vm *vm, int vcpuid)
3150 {
3151         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3152 
3153         vcpu_lock(vcpu);
3154         vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3155         vcpu_unlock(vcpu);
3156 }
3157 
3158 void
3159 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3160 {
3161         struct vcpu *vcpu = &vm->vcpu[vcpuid];
3162 
3163         if (ntype == VCPU_NOTIFY_NONE) {
3164                 return;
3165         }
3166 
3167         vcpu_lock(vcpu);
3168         vcpu_notify_event_locked(vcpu, ntype);
3169         vcpu_unlock(vcpu);
3170 }
3171 
3172 struct vmspace *
3173 vm_get_vmspace(struct vm *vm)
3174 {
3175 
3176         return (vm->vmspace);
3177 }
3178 
3179 int
3180 vm_apicid2vcpuid(struct vm *vm, int apicid)
3181 {
3182         /*
3183          * XXX apic id is assumed to be numerically identical to vcpu id
3184          */
3185         return (apicid);
3186 }
3187 
3188 struct vatpic *
3189 vm_atpic(struct vm *vm)
3190 {
3191         return (vm->vatpic);
3192 }
3193 
3194 struct vatpit *
3195 vm_atpit(struct vm *vm)
3196 {
3197         return (vm->vatpit);
3198 }
3199 
3200 struct vpmtmr *
3201 vm_pmtmr(struct vm *vm)
3202 {
3203 
3204         return (vm->vpmtmr);
3205 }
3206 
3207 struct vrtc *
3208 vm_rtc(struct vm *vm)
3209 {
3210 
3211         return (vm->vrtc);
3212 }
3213 
3214 enum vm_reg_name
3215 vm_segment_name(int seg)
3216 {
3217         static enum vm_reg_name seg_names[] = {
3218                 VM_REG_GUEST_ES,
3219                 VM_REG_GUEST_CS,
3220                 VM_REG_GUEST_SS,
3221                 VM_REG_GUEST_DS,
3222                 VM_REG_GUEST_FS,
3223                 VM_REG_GUEST_GS
3224         };
3225 
3226         KASSERT(seg >= 0 && seg < nitems(seg_names),
3227             ("%s: invalid segment encoding %d", __func__, seg));
3228         return (seg_names[seg]);
3229 }
3230 
3231 void
3232 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3233     int num_copyinfo)
3234 {
3235         int idx;
3236 
3237         for (idx = 0; idx < num_copyinfo; idx++) {
3238                 if (copyinfo[idx].cookie != NULL)
3239                         vm_gpa_release(copyinfo[idx].cookie);
3240         }
3241         bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3242 }
3243 
3244 int
3245 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3246     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3247     int num_copyinfo, int *fault)
3248 {
3249         int error, idx, nused;
3250         size_t n, off, remaining;
3251         void *hva, *cookie;
3252         uint64_t gpa;
3253 
3254         bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3255 
3256         nused = 0;
3257         remaining = len;
3258         while (remaining > 0) {
3259                 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3260                 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3261                 if (error || *fault)
3262                         return (error);
3263                 off = gpa & PAGE_MASK;
3264                 n = min(remaining, PAGE_SIZE - off);
3265                 copyinfo[nused].gpa = gpa;
3266                 copyinfo[nused].len = n;
3267                 remaining -= n;
3268                 gla += n;
3269                 nused++;
3270         }
3271 
3272         for (idx = 0; idx < nused; idx++) {
3273                 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3274                     copyinfo[idx].len, prot, &cookie);
3275                 if (hva == NULL)
3276                         break;
3277                 copyinfo[idx].hva = hva;
3278                 copyinfo[idx].cookie = cookie;
3279         }
3280 
3281         if (idx != nused) {
3282                 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3283                 return (EFAULT);
3284         } else {
3285                 *fault = 0;
3286                 return (0);
3287         }
3288 }
3289 
3290 void
3291 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3292     size_t len)
3293 {
3294         char *dst;
3295         int idx;
3296 
3297         dst = kaddr;
3298         idx = 0;
3299         while (len > 0) {
3300                 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3301                 len -= copyinfo[idx].len;
3302                 dst += copyinfo[idx].len;
3303                 idx++;
3304         }
3305 }
3306 
3307 void
3308 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3309     struct vm_copyinfo *copyinfo, size_t len)
3310 {
3311         const char *src;
3312         int idx;
3313 
3314         src = kaddr;
3315         idx = 0;
3316         while (len > 0) {
3317                 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3318                 len -= copyinfo[idx].len;
3319                 src += copyinfo[idx].len;
3320                 idx++;
3321         }
3322 }
3323 
3324 /*
3325  * Return the amount of in-use and wired memory for the VM. Since
3326  * these are global stats, only return the values with for vCPU 0
3327  */
3328 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3329 VMM_STAT_DECLARE(VMM_MEM_WIRED);
3330 
3331 static void
3332 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3333 {
3334 
3335         if (vcpu == 0) {
3336                 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3337                     PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3338         }
3339 }
3340 
3341 static void
3342 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3343 {
3344 
3345         if (vcpu == 0) {
3346                 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3347                     PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3348         }
3349 }
3350 
3351 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3352 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3353 
3354 int
3355 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3356     uint8_t bytes, uint32_t *val)
3357 {
3358         return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3359 }
3360 
3361 /*
3362  * bhyve-internal interfaces to attach or detach IO port handlers.
3363  * Must be called with VM write lock held for safety.
3364  */
3365 int
3366 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3367     void **cookie)
3368 {
3369         int err;
3370         err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3371         if (err == 0) {
3372                 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3373         }
3374         return (err);
3375 }
3376 int
3377 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3378     void **old_arg)
3379 {
3380         uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3381         int err;
3382 
3383         err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3384         if (err == 0) {
3385                 *cookie = NULL;
3386         }
3387         return (err);
3388 }
3389 
3390 /*
3391  * External driver interfaces to attach or detach IO port handlers.
3392  * Must be called with VM write lock held for safety.
3393  */
3394 int
3395 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3396     void *arg, void **cookie)
3397 {
3398         int err;
3399 
3400         if (port == 0) {
3401                 return (EINVAL);
3402         }
3403 
3404         err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3405         if (err == 0) {
3406                 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3407         }
3408         return (err);
3409 }
3410 void
3411 vm_ioport_unhook(struct vm *vm, void **cookie)
3412 {
3413         uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3414         ioport_handler_t old_func;
3415         void *old_arg;
3416         int err;
3417 
3418         err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3419 
3420         /* ioport-hook-using drivers are expected to be well-behaved */
3421         VERIFY0(err);
3422         VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3423 
3424         *cookie = NULL;
3425 }