1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD$
29 */
30 /*
31 * This file and its contents are supplied under the terms of the
32 * Common Development and Distribution License ("CDDL"), version 1.0.
33 * You may only use this file in accordance with the terms of version
34 * 1.0 of the CDDL.
35 *
36 * A full copy of the text of the CDDL should have accompanied this
37 * source. A copy of the CDDL is also available via the Internet at
38 * http://www.illumos.org/license/CDDL.
39 *
40 * Copyright 2015 Pluribus Networks Inc.
41 * Copyright 2018 Joyent, Inc.
42 * Copyright 2020 Oxide Computer Company
43 */
44
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/kernel.h>
51 #include <sys/module.h>
52 #include <sys/sysctl.h>
53 #include <sys/malloc.h>
54 #include <sys/pcpu.h>
55 #include <sys/lock.h>
56 #include <sys/mutex.h>
57 #include <sys/proc.h>
58 #include <sys/rwlock.h>
59 #include <sys/sched.h>
60 #include <sys/smp.h>
61 #include <sys/systm.h>
62
63 #include <vm/vm.h>
64 #include <vm/vm_object.h>
65 #include <vm/vm_map.h>
66 #include <vm/vm_page.h>
67 #include <vm/pmap.h>
68 #include <vm/vm_extern.h>
69 #include <vm/vm_param.h>
70
71 #ifdef __FreeBSD__
72 #include <machine/cpu.h>
73 #endif
74 #include <machine/pcb.h>
75 #include <machine/smp.h>
76 #include <machine/md_var.h>
77 #include <x86/psl.h>
78 #include <x86/apicreg.h>
79
80 #include <machine/vmm.h>
81 #include <machine/vmm_dev.h>
82 #include <sys/vmm_instruction_emul.h>
83
84 #include "vmm_ioport.h"
85 #include "vmm_ktr.h"
86 #include "vmm_host.h"
87 #include "vmm_mem.h"
88 #include "vmm_util.h"
89 #include "vatpic.h"
90 #include "vatpit.h"
91 #include "vhpet.h"
92 #include "vioapic.h"
93 #include "vlapic.h"
94 #include "vpmtmr.h"
95 #include "vrtc.h"
96 #include "vmm_stat.h"
97 #include "vmm_lapic.h"
98
99 #include "io/ppt.h"
100 #include "io/iommu.h"
101
102 struct vlapic;
103
104 /*
105 * Initialization:
106 * (a) allocated when vcpu is created
107 * (i) initialized when vcpu is created and when it is reinitialized
108 * (o) initialized the first time the vcpu is created
109 * (x) initialized before use
110 */
111 struct vcpu {
112 /* (o) protects state, run_state, hostcpu, sipi_vector */
113 struct mtx mtx;
114
115 enum vcpu_state state; /* (o) vcpu state */
116 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */
117 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */
118 kcondvar_t state_cv; /* (o) IDLE-transition cv */
119 int hostcpu; /* (o) vcpu's current host cpu */
120 int lastloccpu; /* (o) last host cpu localized to */
121 int reqidle; /* (i) request vcpu to idle */
122 struct vlapic *vlapic; /* (i) APIC device model */
123 enum x2apic_state x2apic_state; /* (i) APIC mode */
124 uint64_t exitintinfo; /* (i) events pending at VM exit */
125 int nmi_pending; /* (i) NMI pending */
126 int extint_pending; /* (i) INTR pending */
127 int exception_pending; /* (i) exception pending */
128 int exc_vector; /* (x) exception collateral */
129 int exc_errcode_valid;
130 uint32_t exc_errcode;
131 uint8_t sipi_vector; /* (i) SIPI vector */
132 struct savefpu *guestfpu; /* (a,i) guest fpu state */
133 uint64_t guest_xcr0; /* (i) guest %xcr0 register */
134 void *stats; /* (a,i) statistics */
135 struct vm_exit exitinfo; /* (x) exit reason and collateral */
136 uint64_t nextrip; /* (x) next instruction to execute */
137 struct vie *vie_ctx; /* (x) instruction emulation context */
138 #ifndef __FreeBSD__
139 uint64_t tsc_offset; /* (x) offset from host TSC */
140 #endif
141 };
142
143 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
144 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
145 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
146 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
147 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
148
149 struct mem_seg {
150 size_t len;
151 bool sysmem;
152 struct vm_object *object;
153 };
154 #ifdef __FreeBSD__
155 #define VM_MAX_MEMSEGS 3
156 #else
157 #define VM_MAX_MEMSEGS 4
158 #endif
159
160 struct mem_map {
161 vm_paddr_t gpa;
162 size_t len;
163 vm_ooffset_t segoff;
164 int segid;
165 int prot;
166 int flags;
167 };
168 #define VM_MAX_MEMMAPS 8
169
170 /*
171 * Initialization:
172 * (o) initialized the first time the VM is created
173 * (i) initialized when VM is created and when it is reinitialized
174 * (x) initialized before use
175 */
176 struct vm {
177 void *cookie; /* (i) cpu-specific data */
178 void *iommu; /* (x) iommu-specific data */
179 struct vhpet *vhpet; /* (i) virtual HPET */
180 struct vioapic *vioapic; /* (i) virtual ioapic */
181 struct vatpic *vatpic; /* (i) virtual atpic */
182 struct vatpit *vatpit; /* (i) virtual atpit */
183 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
184 struct vrtc *vrtc; /* (o) virtual RTC */
185 volatile cpuset_t active_cpus; /* (i) active vcpus */
186 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */
187 int suspend; /* (i) stop VM execution */
188 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
189 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
190 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
191 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
192 struct vmspace *vmspace; /* (o) guest's address space */
193 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
194 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
195 /* The following describe the vm cpu topology */
196 uint16_t sockets; /* (o) num of sockets */
197 uint16_t cores; /* (o) num of cores/socket */
198 uint16_t threads; /* (o) num of threads/core */
199 uint16_t maxcpus; /* (o) max pluggable cpus */
200
201 struct ioport_config ioports; /* (o) ioport handling */
202 };
203
204 static int vmm_initialized;
205
206
207 static void
208 nullop_panic(void)
209 {
210 panic("null vmm operation call");
211 }
212
213 /* Do not allow use of an un-set `ops` to do anything but panic */
214 static struct vmm_ops vmm_ops_null = {
215 .init = (vmm_init_func_t)nullop_panic,
216 .cleanup = (vmm_cleanup_func_t)nullop_panic,
217 .resume = (vmm_resume_func_t)nullop_panic,
218 .vminit = (vmi_init_func_t)nullop_panic,
219 .vmrun = (vmi_run_func_t)nullop_panic,
220 .vmcleanup = (vmi_cleanup_func_t)nullop_panic,
221 .vmgetreg = (vmi_get_register_t)nullop_panic,
222 .vmsetreg = (vmi_set_register_t)nullop_panic,
223 .vmgetdesc = (vmi_get_desc_t)nullop_panic,
224 .vmsetdesc = (vmi_set_desc_t)nullop_panic,
225 .vmgetcap = (vmi_get_cap_t)nullop_panic,
226 .vmsetcap = (vmi_set_cap_t)nullop_panic,
227 .vmspace_alloc = (vmi_vmspace_alloc)nullop_panic,
228 .vmspace_free = (vmi_vmspace_free)nullop_panic,
229 .vlapic_init = (vmi_vlapic_init)nullop_panic,
230 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
231 .vmsavectx = (vmi_savectx)nullop_panic,
232 .vmrestorectx = (vmi_restorectx)nullop_panic,
233 };
234
235 static struct vmm_ops *ops = &vmm_ops_null;
236
237 #define VMM_INIT(num) ((*ops->init)(num))
238 #define VMM_CLEANUP() ((*ops->cleanup)())
239 #define VMM_RESUME() ((*ops->resume)())
240
241 #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap))
242 #define VMRUN(vmi, vcpu, rip, pmap) \
243 ((*ops->vmrun)(vmi, vcpu, rip, pmap))
244 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
245 #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max))
246 #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace))
247
248 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
249 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val))
250 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
251 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
252 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv))
253 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val))
254 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu))
255 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic))
256
257 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
258 #define fpu_stop_emulating() clts()
259
260 SDT_PROVIDER_DEFINE(vmm);
261
262 static MALLOC_DEFINE(M_VM, "vm", "vm");
263
264 /* statistics */
265 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
266
267 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
268 NULL);
269
270 /*
271 * Halt the guest if all vcpus are executing a HLT instruction with
272 * interrupts disabled.
273 */
274 static int halt_detection_enabled = 1;
275
276 /* IPI vector used for vcpu notifications */
277 static int vmm_ipinum;
278
279 /* Trap into hypervisor on all guest exceptions and reflect them back */
280 static int trace_guest_exceptions;
281
282 static void vm_free_memmap(struct vm *vm, int ident);
283 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
284 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
285 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
286 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
287
288 #ifndef __FreeBSD__
289 static void vm_clear_memseg(struct vm *, int);
290
291 /* Flags for vtc_status */
292 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */
293 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */
294
295 typedef struct vm_thread_ctx {
296 struct vm *vtc_vm;
297 int vtc_vcpuid;
298 uint_t vtc_status;
299 } vm_thread_ctx_t;
300 #endif /* __FreeBSD__ */
301
302 #ifdef KTR
303 static const char *
304 vcpu_state2str(enum vcpu_state state)
305 {
306
307 switch (state) {
308 case VCPU_IDLE:
309 return ("idle");
310 case VCPU_FROZEN:
311 return ("frozen");
312 case VCPU_RUNNING:
313 return ("running");
314 case VCPU_SLEEPING:
315 return ("sleeping");
316 default:
317 return ("unknown");
318 }
319 }
320 #endif
321
322 static void
323 vcpu_cleanup(struct vm *vm, int i, bool destroy)
324 {
325 struct vcpu *vcpu = &vm->vcpu[i];
326
327 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
328 if (destroy) {
329 vmm_stat_free(vcpu->stats);
330 fpu_save_area_free(vcpu->guestfpu);
331 vie_free(vcpu->vie_ctx);
332 vcpu->vie_ctx = NULL;
333 }
334 }
335
336 static void
337 vcpu_init(struct vm *vm, int vcpu_id, bool create)
338 {
339 struct vcpu *vcpu;
340
341 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
342 ("vcpu_init: invalid vcpu %d", vcpu_id));
343
344 vcpu = &vm->vcpu[vcpu_id];
345
346 if (create) {
347 #ifdef __FreeBSD__
348 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
349 "initialized", vcpu_id));
350 #endif
351 vcpu_lock_init(vcpu);
352 vcpu->state = VCPU_IDLE;
353 vcpu->hostcpu = NOCPU;
354 #ifndef __FreeBSD__
355 vcpu->lastloccpu = NOCPU;
356 #endif
357 vcpu->guestfpu = fpu_save_area_alloc();
358 vcpu->stats = vmm_stat_alloc();
359 vcpu->vie_ctx = vie_alloc();
360 } else {
361 vie_reset(vcpu->vie_ctx);
362 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
363 }
364
365 vcpu->run_state = VRS_HALT;
366 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
367 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
368 vcpu->reqidle = 0;
369 vcpu->exitintinfo = 0;
370 vcpu->nmi_pending = 0;
371 vcpu->extint_pending = 0;
372 vcpu->exception_pending = 0;
373 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
374 fpu_save_area_reset(vcpu->guestfpu);
375 vmm_stat_init(vcpu->stats);
376 }
377
378 int
379 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
380 {
381
382 return (trace_guest_exceptions);
383 }
384
385 struct vm_exit *
386 vm_exitinfo(struct vm *vm, int cpuid)
387 {
388 struct vcpu *vcpu;
389
390 if (cpuid < 0 || cpuid >= vm->maxcpus)
391 panic("vm_exitinfo: invalid cpuid %d", cpuid);
392
393 vcpu = &vm->vcpu[cpuid];
394
395 return (&vcpu->exitinfo);
396 }
397
398 struct vie *
399 vm_vie_ctx(struct vm *vm, int cpuid)
400 {
401 if (cpuid < 0 || cpuid >= vm->maxcpus)
402 panic("vm_vie_ctx: invalid cpuid %d", cpuid);
403
404 return (vm->vcpu[cpuid].vie_ctx);
405 }
406
407 static int
408 vmm_init(void)
409 {
410 int error;
411
412 vmm_host_state_init();
413
414 #ifdef __FreeBSD__
415 vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
416 &IDTVEC(justreturn));
417 if (vmm_ipinum < 0)
418 vmm_ipinum = IPI_AST;
419 #else
420 /* We use cpu_poke() for IPIs */
421 vmm_ipinum = 0;
422 #endif
423
424 error = vmm_mem_init();
425 if (error)
426 return (error);
427
428 if (vmm_is_intel())
429 ops = &vmm_ops_intel;
430 else if (vmm_is_svm())
431 ops = &vmm_ops_amd;
432 else
433 return (ENXIO);
434
435 #ifdef __FreeBSD__
436 vmm_resume_p = vmm_resume;
437 #endif
438
439 return (VMM_INIT(vmm_ipinum));
440 }
441
442 int
443 vmm_mod_load()
444 {
445 int error;
446
447 VERIFY(vmm_initialized == 0);
448
449 error = vmm_init();
450 if (error == 0)
451 vmm_initialized = 1;
452
453 return (error);
454 }
455
456 int
457 vmm_mod_unload()
458 {
459 int error;
460
461 VERIFY(vmm_initialized == 1);
462
463 iommu_cleanup();
464 error = VMM_CLEANUP();
465 if (error)
466 return (error);
467 vmm_initialized = 0;
468
469 return (0);
470 }
471
472 static void
473 vm_init(struct vm *vm, bool create)
474 {
475 int i;
476 #ifndef __FreeBSD__
477 uint64_t tsc_off;
478 #endif
479
480 vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
481 vm->iommu = NULL;
482 vm->vioapic = vioapic_init(vm);
483 vm->vhpet = vhpet_init(vm);
484 vm->vatpic = vatpic_init(vm);
485 vm->vatpit = vatpit_init(vm);
486 vm->vpmtmr = vpmtmr_init(vm);
487 if (create)
488 vm->vrtc = vrtc_init(vm);
489
490 vm_inout_init(vm, &vm->ioports);
491
492 CPU_ZERO(&vm->active_cpus);
493 CPU_ZERO(&vm->debug_cpus);
494
495 vm->suspend = 0;
496 CPU_ZERO(&vm->suspended_cpus);
497
498 for (i = 0; i < vm->maxcpus; i++)
499 vcpu_init(vm, i, create);
500
501 #ifndef __FreeBSD__
502 tsc_off = (uint64_t)(-(int64_t)rdtsc());
503 for (i = 0; i < vm->maxcpus; i++) {
504 vm->vcpu[i].tsc_offset = tsc_off;
505 }
506 #endif /* __FreeBSD__ */
507 }
508
509 /*
510 * The default CPU topology is a single thread per package.
511 */
512 uint_t cores_per_package = 1;
513 uint_t threads_per_core = 1;
514
515 int
516 vm_create(const char *name, struct vm **retvm)
517 {
518 struct vm *vm;
519 struct vmspace *vmspace;
520
521 /*
522 * If vmm.ko could not be successfully initialized then don't attempt
523 * to create the virtual machine.
524 */
525 if (!vmm_initialized)
526 return (ENXIO);
527
528 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
529 return (EINVAL);
530
531 vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
532 if (vmspace == NULL)
533 return (ENOMEM);
534
535 vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
536 strcpy(vm->name, name);
537 vm->vmspace = vmspace;
538
539 vm->sockets = 1;
540 vm->cores = cores_per_package; /* XXX backwards compatibility */
541 vm->threads = threads_per_core; /* XXX backwards compatibility */
542 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
543
544 vm_init(vm, true);
545
546 *retvm = vm;
547 return (0);
548 }
549
550 void
551 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
552 uint16_t *threads, uint16_t *maxcpus)
553 {
554 *sockets = vm->sockets;
555 *cores = vm->cores;
556 *threads = vm->threads;
557 *maxcpus = vm->maxcpus;
558 }
559
560 uint16_t
561 vm_get_maxcpus(struct vm *vm)
562 {
563 return (vm->maxcpus);
564 }
565
566 int
567 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
568 uint16_t threads, uint16_t maxcpus)
569 {
570 if (maxcpus != 0)
571 return (EINVAL); /* XXX remove when supported */
572 if ((sockets * cores * threads) > vm->maxcpus)
573 return (EINVAL);
574 /* XXX need to check sockets * cores * threads == vCPU, how? */
575 vm->sockets = sockets;
576 vm->cores = cores;
577 vm->threads = threads;
578 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
579 return (0);
580 }
581
582 static void
583 vm_cleanup(struct vm *vm, bool destroy)
584 {
585 struct mem_map *mm;
586 int i;
587
588 ppt_unassign_all(vm);
589
590 if (vm->iommu != NULL)
591 iommu_destroy_domain(vm->iommu);
592
593 /*
594 * Devices which attach their own ioport hooks should be cleaned up
595 * first so they can tear down those registrations.
596 */
597 vpmtmr_cleanup(vm->vpmtmr);
598
599 vm_inout_cleanup(vm, &vm->ioports);
600
601 if (destroy)
602 vrtc_cleanup(vm->vrtc);
603 else
604 vrtc_reset(vm->vrtc);
605
606 vatpit_cleanup(vm->vatpit);
607 vhpet_cleanup(vm->vhpet);
608 vatpic_cleanup(vm->vatpic);
609 vioapic_cleanup(vm->vioapic);
610
611 for (i = 0; i < vm->maxcpus; i++)
612 vcpu_cleanup(vm, i, destroy);
613
614 VMCLEANUP(vm->cookie);
615
616 /*
617 * System memory is removed from the guest address space only when
618 * the VM is destroyed. This is because the mapping remains the same
619 * across VM reset.
620 *
621 * Device memory can be relocated by the guest (e.g. using PCI BARs)
622 * so those mappings are removed on a VM reset.
623 */
624 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
625 mm = &vm->mem_maps[i];
626 if (destroy || !sysmem_mapping(vm, mm))
627 vm_free_memmap(vm, i);
628 #ifndef __FreeBSD__
629 else {
630 /*
631 * We need to reset the IOMMU flag so this mapping can
632 * be reused when a VM is rebooted. Since the IOMMU
633 * domain has already been destroyed we can just reset
634 * the flag here.
635 */
636 mm->flags &= ~VM_MEMMAP_F_IOMMU;
637 }
638 #endif
639 }
640
641 if (destroy) {
642 for (i = 0; i < VM_MAX_MEMSEGS; i++)
643 vm_free_memseg(vm, i);
644
645 VMSPACE_FREE(vm->vmspace);
646 vm->vmspace = NULL;
647 }
648 #ifndef __FreeBSD__
649 else {
650 /*
651 * Clear the first memory segment (low mem), old memory contents
652 * could confuse the UEFI firmware.
653 */
654 vm_clear_memseg(vm, 0);
655 }
656 #endif
657 }
658
659 void
660 vm_destroy(struct vm *vm)
661 {
662 vm_cleanup(vm, true);
663 free(vm, M_VM);
664 }
665
666 int
667 vm_reinit(struct vm *vm)
668 {
669 int error;
670
671 /*
672 * A virtual machine can be reset only if all vcpus are suspended.
673 */
674 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
675 vm_cleanup(vm, false);
676 vm_init(vm, false);
677 error = 0;
678 } else {
679 error = EBUSY;
680 }
681
682 return (error);
683 }
684
685 const char *
686 vm_name(struct vm *vm)
687 {
688 return (vm->name);
689 }
690
691 int
692 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
693 {
694 vm_object_t obj;
695
696 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
697 return (ENOMEM);
698 else
699 return (0);
700 }
701
702 int
703 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
704 {
705
706 vmm_mmio_free(vm->vmspace, gpa, len);
707 return (0);
708 }
709
710 /*
711 * Return 'true' if 'gpa' is allocated in the guest address space.
712 *
713 * This function is called in the context of a running vcpu which acts as
714 * an implicit lock on 'vm->mem_maps[]'.
715 */
716 bool
717 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
718 {
719 struct mem_map *mm;
720 int i;
721
722 #ifdef INVARIANTS
723 int hostcpu, state;
724 state = vcpu_get_state(vm, vcpuid, &hostcpu);
725 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
726 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
727 #endif
728
729 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
730 mm = &vm->mem_maps[i];
731 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
732 return (true); /* 'gpa' is sysmem or devmem */
733 }
734
735 if (ppt_is_mmio(vm, gpa))
736 return (true); /* 'gpa' is pci passthru mmio */
737
738 return (false);
739 }
740
741 int
742 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
743 {
744 struct mem_seg *seg;
745 vm_object_t obj;
746
747 #ifndef __FreeBSD__
748 extern pgcnt_t get_max_page_get(void);
749 #endif
750
751 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
752 return (EINVAL);
753
754 if (len == 0 || (len & PAGE_MASK))
755 return (EINVAL);
756
757 #ifndef __FreeBSD__
758 if (len > ptob(get_max_page_get()))
759 return (EINVAL);
760 #endif
761
762 seg = &vm->mem_segs[ident];
763 if (seg->object != NULL) {
764 if (seg->len == len && seg->sysmem == sysmem)
765 return (EEXIST);
766 else
767 return (EINVAL);
768 }
769
770 obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
771 if (obj == NULL)
772 return (ENOMEM);
773
774 seg->len = len;
775 seg->object = obj;
776 seg->sysmem = sysmem;
777 return (0);
778 }
779
780 int
781 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
782 vm_object_t *objptr)
783 {
784 struct mem_seg *seg;
785
786 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
787 return (EINVAL);
788
789 seg = &vm->mem_segs[ident];
790 if (len)
791 *len = seg->len;
792 if (sysmem)
793 *sysmem = seg->sysmem;
794 if (objptr)
795 *objptr = seg->object;
796 return (0);
797 }
798
799 #ifndef __FreeBSD__
800 static void
801 vm_clear_memseg(struct vm *vm, int ident)
802 {
803 struct mem_seg *seg;
804
805 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
806 ("%s: invalid memseg ident %d", __func__, ident));
807
808 seg = &vm->mem_segs[ident];
809
810 if (seg->object != NULL)
811 vm_object_clear(seg->object);
812 }
813 #endif
814
815 void
816 vm_free_memseg(struct vm *vm, int ident)
817 {
818 struct mem_seg *seg;
819
820 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
821 ("%s: invalid memseg ident %d", __func__, ident));
822
823 seg = &vm->mem_segs[ident];
824 if (seg->object != NULL) {
825 vm_object_deallocate(seg->object);
826 bzero(seg, sizeof (struct mem_seg));
827 }
828 }
829
830 int
831 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
832 size_t len, int prot, int flags)
833 {
834 struct mem_seg *seg;
835 struct mem_map *m, *map;
836 vm_ooffset_t last;
837 int i, error;
838
839 if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
840 return (EINVAL);
841
842 if (flags & ~VM_MEMMAP_F_WIRED)
843 return (EINVAL);
844
845 if (segid < 0 || segid >= VM_MAX_MEMSEGS)
846 return (EINVAL);
847
848 seg = &vm->mem_segs[segid];
849 if (seg->object == NULL)
850 return (EINVAL);
851
852 last = first + len;
853 if (first < 0 || first >= last || last > seg->len)
854 return (EINVAL);
855
856 if ((gpa | first | last) & PAGE_MASK)
857 return (EINVAL);
858
859 map = NULL;
860 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
861 m = &vm->mem_maps[i];
862 if (m->len == 0) {
863 map = m;
864 break;
865 }
866 }
867
868 if (map == NULL)
869 return (ENOSPC);
870
871 error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
872 len, 0, VMFS_NO_SPACE, prot, prot, 0);
873 if (error != KERN_SUCCESS)
874 return (EFAULT);
875
876 vm_object_reference(seg->object);
877
878 if ((flags & VM_MEMMAP_F_WIRED) != 0) {
879 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
880 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
881 if (error != KERN_SUCCESS) {
882 vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
883 return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
884 EFAULT);
885 }
886 }
887
888 map->gpa = gpa;
889 map->len = len;
890 map->segoff = first;
891 map->segid = segid;
892 map->prot = prot;
893 map->flags = flags;
894 return (0);
895 }
896
897 int
898 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
899 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
900 {
901 struct mem_map *mm, *mmnext;
902 int i;
903
904 mmnext = NULL;
905 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
906 mm = &vm->mem_maps[i];
907 if (mm->len == 0 || mm->gpa < *gpa)
908 continue;
909 if (mmnext == NULL || mm->gpa < mmnext->gpa)
910 mmnext = mm;
911 }
912
913 if (mmnext != NULL) {
914 *gpa = mmnext->gpa;
915 if (segid)
916 *segid = mmnext->segid;
917 if (segoff)
918 *segoff = mmnext->segoff;
919 if (len)
920 *len = mmnext->len;
921 if (prot)
922 *prot = mmnext->prot;
923 if (flags)
924 *flags = mmnext->flags;
925 return (0);
926 } else {
927 return (ENOENT);
928 }
929 }
930
931 static void
932 vm_free_memmap(struct vm *vm, int ident)
933 {
934 struct mem_map *mm;
935 int error;
936
937 mm = &vm->mem_maps[ident];
938 if (mm->len) {
939 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
940 mm->gpa + mm->len);
941 KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
942 __func__, error));
943 bzero(mm, sizeof (struct mem_map));
944 }
945 }
946
947 static __inline bool
948 sysmem_mapping(struct vm *vm, struct mem_map *mm)
949 {
950
951 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
952 return (true);
953 else
954 return (false);
955 }
956
957 vm_paddr_t
958 vmm_sysmem_maxaddr(struct vm *vm)
959 {
960 struct mem_map *mm;
961 vm_paddr_t maxaddr;
962 int i;
963
964 maxaddr = 0;
965 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
966 mm = &vm->mem_maps[i];
967 if (sysmem_mapping(vm, mm)) {
968 if (maxaddr < mm->gpa + mm->len)
969 maxaddr = mm->gpa + mm->len;
970 }
971 }
972 return (maxaddr);
973 }
974
975 static void
976 vm_iommu_modify(struct vm *vm, bool map)
977 {
978 int i, sz;
979 vm_paddr_t gpa, hpa;
980 struct mem_map *mm;
981 #ifdef __FreeBSD__
982 void *vp, *cookie, *host_domain;
983 #else
984 void *vp, *cookie, *host_domain __unused;
985 #endif
986
987 sz = PAGE_SIZE;
988 host_domain = iommu_host_domain();
989
990 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
991 mm = &vm->mem_maps[i];
992 if (!sysmem_mapping(vm, mm))
993 continue;
994
995 if (map) {
996 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
997 ("iommu map found invalid memmap %lx/%lx/%x",
998 mm->gpa, mm->len, mm->flags));
999 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
1000 continue;
1001 mm->flags |= VM_MEMMAP_F_IOMMU;
1002 } else {
1003 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
1004 continue;
1005 mm->flags &= ~VM_MEMMAP_F_IOMMU;
1006 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
1007 ("iommu unmap found invalid memmap %lx/%lx/%x",
1008 mm->gpa, mm->len, mm->flags));
1009 }
1010
1011 gpa = mm->gpa;
1012 while (gpa < mm->gpa + mm->len) {
1013 vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
1014 &cookie);
1015 KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
1016 vm_name(vm), gpa));
1017
1018 vm_gpa_release(cookie);
1019
1020 hpa = DMAP_TO_PHYS((uintptr_t)vp);
1021 if (map) {
1022 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
1023 #ifdef __FreeBSD__
1024 iommu_remove_mapping(host_domain, hpa, sz);
1025 #endif
1026 } else {
1027 iommu_remove_mapping(vm->iommu, gpa, sz);
1028 #ifdef __FreeBSD__
1029 iommu_create_mapping(host_domain, hpa, hpa, sz);
1030 #endif
1031 }
1032
1033 gpa += PAGE_SIZE;
1034 }
1035 }
1036
1037 /*
1038 * Invalidate the cached translations associated with the domain
1039 * from which pages were removed.
1040 */
1041 #ifdef __FreeBSD__
1042 if (map)
1043 iommu_invalidate_tlb(host_domain);
1044 else
1045 iommu_invalidate_tlb(vm->iommu);
1046 #else
1047 iommu_invalidate_tlb(vm->iommu);
1048 #endif
1049 }
1050
1051 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false)
1052 #define vm_iommu_map(vm) vm_iommu_modify((vm), true)
1053
1054 int
1055 vm_unassign_pptdev(struct vm *vm, int pptfd)
1056 {
1057 int error;
1058
1059 error = ppt_unassign_device(vm, pptfd);
1060 if (error)
1061 return (error);
1062
1063 if (ppt_assigned_devices(vm) == 0)
1064 vm_iommu_unmap(vm);
1065
1066 return (0);
1067 }
1068
1069 int
1070 vm_assign_pptdev(struct vm *vm, int pptfd)
1071 {
1072 int error;
1073 vm_paddr_t maxaddr;
1074
1075 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1076 if (ppt_assigned_devices(vm) == 0) {
1077 KASSERT(vm->iommu == NULL,
1078 ("vm_assign_pptdev: iommu must be NULL"));
1079 maxaddr = vmm_sysmem_maxaddr(vm);
1080 vm->iommu = iommu_create_domain(maxaddr);
1081 if (vm->iommu == NULL)
1082 return (ENXIO);
1083 vm_iommu_map(vm);
1084 }
1085
1086 error = ppt_assign_device(vm, pptfd);
1087 return (error);
1088 }
1089
1090 void *
1091 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1092 void **cookie)
1093 {
1094 int i, count, pageoff;
1095 struct mem_map *mm;
1096 vm_page_t m;
1097 #ifdef INVARIANTS
1098 /*
1099 * All vcpus are frozen by ioctls that modify the memory map
1100 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1101 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1102 */
1103 int state;
1104 KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1105 __func__, vcpuid));
1106 for (i = 0; i < vm->maxcpus; i++) {
1107 if (vcpuid != -1 && vcpuid != i)
1108 continue;
1109 state = vcpu_get_state(vm, i, NULL);
1110 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1111 __func__, state));
1112 }
1113 #endif
1114 pageoff = gpa & PAGE_MASK;
1115 if (len > PAGE_SIZE - pageoff)
1116 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1117
1118 count = 0;
1119 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1120 mm = &vm->mem_maps[i];
1121 if (mm->len == 0) {
1122 continue;
1123 }
1124 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1125 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1126 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1127 break;
1128 }
1129 }
1130
1131 if (count == 1) {
1132 *cookie = m;
1133 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1134 } else {
1135 *cookie = NULL;
1136 return (NULL);
1137 }
1138 }
1139
1140 void
1141 vm_gpa_release(void *cookie)
1142 {
1143 vm_page_t m = cookie;
1144
1145 vm_page_unwire(m, PQ_ACTIVE);
1146 }
1147
1148 int
1149 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1150 {
1151
1152 if (vcpu < 0 || vcpu >= vm->maxcpus)
1153 return (EINVAL);
1154
1155 if (reg >= VM_REG_LAST)
1156 return (EINVAL);
1157
1158 return (VMGETREG(vm->cookie, vcpu, reg, retval));
1159 }
1160
1161 int
1162 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1163 {
1164 struct vcpu *vcpu;
1165 int error;
1166
1167 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1168 return (EINVAL);
1169
1170 if (reg >= VM_REG_LAST)
1171 return (EINVAL);
1172
1173 error = VMSETREG(vm->cookie, vcpuid, reg, val);
1174 if (error || reg != VM_REG_GUEST_RIP)
1175 return (error);
1176
1177 /* Set 'nextrip' to match the value of %rip */
1178 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1179 vcpu = &vm->vcpu[vcpuid];
1180 vcpu->nextrip = val;
1181 return (0);
1182 }
1183
1184 static bool
1185 is_descriptor_table(int reg)
1186 {
1187 switch (reg) {
1188 case VM_REG_GUEST_IDTR:
1189 case VM_REG_GUEST_GDTR:
1190 return (true);
1191 default:
1192 return (false);
1193 }
1194 }
1195
1196 static bool
1197 is_segment_register(int reg)
1198 {
1199 switch (reg) {
1200 case VM_REG_GUEST_ES:
1201 case VM_REG_GUEST_CS:
1202 case VM_REG_GUEST_SS:
1203 case VM_REG_GUEST_DS:
1204 case VM_REG_GUEST_FS:
1205 case VM_REG_GUEST_GS:
1206 case VM_REG_GUEST_TR:
1207 case VM_REG_GUEST_LDTR:
1208 return (true);
1209 default:
1210 return (false);
1211 }
1212 }
1213
1214 int
1215 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1216 {
1217
1218 if (vcpu < 0 || vcpu >= vm->maxcpus)
1219 return (EINVAL);
1220
1221 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1222 return (EINVAL);
1223
1224 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1225 }
1226
1227 int
1228 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1229 {
1230 if (vcpu < 0 || vcpu >= vm->maxcpus)
1231 return (EINVAL);
1232
1233 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1234 return (EINVAL);
1235
1236 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1237 }
1238
1239 int
1240 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1241 {
1242 struct vcpu *vcpu;
1243
1244 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1245 return (EINVAL);
1246 }
1247
1248 vcpu = &vm->vcpu[vcpuid];
1249
1250 vcpu_lock(vcpu);
1251 *state = vcpu->run_state;
1252 *sipi_vec = vcpu->sipi_vector;
1253 vcpu_unlock(vcpu);
1254
1255 return (0);
1256 }
1257
1258 int
1259 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1260 {
1261 struct vcpu *vcpu;
1262
1263 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1264 return (EINVAL);
1265 }
1266 if (!VRS_IS_VALID(state)) {
1267 return (EINVAL);
1268 }
1269
1270 vcpu = &vm->vcpu[vcpuid];
1271
1272 vcpu_lock(vcpu);
1273 vcpu->run_state = state;
1274 vcpu->sipi_vector = sipi_vec;
1275 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1276 vcpu_unlock(vcpu);
1277
1278 return (0);
1279 }
1280
1281
1282 static void
1283 restore_guest_fpustate(struct vcpu *vcpu)
1284 {
1285
1286 /* flush host state to the pcb */
1287 fpuexit(curthread);
1288
1289 /* restore guest FPU state */
1290 fpu_stop_emulating();
1291 fpurestore(vcpu->guestfpu);
1292
1293 /* restore guest XCR0 if XSAVE is enabled in the host */
1294 if (rcr4() & CR4_XSAVE)
1295 load_xcr(0, vcpu->guest_xcr0);
1296
1297 /*
1298 * The FPU is now "dirty" with the guest's state so turn on emulation
1299 * to trap any access to the FPU by the host.
1300 */
1301 fpu_start_emulating();
1302 }
1303
1304 static void
1305 save_guest_fpustate(struct vcpu *vcpu)
1306 {
1307
1308 if ((rcr0() & CR0_TS) == 0)
1309 panic("fpu emulation not enabled in host!");
1310
1311 /* save guest XCR0 and restore host XCR0 */
1312 if (rcr4() & CR4_XSAVE) {
1313 vcpu->guest_xcr0 = rxcr(0);
1314 load_xcr(0, vmm_get_host_xcr0());
1315 }
1316
1317 /* save guest FPU state */
1318 fpu_stop_emulating();
1319 fpusave(vcpu->guestfpu);
1320 #ifdef __FreeBSD__
1321 fpu_start_emulating();
1322 #else
1323 /*
1324 * When the host state has been restored, we should not re-enable
1325 * CR0.TS on illumos for eager FPU.
1326 */
1327 #endif
1328 }
1329
1330 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
1331
1332 static int
1333 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1334 bool from_idle)
1335 {
1336 struct vcpu *vcpu;
1337 int error;
1338
1339 vcpu = &vm->vcpu[vcpuid];
1340 vcpu_assert_locked(vcpu);
1341
1342 /*
1343 * State transitions from the vmmdev_ioctl() must always begin from
1344 * the VCPU_IDLE state. This guarantees that there is only a single
1345 * ioctl() operating on a vcpu at any point.
1346 */
1347 if (from_idle) {
1348 while (vcpu->state != VCPU_IDLE) {
1349 vcpu->reqidle = 1;
1350 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1351 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1352 "idle requested", vcpu_state2str(vcpu->state));
1353 #ifdef __FreeBSD__
1354 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1355 #else
1356 cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1357 #endif
1358 }
1359 } else {
1360 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1361 "vcpu idle state"));
1362 }
1363
1364 if (vcpu->state == VCPU_RUNNING) {
1365 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1366 "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1367 } else {
1368 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1369 "vcpu that is not running", vcpu->hostcpu));
1370 }
1371
1372 /*
1373 * The following state transitions are allowed:
1374 * IDLE -> FROZEN -> IDLE
1375 * FROZEN -> RUNNING -> FROZEN
1376 * FROZEN -> SLEEPING -> FROZEN
1377 */
1378 switch (vcpu->state) {
1379 case VCPU_IDLE:
1380 case VCPU_RUNNING:
1381 case VCPU_SLEEPING:
1382 error = (newstate != VCPU_FROZEN);
1383 break;
1384 case VCPU_FROZEN:
1385 error = (newstate == VCPU_FROZEN);
1386 break;
1387 default:
1388 error = 1;
1389 break;
1390 }
1391
1392 if (error)
1393 return (EBUSY);
1394
1395 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1396 vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1397
1398 vcpu->state = newstate;
1399 if (newstate == VCPU_RUNNING)
1400 vcpu->hostcpu = curcpu;
1401 else
1402 vcpu->hostcpu = NOCPU;
1403
1404 if (newstate == VCPU_IDLE) {
1405 #ifdef __FreeBSD__
1406 wakeup(&vcpu->state);
1407 #else
1408 cv_broadcast(&vcpu->state_cv);
1409 #endif
1410 }
1411
1412 return (0);
1413 }
1414
1415 static void
1416 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1417 {
1418 int error;
1419
1420 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1421 panic("Error %d setting state to %d\n", error, newstate);
1422 }
1423
1424 static void
1425 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1426 {
1427 int error;
1428
1429 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1430 panic("Error %d setting state to %d", error, newstate);
1431 }
1432
1433 /*
1434 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1435 */
1436 static int
1437 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1438 {
1439 struct vcpu *vcpu;
1440 int t, vcpu_halted, vm_halted;
1441 bool userspace_exit = false;
1442
1443 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1444
1445 vcpu = &vm->vcpu[vcpuid];
1446 vcpu_halted = 0;
1447 vm_halted = 0;
1448
1449 vcpu_lock(vcpu);
1450 while (1) {
1451 /*
1452 * Do a final check for pending interrupts (including NMI and
1453 * INIT) before putting this thread to sleep.
1454 */
1455 if (vm_nmi_pending(vm, vcpuid))
1456 break;
1457 if (vcpu_run_state_pending(vm, vcpuid))
1458 break;
1459 if (!intr_disabled) {
1460 if (vm_extint_pending(vm, vcpuid) ||
1461 vlapic_pending_intr(vcpu->vlapic, NULL)) {
1462 break;
1463 }
1464 }
1465
1466 /*
1467 * Also check for software events which would cause a wake-up.
1468 * This will set the appropriate exitcode directly, rather than
1469 * requiring a trip through VM_RUN().
1470 */
1471 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1472 userspace_exit = true;
1473 break;
1474 }
1475
1476 /*
1477 * Some Linux guests implement "halt" by having all vcpus
1478 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1479 * track of the vcpus that have entered this state. When all
1480 * vcpus enter the halted state the virtual machine is halted.
1481 */
1482 if (intr_disabled) {
1483 if (!vcpu_halted && halt_detection_enabled) {
1484 vcpu_halted = 1;
1485 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1486 }
1487 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1488 vm_halted = 1;
1489 break;
1490 }
1491 }
1492
1493 t = ticks;
1494 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1495 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1496 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1497 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1498 }
1499
1500 if (vcpu_halted)
1501 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1502
1503 vcpu_unlock(vcpu);
1504
1505 if (vm_halted)
1506 vm_suspend(vm, VM_SUSPEND_HALT);
1507
1508 return (userspace_exit ? -1 : 0);
1509 }
1510
1511 static int
1512 vm_handle_paging(struct vm *vm, int vcpuid)
1513 {
1514 int rv, ftype;
1515 struct vm_map *map;
1516 struct vcpu *vcpu;
1517 struct vm_exit *vme;
1518
1519 vcpu = &vm->vcpu[vcpuid];
1520 vme = &vcpu->exitinfo;
1521
1522 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1523 __func__, vme->inst_length));
1524
1525 ftype = vme->u.paging.fault_type;
1526 KASSERT(ftype == VM_PROT_READ ||
1527 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1528 ("vm_handle_paging: invalid fault_type %d", ftype));
1529
1530 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1531 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1532 vme->u.paging.gpa, ftype);
1533 if (rv == 0) {
1534 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1535 ftype == VM_PROT_READ ? "accessed" : "dirty",
1536 vme->u.paging.gpa);
1537 goto done;
1538 }
1539 }
1540
1541 map = &vm->vmspace->vm_map;
1542 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1543
1544 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1545 "ftype = %d", rv, vme->u.paging.gpa, ftype);
1546
1547 if (rv != KERN_SUCCESS)
1548 return (EFAULT);
1549 done:
1550 return (0);
1551 }
1552
1553 int
1554 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1555 int rsize)
1556 {
1557 int err = ESRCH;
1558
1559 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1560 err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1561 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1562 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1563 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1564 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1565 }
1566
1567 return (err);
1568 }
1569
1570 int
1571 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1572 int wsize)
1573 {
1574 int err = ESRCH;
1575
1576 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1577 err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1578 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1579 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1580 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1581 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1582 }
1583
1584 return (err);
1585 }
1586
1587 static int
1588 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1589 {
1590 struct vie *vie;
1591 struct vcpu *vcpu;
1592 struct vm_exit *vme;
1593 uint64_t inst_addr;
1594 int error, fault, cs_d;
1595
1596 vcpu = &vm->vcpu[vcpuid];
1597 vme = &vcpu->exitinfo;
1598 vie = vcpu->vie_ctx;
1599
1600 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1601 __func__, vme->inst_length));
1602
1603 inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1604 cs_d = vme->u.mmio_emul.cs_d;
1605
1606 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1607 vme->u.mmio_emul.gpa);
1608
1609 /* Fetch the faulting instruction */
1610 if (vie_needs_fetch(vie)) {
1611 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1612 &fault);
1613 if (error != 0) {
1614 return (error);
1615 } else if (fault) {
1616 /*
1617 * If a fault during instruction fetch was encounted, it
1618 * will have asserted that the appropriate exception be
1619 * injected at next entry. No further work is required.
1620 */
1621 return (0);
1622 }
1623 }
1624
1625 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1626 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1627 inst_addr);
1628 /* Dump (unrecognized) instruction bytes in userspace */
1629 vie_fallback_exitinfo(vie, vme);
1630 return (-1);
1631 }
1632 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1633 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1634 /* Decoded GLA does not match GLA from VM exit state */
1635 vie_fallback_exitinfo(vie, vme);
1636 return (-1);
1637 }
1638
1639 repeat:
1640 error = vie_emulate_mmio(vie, vm, vcpuid);
1641 if (error < 0) {
1642 /*
1643 * MMIO not handled by any of the in-kernel-emulated devices, so
1644 * make a trip out to userspace for it.
1645 */
1646 vie_exitinfo(vie, vme);
1647 } else if (error == EAGAIN) {
1648 /*
1649 * Continue emulating the rep-prefixed instruction, which has
1650 * not completed its iterations.
1651 *
1652 * In case this can be emulated in-kernel and has a high
1653 * repetition count (causing a tight spin), it should be
1654 * deferential to yield conditions.
1655 */
1656 if (!vcpu_should_yield(vm, vcpuid)) {
1657 goto repeat;
1658 } else {
1659 /*
1660 * Defer to the contending load by making a trip to
1661 * userspace with a no-op (BOGUS) exit reason.
1662 */
1663 vie_reset(vie);
1664 vme->exitcode = VM_EXITCODE_BOGUS;
1665 return (-1);
1666 }
1667 } else if (error == 0) {
1668 /* Update %rip now that instruction has been emulated */
1669 vie_advance_pc(vie, &vcpu->nextrip);
1670 }
1671 return (error);
1672 }
1673
1674 static int
1675 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1676 {
1677 struct vcpu *vcpu;
1678 struct vie *vie;
1679 int err;
1680
1681 vcpu = &vm->vcpu[vcpuid];
1682 vie = vcpu->vie_ctx;
1683
1684 repeat:
1685 err = vie_emulate_inout(vie, vm, vcpuid);
1686
1687 if (err < 0) {
1688 /*
1689 * In/out not handled by any of the in-kernel-emulated devices,
1690 * so make a trip out to userspace for it.
1691 */
1692 vie_exitinfo(vie, vme);
1693 return (err);
1694 } else if (err == EAGAIN) {
1695 /*
1696 * Continue emulating the rep-prefixed ins/outs, which has not
1697 * completed its iterations.
1698 *
1699 * In case this can be emulated in-kernel and has a high
1700 * repetition count (causing a tight spin), it should be
1701 * deferential to yield conditions.
1702 */
1703 if (!vcpu_should_yield(vm, vcpuid)) {
1704 goto repeat;
1705 } else {
1706 /*
1707 * Defer to the contending load by making a trip to
1708 * userspace with a no-op (BOGUS) exit reason.
1709 */
1710 vie_reset(vie);
1711 vme->exitcode = VM_EXITCODE_BOGUS;
1712 return (-1);
1713 }
1714 } else if (err != 0) {
1715 /* Emulation failure. Bail all the way out to userspace. */
1716 vme->exitcode = VM_EXITCODE_INST_EMUL;
1717 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1718 return (-1);
1719 }
1720
1721 vie_advance_pc(vie, &vcpu->nextrip);
1722 return (0);
1723 }
1724
1725 static int
1726 vm_handle_suspend(struct vm *vm, int vcpuid)
1727 {
1728 #ifdef __FreeBSD__
1729 int error, i;
1730 struct vcpu *vcpu;
1731 struct thread *td;
1732
1733 error = 0;
1734 vcpu = &vm->vcpu[vcpuid];
1735 td = curthread;
1736 #else
1737 int i;
1738 struct vcpu *vcpu;
1739
1740 vcpu = &vm->vcpu[vcpuid];
1741 #endif
1742
1743 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1744
1745 #ifdef __FreeBSD__
1746 /*
1747 * Wait until all 'active_cpus' have suspended themselves.
1748 *
1749 * Since a VM may be suspended at any time including when one or
1750 * more vcpus are doing a rendezvous we need to call the rendezvous
1751 * handler while we are waiting to prevent a deadlock.
1752 */
1753 vcpu_lock(vcpu);
1754 while (error == 0) {
1755 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1756 VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1757 break;
1758 }
1759
1760 if (vm->rendezvous_func == NULL) {
1761 VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1762 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1763 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1764 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1765 if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
1766 vcpu_unlock(vcpu);
1767 error = thread_check_susp(td, false);
1768 vcpu_lock(vcpu);
1769 }
1770 } else {
1771 VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1772 vcpu_unlock(vcpu);
1773 error = vm_handle_rendezvous(vm, vcpuid);
1774 vcpu_lock(vcpu);
1775 }
1776 }
1777 vcpu_unlock(vcpu);
1778 #else
1779 vcpu_lock(vcpu);
1780 while (1) {
1781 int rc;
1782
1783 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1784 VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1785 break;
1786 }
1787
1788 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1789 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1790 TR_CLOCK_TICK);
1791 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1792
1793 /*
1794 * If the userspace process driving the instance is killed, any
1795 * vCPUs yet to be marked suspended (because they are not
1796 * VM_RUN-ing in the kernel presently) will never reach that
1797 * state.
1798 *
1799 * To avoid vm_handle_suspend() getting stuck in the kernel
1800 * waiting for those vCPUs, offer a bail-out even though it
1801 * means returning without all vCPUs in a suspended state.
1802 */
1803 if (rc <= 0) {
1804 if ((curproc->p_flag & SEXITING) != 0) {
1805 break;
1806 }
1807 }
1808 }
1809 vcpu_unlock(vcpu);
1810
1811 #endif
1812
1813 /*
1814 * Wakeup the other sleeping vcpus and return to userspace.
1815 */
1816 for (i = 0; i < vm->maxcpus; i++) {
1817 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1818 vcpu_notify_event(vm, i);
1819 }
1820 }
1821
1822 return (-1);
1823 }
1824
1825 static int
1826 vm_handle_reqidle(struct vm *vm, int vcpuid)
1827 {
1828 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1829
1830 vcpu_lock(vcpu);
1831 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1832 vcpu->reqidle = 0;
1833 vcpu_unlock(vcpu);
1834 return (-1);
1835 }
1836
1837 static int
1838 vm_handle_run_state(struct vm *vm, int vcpuid)
1839 {
1840 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1841 bool handled = false;
1842
1843 vcpu_lock(vcpu);
1844 while (1) {
1845 if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1846 vcpu_unlock(vcpu);
1847 VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1848 vcpu_lock(vcpu);
1849
1850 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1851 vcpu->run_state |= VRS_INIT;
1852 }
1853
1854 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1855 (VRS_INIT | VRS_PEND_SIPI)) {
1856 const uint8_t vector = vcpu->sipi_vector;
1857
1858 vcpu_unlock(vcpu);
1859 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1860 vcpu_lock(vcpu);
1861
1862 vcpu->run_state &= ~VRS_PEND_SIPI;
1863 vcpu->run_state |= VRS_RUN;
1864 }
1865
1866 /*
1867 * If the vCPU is now in the running state, there is no need to
1868 * wait for anything prior to re-entry.
1869 */
1870 if ((vcpu->run_state & VRS_RUN) != 0) {
1871 handled = true;
1872 break;
1873 }
1874
1875 /*
1876 * Also check for software events which would cause a wake-up.
1877 * This will set the appropriate exitcode directly, rather than
1878 * requiring a trip through VM_RUN().
1879 */
1880 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1881 break;
1882 }
1883
1884 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1885 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1886 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1887 }
1888 vcpu_unlock(vcpu);
1889
1890 return (handled ? 0 : -1);
1891 }
1892
1893 #ifndef __FreeBSD__
1894 static int
1895 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1896 {
1897 struct vcpu *cpu = &vm->vcpu[vcpuid];
1898 const uint32_t code = vme->u.msr.code;
1899 const uint64_t val = vme->u.msr.wval;
1900
1901 switch (code) {
1902 case MSR_TSC:
1903 cpu->tsc_offset = val - rdtsc();
1904 return (0);
1905 }
1906
1907 return (-1);
1908 }
1909 #endif /* __FreeBSD__ */
1910
1911 int
1912 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1913 {
1914 int i;
1915
1916 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1917 return (EINVAL);
1918
1919 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1920 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1921 vm->suspend, how);
1922 return (EALREADY);
1923 }
1924
1925 VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1926
1927 /*
1928 * Notify all active vcpus that they are now suspended.
1929 */
1930 for (i = 0; i < vm->maxcpus; i++) {
1931 if (CPU_ISSET(i, &vm->active_cpus))
1932 vcpu_notify_event(vm, i);
1933 }
1934
1935 return (0);
1936 }
1937
1938 void
1939 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1940 {
1941 struct vm_exit *vmexit;
1942
1943 vmexit = vm_exitinfo(vm, vcpuid);
1944 vmexit->rip = rip;
1945 vmexit->inst_length = 0;
1946 vmexit->exitcode = VM_EXITCODE_RUN_STATE;
1947 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
1948 }
1949
1950
1951 #ifndef __FreeBSD__
1952 /*
1953 * Some vmm resources, such as the lapic, may have CPU-specific resources
1954 * allocated to them which would benefit from migration onto the host CPU which
1955 * is processing the vcpu state.
1956 */
1957 static void
1958 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
1959 {
1960 /*
1961 * Localizing cyclic resources requires acquisition of cpu_lock, and
1962 * doing so with kpreempt disabled is a recipe for deadlock disaster.
1963 */
1964 VERIFY(curthread->t_preempt == 0);
1965
1966 /*
1967 * Do not bother with localization if this vCPU is about to return to
1968 * the host CPU it was last localized to.
1969 */
1970 if (vcpu->lastloccpu == curcpu)
1971 return;
1972
1973 /*
1974 * Localize system-wide resources to the primary boot vCPU. While any
1975 * of the other vCPUs may access them, it keeps the potential interrupt
1976 * footprint constrained to CPUs involved with this instance.
1977 */
1978 if (vcpu == &vm->vcpu[0]) {
1979 vhpet_localize_resources(vm->vhpet);
1980 vrtc_localize_resources(vm->vrtc);
1981 vatpit_localize_resources(vm->vatpit);
1982 }
1983
1984 vlapic_localize_resources(vcpu->vlapic);
1985
1986 vcpu->lastloccpu = curcpu;
1987 }
1988
1989 static void
1990 vmm_savectx(void *arg)
1991 {
1992 vm_thread_ctx_t *vtc = arg;
1993 struct vm *vm = vtc->vtc_vm;
1994 const int vcpuid = vtc->vtc_vcpuid;
1995
1996 if (ops->vmsavectx != NULL) {
1997 ops->vmsavectx(vm->cookie, vcpuid);
1998 }
1999
2000 /*
2001 * If the CPU holds the restored guest FPU state, save it and restore
2002 * the host FPU state before this thread goes off-cpu.
2003 */
2004 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2005 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2006
2007 save_guest_fpustate(vcpu);
2008 vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2009 }
2010 }
2011
2012 static void
2013 vmm_restorectx(void *arg)
2014 {
2015 vm_thread_ctx_t *vtc = arg;
2016 struct vm *vm = vtc->vtc_vm;
2017 const int vcpuid = vtc->vtc_vcpuid;
2018
2019 /*
2020 * When coming back on-cpu, only restore the guest FPU status if the
2021 * thread is in a context marked as requiring it. This should be rare,
2022 * occurring only when a future logic error results in a voluntary
2023 * sleep during the VMRUN critical section.
2024 *
2025 * The common case will result in elision of the guest FPU state
2026 * restoration, deferring that action until it is clearly necessary
2027 * during vm_run.
2028 */
2029 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2030 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2031 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2032
2033 restore_guest_fpustate(vcpu);
2034 vtc->vtc_status |= VTCS_FPU_RESTORED;
2035 }
2036
2037 if (ops->vmrestorectx != NULL) {
2038 ops->vmrestorectx(vm->cookie, vcpuid);
2039 }
2040
2041 }
2042
2043 /*
2044 * If we're in removectx(), we might still have state to tidy up.
2045 */
2046 static void
2047 vmm_freectx(void *arg, int isexec)
2048 {
2049 vmm_savectx(arg);
2050 }
2051
2052 #endif /* __FreeBSD */
2053
2054 static int
2055 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2056 struct vm_exit *vme)
2057 {
2058 struct vcpu *vcpu;
2059 struct vie *vie;
2060 int err;
2061
2062 vcpu = &vm->vcpu[vcpuid];
2063 vie = vcpu->vie_ctx;
2064 err = 0;
2065
2066 switch (entry->cmd) {
2067 case VEC_DEFAULT:
2068 return (0);
2069 case VEC_DISCARD_INSTR:
2070 vie_reset(vie);
2071 return (0);
2072 case VEC_FULFILL_MMIO:
2073 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2074 if (err == 0) {
2075 err = vie_emulate_mmio(vie, vm, vcpuid);
2076 if (err == 0) {
2077 vie_advance_pc(vie, &vcpu->nextrip);
2078 } else if (err < 0) {
2079 vie_exitinfo(vie, vme);
2080 } else if (err == EAGAIN) {
2081 /*
2082 * Clear the instruction emulation state in
2083 * order to re-enter VM context and continue
2084 * this 'rep <instruction>'
2085 */
2086 vie_reset(vie);
2087 err = 0;
2088 }
2089 }
2090 break;
2091 case VEC_FULFILL_INOUT:
2092 err = vie_fulfill_inout(vie, &entry->u.inout);
2093 if (err == 0) {
2094 err = vie_emulate_inout(vie, vm, vcpuid);
2095 if (err == 0) {
2096 vie_advance_pc(vie, &vcpu->nextrip);
2097 } else if (err < 0) {
2098 vie_exitinfo(vie, vme);
2099 } else if (err == EAGAIN) {
2100 /*
2101 * Clear the instruction emulation state in
2102 * order to re-enter VM context and continue
2103 * this 'rep ins/outs'
2104 */
2105 vie_reset(vie);
2106 err = 0;
2107 }
2108 }
2109 break;
2110 default:
2111 return (EINVAL);
2112 }
2113 return (err);
2114 }
2115
2116 static int
2117 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2118 {
2119 struct vie *vie;
2120
2121 vie = vm->vcpu[vcpuid].vie_ctx;
2122
2123 if (vie_pending(vie)) {
2124 /*
2125 * Userspace has not fulfilled the pending needs of the
2126 * instruction emulation, so bail back out.
2127 */
2128 vie_exitinfo(vie, vme);
2129 return (-1);
2130 }
2131
2132 return (0);
2133 }
2134
2135 int
2136 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2137 {
2138 int error;
2139 struct vcpu *vcpu;
2140 #ifdef __FreeBSD__
2141 struct pcb *pcb;
2142 #endif
2143 uint64_t tscval;
2144 struct vm_exit *vme;
2145 bool intr_disabled;
2146 pmap_t pmap;
2147 #ifndef __FreeBSD__
2148 vm_thread_ctx_t vtc;
2149 int affinity_type = CPU_CURRENT;
2150 #endif
2151
2152 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2153 return (EINVAL);
2154
2155 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2156 return (EINVAL);
2157
2158 if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2159 return (EINVAL);
2160
2161 pmap = vmspace_pmap(vm->vmspace);
2162 vcpu = &vm->vcpu[vcpuid];
2163 vme = &vcpu->exitinfo;
2164
2165 #ifndef __FreeBSD__
2166 vtc.vtc_vm = vm;
2167 vtc.vtc_vcpuid = vcpuid;
2168 vtc.vtc_status = 0;
2169
2170 installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2171 NULL, vmm_freectx);
2172 #endif
2173
2174 error = vm_entry_actions(vm, vcpuid, entry, vme);
2175 if (error != 0) {
2176 goto exit;
2177 }
2178
2179 restart:
2180 error = vm_loop_checks(vm, vcpuid, vme);
2181 if (error != 0) {
2182 goto exit;
2183 }
2184
2185 #ifndef __FreeBSD__
2186 thread_affinity_set(curthread, affinity_type);
2187 /*
2188 * Resource localization should happen after the CPU affinity for the
2189 * thread has been set to ensure that access from restricted contexts,
2190 * such as VMX-accelerated APIC operations, can occur without inducing
2191 * cyclic cross-calls.
2192 *
2193 * This must be done prior to disabling kpreempt via critical_enter().
2194 */
2195 vm_localize_resources(vm, vcpu);
2196
2197 affinity_type = CPU_CURRENT;
2198 #endif
2199
2200 critical_enter();
2201
2202 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2203 ("vm_run: absurd pm_active"));
2204
2205 tscval = rdtsc();
2206
2207 #ifdef __FreeBSD__
2208 pcb = PCPU_GET(curpcb);
2209 set_pcb_flags(pcb, PCB_FULL_IRET);
2210 #else
2211 /* Force a trip through update_sregs to reload %fs/%gs and friends */
2212 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2213 #endif
2214
2215 #ifdef __FreeBSD__
2216 restore_guest_fpustate(vcpu);
2217 #else
2218 if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2219 restore_guest_fpustate(vcpu);
2220 vtc.vtc_status |= VTCS_FPU_RESTORED;
2221 }
2222 vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2223 #endif
2224
2225 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2226 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2227 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2228
2229 #ifdef __FreeBSD__
2230 save_guest_fpustate(vcpu);
2231 #else
2232 vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2233 #endif
2234
2235 #ifndef __FreeBSD__
2236 /*
2237 * Once clear of the delicate contexts comprising the VM_RUN handler,
2238 * thread CPU affinity can be loosened while other processing occurs.
2239 */
2240 thread_affinity_clear(curthread);
2241 #endif
2242
2243 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
2244
2245 critical_exit();
2246
2247 if (error != 0) {
2248 /* Communicate out any error from VMRUN() above */
2249 goto exit;
2250 }
2251
2252 vcpu->nextrip = vme->rip + vme->inst_length;
2253 switch (vme->exitcode) {
2254 case VM_EXITCODE_REQIDLE:
2255 error = vm_handle_reqidle(vm, vcpuid);
2256 break;
2257 case VM_EXITCODE_RUN_STATE:
2258 error = vm_handle_run_state(vm, vcpuid);
2259 break;
2260 case VM_EXITCODE_SUSPENDED:
2261 error = vm_handle_suspend(vm, vcpuid);
2262 break;
2263 case VM_EXITCODE_IOAPIC_EOI:
2264 vioapic_process_eoi(vm, vcpuid,
2265 vme->u.ioapic_eoi.vector);
2266 break;
2267 case VM_EXITCODE_HLT:
2268 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2269 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2270 break;
2271 case VM_EXITCODE_PAGING:
2272 error = vm_handle_paging(vm, vcpuid);
2273 break;
2274 case VM_EXITCODE_MMIO_EMUL:
2275 error = vm_handle_mmio_emul(vm, vcpuid);
2276 break;
2277 case VM_EXITCODE_INOUT:
2278 error = vm_handle_inout(vm, vcpuid, vme);
2279 break;
2280 case VM_EXITCODE_MONITOR:
2281 case VM_EXITCODE_MWAIT:
2282 case VM_EXITCODE_VMINSN:
2283 vm_inject_ud(vm, vcpuid);
2284 break;
2285 #ifndef __FreeBSD__
2286 case VM_EXITCODE_WRMSR:
2287 if (vm_handle_wrmsr(vm, vcpuid, vme) != 0) {
2288 error = -1;
2289 }
2290 break;
2291
2292 case VM_EXITCODE_HT: {
2293 affinity_type = CPU_BEST;
2294 break;
2295 }
2296 #endif
2297
2298 case VM_EXITCODE_MTRAP:
2299 vm_suspend_cpu(vm, vcpuid);
2300 error = -1;
2301 break;
2302 default:
2303 /* handled in userland */
2304 error = -1;
2305 break;
2306 }
2307
2308 if (error == 0) {
2309 /* VM exit conditions handled in-kernel, continue running */
2310 goto restart;
2311 }
2312
2313 exit:
2314 #ifndef __FreeBSD__
2315 removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2316 NULL, vmm_freectx);
2317 #endif
2318
2319 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2320
2321 return (error);
2322 }
2323
2324 int
2325 vm_restart_instruction(void *arg, int vcpuid)
2326 {
2327 struct vm *vm;
2328 struct vcpu *vcpu;
2329 enum vcpu_state state;
2330 uint64_t rip;
2331 int error;
2332
2333 vm = arg;
2334 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2335 return (EINVAL);
2336
2337 vcpu = &vm->vcpu[vcpuid];
2338 state = vcpu_get_state(vm, vcpuid, NULL);
2339 if (state == VCPU_RUNNING) {
2340 /*
2341 * When a vcpu is "running" the next instruction is determined
2342 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2343 * Thus setting 'inst_length' to zero will cause the current
2344 * instruction to be restarted.
2345 */
2346 vcpu->exitinfo.inst_length = 0;
2347 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2348 "setting inst_length to zero", vcpu->exitinfo.rip);
2349 } else if (state == VCPU_FROZEN) {
2350 /*
2351 * When a vcpu is "frozen" it is outside the critical section
2352 * around VMRUN() and 'nextrip' points to the next instruction.
2353 * Thus instruction restart is achieved by setting 'nextrip'
2354 * to the vcpu's %rip.
2355 */
2356 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2357 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2358 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2359 "nextrip from %lx to %lx", vcpu->nextrip, rip);
2360 vcpu->nextrip = rip;
2361 } else {
2362 panic("%s: invalid state %d", __func__, state);
2363 }
2364 return (0);
2365 }
2366
2367 int
2368 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2369 {
2370 struct vcpu *vcpu;
2371 int type, vector;
2372
2373 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2374 return (EINVAL);
2375
2376 vcpu = &vm->vcpu[vcpuid];
2377
2378 if (info & VM_INTINFO_VALID) {
2379 type = info & VM_INTINFO_TYPE;
2380 vector = info & 0xff;
2381 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2382 return (EINVAL);
2383 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2384 return (EINVAL);
2385 if (info & VM_INTINFO_RSVD)
2386 return (EINVAL);
2387 } else {
2388 info = 0;
2389 }
2390 VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2391 vcpu->exitintinfo = info;
2392 return (0);
2393 }
2394
2395 enum exc_class {
2396 EXC_BENIGN,
2397 EXC_CONTRIBUTORY,
2398 EXC_PAGEFAULT
2399 };
2400
2401 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */
2402
2403 static enum exc_class
2404 exception_class(uint64_t info)
2405 {
2406 int type, vector;
2407
2408 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2409 type = info & VM_INTINFO_TYPE;
2410 vector = info & 0xff;
2411
2412 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2413 switch (type) {
2414 case VM_INTINFO_HWINTR:
2415 case VM_INTINFO_SWINTR:
2416 case VM_INTINFO_NMI:
2417 return (EXC_BENIGN);
2418 default:
2419 /*
2420 * Hardware exception.
2421 *
2422 * SVM and VT-x use identical type values to represent NMI,
2423 * hardware interrupt and software interrupt.
2424 *
2425 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2426 * for exceptions except #BP and #OF. #BP and #OF use a type
2427 * value of '5' or '6'. Therefore we don't check for explicit
2428 * values of 'type' to classify 'intinfo' into a hardware
2429 * exception.
2430 */
2431 break;
2432 }
2433
2434 switch (vector) {
2435 case IDT_PF:
2436 case IDT_VE:
2437 return (EXC_PAGEFAULT);
2438 case IDT_DE:
2439 case IDT_TS:
2440 case IDT_NP:
2441 case IDT_SS:
2442 case IDT_GP:
2443 return (EXC_CONTRIBUTORY);
2444 default:
2445 return (EXC_BENIGN);
2446 }
2447 }
2448
2449 static int
2450 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2451 uint64_t *retinfo)
2452 {
2453 enum exc_class exc1, exc2;
2454 int type1, vector1;
2455
2456 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2457 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2458
2459 /*
2460 * If an exception occurs while attempting to call the double-fault
2461 * handler the processor enters shutdown mode (aka triple fault).
2462 */
2463 type1 = info1 & VM_INTINFO_TYPE;
2464 vector1 = info1 & 0xff;
2465 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2466 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2467 info1, info2);
2468 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2469 *retinfo = 0;
2470 return (0);
2471 }
2472
2473 /*
2474 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2475 */
2476 exc1 = exception_class(info1);
2477 exc2 = exception_class(info2);
2478 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2479 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2480 /* Convert nested fault into a double fault. */
2481 *retinfo = IDT_DF;
2482 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2483 *retinfo |= VM_INTINFO_DEL_ERRCODE;
2484 } else {
2485 /* Handle exceptions serially */
2486 *retinfo = info2;
2487 }
2488 return (1);
2489 }
2490
2491 static uint64_t
2492 vcpu_exception_intinfo(struct vcpu *vcpu)
2493 {
2494 uint64_t info = 0;
2495
2496 if (vcpu->exception_pending) {
2497 info = vcpu->exc_vector & 0xff;
2498 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2499 if (vcpu->exc_errcode_valid) {
2500 info |= VM_INTINFO_DEL_ERRCODE;
2501 info |= (uint64_t)vcpu->exc_errcode << 32;
2502 }
2503 }
2504 return (info);
2505 }
2506
2507 int
2508 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2509 {
2510 struct vcpu *vcpu;
2511 uint64_t info1, info2;
2512 int valid;
2513
2514 KASSERT(vcpuid >= 0 &&
2515 vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2516
2517 vcpu = &vm->vcpu[vcpuid];
2518
2519 info1 = vcpu->exitintinfo;
2520 vcpu->exitintinfo = 0;
2521
2522 info2 = 0;
2523 if (vcpu->exception_pending) {
2524 info2 = vcpu_exception_intinfo(vcpu);
2525 vcpu->exception_pending = 0;
2526 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2527 vcpu->exc_vector, info2);
2528 }
2529
2530 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2531 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2532 } else if (info1 & VM_INTINFO_VALID) {
2533 *retinfo = info1;
2534 valid = 1;
2535 } else if (info2 & VM_INTINFO_VALID) {
2536 *retinfo = info2;
2537 valid = 1;
2538 } else {
2539 valid = 0;
2540 }
2541
2542 if (valid) {
2543 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2544 "retinfo(%lx)", __func__, info1, info2, *retinfo);
2545 }
2546
2547 return (valid);
2548 }
2549
2550 int
2551 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2552 {
2553 struct vcpu *vcpu;
2554
2555 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2556 return (EINVAL);
2557
2558 vcpu = &vm->vcpu[vcpuid];
2559 *info1 = vcpu->exitintinfo;
2560 *info2 = vcpu_exception_intinfo(vcpu);
2561 return (0);
2562 }
2563
2564 int
2565 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2566 uint32_t errcode, int restart_instruction)
2567 {
2568 struct vcpu *vcpu;
2569 uint64_t regval;
2570 int error;
2571
2572 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2573 return (EINVAL);
2574
2575 if (vector < 0 || vector >= 32)
2576 return (EINVAL);
2577
2578 /*
2579 * NMIs (which bear an exception vector of 2) are to be injected via
2580 * their own specialized path using vm_inject_nmi().
2581 */
2582 if (vector == 2) {
2583 return (EINVAL);
2584 }
2585
2586 /*
2587 * A double fault exception should never be injected directly into
2588 * the guest. It is a derived exception that results from specific
2589 * combinations of nested faults.
2590 */
2591 if (vector == IDT_DF)
2592 return (EINVAL);
2593
2594 vcpu = &vm->vcpu[vcpuid];
2595
2596 if (vcpu->exception_pending) {
2597 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2598 "pending exception %d", vector, vcpu->exc_vector);
2599 return (EBUSY);
2600 }
2601
2602 if (errcode_valid) {
2603 /*
2604 * Exceptions don't deliver an error code in real mode.
2605 */
2606 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val);
2607 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2608 if (!(regval & CR0_PE))
2609 errcode_valid = 0;
2610 }
2611
2612 /*
2613 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2614 *
2615 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2616 * one instruction or incurs an exception.
2617 */
2618 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2619 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2620 __func__, error));
2621
2622 if (restart_instruction)
2623 vm_restart_instruction(vm, vcpuid);
2624
2625 vcpu->exception_pending = 1;
2626 vcpu->exc_vector = vector;
2627 vcpu->exc_errcode = errcode;
2628 vcpu->exc_errcode_valid = errcode_valid;
2629 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2630 return (0);
2631 }
2632
2633 void
2634 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2635 int errcode)
2636 {
2637 int error;
2638
2639 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2640 errcode, 1);
2641 KASSERT(error == 0, ("vm_inject_exception error %d", error));
2642 }
2643
2644 void
2645 vm_inject_ud(struct vm *vm, int vcpuid)
2646 {
2647 vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2648 }
2649
2650 void
2651 vm_inject_gp(struct vm *vm, int vcpuid)
2652 {
2653 vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2654 }
2655
2656 void
2657 vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2658 {
2659 vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2660 }
2661
2662 void
2663 vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2664 {
2665 vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2666 }
2667
2668 void
2669 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2670 {
2671 int error;
2672
2673 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2674 error_code, cr2);
2675
2676 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2677 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2678
2679 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2680 }
2681
2682 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2683
2684 int
2685 vm_inject_nmi(struct vm *vm, int vcpuid)
2686 {
2687 struct vcpu *vcpu;
2688
2689 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2690 return (EINVAL);
2691
2692 vcpu = &vm->vcpu[vcpuid];
2693
2694 vcpu->nmi_pending = 1;
2695 vcpu_notify_event(vm, vcpuid);
2696 return (0);
2697 }
2698
2699 int
2700 vm_nmi_pending(struct vm *vm, int vcpuid)
2701 {
2702 struct vcpu *vcpu;
2703
2704 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2705 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2706
2707 vcpu = &vm->vcpu[vcpuid];
2708
2709 return (vcpu->nmi_pending);
2710 }
2711
2712 void
2713 vm_nmi_clear(struct vm *vm, int vcpuid)
2714 {
2715 struct vcpu *vcpu;
2716
2717 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2718 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2719
2720 vcpu = &vm->vcpu[vcpuid];
2721
2722 if (vcpu->nmi_pending == 0)
2723 panic("vm_nmi_clear: inconsistent nmi_pending state");
2724
2725 vcpu->nmi_pending = 0;
2726 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2727 }
2728
2729 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2730
2731 int
2732 vm_inject_extint(struct vm *vm, int vcpuid)
2733 {
2734 struct vcpu *vcpu;
2735
2736 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2737 return (EINVAL);
2738
2739 vcpu = &vm->vcpu[vcpuid];
2740
2741 vcpu->extint_pending = 1;
2742 vcpu_notify_event(vm, vcpuid);
2743 return (0);
2744 }
2745
2746 int
2747 vm_extint_pending(struct vm *vm, int vcpuid)
2748 {
2749 struct vcpu *vcpu;
2750
2751 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2752 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2753
2754 vcpu = &vm->vcpu[vcpuid];
2755
2756 return (vcpu->extint_pending);
2757 }
2758
2759 void
2760 vm_extint_clear(struct vm *vm, int vcpuid)
2761 {
2762 struct vcpu *vcpu;
2763
2764 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2765 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2766
2767 vcpu = &vm->vcpu[vcpuid];
2768
2769 if (vcpu->extint_pending == 0)
2770 panic("vm_extint_clear: inconsistent extint_pending state");
2771
2772 vcpu->extint_pending = 0;
2773 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2774 }
2775
2776 int
2777 vm_inject_init(struct vm *vm, int vcpuid)
2778 {
2779 struct vcpu *vcpu;
2780
2781 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2782 return (EINVAL);
2783
2784 vcpu = &vm->vcpu[vcpuid];
2785 vcpu_lock(vcpu);
2786 vcpu->run_state |= VRS_PEND_INIT;
2787 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2788 vcpu_unlock(vcpu);
2789 return (0);
2790 }
2791
2792 int
2793 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2794 {
2795 struct vcpu *vcpu;
2796
2797 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2798 return (EINVAL);
2799
2800 vcpu = &vm->vcpu[vcpuid];
2801 vcpu_lock(vcpu);
2802 vcpu->run_state |= VRS_PEND_SIPI;
2803 vcpu->sipi_vector = vector;
2804 /* SIPI is only actionable if the CPU is waiting in INIT state */
2805 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2806 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2807 }
2808 vcpu_unlock(vcpu);
2809 return (0);
2810 }
2811
2812 bool
2813 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2814 {
2815 struct vcpu *vcpu;
2816
2817 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2818 vcpu = &vm->vcpu[vcpuid];
2819
2820 /* Of interest: vCPU not in running state or with pending INIT */
2821 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2822 }
2823
2824 int
2825 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2826 {
2827 struct seg_desc desc;
2828 const enum vm_reg_name clear_regs[] = {
2829 VM_REG_GUEST_CR2,
2830 VM_REG_GUEST_CR3,
2831 VM_REG_GUEST_CR4,
2832 VM_REG_GUEST_RAX,
2833 VM_REG_GUEST_RBX,
2834 VM_REG_GUEST_RCX,
2835 VM_REG_GUEST_RSI,
2836 VM_REG_GUEST_RDI,
2837 VM_REG_GUEST_RBP,
2838 VM_REG_GUEST_RSP,
2839 VM_REG_GUEST_R8,
2840 VM_REG_GUEST_R9,
2841 VM_REG_GUEST_R10,
2842 VM_REG_GUEST_R11,
2843 VM_REG_GUEST_R12,
2844 VM_REG_GUEST_R13,
2845 VM_REG_GUEST_R14,
2846 VM_REG_GUEST_R15,
2847 VM_REG_GUEST_DR0,
2848 VM_REG_GUEST_DR1,
2849 VM_REG_GUEST_DR2,
2850 VM_REG_GUEST_DR3,
2851 VM_REG_GUEST_EFER,
2852 };
2853 const enum vm_reg_name data_segs[] = {
2854 VM_REG_GUEST_SS,
2855 VM_REG_GUEST_DS,
2856 VM_REG_GUEST_ES,
2857 VM_REG_GUEST_FS,
2858 VM_REG_GUEST_GS,
2859 };
2860 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2861
2862 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2863 return (EINVAL);
2864
2865 for (uint_t i = 0; i < nitems(clear_regs); i++) {
2866 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2867 }
2868
2869 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2870 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2871 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2872
2873 /*
2874 * The prescribed contents of %rdx differ slightly between the Intel and
2875 * AMD architectural definitions. The former expects the Extended Model
2876 * in bits 16-19 where the latter expects all the Family, Model, and
2877 * Stepping be there. Common boot ROMs appear to disregard this
2878 * anyways, so we stick with a compromise value similar to what is
2879 * spelled out in the Intel SDM.
2880 */
2881 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2882
2883 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2884 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2885
2886 /* CS: Present, R/W, Accessed */
2887 desc.access = 0x0093;
2888 desc.base = 0xffff0000;
2889 desc.limit = 0xffff;
2890 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2891 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2892
2893 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2894 desc.access = 0x0093;
2895 desc.base = 0;
2896 desc.limit = 0xffff;
2897 for (uint_t i = 0; i < nitems(data_segs); i++) {
2898 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2899 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2900 }
2901
2902 /* GDTR, IDTR */
2903 desc.base = 0;
2904 desc.limit = 0xffff;
2905 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2906 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2907
2908 /* LDTR: Present, LDT */
2909 desc.access = 0x0082;
2910 desc.base = 0;
2911 desc.limit = 0xffff;
2912 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2913 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2914
2915 /* TR: Present, 32-bit TSS */
2916 desc.access = 0x008b;
2917 desc.base = 0;
2918 desc.limit = 0xffff;
2919 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2920 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2921
2922 vlapic_reset(vm_lapic(vm, vcpuid));
2923
2924 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2925
2926 vcpu->exitintinfo = 0;
2927 vcpu->exception_pending = 0;
2928 vcpu->nmi_pending = 0;
2929 vcpu->extint_pending = 0;
2930
2931 /*
2932 * A CPU reset caused by power-on or system reset clears more state than
2933 * one which is trigged from an INIT IPI.
2934 */
2935 if (!init_only) {
2936 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2937 fpu_save_area_reset(vcpu->guestfpu);
2938
2939 /* XXX: clear MSRs and other pieces */
2940 }
2941
2942 return (0);
2943 }
2944
2945 static int
2946 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2947 {
2948 struct seg_desc desc;
2949
2950 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2951 return (EINVAL);
2952
2953 /* CS: Present, R/W, Accessed */
2954 desc.access = 0x0093;
2955 desc.base = (uint64_t)vector << 12;
2956 desc.limit = 0xffff;
2957 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2958 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2959 (uint64_t)vector << 8));
2960
2961 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2962
2963 return (0);
2964 }
2965
2966 int
2967 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2968 {
2969 if (vcpu < 0 || vcpu >= vm->maxcpus)
2970 return (EINVAL);
2971
2972 if (type < 0 || type >= VM_CAP_MAX)
2973 return (EINVAL);
2974
2975 return (VMGETCAP(vm->cookie, vcpu, type, retval));
2976 }
2977
2978 int
2979 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
2980 {
2981 if (vcpu < 0 || vcpu >= vm->maxcpus)
2982 return (EINVAL);
2983
2984 if (type < 0 || type >= VM_CAP_MAX)
2985 return (EINVAL);
2986
2987 return (VMSETCAP(vm->cookie, vcpu, type, val));
2988 }
2989
2990 struct vlapic *
2991 vm_lapic(struct vm *vm, int cpu)
2992 {
2993 return (vm->vcpu[cpu].vlapic);
2994 }
2995
2996 struct vioapic *
2997 vm_ioapic(struct vm *vm)
2998 {
2999
3000 return (vm->vioapic);
3001 }
3002
3003 struct vhpet *
3004 vm_hpet(struct vm *vm)
3005 {
3006
3007 return (vm->vhpet);
3008 }
3009
3010 #ifdef __FreeBSD__
3011 bool
3012 vmm_is_pptdev(int bus, int slot, int func)
3013 {
3014 int b, f, i, n, s;
3015 char *val, *cp, *cp2;
3016 bool found;
3017
3018 /*
3019 * XXX
3020 * The length of an environment variable is limited to 128 bytes which
3021 * puts an upper limit on the number of passthru devices that may be
3022 * specified using a single environment variable.
3023 *
3024 * Work around this by scanning multiple environment variable
3025 * names instead of a single one - yuck!
3026 */
3027 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
3028
3029 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
3030 found = false;
3031 for (i = 0; names[i] != NULL && !found; i++) {
3032 cp = val = kern_getenv(names[i]);
3033 while (cp != NULL && *cp != '\0') {
3034 if ((cp2 = strchr(cp, ' ')) != NULL)
3035 *cp2 = '\0';
3036
3037 n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
3038 if (n == 3 && bus == b && slot == s && func == f) {
3039 found = true;
3040 break;
3041 }
3042
3043 if (cp2 != NULL)
3044 *cp2++ = ' ';
3045
3046 cp = cp2;
3047 }
3048 freeenv(val);
3049 }
3050 return (found);
3051 }
3052 #endif
3053
3054 void *
3055 vm_iommu_domain(struct vm *vm)
3056 {
3057
3058 return (vm->iommu);
3059 }
3060
3061 int
3062 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3063 bool from_idle)
3064 {
3065 int error;
3066 struct vcpu *vcpu;
3067
3068 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3069 panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3070
3071 vcpu = &vm->vcpu[vcpuid];
3072
3073 vcpu_lock(vcpu);
3074 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3075 vcpu_unlock(vcpu);
3076
3077 return (error);
3078 }
3079
3080 enum vcpu_state
3081 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3082 {
3083 struct vcpu *vcpu;
3084 enum vcpu_state state;
3085
3086 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3087 panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3088
3089 vcpu = &vm->vcpu[vcpuid];
3090
3091 vcpu_lock(vcpu);
3092 state = vcpu->state;
3093 if (hostcpu != NULL)
3094 *hostcpu = vcpu->hostcpu;
3095 vcpu_unlock(vcpu);
3096
3097 return (state);
3098 }
3099
3100 #ifndef __FreeBSD__
3101 uint64_t
3102 vcpu_tsc_offset(struct vm *vm, int vcpuid)
3103 {
3104 return (vm->vcpu[vcpuid].tsc_offset);
3105 }
3106 #endif /* __FreeBSD__ */
3107
3108 int
3109 vm_activate_cpu(struct vm *vm, int vcpuid)
3110 {
3111
3112 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3113 return (EINVAL);
3114
3115 if (CPU_ISSET(vcpuid, &vm->active_cpus))
3116 return (EBUSY);
3117
3118 VCPU_CTR0(vm, vcpuid, "activated");
3119 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3120 return (0);
3121 }
3122
3123 int
3124 vm_suspend_cpu(struct vm *vm, int vcpuid)
3125 {
3126 int i;
3127
3128 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3129 return (EINVAL);
3130
3131 if (vcpuid == -1) {
3132 vm->debug_cpus = vm->active_cpus;
3133 for (i = 0; i < vm->maxcpus; i++) {
3134 if (CPU_ISSET(i, &vm->active_cpus))
3135 vcpu_notify_event(vm, i);
3136 }
3137 } else {
3138 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3139 return (EINVAL);
3140
3141 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3142 vcpu_notify_event(vm, vcpuid);
3143 }
3144 return (0);
3145 }
3146
3147 int
3148 vm_resume_cpu(struct vm *vm, int vcpuid)
3149 {
3150
3151 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3152 return (EINVAL);
3153
3154 if (vcpuid == -1) {
3155 CPU_ZERO(&vm->debug_cpus);
3156 } else {
3157 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3158 return (EINVAL);
3159
3160 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3161 }
3162 return (0);
3163 }
3164
3165 static bool
3166 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3167 uint64_t entry_rip)
3168 {
3169 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3170 struct vm_exit *vme = &vcpu->exitinfo;
3171 bool bail = false;
3172
3173 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3174
3175 if (vm->suspend) {
3176 if (on_entry) {
3177 VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3178 vm->suspend < VM_SUSPEND_LAST);
3179
3180 vme->exitcode = VM_EXITCODE_SUSPENDED;
3181 vme->u.suspended.how = vm->suspend;
3182 } else {
3183 /*
3184 * Handling VM suspend is complicated, so if that
3185 * condition is detected outside of VM-entry itself,
3186 * just emit a BOGUS exitcode so we take a lap to pick
3187 * up the event during an entry and are directed into
3188 * the vm_handle_suspend() logic.
3189 */
3190 vme->exitcode = VM_EXITCODE_BOGUS;
3191 }
3192 bail = true;
3193 }
3194 if (vcpu->reqidle) {
3195 vme->exitcode = VM_EXITCODE_REQIDLE;
3196 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3197
3198 if (!on_entry) {
3199 /*
3200 * A reqidle request detected outside of VM-entry can be
3201 * handled directly by clearing the request (and taking
3202 * a lap to userspace).
3203 */
3204 vcpu_assert_locked(vcpu);
3205 vcpu->reqidle = 0;
3206 }
3207 bail = true;
3208 }
3209 if (vcpu_should_yield(vm, vcpuid)) {
3210 vme->exitcode = VM_EXITCODE_BOGUS;
3211 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3212 bail = true;
3213 }
3214 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3215 vme->exitcode = VM_EXITCODE_DEBUG;
3216 bail = true;
3217 }
3218
3219 if (bail) {
3220 if (on_entry) {
3221 /*
3222 * If bailing out during VM-entry, the current %rip must
3223 * be recorded in the exitinfo.
3224 */
3225 vme->rip = entry_rip;
3226 }
3227 vme->inst_length = 0;
3228 }
3229 return (bail);
3230 }
3231
3232 static bool
3233 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3234 {
3235 /*
3236 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3237 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3238 * structure, and we would only modify the exitcode.
3239 */
3240 return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3241 }
3242
3243 bool
3244 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3245 {
3246 /*
3247 * Bail-out checks done as part of VM entry require an updated %rip to
3248 * populate the vm_exit struct if any of the conditions of interest are
3249 * matched in the check.
3250 */
3251 return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3252 }
3253
3254 cpuset_t
3255 vm_active_cpus(struct vm *vm)
3256 {
3257
3258 return (vm->active_cpus);
3259 }
3260
3261 cpuset_t
3262 vm_debug_cpus(struct vm *vm)
3263 {
3264
3265 return (vm->debug_cpus);
3266 }
3267
3268 cpuset_t
3269 vm_suspended_cpus(struct vm *vm)
3270 {
3271
3272 return (vm->suspended_cpus);
3273 }
3274
3275 void *
3276 vcpu_stats(struct vm *vm, int vcpuid)
3277 {
3278
3279 return (vm->vcpu[vcpuid].stats);
3280 }
3281
3282 int
3283 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3284 {
3285 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3286 return (EINVAL);
3287
3288 *state = vm->vcpu[vcpuid].x2apic_state;
3289
3290 return (0);
3291 }
3292
3293 int
3294 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3295 {
3296 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3297 return (EINVAL);
3298
3299 if (state >= X2APIC_STATE_LAST)
3300 return (EINVAL);
3301
3302 vm->vcpu[vcpuid].x2apic_state = state;
3303
3304 vlapic_set_x2apic_state(vm, vcpuid, state);
3305
3306 return (0);
3307 }
3308
3309 /*
3310 * This function is called to ensure that a vcpu "sees" a pending event
3311 * as soon as possible:
3312 * - If the vcpu thread is sleeping then it is woken up.
3313 * - If the vcpu is running on a different host_cpu then an IPI will be directed
3314 * to the host_cpu to cause the vcpu to trap into the hypervisor.
3315 */
3316 static void
3317 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3318 {
3319 int hostcpu;
3320
3321 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3322
3323 hostcpu = vcpu->hostcpu;
3324 if (vcpu->state == VCPU_RUNNING) {
3325 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3326 if (hostcpu != curcpu) {
3327 if (ntype == VCPU_NOTIFY_APIC) {
3328 vlapic_post_intr(vcpu->vlapic, hostcpu,
3329 vmm_ipinum);
3330 } else {
3331 ipi_cpu(hostcpu, vmm_ipinum);
3332 }
3333 } else {
3334 /*
3335 * If the 'vcpu' is running on 'curcpu' then it must
3336 * be sending a notification to itself (e.g. SELF_IPI).
3337 * The pending event will be picked up when the vcpu
3338 * transitions back to guest context.
3339 */
3340 }
3341 } else {
3342 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3343 "with hostcpu %d", vcpu->state, hostcpu));
3344 if (vcpu->state == VCPU_SLEEPING) {
3345 #ifdef __FreeBSD__
3346 wakeup_one(vcpu);
3347 #else
3348 cv_signal(&vcpu->vcpu_cv);
3349 #endif
3350 }
3351 }
3352 }
3353
3354 void
3355 vcpu_notify_event(struct vm *vm, int vcpuid)
3356 {
3357 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3358
3359 vcpu_lock(vcpu);
3360 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3361 vcpu_unlock(vcpu);
3362 }
3363
3364 void
3365 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3366 {
3367 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3368
3369 if (ntype == VCPU_NOTIFY_NONE) {
3370 return;
3371 }
3372
3373 vcpu_lock(vcpu);
3374 vcpu_notify_event_locked(vcpu, ntype);
3375 vcpu_unlock(vcpu);
3376 }
3377
3378 struct vmspace *
3379 vm_get_vmspace(struct vm *vm)
3380 {
3381
3382 return (vm->vmspace);
3383 }
3384
3385 int
3386 vm_apicid2vcpuid(struct vm *vm, int apicid)
3387 {
3388 /*
3389 * XXX apic id is assumed to be numerically identical to vcpu id
3390 */
3391 return (apicid);
3392 }
3393
3394 struct vatpic *
3395 vm_atpic(struct vm *vm)
3396 {
3397 return (vm->vatpic);
3398 }
3399
3400 struct vatpit *
3401 vm_atpit(struct vm *vm)
3402 {
3403 return (vm->vatpit);
3404 }
3405
3406 struct vpmtmr *
3407 vm_pmtmr(struct vm *vm)
3408 {
3409
3410 return (vm->vpmtmr);
3411 }
3412
3413 struct vrtc *
3414 vm_rtc(struct vm *vm)
3415 {
3416
3417 return (vm->vrtc);
3418 }
3419
3420 enum vm_reg_name
3421 vm_segment_name(int seg)
3422 {
3423 static enum vm_reg_name seg_names[] = {
3424 VM_REG_GUEST_ES,
3425 VM_REG_GUEST_CS,
3426 VM_REG_GUEST_SS,
3427 VM_REG_GUEST_DS,
3428 VM_REG_GUEST_FS,
3429 VM_REG_GUEST_GS
3430 };
3431
3432 KASSERT(seg >= 0 && seg < nitems(seg_names),
3433 ("%s: invalid segment encoding %d", __func__, seg));
3434 return (seg_names[seg]);
3435 }
3436
3437 void
3438 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3439 int num_copyinfo)
3440 {
3441 int idx;
3442
3443 for (idx = 0; idx < num_copyinfo; idx++) {
3444 if (copyinfo[idx].cookie != NULL)
3445 vm_gpa_release(copyinfo[idx].cookie);
3446 }
3447 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3448 }
3449
3450 int
3451 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3452 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3453 int num_copyinfo, int *fault)
3454 {
3455 int error, idx, nused;
3456 size_t n, off, remaining;
3457 void *hva, *cookie;
3458 uint64_t gpa;
3459
3460 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3461
3462 nused = 0;
3463 remaining = len;
3464 while (remaining > 0) {
3465 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3466 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3467 if (error || *fault)
3468 return (error);
3469 off = gpa & PAGE_MASK;
3470 n = min(remaining, PAGE_SIZE - off);
3471 copyinfo[nused].gpa = gpa;
3472 copyinfo[nused].len = n;
3473 remaining -= n;
3474 gla += n;
3475 nused++;
3476 }
3477
3478 for (idx = 0; idx < nused; idx++) {
3479 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3480 copyinfo[idx].len, prot, &cookie);
3481 if (hva == NULL)
3482 break;
3483 copyinfo[idx].hva = hva;
3484 copyinfo[idx].cookie = cookie;
3485 }
3486
3487 if (idx != nused) {
3488 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3489 return (EFAULT);
3490 } else {
3491 *fault = 0;
3492 return (0);
3493 }
3494 }
3495
3496 void
3497 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3498 size_t len)
3499 {
3500 char *dst;
3501 int idx;
3502
3503 dst = kaddr;
3504 idx = 0;
3505 while (len > 0) {
3506 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3507 len -= copyinfo[idx].len;
3508 dst += copyinfo[idx].len;
3509 idx++;
3510 }
3511 }
3512
3513 void
3514 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3515 struct vm_copyinfo *copyinfo, size_t len)
3516 {
3517 const char *src;
3518 int idx;
3519
3520 src = kaddr;
3521 idx = 0;
3522 while (len > 0) {
3523 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3524 len -= copyinfo[idx].len;
3525 src += copyinfo[idx].len;
3526 idx++;
3527 }
3528 }
3529
3530 /*
3531 * Return the amount of in-use and wired memory for the VM. Since
3532 * these are global stats, only return the values with for vCPU 0
3533 */
3534 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3535 VMM_STAT_DECLARE(VMM_MEM_WIRED);
3536
3537 static void
3538 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3539 {
3540
3541 if (vcpu == 0) {
3542 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3543 PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3544 }
3545 }
3546
3547 static void
3548 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3549 {
3550
3551 if (vcpu == 0) {
3552 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3553 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3554 }
3555 }
3556
3557 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3558 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3559
3560 int
3561 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3562 uint8_t bytes, uint32_t *val)
3563 {
3564 return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3565 }
3566
3567 /*
3568 * bhyve-internal interfaces to attach or detach IO port handlers.
3569 * Must be called with VM write lock held for safety.
3570 */
3571 int
3572 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3573 void **cookie)
3574 {
3575 int err;
3576 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3577 if (err == 0) {
3578 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3579 }
3580 return (err);
3581 }
3582 int
3583 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3584 void **old_arg)
3585 {
3586 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3587 int err;
3588
3589 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3590 if (err == 0) {
3591 *cookie = NULL;
3592 }
3593 return (err);
3594 }
3595
3596 /*
3597 * External driver interfaces to attach or detach IO port handlers.
3598 * Must be called with VM write lock held for safety.
3599 */
3600 int
3601 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3602 void *arg, void **cookie)
3603 {
3604 int err;
3605
3606 if (port == 0) {
3607 return (EINVAL);
3608 }
3609
3610 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3611 if (err == 0) {
3612 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3613 }
3614 return (err);
3615 }
3616 void
3617 vm_ioport_unhook(struct vm *vm, void **cookie)
3618 {
3619 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3620 ioport_handler_t old_func;
3621 void *old_arg;
3622 int err;
3623
3624 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3625
3626 /* ioport-hook-using drivers are expected to be well-behaved */
3627 VERIFY0(err);
3628 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3629
3630 *cookie = NULL;
3631 }