1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD$
29 */
30 /*
31 * This file and its contents are supplied under the terms of the
32 * Common Development and Distribution License ("CDDL"), version 1.0.
33 * You may only use this file in accordance with the terms of version
34 * 1.0 of the CDDL.
35 *
36 * A full copy of the text of the CDDL should have accompanied this
37 * source. A copy of the CDDL is also available via the Internet at
38 * http://www.illumos.org/license/CDDL.
39 *
40 * Copyright 2015 Pluribus Networks Inc.
41 * Copyright 2021 Joyent, Inc.
42 * Copyright 2021 Oxide Computer Company
43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
44 */
45
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/kernel.h>
52 #include <sys/module.h>
53 #include <sys/sysctl.h>
54 #include <sys/malloc.h>
55 #include <sys/pcpu.h>
56 #include <sys/lock.h>
57 #include <sys/mutex.h>
58 #include <sys/proc.h>
59 #include <sys/rwlock.h>
60 #include <sys/sched.h>
61 #include <sys/smp.h>
62 #include <sys/systm.h>
63
64 #include <machine/pcb.h>
65 #include <machine/smp.h>
66 #include <machine/md_var.h>
67 #include <x86/psl.h>
68 #include <x86/apicreg.h>
69
70 #include <machine/specialreg.h>
71 #include <machine/vmm.h>
72 #include <machine/vmm_dev.h>
73 #include <machine/vmparam.h>
74 #include <sys/vmm_instruction_emul.h>
75 #include <sys/vmm_vm.h>
76
77 #include "vmm_ioport.h"
78 #include "vmm_ktr.h"
79 #include "vmm_host.h"
80 #include "vmm_mem.h"
81 #include "vmm_util.h"
82 #include "vatpic.h"
83 #include "vatpit.h"
84 #include "vhpet.h"
85 #include "vioapic.h"
86 #include "vlapic.h"
87 #include "vpmtmr.h"
88 #include "vrtc.h"
89 #include "vmm_stat.h"
90 #include "vmm_lapic.h"
91
92 #include "io/ppt.h"
93 #include "io/iommu.h"
94
95 struct vlapic;
96
97 /*
98 * Initialization:
99 * (a) allocated when vcpu is created
100 * (i) initialized when vcpu is created and when it is reinitialized
101 * (o) initialized the first time the vcpu is created
102 * (x) initialized before use
103 */
104 struct vcpu {
105 /* (o) protects state, run_state, hostcpu, sipi_vector */
106 struct mtx mtx;
107
108 enum vcpu_state state; /* (o) vcpu state */
109 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */
110 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */
111 kcondvar_t state_cv; /* (o) IDLE-transition cv */
112 int hostcpu; /* (o) vcpu's current host cpu */
113 int lastloccpu; /* (o) last host cpu localized to */
114 int reqidle; /* (i) request vcpu to idle */
115 struct vlapic *vlapic; /* (i) APIC device model */
116 enum x2apic_state x2apic_state; /* (i) APIC mode */
117 uint64_t exitintinfo; /* (i) events pending at VM exit */
118 int nmi_pending; /* (i) NMI pending */
119 int extint_pending; /* (i) INTR pending */
120 int exception_pending; /* (i) exception pending */
121 int exc_vector; /* (x) exception collateral */
122 int exc_errcode_valid;
123 uint32_t exc_errcode;
124 uint8_t sipi_vector; /* (i) SIPI vector */
125 struct savefpu *guestfpu; /* (a,i) guest fpu state */
126 uint64_t guest_xcr0; /* (i) guest %xcr0 register */
127 void *stats; /* (a,i) statistics */
128 struct vm_exit exitinfo; /* (x) exit reason and collateral */
129 uint64_t nextrip; /* (x) next instruction to execute */
130 struct vie *vie_ctx; /* (x) instruction emulation context */
131 uint64_t tsc_offset; /* (x) offset from host TSC */
132
133 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */
134 hrtime_t ustate_when; /* (i) time of last ustate change */
135 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */
136 };
137
138 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
139 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
140 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
141 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
142 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
143
144 struct mem_seg {
145 size_t len;
146 bool sysmem;
147 struct vm_object *object;
148 };
149 #define VM_MAX_MEMSEGS 4
150
151 struct mem_map {
152 vm_paddr_t gpa;
153 size_t len;
154 vm_ooffset_t segoff;
155 int segid;
156 int prot;
157 int flags;
158 };
159 #define VM_MAX_MEMMAPS 8
160
161 /*
162 * Initialization:
163 * (o) initialized the first time the VM is created
164 * (i) initialized when VM is created and when it is reinitialized
165 * (x) initialized before use
166 */
167 struct vm {
168 void *cookie; /* (i) cpu-specific data */
169 void *iommu; /* (x) iommu-specific data */
170 struct vhpet *vhpet; /* (i) virtual HPET */
171 struct vioapic *vioapic; /* (i) virtual ioapic */
172 struct vatpic *vatpic; /* (i) virtual atpic */
173 struct vatpit *vatpit; /* (i) virtual atpit */
174 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
175 struct vrtc *vrtc; /* (o) virtual RTC */
176 volatile cpuset_t active_cpus; /* (i) active vcpus */
177 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */
178 int suspend; /* (i) stop VM execution */
179 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
180 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
181 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
182 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
183 struct vmspace *vmspace; /* (o) guest's address space */
184 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
185 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
186 /* The following describe the vm cpu topology */
187 uint16_t sockets; /* (o) num of sockets */
188 uint16_t cores; /* (o) num of cores/socket */
189 uint16_t threads; /* (o) num of threads/core */
190 uint16_t maxcpus; /* (o) max pluggable cpus */
191 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */
192 size_t arc_resv; /* # of pages take from ARC */
193
194 struct ioport_config ioports; /* (o) ioport handling */
195 };
196
197 static int vmm_initialized;
198
199
200 static void
201 nullop_panic(void)
202 {
203 panic("null vmm operation call");
204 }
205
206 /* Do not allow use of an un-set `ops` to do anything but panic */
207 static struct vmm_ops vmm_ops_null = {
208 .init = (vmm_init_func_t)nullop_panic,
209 .cleanup = (vmm_cleanup_func_t)nullop_panic,
210 .resume = (vmm_resume_func_t)nullop_panic,
211 .vminit = (vmi_init_func_t)nullop_panic,
212 .vmrun = (vmi_run_func_t)nullop_panic,
213 .vmcleanup = (vmi_cleanup_func_t)nullop_panic,
214 .vmgetreg = (vmi_get_register_t)nullop_panic,
215 .vmsetreg = (vmi_set_register_t)nullop_panic,
216 .vmgetdesc = (vmi_get_desc_t)nullop_panic,
217 .vmsetdesc = (vmi_set_desc_t)nullop_panic,
218 .vmgetcap = (vmi_get_cap_t)nullop_panic,
219 .vmsetcap = (vmi_set_cap_t)nullop_panic,
220 .vmspace_alloc = (vmi_vmspace_alloc)nullop_panic,
221 .vmspace_free = (vmi_vmspace_free)nullop_panic,
222 .vlapic_init = (vmi_vlapic_init)nullop_panic,
223 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
224 .vmsavectx = (vmi_savectx)nullop_panic,
225 .vmrestorectx = (vmi_restorectx)nullop_panic,
226 };
227
228 static struct vmm_ops *ops = &vmm_ops_null;
229
230 #define VMM_INIT(num) ((*ops->init)(num))
231 #define VMM_CLEANUP() ((*ops->cleanup)())
232 #define VMM_RESUME() ((*ops->resume)())
233
234 #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap))
235 #define VMRUN(vmi, vcpu, rip, pmap) \
236 ((*ops->vmrun)(vmi, vcpu, rip, pmap))
237 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
238 #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max))
239 #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace))
240
241 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
242 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val))
243 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
244 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
245 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv))
246 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val))
247 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu))
248 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic))
249
250 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
251 #define fpu_stop_emulating() clts()
252
253 SDT_PROVIDER_DEFINE(vmm);
254
255 static MALLOC_DEFINE(M_VM, "vm", "vm");
256
257 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
258 NULL);
259
260 /*
261 * Halt the guest if all vcpus are executing a HLT instruction with
262 * interrupts disabled.
263 */
264 static int halt_detection_enabled = 1;
265
266 /* IPI vector used for vcpu notifications */
267 static int vmm_ipinum;
268
269 /* Trap into hypervisor on all guest exceptions and reflect them back */
270 static int trace_guest_exceptions;
271
272 static void vm_free_memmap(struct vm *vm, int ident);
273 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
274 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
275 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
276 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
277
278 extern int arc_virt_machine_reserve(size_t);
279 extern void arc_virt_machine_release(size_t);
280
281 /* Flags for vtc_status */
282 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */
283 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */
284
285 typedef struct vm_thread_ctx {
286 struct vm *vtc_vm;
287 int vtc_vcpuid;
288 uint_t vtc_status;
289 enum vcpu_ustate vtc_ustate;
290 } vm_thread_ctx_t;
291
292 #ifdef KTR
293 static const char *
294 vcpu_state2str(enum vcpu_state state)
295 {
296
297 switch (state) {
298 case VCPU_IDLE:
299 return ("idle");
300 case VCPU_FROZEN:
301 return ("frozen");
302 case VCPU_RUNNING:
303 return ("running");
304 case VCPU_SLEEPING:
305 return ("sleeping");
306 default:
307 return ("unknown");
308 }
309 }
310 #endif
311
312 static void
313 vcpu_cleanup(struct vm *vm, int i, bool destroy)
314 {
315 struct vcpu *vcpu = &vm->vcpu[i];
316
317 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
318 if (destroy) {
319 vmm_stat_free(vcpu->stats);
320 fpu_save_area_free(vcpu->guestfpu);
321 vie_free(vcpu->vie_ctx);
322 vcpu->vie_ctx = NULL;
323 }
324 }
325
326 static void
327 vcpu_init(struct vm *vm, int vcpu_id, bool create)
328 {
329 struct vcpu *vcpu;
330
331 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
332 ("vcpu_init: invalid vcpu %d", vcpu_id));
333
334 vcpu = &vm->vcpu[vcpu_id];
335
336 if (create) {
337 vcpu_lock_init(vcpu);
338 vcpu->state = VCPU_IDLE;
339 vcpu->hostcpu = NOCPU;
340 vcpu->lastloccpu = NOCPU;
341 vcpu->guestfpu = fpu_save_area_alloc();
342 vcpu->stats = vmm_stat_alloc();
343 vcpu->vie_ctx = vie_alloc();
344
345 vcpu->ustate = VU_INIT;
346 vcpu->ustate_when = gethrtime();
347 } else {
348 vie_reset(vcpu->vie_ctx);
349 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
350 if (vcpu->ustate != VU_INIT) {
351 vcpu_ustate_change(vm, vcpu_id, VU_INIT);
352 }
353 }
354
355 vcpu->run_state = VRS_HALT;
356 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
357 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
358 vcpu->reqidle = 0;
359 vcpu->exitintinfo = 0;
360 vcpu->nmi_pending = 0;
361 vcpu->extint_pending = 0;
362 vcpu->exception_pending = 0;
363 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
364 fpu_save_area_reset(vcpu->guestfpu);
365 vmm_stat_init(vcpu->stats);
366 vcpu->tsc_offset = 0;
367 }
368
369 int
370 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
371 {
372
373 return (trace_guest_exceptions);
374 }
375
376 struct vm_exit *
377 vm_exitinfo(struct vm *vm, int cpuid)
378 {
379 struct vcpu *vcpu;
380
381 if (cpuid < 0 || cpuid >= vm->maxcpus)
382 panic("vm_exitinfo: invalid cpuid %d", cpuid);
383
384 vcpu = &vm->vcpu[cpuid];
385
386 return (&vcpu->exitinfo);
387 }
388
389 struct vie *
390 vm_vie_ctx(struct vm *vm, int cpuid)
391 {
392 if (cpuid < 0 || cpuid >= vm->maxcpus)
393 panic("vm_vie_ctx: invalid cpuid %d", cpuid);
394
395 return (vm->vcpu[cpuid].vie_ctx);
396 }
397
398 static int
399 vmm_init(void)
400 {
401 int error;
402
403 vmm_host_state_init();
404
405 /* We use cpu_poke() for IPIs */
406 vmm_ipinum = 0;
407
408 error = vmm_mem_init();
409 if (error)
410 return (error);
411
412 if (vmm_is_intel())
413 ops = &vmm_ops_intel;
414 else if (vmm_is_svm())
415 ops = &vmm_ops_amd;
416 else
417 return (ENXIO);
418
419 return (VMM_INIT(vmm_ipinum));
420 }
421
422 int
423 vmm_mod_load()
424 {
425 int error;
426
427 VERIFY(vmm_initialized == 0);
428
429 error = vmm_init();
430 if (error == 0)
431 vmm_initialized = 1;
432
433 return (error);
434 }
435
436 int
437 vmm_mod_unload()
438 {
439 int error;
440
441 VERIFY(vmm_initialized == 1);
442
443 iommu_cleanup();
444 error = VMM_CLEANUP();
445 if (error)
446 return (error);
447 vmm_initialized = 0;
448
449 return (0);
450 }
451
452 static void
453 vm_init(struct vm *vm, bool create)
454 {
455 int i;
456
457 vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
458 vm->iommu = NULL;
459 vm->vioapic = vioapic_init(vm);
460 vm->vhpet = vhpet_init(vm);
461 vm->vatpic = vatpic_init(vm);
462 vm->vatpit = vatpit_init(vm);
463 vm->vpmtmr = vpmtmr_init(vm);
464 if (create)
465 vm->vrtc = vrtc_init(vm);
466
467 vm_inout_init(vm, &vm->ioports);
468
469 CPU_ZERO(&vm->active_cpus);
470 CPU_ZERO(&vm->debug_cpus);
471
472 vm->suspend = 0;
473 CPU_ZERO(&vm->suspended_cpus);
474
475 for (i = 0; i < vm->maxcpus; i++)
476 vcpu_init(vm, i, create);
477
478 /*
479 * Configure the VM-wide TSC offset so that the call to vm_init()
480 * represents the boot time (when the TSC(s) read 0). Each vCPU will
481 * have its own offset from this, which is altered if/when the guest
482 * writes to MSR_TSC.
483 *
484 * The TSC offsetting math is all unsigned, using overflow for negative
485 * offets. A reading of the TSC is negated to form the boot offset.
486 */
487 vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset());
488 }
489
490 /*
491 * The default CPU topology is a single thread per package.
492 */
493 uint_t cores_per_package = 1;
494 uint_t threads_per_core = 1;
495
496 int
497 vm_create(const char *name, struct vm **retvm)
498 {
499 struct vm *vm;
500 struct vmspace *vmspace;
501
502 /*
503 * If vmm.ko could not be successfully initialized then don't attempt
504 * to create the virtual machine.
505 */
506 if (!vmm_initialized)
507 return (ENXIO);
508
509 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
510 return (EINVAL);
511
512 vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
513 if (vmspace == NULL)
514 return (ENOMEM);
515
516 vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
517 strcpy(vm->name, name);
518 vm->vmspace = vmspace;
519
520 vm->sockets = 1;
521 vm->cores = cores_per_package; /* XXX backwards compatibility */
522 vm->threads = threads_per_core; /* XXX backwards compatibility */
523 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
524
525 vm_init(vm, true);
526
527 *retvm = vm;
528 return (0);
529 }
530
531 void
532 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
533 uint16_t *threads, uint16_t *maxcpus)
534 {
535 *sockets = vm->sockets;
536 *cores = vm->cores;
537 *threads = vm->threads;
538 *maxcpus = vm->maxcpus;
539 }
540
541 uint16_t
542 vm_get_maxcpus(struct vm *vm)
543 {
544 return (vm->maxcpus);
545 }
546
547 int
548 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
549 uint16_t threads, uint16_t maxcpus)
550 {
551 if (maxcpus != 0)
552 return (EINVAL); /* XXX remove when supported */
553 if ((sockets * cores * threads) > vm->maxcpus)
554 return (EINVAL);
555 /* XXX need to check sockets * cores * threads == vCPU, how? */
556 vm->sockets = sockets;
557 vm->cores = cores;
558 vm->threads = threads;
559 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
560 return (0);
561 }
562
563 static void
564 vm_cleanup(struct vm *vm, bool destroy)
565 {
566 struct mem_map *mm;
567 int i;
568
569 ppt_unassign_all(vm);
570
571 if (vm->iommu != NULL)
572 iommu_destroy_domain(vm->iommu);
573
574 /*
575 * Devices which attach their own ioport hooks should be cleaned up
576 * first so they can tear down those registrations.
577 */
578 vpmtmr_cleanup(vm->vpmtmr);
579
580 vm_inout_cleanup(vm, &vm->ioports);
581
582 if (destroy)
583 vrtc_cleanup(vm->vrtc);
584 else
585 vrtc_reset(vm->vrtc);
586
587 vatpit_cleanup(vm->vatpit);
588 vhpet_cleanup(vm->vhpet);
589 vatpic_cleanup(vm->vatpic);
590 vioapic_cleanup(vm->vioapic);
591
592 for (i = 0; i < vm->maxcpus; i++)
593 vcpu_cleanup(vm, i, destroy);
594
595 VMCLEANUP(vm->cookie);
596
597 /*
598 * System memory is removed from the guest address space only when
599 * the VM is destroyed. This is because the mapping remains the same
600 * across VM reset.
601 *
602 * Device memory can be relocated by the guest (e.g. using PCI BARs)
603 * so those mappings are removed on a VM reset.
604 */
605 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
606 mm = &vm->mem_maps[i];
607 if (destroy || !sysmem_mapping(vm, mm)) {
608 vm_free_memmap(vm, i);
609 } else {
610 /*
611 * We need to reset the IOMMU flag so this mapping can
612 * be reused when a VM is rebooted. Since the IOMMU
613 * domain has already been destroyed we can just reset
614 * the flag here.
615 */
616 mm->flags &= ~VM_MEMMAP_F_IOMMU;
617 }
618 }
619
620 if (destroy) {
621 for (i = 0; i < VM_MAX_MEMSEGS; i++)
622 vm_free_memseg(vm, i);
623
624 VMSPACE_FREE(vm->vmspace);
625 vm->vmspace = NULL;
626
627 arc_virt_machine_release(vm->arc_resv);
628 vm->arc_resv = 0;
629 }
630 }
631
632 void
633 vm_destroy(struct vm *vm)
634 {
635 vm_cleanup(vm, true);
636 free(vm, M_VM);
637 }
638
639 int
640 vm_reinit(struct vm *vm)
641 {
642 int error;
643
644 /*
645 * A virtual machine can be reset only if all vcpus are suspended.
646 */
647 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
648 vm_cleanup(vm, false);
649 vm_init(vm, false);
650 error = 0;
651 } else {
652 error = EBUSY;
653 }
654
655 return (error);
656 }
657
658 const char *
659 vm_name(struct vm *vm)
660 {
661 return (vm->name);
662 }
663
664 int
665 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
666 {
667 vm_object_t obj;
668
669 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
670 return (ENOMEM);
671 else
672 return (0);
673 }
674
675 int
676 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
677 {
678 return (vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len));
679 }
680
681 /*
682 * Return 'true' if 'gpa' is allocated in the guest address space.
683 *
684 * This function is called in the context of a running vcpu which acts as
685 * an implicit lock on 'vm->mem_maps[]'.
686 */
687 bool
688 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
689 {
690 struct mem_map *mm;
691 int i;
692
693 #ifdef INVARIANTS
694 int hostcpu, state;
695 state = vcpu_get_state(vm, vcpuid, &hostcpu);
696 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
697 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
698 #endif
699
700 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
701 mm = &vm->mem_maps[i];
702 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
703 return (true); /* 'gpa' is sysmem or devmem */
704 }
705
706 if (ppt_is_mmio(vm, gpa))
707 return (true); /* 'gpa' is pci passthru mmio */
708
709 return (false);
710 }
711
712 int
713 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
714 {
715 struct mem_seg *seg;
716 vm_object_t obj;
717
718 #ifndef __FreeBSD__
719 extern pgcnt_t get_max_page_get(void);
720 #endif
721
722 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
723 return (EINVAL);
724
725 if (len == 0 || (len & PAGE_MASK))
726 return (EINVAL);
727
728 #ifndef __FreeBSD__
729 if (len > ptob(get_max_page_get()))
730 return (EINVAL);
731 #endif
732
733 seg = &vm->mem_segs[ident];
734 if (seg->object != NULL) {
735 if (seg->len == len && seg->sysmem == sysmem)
736 return (EEXIST);
737 else
738 return (EINVAL);
739 }
740
741 obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
742 if (obj == NULL)
743 return (ENOMEM);
744
745 seg->len = len;
746 seg->object = obj;
747 seg->sysmem = sysmem;
748 return (0);
749 }
750
751 int
752 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
753 vm_object_t *objptr)
754 {
755 struct mem_seg *seg;
756
757 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
758 return (EINVAL);
759
760 seg = &vm->mem_segs[ident];
761 if (len)
762 *len = seg->len;
763 if (sysmem)
764 *sysmem = seg->sysmem;
765 if (objptr)
766 *objptr = seg->object;
767 return (0);
768 }
769
770 void
771 vm_free_memseg(struct vm *vm, int ident)
772 {
773 struct mem_seg *seg;
774
775 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
776 ("%s: invalid memseg ident %d", __func__, ident));
777
778 seg = &vm->mem_segs[ident];
779 if (seg->object != NULL) {
780 vm_object_deallocate(seg->object);
781 bzero(seg, sizeof (struct mem_seg));
782 }
783 }
784
785 int
786 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
787 size_t len, int prot, int flags)
788 {
789 struct mem_seg *seg;
790 struct mem_map *m, *map;
791 vm_ooffset_t last;
792 int i, error;
793
794 if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
795 return (EINVAL);
796
797 if (flags & ~VM_MEMMAP_F_WIRED)
798 return (EINVAL);
799
800 if (segid < 0 || segid >= VM_MAX_MEMSEGS)
801 return (EINVAL);
802
803 seg = &vm->mem_segs[segid];
804 if (seg->object == NULL)
805 return (EINVAL);
806
807 last = first + len;
808 if (first < 0 || first >= last || last > seg->len)
809 return (EINVAL);
810
811 if ((gpa | first | last) & PAGE_MASK)
812 return (EINVAL);
813
814 map = NULL;
815 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
816 m = &vm->mem_maps[i];
817 if (m->len == 0) {
818 map = m;
819 break;
820 }
821 }
822
823 if (map == NULL)
824 return (ENOSPC);
825
826 error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
827 len, 0, VMFS_NO_SPACE, prot, prot, 0);
828 if (error != 0)
829 return (EFAULT);
830
831 vm_object_reference(seg->object);
832
833 if ((flags & VM_MEMMAP_F_WIRED) != 0) {
834 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
835 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
836 if (error != 0) {
837 vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
838 return (EFAULT);
839 }
840 }
841
842 map->gpa = gpa;
843 map->len = len;
844 map->segoff = first;
845 map->segid = segid;
846 map->prot = prot;
847 map->flags = flags;
848 return (0);
849 }
850
851 int
852 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
853 {
854 struct mem_map *m;
855 int i;
856
857 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
858 m = &vm->mem_maps[i];
859 if (m->gpa == gpa && m->len == len &&
860 (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
861 vm_free_memmap(vm, i);
862 return (0);
863 }
864 }
865
866 return (EINVAL);
867 }
868
869 int
870 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
871 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
872 {
873 struct mem_map *mm, *mmnext;
874 int i;
875
876 mmnext = NULL;
877 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
878 mm = &vm->mem_maps[i];
879 if (mm->len == 0 || mm->gpa < *gpa)
880 continue;
881 if (mmnext == NULL || mm->gpa < mmnext->gpa)
882 mmnext = mm;
883 }
884
885 if (mmnext != NULL) {
886 *gpa = mmnext->gpa;
887 if (segid)
888 *segid = mmnext->segid;
889 if (segoff)
890 *segoff = mmnext->segoff;
891 if (len)
892 *len = mmnext->len;
893 if (prot)
894 *prot = mmnext->prot;
895 if (flags)
896 *flags = mmnext->flags;
897 return (0);
898 } else {
899 return (ENOENT);
900 }
901 }
902
903 static void
904 vm_free_memmap(struct vm *vm, int ident)
905 {
906 struct mem_map *mm;
907 int error;
908
909 mm = &vm->mem_maps[ident];
910 if (mm->len) {
911 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
912 mm->gpa + mm->len);
913 KASSERT(error == 0, ("%s: vm_map_remove error %d",
914 __func__, error));
915 bzero(mm, sizeof (struct mem_map));
916 }
917 }
918
919 static __inline bool
920 sysmem_mapping(struct vm *vm, struct mem_map *mm)
921 {
922
923 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
924 return (true);
925 else
926 return (false);
927 }
928
929 vm_paddr_t
930 vmm_sysmem_maxaddr(struct vm *vm)
931 {
932 struct mem_map *mm;
933 vm_paddr_t maxaddr;
934 int i;
935
936 maxaddr = 0;
937 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
938 mm = &vm->mem_maps[i];
939 if (sysmem_mapping(vm, mm)) {
940 if (maxaddr < mm->gpa + mm->len)
941 maxaddr = mm->gpa + mm->len;
942 }
943 }
944 return (maxaddr);
945 }
946
947 static void
948 vm_iommu_modify(struct vm *vm, bool map)
949 {
950 int i, sz;
951 vm_paddr_t gpa, hpa;
952 struct mem_map *mm;
953 #ifdef __FreeBSD__
954 void *vp, *cookie, *host_domain;
955 #else
956 void *vp, *cookie, *host_domain __unused;
957 #endif
958
959 sz = PAGE_SIZE;
960 host_domain = iommu_host_domain();
961
962 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
963 mm = &vm->mem_maps[i];
964 if (!sysmem_mapping(vm, mm))
965 continue;
966
967 if (map) {
968 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
969 ("iommu map found invalid memmap %lx/%lx/%x",
970 mm->gpa, mm->len, mm->flags));
971 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
972 continue;
973 mm->flags |= VM_MEMMAP_F_IOMMU;
974 } else {
975 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
976 continue;
977 mm->flags &= ~VM_MEMMAP_F_IOMMU;
978 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
979 ("iommu unmap found invalid memmap %lx/%lx/%x",
980 mm->gpa, mm->len, mm->flags));
981 }
982
983 gpa = mm->gpa;
984 while (gpa < mm->gpa + mm->len) {
985 vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, PROT_WRITE,
986 &cookie);
987 KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
988 vm_name(vm), gpa));
989
990 vm_gpa_release(cookie);
991
992 hpa = DMAP_TO_PHYS((uintptr_t)vp);
993 if (map) {
994 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
995 #ifdef __FreeBSD__
996 iommu_remove_mapping(host_domain, hpa, sz);
997 #endif
998 } else {
999 iommu_remove_mapping(vm->iommu, gpa, sz);
1000 #ifdef __FreeBSD__
1001 iommu_create_mapping(host_domain, hpa, hpa, sz);
1002 #endif
1003 }
1004
1005 gpa += PAGE_SIZE;
1006 }
1007 }
1008
1009 /*
1010 * Invalidate the cached translations associated with the domain
1011 * from which pages were removed.
1012 */
1013 #ifdef __FreeBSD__
1014 if (map)
1015 iommu_invalidate_tlb(host_domain);
1016 else
1017 iommu_invalidate_tlb(vm->iommu);
1018 #else
1019 iommu_invalidate_tlb(vm->iommu);
1020 #endif
1021 }
1022
1023 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false)
1024 #define vm_iommu_map(vm) vm_iommu_modify((vm), true)
1025
1026 int
1027 vm_unassign_pptdev(struct vm *vm, int pptfd)
1028 {
1029 int error;
1030
1031 error = ppt_unassign_device(vm, pptfd);
1032 if (error)
1033 return (error);
1034
1035 if (ppt_assigned_devices(vm) == 0)
1036 vm_iommu_unmap(vm);
1037
1038 return (0);
1039 }
1040
1041 int
1042 vm_assign_pptdev(struct vm *vm, int pptfd)
1043 {
1044 int error;
1045 vm_paddr_t maxaddr;
1046
1047 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1048 if (ppt_assigned_devices(vm) == 0) {
1049 KASSERT(vm->iommu == NULL,
1050 ("vm_assign_pptdev: iommu must be NULL"));
1051 maxaddr = vmm_sysmem_maxaddr(vm);
1052 vm->iommu = iommu_create_domain(maxaddr);
1053 if (vm->iommu == NULL)
1054 return (ENXIO);
1055 vm_iommu_map(vm);
1056 }
1057
1058 error = ppt_assign_device(vm, pptfd);
1059 return (error);
1060 }
1061
1062 void *
1063 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1064 void **cookie)
1065 {
1066 int i, count, pageoff;
1067 struct mem_map *mm;
1068 vm_page_t m;
1069 #ifdef INVARIANTS
1070 /*
1071 * All vcpus are frozen by ioctls that modify the memory map
1072 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1073 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1074 */
1075 int state;
1076 KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1077 __func__, vcpuid));
1078 for (i = 0; i < vm->maxcpus; i++) {
1079 if (vcpuid != -1 && vcpuid != i)
1080 continue;
1081 state = vcpu_get_state(vm, i, NULL);
1082 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1083 __func__, state));
1084 }
1085 #endif
1086 pageoff = gpa & PAGE_MASK;
1087 if (len > PAGE_SIZE - pageoff)
1088 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1089
1090 count = 0;
1091 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1092 mm = &vm->mem_maps[i];
1093 if (mm->len == 0) {
1094 continue;
1095 }
1096 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1097 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1098 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1099 break;
1100 }
1101 }
1102
1103 if (count == 1) {
1104 *cookie = m;
1105 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1106 } else {
1107 *cookie = NULL;
1108 return (NULL);
1109 }
1110 }
1111
1112 void
1113 vm_gpa_release(void *cookie)
1114 {
1115 vm_page_t m = cookie;
1116
1117 vm_page_unwire(m, PQ_ACTIVE);
1118 }
1119
1120 int
1121 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1122 {
1123
1124 if (vcpu < 0 || vcpu >= vm->maxcpus)
1125 return (EINVAL);
1126
1127 if (reg >= VM_REG_LAST)
1128 return (EINVAL);
1129
1130 return (VMGETREG(vm->cookie, vcpu, reg, retval));
1131 }
1132
1133 int
1134 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1135 {
1136 struct vcpu *vcpu;
1137 int error;
1138
1139 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1140 return (EINVAL);
1141
1142 if (reg >= VM_REG_LAST)
1143 return (EINVAL);
1144
1145 error = VMSETREG(vm->cookie, vcpuid, reg, val);
1146 if (error || reg != VM_REG_GUEST_RIP)
1147 return (error);
1148
1149 /* Set 'nextrip' to match the value of %rip */
1150 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1151 vcpu = &vm->vcpu[vcpuid];
1152 vcpu->nextrip = val;
1153 return (0);
1154 }
1155
1156 static bool
1157 is_descriptor_table(int reg)
1158 {
1159 switch (reg) {
1160 case VM_REG_GUEST_IDTR:
1161 case VM_REG_GUEST_GDTR:
1162 return (true);
1163 default:
1164 return (false);
1165 }
1166 }
1167
1168 static bool
1169 is_segment_register(int reg)
1170 {
1171 switch (reg) {
1172 case VM_REG_GUEST_ES:
1173 case VM_REG_GUEST_CS:
1174 case VM_REG_GUEST_SS:
1175 case VM_REG_GUEST_DS:
1176 case VM_REG_GUEST_FS:
1177 case VM_REG_GUEST_GS:
1178 case VM_REG_GUEST_TR:
1179 case VM_REG_GUEST_LDTR:
1180 return (true);
1181 default:
1182 return (false);
1183 }
1184 }
1185
1186 int
1187 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1188 {
1189
1190 if (vcpu < 0 || vcpu >= vm->maxcpus)
1191 return (EINVAL);
1192
1193 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1194 return (EINVAL);
1195
1196 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1197 }
1198
1199 int
1200 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1201 {
1202 if (vcpu < 0 || vcpu >= vm->maxcpus)
1203 return (EINVAL);
1204
1205 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1206 return (EINVAL);
1207
1208 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1209 }
1210
1211 int
1212 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1213 {
1214 struct vcpu *vcpu;
1215
1216 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1217 return (EINVAL);
1218 }
1219
1220 vcpu = &vm->vcpu[vcpuid];
1221
1222 vcpu_lock(vcpu);
1223 *state = vcpu->run_state;
1224 *sipi_vec = vcpu->sipi_vector;
1225 vcpu_unlock(vcpu);
1226
1227 return (0);
1228 }
1229
1230 int
1231 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1232 {
1233 struct vcpu *vcpu;
1234
1235 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1236 return (EINVAL);
1237 }
1238 if (!VRS_IS_VALID(state)) {
1239 return (EINVAL);
1240 }
1241
1242 vcpu = &vm->vcpu[vcpuid];
1243
1244 vcpu_lock(vcpu);
1245 vcpu->run_state = state;
1246 vcpu->sipi_vector = sipi_vec;
1247 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1248 vcpu_unlock(vcpu);
1249
1250 return (0);
1251 }
1252
1253
1254 static void
1255 restore_guest_fpustate(struct vcpu *vcpu)
1256 {
1257
1258 /* flush host state to the pcb */
1259 fpuexit(curthread);
1260
1261 /* restore guest FPU state */
1262 fpu_stop_emulating();
1263 fpurestore(vcpu->guestfpu);
1264
1265 /* restore guest XCR0 if XSAVE is enabled in the host */
1266 if (rcr4() & CR4_XSAVE)
1267 load_xcr(0, vcpu->guest_xcr0);
1268
1269 /*
1270 * The FPU is now "dirty" with the guest's state so turn on emulation
1271 * to trap any access to the FPU by the host.
1272 */
1273 fpu_start_emulating();
1274 }
1275
1276 static void
1277 save_guest_fpustate(struct vcpu *vcpu)
1278 {
1279
1280 if ((rcr0() & CR0_TS) == 0)
1281 panic("fpu emulation not enabled in host!");
1282
1283 /* save guest XCR0 and restore host XCR0 */
1284 if (rcr4() & CR4_XSAVE) {
1285 vcpu->guest_xcr0 = rxcr(0);
1286 load_xcr(0, vmm_get_host_xcr0());
1287 }
1288
1289 /* save guest FPU state */
1290 fpu_stop_emulating();
1291 fpusave(vcpu->guestfpu);
1292 /*
1293 * When the host state has been restored, we should not re-enable
1294 * CR0.TS on illumos for eager FPU.
1295 */
1296 }
1297
1298 static int
1299 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1300 bool from_idle)
1301 {
1302 struct vcpu *vcpu;
1303 int error;
1304
1305 vcpu = &vm->vcpu[vcpuid];
1306 vcpu_assert_locked(vcpu);
1307
1308 /*
1309 * State transitions from the vmmdev_ioctl() must always begin from
1310 * the VCPU_IDLE state. This guarantees that there is only a single
1311 * ioctl() operating on a vcpu at any point.
1312 */
1313 if (from_idle) {
1314 while (vcpu->state != VCPU_IDLE) {
1315 vcpu->reqidle = 1;
1316 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1317 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1318 "idle requested", vcpu_state2str(vcpu->state));
1319 cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1320 }
1321 } else {
1322 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1323 "vcpu idle state"));
1324 }
1325
1326 if (vcpu->state == VCPU_RUNNING) {
1327 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1328 "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1329 } else {
1330 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1331 "vcpu that is not running", vcpu->hostcpu));
1332 }
1333
1334 /*
1335 * The following state transitions are allowed:
1336 * IDLE -> FROZEN -> IDLE
1337 * FROZEN -> RUNNING -> FROZEN
1338 * FROZEN -> SLEEPING -> FROZEN
1339 */
1340 switch (vcpu->state) {
1341 case VCPU_IDLE:
1342 case VCPU_RUNNING:
1343 case VCPU_SLEEPING:
1344 error = (newstate != VCPU_FROZEN);
1345 break;
1346 case VCPU_FROZEN:
1347 error = (newstate == VCPU_FROZEN);
1348 break;
1349 default:
1350 error = 1;
1351 break;
1352 }
1353
1354 if (error)
1355 return (EBUSY);
1356
1357 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1358 vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1359
1360 vcpu->state = newstate;
1361 if (newstate == VCPU_RUNNING)
1362 vcpu->hostcpu = curcpu;
1363 else
1364 vcpu->hostcpu = NOCPU;
1365
1366 if (newstate == VCPU_IDLE) {
1367 cv_broadcast(&vcpu->state_cv);
1368 }
1369
1370 return (0);
1371 }
1372
1373 static void
1374 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1375 {
1376 int error;
1377
1378 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1379 panic("Error %d setting state to %d\n", error, newstate);
1380 }
1381
1382 static void
1383 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1384 {
1385 int error;
1386
1387 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1388 panic("Error %d setting state to %d", error, newstate);
1389 }
1390
1391 /*
1392 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1393 */
1394 static int
1395 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1396 {
1397 struct vcpu *vcpu;
1398 int vcpu_halted, vm_halted;
1399 bool userspace_exit = false;
1400
1401 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1402
1403 vcpu = &vm->vcpu[vcpuid];
1404 vcpu_halted = 0;
1405 vm_halted = 0;
1406
1407 vcpu_lock(vcpu);
1408 while (1) {
1409 /*
1410 * Do a final check for pending interrupts (including NMI and
1411 * INIT) before putting this thread to sleep.
1412 */
1413 if (vm_nmi_pending(vm, vcpuid))
1414 break;
1415 if (vcpu_run_state_pending(vm, vcpuid))
1416 break;
1417 if (!intr_disabled) {
1418 if (vm_extint_pending(vm, vcpuid) ||
1419 vlapic_pending_intr(vcpu->vlapic, NULL)) {
1420 break;
1421 }
1422 }
1423
1424 /*
1425 * Also check for software events which would cause a wake-up.
1426 * This will set the appropriate exitcode directly, rather than
1427 * requiring a trip through VM_RUN().
1428 */
1429 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1430 userspace_exit = true;
1431 break;
1432 }
1433
1434 /*
1435 * Some Linux guests implement "halt" by having all vcpus
1436 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1437 * track of the vcpus that have entered this state. When all
1438 * vcpus enter the halted state the virtual machine is halted.
1439 */
1440 if (intr_disabled) {
1441 if (!vcpu_halted && halt_detection_enabled) {
1442 vcpu_halted = 1;
1443 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1444 }
1445 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1446 vm_halted = 1;
1447 break;
1448 }
1449 }
1450
1451 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1452 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1453 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1454 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1455 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1456 }
1457
1458 if (vcpu_halted)
1459 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1460
1461 vcpu_unlock(vcpu);
1462
1463 if (vm_halted)
1464 vm_suspend(vm, VM_SUSPEND_HALT);
1465
1466 return (userspace_exit ? -1 : 0);
1467 }
1468
1469 static int
1470 vm_handle_paging(struct vm *vm, int vcpuid)
1471 {
1472 int rv, ftype;
1473 struct vm_map *map;
1474 struct vcpu *vcpu;
1475 struct vm_exit *vme;
1476
1477 vcpu = &vm->vcpu[vcpuid];
1478 vme = &vcpu->exitinfo;
1479
1480 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1481 __func__, vme->inst_length));
1482
1483 ftype = vme->u.paging.fault_type;
1484 KASSERT(ftype == PROT_READ ||
1485 ftype == PROT_WRITE || ftype == PROT_EXEC,
1486 ("vm_handle_paging: invalid fault_type %d", ftype));
1487
1488 if (ftype == PROT_READ || ftype == PROT_WRITE) {
1489 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1490 vme->u.paging.gpa, ftype);
1491 if (rv == 0) {
1492 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1493 ftype == PROT_READ ? "accessed" : "dirty",
1494 vme->u.paging.gpa);
1495 goto done;
1496 }
1497 }
1498
1499 map = &vm->vmspace->vm_map;
1500 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1501
1502 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1503 "ftype = %d", rv, vme->u.paging.gpa, ftype);
1504
1505 if (rv != 0)
1506 return (EFAULT);
1507 done:
1508 return (0);
1509 }
1510
1511 int
1512 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1513 int rsize)
1514 {
1515 int err = ESRCH;
1516
1517 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1518 err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1519 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1520 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1521 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1522 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1523 }
1524
1525 return (err);
1526 }
1527
1528 int
1529 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1530 int wsize)
1531 {
1532 int err = ESRCH;
1533
1534 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1535 err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1536 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1537 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1538 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1539 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1540 }
1541
1542 return (err);
1543 }
1544
1545 static int
1546 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1547 {
1548 struct vie *vie;
1549 struct vcpu *vcpu;
1550 struct vm_exit *vme;
1551 uint64_t inst_addr;
1552 int error, fault, cs_d;
1553
1554 vcpu = &vm->vcpu[vcpuid];
1555 vme = &vcpu->exitinfo;
1556 vie = vcpu->vie_ctx;
1557
1558 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1559 __func__, vme->inst_length));
1560
1561 inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1562 cs_d = vme->u.mmio_emul.cs_d;
1563
1564 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1565 vme->u.mmio_emul.gpa);
1566
1567 /* Fetch the faulting instruction */
1568 if (vie_needs_fetch(vie)) {
1569 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1570 &fault);
1571 if (error != 0) {
1572 return (error);
1573 } else if (fault) {
1574 /*
1575 * If a fault during instruction fetch was encountered,
1576 * it will have asserted that the appropriate exception
1577 * be injected at next entry.
1578 * No further work is required.
1579 */
1580 return (0);
1581 }
1582 }
1583
1584 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1585 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1586 inst_addr);
1587 /* Dump (unrecognized) instruction bytes in userspace */
1588 vie_fallback_exitinfo(vie, vme);
1589 return (-1);
1590 }
1591 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1592 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1593 /* Decoded GLA does not match GLA from VM exit state */
1594 vie_fallback_exitinfo(vie, vme);
1595 return (-1);
1596 }
1597
1598 repeat:
1599 error = vie_emulate_mmio(vie, vm, vcpuid);
1600 if (error < 0) {
1601 /*
1602 * MMIO not handled by any of the in-kernel-emulated devices, so
1603 * make a trip out to userspace for it.
1604 */
1605 vie_exitinfo(vie, vme);
1606 } else if (error == EAGAIN) {
1607 /*
1608 * Continue emulating the rep-prefixed instruction, which has
1609 * not completed its iterations.
1610 *
1611 * In case this can be emulated in-kernel and has a high
1612 * repetition count (causing a tight spin), it should be
1613 * deferential to yield conditions.
1614 */
1615 if (!vcpu_should_yield(vm, vcpuid)) {
1616 goto repeat;
1617 } else {
1618 /*
1619 * Defer to the contending load by making a trip to
1620 * userspace with a no-op (BOGUS) exit reason.
1621 */
1622 vie_reset(vie);
1623 vme->exitcode = VM_EXITCODE_BOGUS;
1624 return (-1);
1625 }
1626 } else if (error == 0) {
1627 /* Update %rip now that instruction has been emulated */
1628 vie_advance_pc(vie, &vcpu->nextrip);
1629 }
1630 return (error);
1631 }
1632
1633 static int
1634 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1635 {
1636 struct vcpu *vcpu;
1637 struct vie *vie;
1638 int err;
1639
1640 vcpu = &vm->vcpu[vcpuid];
1641 vie = vcpu->vie_ctx;
1642
1643 repeat:
1644 err = vie_emulate_inout(vie, vm, vcpuid);
1645
1646 if (err < 0) {
1647 /*
1648 * In/out not handled by any of the in-kernel-emulated devices,
1649 * so make a trip out to userspace for it.
1650 */
1651 vie_exitinfo(vie, vme);
1652 return (err);
1653 } else if (err == EAGAIN) {
1654 /*
1655 * Continue emulating the rep-prefixed ins/outs, which has not
1656 * completed its iterations.
1657 *
1658 * In case this can be emulated in-kernel and has a high
1659 * repetition count (causing a tight spin), it should be
1660 * deferential to yield conditions.
1661 */
1662 if (!vcpu_should_yield(vm, vcpuid)) {
1663 goto repeat;
1664 } else {
1665 /*
1666 * Defer to the contending load by making a trip to
1667 * userspace with a no-op (BOGUS) exit reason.
1668 */
1669 vie_reset(vie);
1670 vme->exitcode = VM_EXITCODE_BOGUS;
1671 return (-1);
1672 }
1673 } else if (err != 0) {
1674 /* Emulation failure. Bail all the way out to userspace. */
1675 vme->exitcode = VM_EXITCODE_INST_EMUL;
1676 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1677 return (-1);
1678 }
1679
1680 vie_advance_pc(vie, &vcpu->nextrip);
1681 return (0);
1682 }
1683
1684 static int
1685 vm_handle_inst_emul(struct vm *vm, int vcpuid)
1686 {
1687 struct vie *vie;
1688 struct vcpu *vcpu;
1689 struct vm_exit *vme;
1690 uint64_t cs_base;
1691 int error, fault, cs_d;
1692
1693 vcpu = &vm->vcpu[vcpuid];
1694 vme = &vcpu->exitinfo;
1695 vie = vcpu->vie_ctx;
1696
1697 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1698
1699 /* Fetch the faulting instruction */
1700 ASSERT(vie_needs_fetch(vie));
1701 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1702 &fault);
1703 if (error != 0) {
1704 return (error);
1705 } else if (fault) {
1706 /*
1707 * If a fault during instruction fetch was encounted, it will
1708 * have asserted that the appropriate exception be injected at
1709 * next entry. No further work is required.
1710 */
1711 return (0);
1712 }
1713
1714 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1715 /* Dump (unrecognized) instruction bytes in userspace */
1716 vie_fallback_exitinfo(vie, vme);
1717 return (-1);
1718 }
1719
1720 error = vie_emulate_other(vie, vm, vcpuid);
1721 if (error != 0) {
1722 /*
1723 * Instruction emulation was unable to complete successfully, so
1724 * kick it out to userspace for handling.
1725 */
1726 vie_fallback_exitinfo(vie, vme);
1727 } else {
1728 /* Update %rip now that instruction has been emulated */
1729 vie_advance_pc(vie, &vcpu->nextrip);
1730 }
1731 return (error);
1732 }
1733
1734 static int
1735 vm_handle_suspend(struct vm *vm, int vcpuid)
1736 {
1737 int i;
1738 struct vcpu *vcpu;
1739
1740 vcpu = &vm->vcpu[vcpuid];
1741
1742 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1743
1744 /*
1745 * Wait until all 'active_cpus' have suspended themselves.
1746 */
1747 vcpu_lock(vcpu);
1748 vcpu_ustate_change(vm, vcpuid, VU_INIT);
1749 while (1) {
1750 int rc;
1751
1752 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1753 VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1754 break;
1755 }
1756
1757 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1758 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1759 TR_CLOCK_TICK);
1760 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1761
1762 /*
1763 * If the userspace process driving the instance is killed, any
1764 * vCPUs yet to be marked suspended (because they are not
1765 * VM_RUN-ing in the kernel presently) will never reach that
1766 * state.
1767 *
1768 * To avoid vm_handle_suspend() getting stuck in the kernel
1769 * waiting for those vCPUs, offer a bail-out even though it
1770 * means returning without all vCPUs in a suspended state.
1771 */
1772 if (rc <= 0) {
1773 if ((curproc->p_flag & SEXITING) != 0) {
1774 break;
1775 }
1776 }
1777 }
1778 vcpu_unlock(vcpu);
1779
1780 /*
1781 * Wakeup the other sleeping vcpus and return to userspace.
1782 */
1783 for (i = 0; i < vm->maxcpus; i++) {
1784 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1785 vcpu_notify_event(vm, i);
1786 }
1787 }
1788
1789 return (-1);
1790 }
1791
1792 static int
1793 vm_handle_reqidle(struct vm *vm, int vcpuid)
1794 {
1795 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1796
1797 vcpu_lock(vcpu);
1798 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1799 vcpu->reqidle = 0;
1800 vcpu_unlock(vcpu);
1801 return (-1);
1802 }
1803
1804 static int
1805 vm_handle_run_state(struct vm *vm, int vcpuid)
1806 {
1807 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1808 bool handled = false;
1809
1810 vcpu_lock(vcpu);
1811 while (1) {
1812 if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1813 vcpu_unlock(vcpu);
1814 VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1815 vcpu_lock(vcpu);
1816
1817 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1818 vcpu->run_state |= VRS_INIT;
1819 }
1820
1821 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1822 (VRS_INIT | VRS_PEND_SIPI)) {
1823 const uint8_t vector = vcpu->sipi_vector;
1824
1825 vcpu_unlock(vcpu);
1826 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1827 vcpu_lock(vcpu);
1828
1829 vcpu->run_state &= ~VRS_PEND_SIPI;
1830 vcpu->run_state |= VRS_RUN;
1831 }
1832
1833 /*
1834 * If the vCPU is now in the running state, there is no need to
1835 * wait for anything prior to re-entry.
1836 */
1837 if ((vcpu->run_state & VRS_RUN) != 0) {
1838 handled = true;
1839 break;
1840 }
1841
1842 /*
1843 * Also check for software events which would cause a wake-up.
1844 * This will set the appropriate exitcode directly, rather than
1845 * requiring a trip through VM_RUN().
1846 */
1847 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1848 break;
1849 }
1850
1851 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1852 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1853 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1854 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1855 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1856 }
1857 vcpu_unlock(vcpu);
1858
1859 return (handled ? 0 : -1);
1860 }
1861
1862 static int
1863 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1864 {
1865 const uint32_t code = vme->u.msr.code;
1866 uint64_t val = 0;
1867
1868 switch (code) {
1869 case MSR_MCG_CAP:
1870 case MSR_MCG_STATUS:
1871 val = 0;
1872 break;
1873
1874 case MSR_MTRRcap:
1875 case MSR_MTRRdefType:
1876 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1877 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1878 case MSR_MTRR64kBase:
1879 val = 0;
1880 break;
1881
1882 case MSR_TSC:
1883 /*
1884 * In all likelihood, this should always be handled in guest
1885 * context by VMX/SVM rather than taking an exit. (Both VMX and
1886 * SVM pass through read-only access to MSR_TSC to the guest.)
1887 *
1888 * No physical offset is requested of vcpu_tsc_offset() since
1889 * rdtsc_offset() takes care of that instead.
1890 */
1891 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
1892 break;
1893
1894 default:
1895 /*
1896 * Anything not handled at this point will be kicked out to
1897 * userspace for attempted processing there.
1898 */
1899 return (-1);
1900 }
1901
1902 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
1903 val & 0xffffffff));
1904 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
1905 val >> 32));
1906 return (0);
1907 }
1908
1909 static int
1910 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1911 {
1912 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1913 const uint32_t code = vme->u.msr.code;
1914 const uint64_t val = vme->u.msr.wval;
1915
1916 switch (code) {
1917 case MSR_MCG_CAP:
1918 case MSR_MCG_STATUS:
1919 /* Ignore writes */
1920 break;
1921
1922 case MSR_MTRRcap:
1923 vm_inject_gp(vm, vcpuid);
1924 break;
1925 case MSR_MTRRdefType:
1926 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1927 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1928 case MSR_MTRR64kBase:
1929 /* Ignore writes */
1930 break;
1931
1932 case MSR_TSC:
1933 /*
1934 * The effect of writing the TSC MSR is that a subsequent read
1935 * of the TSC would report that value written (plus any time
1936 * elapsed between the write and the read). The guest TSC value
1937 * is calculated from a global offset for the guest (which
1938 * effectively makes its TSC read 0 at guest boot) and a
1939 * per-vCPU offset to handle these writes to the MSR.
1940 *
1941 * To calculate that per-vCPU offset, we can work backwards from
1942 * the guest value at the time of write:
1943 *
1944 * value = host TSC + VM boot offset + vCPU offset
1945 *
1946 * so therefore:
1947 *
1948 * value - host TSC - VM boot offset = vCPU offset
1949 */
1950 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
1951 break;
1952
1953 default:
1954 /*
1955 * Anything not handled at this point will be kicked out to
1956 * userspace for attempted processing there.
1957 */
1958 return (-1);
1959 }
1960
1961 return (0);
1962 }
1963
1964 int
1965 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1966 {
1967 int i;
1968
1969 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1970 return (EINVAL);
1971
1972 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1973 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1974 vm->suspend, how);
1975 return (EALREADY);
1976 }
1977
1978 VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1979
1980 /*
1981 * Notify all active vcpus that they are now suspended.
1982 */
1983 for (i = 0; i < vm->maxcpus; i++) {
1984 if (CPU_ISSET(i, &vm->active_cpus))
1985 vcpu_notify_event(vm, i);
1986 }
1987
1988 return (0);
1989 }
1990
1991 void
1992 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1993 {
1994 struct vm_exit *vmexit;
1995
1996 vmexit = vm_exitinfo(vm, vcpuid);
1997 vmexit->rip = rip;
1998 vmexit->inst_length = 0;
1999 vmexit->exitcode = VM_EXITCODE_RUN_STATE;
2000 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
2001 }
2002
2003 /*
2004 * Some vmm resources, such as the lapic, may have CPU-specific resources
2005 * allocated to them which would benefit from migration onto the host CPU which
2006 * is processing the vcpu state.
2007 */
2008 static void
2009 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2010 {
2011 /*
2012 * Localizing cyclic resources requires acquisition of cpu_lock, and
2013 * doing so with kpreempt disabled is a recipe for deadlock disaster.
2014 */
2015 VERIFY(curthread->t_preempt == 0);
2016
2017 /*
2018 * Do not bother with localization if this vCPU is about to return to
2019 * the host CPU it was last localized to.
2020 */
2021 if (vcpu->lastloccpu == curcpu)
2022 return;
2023
2024 /*
2025 * Localize system-wide resources to the primary boot vCPU. While any
2026 * of the other vCPUs may access them, it keeps the potential interrupt
2027 * footprint constrained to CPUs involved with this instance.
2028 */
2029 if (vcpu == &vm->vcpu[0]) {
2030 vhpet_localize_resources(vm->vhpet);
2031 vrtc_localize_resources(vm->vrtc);
2032 vatpit_localize_resources(vm->vatpit);
2033 }
2034
2035 vlapic_localize_resources(vcpu->vlapic);
2036
2037 vcpu->lastloccpu = curcpu;
2038 }
2039
2040 static void
2041 vmm_savectx(void *arg)
2042 {
2043 vm_thread_ctx_t *vtc = arg;
2044 struct vm *vm = vtc->vtc_vm;
2045 const int vcpuid = vtc->vtc_vcpuid;
2046
2047 if (ops->vmsavectx != NULL) {
2048 ops->vmsavectx(vm->cookie, vcpuid);
2049 }
2050
2051 /*
2052 * Account for going off-cpu, unless the vCPU is idled, where being
2053 * off-cpu is the explicit point.
2054 */
2055 if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2056 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2057 vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2058 }
2059
2060 /*
2061 * If the CPU holds the restored guest FPU state, save it and restore
2062 * the host FPU state before this thread goes off-cpu.
2063 */
2064 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2065 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2066
2067 save_guest_fpustate(vcpu);
2068 vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2069 }
2070 }
2071
2072 static void
2073 vmm_restorectx(void *arg)
2074 {
2075 vm_thread_ctx_t *vtc = arg;
2076 struct vm *vm = vtc->vtc_vm;
2077 const int vcpuid = vtc->vtc_vcpuid;
2078
2079 /* Complete microstate accounting for vCPU being off-cpu */
2080 if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2081 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2082 }
2083
2084 /*
2085 * When coming back on-cpu, only restore the guest FPU status if the
2086 * thread is in a context marked as requiring it. This should be rare,
2087 * occurring only when a future logic error results in a voluntary
2088 * sleep during the VMRUN critical section.
2089 *
2090 * The common case will result in elision of the guest FPU state
2091 * restoration, deferring that action until it is clearly necessary
2092 * during vm_run.
2093 */
2094 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2095 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2096 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2097
2098 restore_guest_fpustate(vcpu);
2099 vtc->vtc_status |= VTCS_FPU_RESTORED;
2100 }
2101
2102 if (ops->vmrestorectx != NULL) {
2103 ops->vmrestorectx(vm->cookie, vcpuid);
2104 }
2105
2106 }
2107
2108 /*
2109 * If we're in removectx(), we might still have state to tidy up.
2110 */
2111 static void
2112 vmm_freectx(void *arg, int isexec)
2113 {
2114 vmm_savectx(arg);
2115 }
2116
2117 static int
2118 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2119 struct vm_exit *vme)
2120 {
2121 struct vcpu *vcpu;
2122 struct vie *vie;
2123 int err;
2124
2125 vcpu = &vm->vcpu[vcpuid];
2126 vie = vcpu->vie_ctx;
2127 err = 0;
2128
2129 switch (entry->cmd) {
2130 case VEC_DEFAULT:
2131 return (0);
2132 case VEC_DISCARD_INSTR:
2133 vie_reset(vie);
2134 return (0);
2135 case VEC_FULFILL_MMIO:
2136 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2137 if (err == 0) {
2138 err = vie_emulate_mmio(vie, vm, vcpuid);
2139 if (err == 0) {
2140 vie_advance_pc(vie, &vcpu->nextrip);
2141 } else if (err < 0) {
2142 vie_exitinfo(vie, vme);
2143 } else if (err == EAGAIN) {
2144 /*
2145 * Clear the instruction emulation state in
2146 * order to re-enter VM context and continue
2147 * this 'rep <instruction>'
2148 */
2149 vie_reset(vie);
2150 err = 0;
2151 }
2152 }
2153 break;
2154 case VEC_FULFILL_INOUT:
2155 err = vie_fulfill_inout(vie, &entry->u.inout);
2156 if (err == 0) {
2157 err = vie_emulate_inout(vie, vm, vcpuid);
2158 if (err == 0) {
2159 vie_advance_pc(vie, &vcpu->nextrip);
2160 } else if (err < 0) {
2161 vie_exitinfo(vie, vme);
2162 } else if (err == EAGAIN) {
2163 /*
2164 * Clear the instruction emulation state in
2165 * order to re-enter VM context and continue
2166 * this 'rep ins/outs'
2167 */
2168 vie_reset(vie);
2169 err = 0;
2170 }
2171 }
2172 break;
2173 default:
2174 return (EINVAL);
2175 }
2176 return (err);
2177 }
2178
2179 static int
2180 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2181 {
2182 struct vie *vie;
2183
2184 vie = vm->vcpu[vcpuid].vie_ctx;
2185
2186 if (vie_pending(vie)) {
2187 /*
2188 * Userspace has not fulfilled the pending needs of the
2189 * instruction emulation, so bail back out.
2190 */
2191 vie_exitinfo(vie, vme);
2192 return (-1);
2193 }
2194
2195 return (0);
2196 }
2197
2198 int
2199 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2200 {
2201 int error;
2202 struct vcpu *vcpu;
2203 struct vm_exit *vme;
2204 bool intr_disabled;
2205 pmap_t pmap;
2206 vm_thread_ctx_t vtc;
2207 int affinity_type = CPU_CURRENT;
2208
2209 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2210 return (EINVAL);
2211 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2212 return (EINVAL);
2213 if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2214 return (EINVAL);
2215
2216 pmap = vmspace_pmap(vm->vmspace);
2217 vcpu = &vm->vcpu[vcpuid];
2218 vme = &vcpu->exitinfo;
2219
2220 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2221
2222 vtc.vtc_vm = vm;
2223 vtc.vtc_vcpuid = vcpuid;
2224 vtc.vtc_status = 0;
2225 installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2226 NULL, vmm_freectx, NULL);
2227
2228 error = vm_entry_actions(vm, vcpuid, entry, vme);
2229 if (error != 0) {
2230 goto exit;
2231 }
2232
2233 restart:
2234 error = vm_loop_checks(vm, vcpuid, vme);
2235 if (error != 0) {
2236 goto exit;
2237 }
2238
2239 thread_affinity_set(curthread, affinity_type);
2240 /*
2241 * Resource localization should happen after the CPU affinity for the
2242 * thread has been set to ensure that access from restricted contexts,
2243 * such as VMX-accelerated APIC operations, can occur without inducing
2244 * cyclic cross-calls.
2245 *
2246 * This must be done prior to disabling kpreempt via critical_enter().
2247 */
2248 vm_localize_resources(vm, vcpu);
2249 affinity_type = CPU_CURRENT;
2250 critical_enter();
2251
2252 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2253 ("vm_run: absurd pm_active"));
2254
2255 /* Force a trip through update_sregs to reload %fs/%gs and friends */
2256 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2257
2258 if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2259 restore_guest_fpustate(vcpu);
2260 vtc.vtc_status |= VTCS_FPU_RESTORED;
2261 }
2262 vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2263
2264 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2265 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2266 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2267
2268 /*
2269 * Once clear of the delicate contexts comprising the VM_RUN handler,
2270 * thread CPU affinity can be loosened while other processing occurs.
2271 */
2272 vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2273 thread_affinity_clear(curthread);
2274 critical_exit();
2275
2276 if (error != 0) {
2277 /* Communicate out any error from VMRUN() above */
2278 goto exit;
2279 }
2280
2281 vcpu->nextrip = vme->rip + vme->inst_length;
2282 switch (vme->exitcode) {
2283 case VM_EXITCODE_REQIDLE:
2284 error = vm_handle_reqidle(vm, vcpuid);
2285 break;
2286 case VM_EXITCODE_RUN_STATE:
2287 error = vm_handle_run_state(vm, vcpuid);
2288 break;
2289 case VM_EXITCODE_SUSPENDED:
2290 error = vm_handle_suspend(vm, vcpuid);
2291 break;
2292 case VM_EXITCODE_IOAPIC_EOI:
2293 vioapic_process_eoi(vm, vcpuid,
2294 vme->u.ioapic_eoi.vector);
2295 break;
2296 case VM_EXITCODE_HLT:
2297 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2298 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2299 break;
2300 case VM_EXITCODE_PAGING:
2301 error = vm_handle_paging(vm, vcpuid);
2302 break;
2303 case VM_EXITCODE_MMIO_EMUL:
2304 error = vm_handle_mmio_emul(vm, vcpuid);
2305 break;
2306 case VM_EXITCODE_INOUT:
2307 error = vm_handle_inout(vm, vcpuid, vme);
2308 break;
2309 case VM_EXITCODE_INST_EMUL:
2310 error = vm_handle_inst_emul(vm, vcpuid);
2311 break;
2312 case VM_EXITCODE_MONITOR:
2313 case VM_EXITCODE_MWAIT:
2314 case VM_EXITCODE_VMINSN:
2315 vm_inject_ud(vm, vcpuid);
2316 break;
2317 case VM_EXITCODE_RDMSR:
2318 error = vm_handle_rdmsr(vm, vcpuid, vme);
2319 break;
2320 case VM_EXITCODE_WRMSR:
2321 error = vm_handle_wrmsr(vm, vcpuid, vme);
2322 break;
2323 case VM_EXITCODE_HT:
2324 affinity_type = CPU_BEST;
2325 break;
2326 case VM_EXITCODE_MTRAP:
2327 vm_suspend_cpu(vm, vcpuid);
2328 error = -1;
2329 break;
2330 default:
2331 /* handled in userland */
2332 error = -1;
2333 break;
2334 }
2335
2336 if (error == 0) {
2337 /* VM exit conditions handled in-kernel, continue running */
2338 goto restart;
2339 }
2340
2341 exit:
2342 removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2343 NULL, vmm_freectx);
2344
2345 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2346
2347 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2348 return (error);
2349 }
2350
2351 int
2352 vm_restart_instruction(void *arg, int vcpuid)
2353 {
2354 struct vm *vm;
2355 struct vcpu *vcpu;
2356 enum vcpu_state state;
2357 uint64_t rip;
2358 int error;
2359
2360 vm = arg;
2361 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2362 return (EINVAL);
2363
2364 vcpu = &vm->vcpu[vcpuid];
2365 state = vcpu_get_state(vm, vcpuid, NULL);
2366 if (state == VCPU_RUNNING) {
2367 /*
2368 * When a vcpu is "running" the next instruction is determined
2369 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2370 * Thus setting 'inst_length' to zero will cause the current
2371 * instruction to be restarted.
2372 */
2373 vcpu->exitinfo.inst_length = 0;
2374 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2375 "setting inst_length to zero", vcpu->exitinfo.rip);
2376 } else if (state == VCPU_FROZEN) {
2377 /*
2378 * When a vcpu is "frozen" it is outside the critical section
2379 * around VMRUN() and 'nextrip' points to the next instruction.
2380 * Thus instruction restart is achieved by setting 'nextrip'
2381 * to the vcpu's %rip.
2382 */
2383 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2384 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2385 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2386 "nextrip from %lx to %lx", vcpu->nextrip, rip);
2387 vcpu->nextrip = rip;
2388 } else {
2389 panic("%s: invalid state %d", __func__, state);
2390 }
2391 return (0);
2392 }
2393
2394 int
2395 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2396 {
2397 struct vcpu *vcpu;
2398 int type, vector;
2399
2400 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2401 return (EINVAL);
2402
2403 vcpu = &vm->vcpu[vcpuid];
2404
2405 if (info & VM_INTINFO_VALID) {
2406 type = info & VM_INTINFO_TYPE;
2407 vector = info & 0xff;
2408 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2409 return (EINVAL);
2410 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2411 return (EINVAL);
2412 if (info & VM_INTINFO_RSVD)
2413 return (EINVAL);
2414 } else {
2415 info = 0;
2416 }
2417 VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2418 vcpu->exitintinfo = info;
2419 return (0);
2420 }
2421
2422 enum exc_class {
2423 EXC_BENIGN,
2424 EXC_CONTRIBUTORY,
2425 EXC_PAGEFAULT
2426 };
2427
2428 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */
2429
2430 static enum exc_class
2431 exception_class(uint64_t info)
2432 {
2433 int type, vector;
2434
2435 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2436 type = info & VM_INTINFO_TYPE;
2437 vector = info & 0xff;
2438
2439 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2440 switch (type) {
2441 case VM_INTINFO_HWINTR:
2442 case VM_INTINFO_SWINTR:
2443 case VM_INTINFO_NMI:
2444 return (EXC_BENIGN);
2445 default:
2446 /*
2447 * Hardware exception.
2448 *
2449 * SVM and VT-x use identical type values to represent NMI,
2450 * hardware interrupt and software interrupt.
2451 *
2452 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2453 * for exceptions except #BP and #OF. #BP and #OF use a type
2454 * value of '5' or '6'. Therefore we don't check for explicit
2455 * values of 'type' to classify 'intinfo' into a hardware
2456 * exception.
2457 */
2458 break;
2459 }
2460
2461 switch (vector) {
2462 case IDT_PF:
2463 case IDT_VE:
2464 return (EXC_PAGEFAULT);
2465 case IDT_DE:
2466 case IDT_TS:
2467 case IDT_NP:
2468 case IDT_SS:
2469 case IDT_GP:
2470 return (EXC_CONTRIBUTORY);
2471 default:
2472 return (EXC_BENIGN);
2473 }
2474 }
2475
2476 static int
2477 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2478 uint64_t *retinfo)
2479 {
2480 enum exc_class exc1, exc2;
2481 int type1, vector1;
2482
2483 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2484 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2485
2486 /*
2487 * If an exception occurs while attempting to call the double-fault
2488 * handler the processor enters shutdown mode (aka triple fault).
2489 */
2490 type1 = info1 & VM_INTINFO_TYPE;
2491 vector1 = info1 & 0xff;
2492 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2493 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2494 info1, info2);
2495 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2496 *retinfo = 0;
2497 return (0);
2498 }
2499
2500 /*
2501 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2502 */
2503 exc1 = exception_class(info1);
2504 exc2 = exception_class(info2);
2505 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2506 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2507 /* Convert nested fault into a double fault. */
2508 *retinfo = IDT_DF;
2509 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2510 *retinfo |= VM_INTINFO_DEL_ERRCODE;
2511 } else {
2512 /* Handle exceptions serially */
2513 *retinfo = info2;
2514 }
2515 return (1);
2516 }
2517
2518 static uint64_t
2519 vcpu_exception_intinfo(struct vcpu *vcpu)
2520 {
2521 uint64_t info = 0;
2522
2523 if (vcpu->exception_pending) {
2524 info = vcpu->exc_vector & 0xff;
2525 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2526 if (vcpu->exc_errcode_valid) {
2527 info |= VM_INTINFO_DEL_ERRCODE;
2528 info |= (uint64_t)vcpu->exc_errcode << 32;
2529 }
2530 }
2531 return (info);
2532 }
2533
2534 int
2535 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2536 {
2537 struct vcpu *vcpu;
2538 uint64_t info1, info2;
2539 int valid;
2540
2541 KASSERT(vcpuid >= 0 &&
2542 vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2543
2544 vcpu = &vm->vcpu[vcpuid];
2545
2546 info1 = vcpu->exitintinfo;
2547 vcpu->exitintinfo = 0;
2548
2549 info2 = 0;
2550 if (vcpu->exception_pending) {
2551 info2 = vcpu_exception_intinfo(vcpu);
2552 vcpu->exception_pending = 0;
2553 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2554 vcpu->exc_vector, info2);
2555 }
2556
2557 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2558 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2559 } else if (info1 & VM_INTINFO_VALID) {
2560 *retinfo = info1;
2561 valid = 1;
2562 } else if (info2 & VM_INTINFO_VALID) {
2563 *retinfo = info2;
2564 valid = 1;
2565 } else {
2566 valid = 0;
2567 }
2568
2569 if (valid) {
2570 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2571 "retinfo(%lx)", __func__, info1, info2, *retinfo);
2572 }
2573
2574 return (valid);
2575 }
2576
2577 int
2578 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2579 {
2580 struct vcpu *vcpu;
2581
2582 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2583 return (EINVAL);
2584
2585 vcpu = &vm->vcpu[vcpuid];
2586 *info1 = vcpu->exitintinfo;
2587 *info2 = vcpu_exception_intinfo(vcpu);
2588 return (0);
2589 }
2590
2591 int
2592 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2593 uint32_t errcode, int restart_instruction)
2594 {
2595 struct vcpu *vcpu;
2596 uint64_t regval;
2597 int error;
2598
2599 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2600 return (EINVAL);
2601
2602 if (vector < 0 || vector >= 32)
2603 return (EINVAL);
2604
2605 /*
2606 * NMIs (which bear an exception vector of 2) are to be injected via
2607 * their own specialized path using vm_inject_nmi().
2608 */
2609 if (vector == 2) {
2610 return (EINVAL);
2611 }
2612
2613 /*
2614 * A double fault exception should never be injected directly into
2615 * the guest. It is a derived exception that results from specific
2616 * combinations of nested faults.
2617 */
2618 if (vector == IDT_DF)
2619 return (EINVAL);
2620
2621 vcpu = &vm->vcpu[vcpuid];
2622
2623 if (vcpu->exception_pending) {
2624 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2625 "pending exception %d", vector, vcpu->exc_vector);
2626 return (EBUSY);
2627 }
2628
2629 if (errcode_valid) {
2630 /*
2631 * Exceptions don't deliver an error code in real mode.
2632 */
2633 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val);
2634 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2635 if (!(regval & CR0_PE))
2636 errcode_valid = 0;
2637 }
2638
2639 /*
2640 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2641 *
2642 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2643 * one instruction or incurs an exception.
2644 */
2645 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2646 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2647 __func__, error));
2648
2649 if (restart_instruction)
2650 vm_restart_instruction(vm, vcpuid);
2651
2652 vcpu->exception_pending = 1;
2653 vcpu->exc_vector = vector;
2654 vcpu->exc_errcode = errcode;
2655 vcpu->exc_errcode_valid = errcode_valid;
2656 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2657 return (0);
2658 }
2659
2660 void
2661 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2662 int errcode)
2663 {
2664 int error;
2665
2666 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2667 errcode, 1);
2668 KASSERT(error == 0, ("vm_inject_exception error %d", error));
2669 }
2670
2671 void
2672 vm_inject_ud(struct vm *vm, int vcpuid)
2673 {
2674 vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2675 }
2676
2677 void
2678 vm_inject_gp(struct vm *vm, int vcpuid)
2679 {
2680 vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2681 }
2682
2683 void
2684 vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2685 {
2686 vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2687 }
2688
2689 void
2690 vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2691 {
2692 vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2693 }
2694
2695 void
2696 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2697 {
2698 int error;
2699
2700 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2701 error_code, cr2);
2702
2703 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2704 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2705
2706 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2707 }
2708
2709 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2710
2711 int
2712 vm_inject_nmi(struct vm *vm, int vcpuid)
2713 {
2714 struct vcpu *vcpu;
2715
2716 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2717 return (EINVAL);
2718
2719 vcpu = &vm->vcpu[vcpuid];
2720
2721 vcpu->nmi_pending = 1;
2722 vcpu_notify_event(vm, vcpuid);
2723 return (0);
2724 }
2725
2726 int
2727 vm_nmi_pending(struct vm *vm, int vcpuid)
2728 {
2729 struct vcpu *vcpu;
2730
2731 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2732 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2733
2734 vcpu = &vm->vcpu[vcpuid];
2735
2736 return (vcpu->nmi_pending);
2737 }
2738
2739 void
2740 vm_nmi_clear(struct vm *vm, int vcpuid)
2741 {
2742 struct vcpu *vcpu;
2743
2744 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2745 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2746
2747 vcpu = &vm->vcpu[vcpuid];
2748
2749 if (vcpu->nmi_pending == 0)
2750 panic("vm_nmi_clear: inconsistent nmi_pending state");
2751
2752 vcpu->nmi_pending = 0;
2753 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2754 }
2755
2756 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2757
2758 int
2759 vm_inject_extint(struct vm *vm, int vcpuid)
2760 {
2761 struct vcpu *vcpu;
2762
2763 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2764 return (EINVAL);
2765
2766 vcpu = &vm->vcpu[vcpuid];
2767
2768 vcpu->extint_pending = 1;
2769 vcpu_notify_event(vm, vcpuid);
2770 return (0);
2771 }
2772
2773 int
2774 vm_extint_pending(struct vm *vm, int vcpuid)
2775 {
2776 struct vcpu *vcpu;
2777
2778 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2779 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2780
2781 vcpu = &vm->vcpu[vcpuid];
2782
2783 return (vcpu->extint_pending);
2784 }
2785
2786 void
2787 vm_extint_clear(struct vm *vm, int vcpuid)
2788 {
2789 struct vcpu *vcpu;
2790
2791 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2792 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2793
2794 vcpu = &vm->vcpu[vcpuid];
2795
2796 if (vcpu->extint_pending == 0)
2797 panic("vm_extint_clear: inconsistent extint_pending state");
2798
2799 vcpu->extint_pending = 0;
2800 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2801 }
2802
2803 int
2804 vm_inject_init(struct vm *vm, int vcpuid)
2805 {
2806 struct vcpu *vcpu;
2807
2808 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2809 return (EINVAL);
2810
2811 vcpu = &vm->vcpu[vcpuid];
2812 vcpu_lock(vcpu);
2813 vcpu->run_state |= VRS_PEND_INIT;
2814 /*
2815 * As part of queuing the INIT request, clear any pending SIPI. It
2816 * would not otherwise survive across the reset of the vCPU when it
2817 * undergoes the requested INIT. We would not want it to linger when it
2818 * could be mistaken as a subsequent (after the INIT) SIPI request.
2819 */
2820 vcpu->run_state &= ~VRS_PEND_SIPI;
2821 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2822
2823 vcpu_unlock(vcpu);
2824 return (0);
2825 }
2826
2827 int
2828 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2829 {
2830 struct vcpu *vcpu;
2831
2832 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2833 return (EINVAL);
2834
2835 vcpu = &vm->vcpu[vcpuid];
2836 vcpu_lock(vcpu);
2837 vcpu->run_state |= VRS_PEND_SIPI;
2838 vcpu->sipi_vector = vector;
2839 /* SIPI is only actionable if the CPU is waiting in INIT state */
2840 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2841 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2842 }
2843 vcpu_unlock(vcpu);
2844 return (0);
2845 }
2846
2847 bool
2848 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2849 {
2850 struct vcpu *vcpu;
2851
2852 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2853 vcpu = &vm->vcpu[vcpuid];
2854
2855 /* Of interest: vCPU not in running state or with pending INIT */
2856 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2857 }
2858
2859 int
2860 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2861 {
2862 struct seg_desc desc;
2863 const enum vm_reg_name clear_regs[] = {
2864 VM_REG_GUEST_CR2,
2865 VM_REG_GUEST_CR3,
2866 VM_REG_GUEST_CR4,
2867 VM_REG_GUEST_RAX,
2868 VM_REG_GUEST_RBX,
2869 VM_REG_GUEST_RCX,
2870 VM_REG_GUEST_RSI,
2871 VM_REG_GUEST_RDI,
2872 VM_REG_GUEST_RBP,
2873 VM_REG_GUEST_RSP,
2874 VM_REG_GUEST_R8,
2875 VM_REG_GUEST_R9,
2876 VM_REG_GUEST_R10,
2877 VM_REG_GUEST_R11,
2878 VM_REG_GUEST_R12,
2879 VM_REG_GUEST_R13,
2880 VM_REG_GUEST_R14,
2881 VM_REG_GUEST_R15,
2882 VM_REG_GUEST_DR0,
2883 VM_REG_GUEST_DR1,
2884 VM_REG_GUEST_DR2,
2885 VM_REG_GUEST_DR3,
2886 VM_REG_GUEST_EFER,
2887 };
2888 const enum vm_reg_name data_segs[] = {
2889 VM_REG_GUEST_SS,
2890 VM_REG_GUEST_DS,
2891 VM_REG_GUEST_ES,
2892 VM_REG_GUEST_FS,
2893 VM_REG_GUEST_GS,
2894 };
2895 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2896
2897 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2898 return (EINVAL);
2899
2900 for (uint_t i = 0; i < nitems(clear_regs); i++) {
2901 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2902 }
2903
2904 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2905 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2906 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2907
2908 /*
2909 * The prescribed contents of %rdx differ slightly between the Intel and
2910 * AMD architectural definitions. The former expects the Extended Model
2911 * in bits 16-19 where the latter expects all the Family, Model, and
2912 * Stepping be there. Common boot ROMs appear to disregard this
2913 * anyways, so we stick with a compromise value similar to what is
2914 * spelled out in the Intel SDM.
2915 */
2916 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2917
2918 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2919 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2920
2921 /* CS: Present, R/W, Accessed */
2922 desc.access = 0x0093;
2923 desc.base = 0xffff0000;
2924 desc.limit = 0xffff;
2925 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2926 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2927
2928 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2929 desc.access = 0x0093;
2930 desc.base = 0;
2931 desc.limit = 0xffff;
2932 for (uint_t i = 0; i < nitems(data_segs); i++) {
2933 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2934 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2935 }
2936
2937 /* GDTR, IDTR */
2938 desc.base = 0;
2939 desc.limit = 0xffff;
2940 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2941 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2942
2943 /* LDTR: Present, LDT */
2944 desc.access = 0x0082;
2945 desc.base = 0;
2946 desc.limit = 0xffff;
2947 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2948 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2949
2950 /* TR: Present, 32-bit TSS */
2951 desc.access = 0x008b;
2952 desc.base = 0;
2953 desc.limit = 0xffff;
2954 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2955 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2956
2957 vlapic_reset(vm_lapic(vm, vcpuid));
2958
2959 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2960
2961 vcpu->exitintinfo = 0;
2962 vcpu->exception_pending = 0;
2963 vcpu->nmi_pending = 0;
2964 vcpu->extint_pending = 0;
2965
2966 /*
2967 * A CPU reset caused by power-on or system reset clears more state than
2968 * one which is trigged from an INIT IPI.
2969 */
2970 if (!init_only) {
2971 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2972 fpu_save_area_reset(vcpu->guestfpu);
2973
2974 /* XXX: clear MSRs and other pieces */
2975 }
2976
2977 return (0);
2978 }
2979
2980 static int
2981 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2982 {
2983 struct seg_desc desc;
2984
2985 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2986 return (EINVAL);
2987
2988 /* CS: Present, R/W, Accessed */
2989 desc.access = 0x0093;
2990 desc.base = (uint64_t)vector << 12;
2991 desc.limit = 0xffff;
2992 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2993 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2994 (uint64_t)vector << 8));
2995
2996 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2997
2998 return (0);
2999 }
3000
3001 int
3002 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
3003 {
3004 if (vcpu < 0 || vcpu >= vm->maxcpus)
3005 return (EINVAL);
3006
3007 if (type < 0 || type >= VM_CAP_MAX)
3008 return (EINVAL);
3009
3010 return (VMGETCAP(vm->cookie, vcpu, type, retval));
3011 }
3012
3013 int
3014 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3015 {
3016 if (vcpu < 0 || vcpu >= vm->maxcpus)
3017 return (EINVAL);
3018
3019 if (type < 0 || type >= VM_CAP_MAX)
3020 return (EINVAL);
3021
3022 return (VMSETCAP(vm->cookie, vcpu, type, val));
3023 }
3024
3025 struct vlapic *
3026 vm_lapic(struct vm *vm, int cpu)
3027 {
3028 return (vm->vcpu[cpu].vlapic);
3029 }
3030
3031 struct vioapic *
3032 vm_ioapic(struct vm *vm)
3033 {
3034
3035 return (vm->vioapic);
3036 }
3037
3038 struct vhpet *
3039 vm_hpet(struct vm *vm)
3040 {
3041
3042 return (vm->vhpet);
3043 }
3044
3045 void *
3046 vm_iommu_domain(struct vm *vm)
3047 {
3048
3049 return (vm->iommu);
3050 }
3051
3052 int
3053 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3054 bool from_idle)
3055 {
3056 int error;
3057 struct vcpu *vcpu;
3058
3059 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3060 panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3061
3062 vcpu = &vm->vcpu[vcpuid];
3063
3064 vcpu_lock(vcpu);
3065 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3066 vcpu_unlock(vcpu);
3067
3068 return (error);
3069 }
3070
3071 enum vcpu_state
3072 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3073 {
3074 struct vcpu *vcpu;
3075 enum vcpu_state state;
3076
3077 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3078 panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3079
3080 vcpu = &vm->vcpu[vcpuid];
3081
3082 vcpu_lock(vcpu);
3083 state = vcpu->state;
3084 if (hostcpu != NULL)
3085 *hostcpu = vcpu->hostcpu;
3086 vcpu_unlock(vcpu);
3087
3088 return (state);
3089 }
3090
3091 uint64_t
3092 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3093 {
3094 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3095
3096 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3097
3098 if (phys_adj) {
3099 /* Include any offset for the current physical CPU too */
3100 extern hrtime_t tsc_gethrtime_tick_delta(void);
3101 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3102 }
3103
3104 return (vcpu_off);
3105 }
3106
3107 int
3108 vm_activate_cpu(struct vm *vm, int vcpuid)
3109 {
3110
3111 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3112 return (EINVAL);
3113
3114 if (CPU_ISSET(vcpuid, &vm->active_cpus))
3115 return (EBUSY);
3116
3117 VCPU_CTR0(vm, vcpuid, "activated");
3118 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3119 return (0);
3120 }
3121
3122 int
3123 vm_suspend_cpu(struct vm *vm, int vcpuid)
3124 {
3125 int i;
3126
3127 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3128 return (EINVAL);
3129
3130 if (vcpuid == -1) {
3131 vm->debug_cpus = vm->active_cpus;
3132 for (i = 0; i < vm->maxcpus; i++) {
3133 if (CPU_ISSET(i, &vm->active_cpus))
3134 vcpu_notify_event(vm, i);
3135 }
3136 } else {
3137 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3138 return (EINVAL);
3139
3140 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3141 vcpu_notify_event(vm, vcpuid);
3142 }
3143 return (0);
3144 }
3145
3146 int
3147 vm_resume_cpu(struct vm *vm, int vcpuid)
3148 {
3149
3150 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3151 return (EINVAL);
3152
3153 if (vcpuid == -1) {
3154 CPU_ZERO(&vm->debug_cpus);
3155 } else {
3156 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3157 return (EINVAL);
3158
3159 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3160 }
3161 return (0);
3162 }
3163
3164 static bool
3165 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3166 uint64_t entry_rip)
3167 {
3168 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3169 struct vm_exit *vme = &vcpu->exitinfo;
3170 bool bail = false;
3171
3172 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3173
3174 if (vm->suspend) {
3175 if (on_entry) {
3176 VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3177 vm->suspend < VM_SUSPEND_LAST);
3178
3179 vme->exitcode = VM_EXITCODE_SUSPENDED;
3180 vme->u.suspended.how = vm->suspend;
3181 } else {
3182 /*
3183 * Handling VM suspend is complicated, so if that
3184 * condition is detected outside of VM-entry itself,
3185 * just emit a BOGUS exitcode so we take a lap to pick
3186 * up the event during an entry and are directed into
3187 * the vm_handle_suspend() logic.
3188 */
3189 vme->exitcode = VM_EXITCODE_BOGUS;
3190 }
3191 bail = true;
3192 }
3193 if (vcpu->reqidle) {
3194 vme->exitcode = VM_EXITCODE_REQIDLE;
3195 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3196
3197 if (!on_entry) {
3198 /*
3199 * A reqidle request detected outside of VM-entry can be
3200 * handled directly by clearing the request (and taking
3201 * a lap to userspace).
3202 */
3203 vcpu_assert_locked(vcpu);
3204 vcpu->reqidle = 0;
3205 }
3206 bail = true;
3207 }
3208 if (vcpu_should_yield(vm, vcpuid)) {
3209 vme->exitcode = VM_EXITCODE_BOGUS;
3210 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3211 bail = true;
3212 }
3213 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3214 vme->exitcode = VM_EXITCODE_DEBUG;
3215 bail = true;
3216 }
3217
3218 if (bail) {
3219 if (on_entry) {
3220 /*
3221 * If bailing out during VM-entry, the current %rip must
3222 * be recorded in the exitinfo.
3223 */
3224 vme->rip = entry_rip;
3225 }
3226 vme->inst_length = 0;
3227 }
3228 return (bail);
3229 }
3230
3231 static bool
3232 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3233 {
3234 /*
3235 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3236 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3237 * structure, and we would only modify the exitcode.
3238 */
3239 return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3240 }
3241
3242 bool
3243 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3244 {
3245 /*
3246 * Bail-out checks done as part of VM entry require an updated %rip to
3247 * populate the vm_exit struct if any of the conditions of interest are
3248 * matched in the check.
3249 */
3250 return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3251 }
3252
3253 cpuset_t
3254 vm_active_cpus(struct vm *vm)
3255 {
3256
3257 return (vm->active_cpus);
3258 }
3259
3260 cpuset_t
3261 vm_debug_cpus(struct vm *vm)
3262 {
3263
3264 return (vm->debug_cpus);
3265 }
3266
3267 cpuset_t
3268 vm_suspended_cpus(struct vm *vm)
3269 {
3270
3271 return (vm->suspended_cpus);
3272 }
3273
3274 void *
3275 vcpu_stats(struct vm *vm, int vcpuid)
3276 {
3277
3278 return (vm->vcpu[vcpuid].stats);
3279 }
3280
3281 int
3282 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3283 {
3284 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3285 return (EINVAL);
3286
3287 *state = vm->vcpu[vcpuid].x2apic_state;
3288
3289 return (0);
3290 }
3291
3292 int
3293 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3294 {
3295 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3296 return (EINVAL);
3297
3298 if (state >= X2APIC_STATE_LAST)
3299 return (EINVAL);
3300
3301 vm->vcpu[vcpuid].x2apic_state = state;
3302
3303 vlapic_set_x2apic_state(vm, vcpuid, state);
3304
3305 return (0);
3306 }
3307
3308 /*
3309 * This function is called to ensure that a vcpu "sees" a pending event
3310 * as soon as possible:
3311 * - If the vcpu thread is sleeping then it is woken up.
3312 * - If the vcpu is running on a different host_cpu then an IPI will be directed
3313 * to the host_cpu to cause the vcpu to trap into the hypervisor.
3314 */
3315 static void
3316 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3317 {
3318 int hostcpu;
3319
3320 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3321
3322 hostcpu = vcpu->hostcpu;
3323 if (vcpu->state == VCPU_RUNNING) {
3324 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3325 if (hostcpu != curcpu) {
3326 if (ntype == VCPU_NOTIFY_APIC) {
3327 vlapic_post_intr(vcpu->vlapic, hostcpu,
3328 vmm_ipinum);
3329 } else {
3330 ipi_cpu(hostcpu, vmm_ipinum);
3331 }
3332 } else {
3333 /*
3334 * If the 'vcpu' is running on 'curcpu' then it must
3335 * be sending a notification to itself (e.g. SELF_IPI).
3336 * The pending event will be picked up when the vcpu
3337 * transitions back to guest context.
3338 */
3339 }
3340 } else {
3341 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3342 "with hostcpu %d", vcpu->state, hostcpu));
3343 if (vcpu->state == VCPU_SLEEPING) {
3344 cv_signal(&vcpu->vcpu_cv);
3345 }
3346 }
3347 }
3348
3349 void
3350 vcpu_notify_event(struct vm *vm, int vcpuid)
3351 {
3352 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3353
3354 vcpu_lock(vcpu);
3355 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3356 vcpu_unlock(vcpu);
3357 }
3358
3359 void
3360 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3361 {
3362 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3363
3364 if (ntype == VCPU_NOTIFY_NONE) {
3365 return;
3366 }
3367
3368 vcpu_lock(vcpu);
3369 vcpu_notify_event_locked(vcpu, ntype);
3370 vcpu_unlock(vcpu);
3371 }
3372
3373 void
3374 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3375 {
3376 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3377 hrtime_t now = gethrtime();
3378
3379 ASSERT3U(ustate, !=, vcpu->ustate);
3380 ASSERT3S(ustate, <, VU_MAX);
3381 ASSERT3S(ustate, >=, VU_INIT);
3382
3383 hrtime_t delta = now - vcpu->ustate_when;
3384 vcpu->ustate_total[vcpu->ustate] += delta;
3385
3386 membar_producer();
3387
3388 vcpu->ustate_when = now;
3389 vcpu->ustate = ustate;
3390 }
3391
3392 struct vmspace *
3393 vm_get_vmspace(struct vm *vm)
3394 {
3395
3396 return (vm->vmspace);
3397 }
3398
3399 int
3400 vm_apicid2vcpuid(struct vm *vm, int apicid)
3401 {
3402 /*
3403 * XXX apic id is assumed to be numerically identical to vcpu id
3404 */
3405 return (apicid);
3406 }
3407
3408 struct vatpic *
3409 vm_atpic(struct vm *vm)
3410 {
3411 return (vm->vatpic);
3412 }
3413
3414 struct vatpit *
3415 vm_atpit(struct vm *vm)
3416 {
3417 return (vm->vatpit);
3418 }
3419
3420 struct vpmtmr *
3421 vm_pmtmr(struct vm *vm)
3422 {
3423
3424 return (vm->vpmtmr);
3425 }
3426
3427 struct vrtc *
3428 vm_rtc(struct vm *vm)
3429 {
3430
3431 return (vm->vrtc);
3432 }
3433
3434 enum vm_reg_name
3435 vm_segment_name(int seg)
3436 {
3437 static enum vm_reg_name seg_names[] = {
3438 VM_REG_GUEST_ES,
3439 VM_REG_GUEST_CS,
3440 VM_REG_GUEST_SS,
3441 VM_REG_GUEST_DS,
3442 VM_REG_GUEST_FS,
3443 VM_REG_GUEST_GS
3444 };
3445
3446 KASSERT(seg >= 0 && seg < nitems(seg_names),
3447 ("%s: invalid segment encoding %d", __func__, seg));
3448 return (seg_names[seg]);
3449 }
3450
3451 void
3452 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3453 int num_copyinfo)
3454 {
3455 int idx;
3456
3457 for (idx = 0; idx < num_copyinfo; idx++) {
3458 if (copyinfo[idx].cookie != NULL)
3459 vm_gpa_release(copyinfo[idx].cookie);
3460 }
3461 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3462 }
3463
3464 int
3465 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3466 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3467 int num_copyinfo, int *fault)
3468 {
3469 int error, idx, nused;
3470 size_t n, off, remaining;
3471 void *hva, *cookie;
3472 uint64_t gpa;
3473
3474 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3475
3476 nused = 0;
3477 remaining = len;
3478 while (remaining > 0) {
3479 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3480 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3481 if (error || *fault)
3482 return (error);
3483 off = gpa & PAGE_MASK;
3484 n = min(remaining, PAGE_SIZE - off);
3485 copyinfo[nused].gpa = gpa;
3486 copyinfo[nused].len = n;
3487 remaining -= n;
3488 gla += n;
3489 nused++;
3490 }
3491
3492 for (idx = 0; idx < nused; idx++) {
3493 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3494 copyinfo[idx].len, prot, &cookie);
3495 if (hva == NULL)
3496 break;
3497 copyinfo[idx].hva = hva;
3498 copyinfo[idx].cookie = cookie;
3499 }
3500
3501 if (idx != nused) {
3502 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3503 return (EFAULT);
3504 } else {
3505 *fault = 0;
3506 return (0);
3507 }
3508 }
3509
3510 void
3511 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3512 size_t len)
3513 {
3514 char *dst;
3515 int idx;
3516
3517 dst = kaddr;
3518 idx = 0;
3519 while (len > 0) {
3520 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3521 len -= copyinfo[idx].len;
3522 dst += copyinfo[idx].len;
3523 idx++;
3524 }
3525 }
3526
3527 void
3528 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3529 struct vm_copyinfo *copyinfo, size_t len)
3530 {
3531 const char *src;
3532 int idx;
3533
3534 src = kaddr;
3535 idx = 0;
3536 while (len > 0) {
3537 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3538 len -= copyinfo[idx].len;
3539 src += copyinfo[idx].len;
3540 idx++;
3541 }
3542 }
3543
3544 /*
3545 * Return the amount of in-use and wired memory for the VM. Since
3546 * these are global stats, only return the values with for vCPU 0
3547 */
3548 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3549 VMM_STAT_DECLARE(VMM_MEM_WIRED);
3550
3551 static void
3552 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3553 {
3554
3555 if (vcpu == 0) {
3556 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3557 PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3558 }
3559 }
3560
3561 static void
3562 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3563 {
3564
3565 if (vcpu == 0) {
3566 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3567 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3568 }
3569 }
3570
3571 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3572 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3573
3574 int
3575 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3576 uint8_t bytes, uint32_t *val)
3577 {
3578 return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3579 }
3580
3581 /*
3582 * bhyve-internal interfaces to attach or detach IO port handlers.
3583 * Must be called with VM write lock held for safety.
3584 */
3585 int
3586 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3587 void **cookie)
3588 {
3589 int err;
3590 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3591 if (err == 0) {
3592 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3593 }
3594 return (err);
3595 }
3596 int
3597 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3598 void **old_arg)
3599 {
3600 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3601 int err;
3602
3603 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3604 if (err == 0) {
3605 *cookie = NULL;
3606 }
3607 return (err);
3608 }
3609
3610 /*
3611 * External driver interfaces to attach or detach IO port handlers.
3612 * Must be called with VM write lock held for safety.
3613 */
3614 int
3615 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3616 void *arg, void **cookie)
3617 {
3618 int err;
3619
3620 if (port == 0) {
3621 return (EINVAL);
3622 }
3623
3624 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3625 if (err == 0) {
3626 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3627 }
3628 return (err);
3629 }
3630 void
3631 vm_ioport_unhook(struct vm *vm, void **cookie)
3632 {
3633 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3634 ioport_handler_t old_func;
3635 void *old_arg;
3636 int err;
3637
3638 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3639
3640 /* ioport-hook-using drivers are expected to be well-behaved */
3641 VERIFY0(err);
3642 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3643
3644 *cookie = NULL;
3645 }
3646
3647 int
3648 vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3649 {
3650 struct vm *vm = ksp->ks_private;
3651 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3652 const int vcpuid = vvk->vvk_vcpu.value.ui32;
3653 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3654
3655 ASSERT3U(vcpuid, <, VM_MAXCPU);
3656
3657 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3658 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3659 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3660 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3661 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3662 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3663
3664 return (0);
3665 }
3666
3667 int
3668 vm_arc_resv(struct vm *vm, uint64_t len)
3669 {
3670 /* Since we already have the compat macros included, we use those */
3671 size_t pages = (size_t)roundup2(len, PAGE_SIZE) >> PAGE_SHIFT;
3672 int err = 0;
3673
3674 err = arc_virt_machine_reserve(pages);
3675 if (err != 0)
3676 return (err);
3677
3678 vm->arc_resv += pages;
3679 return (0);
3680 }