1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD$
29 */
30 /*
31 * This file and its contents are supplied under the terms of the
32 * Common Development and Distribution License ("CDDL"), version 1.0.
33 * You may only use this file in accordance with the terms of version
34 * 1.0 of the CDDL.
35 *
36 * A full copy of the text of the CDDL should have accompanied this
37 * source. A copy of the CDDL is also available via the Internet at
38 * http://www.illumos.org/license/CDDL.
39 *
40 * Copyright 2015 Pluribus Networks Inc.
41 * Copyright 2018 Joyent, Inc.
42 * Copyright 2021 Oxide Computer Company
43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
44 */
45
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/kernel.h>
52 #include <sys/module.h>
53 #include <sys/sysctl.h>
54 #include <sys/malloc.h>
55 #include <sys/pcpu.h>
56 #include <sys/lock.h>
57 #include <sys/mutex.h>
58 #include <sys/proc.h>
59 #include <sys/rwlock.h>
60 #include <sys/sched.h>
61 #include <sys/smp.h>
62 #include <sys/systm.h>
63
64 #include <machine/pcb.h>
65 #include <machine/smp.h>
66 #include <machine/md_var.h>
67 #include <x86/psl.h>
68 #include <x86/apicreg.h>
69
70 #include <machine/specialreg.h>
71 #include <machine/vmm.h>
72 #include <machine/vmm_dev.h>
73 #include <machine/vmparam.h>
74 #include <sys/vmm_instruction_emul.h>
75 #include <sys/vmm_vm.h>
76
77 #include "vmm_ioport.h"
78 #include "vmm_ktr.h"
79 #include "vmm_host.h"
80 #include "vmm_mem.h"
81 #include "vmm_util.h"
82 #include "vatpic.h"
83 #include "vatpit.h"
84 #include "vhpet.h"
85 #include "vioapic.h"
86 #include "vlapic.h"
87 #include "vpmtmr.h"
88 #include "vrtc.h"
89 #include "vmm_stat.h"
90 #include "vmm_lapic.h"
91
92 #include "io/ppt.h"
93 #include "io/iommu.h"
94
95 struct vlapic;
96
97 /*
98 * Initialization:
99 * (a) allocated when vcpu is created
100 * (i) initialized when vcpu is created and when it is reinitialized
101 * (o) initialized the first time the vcpu is created
102 * (x) initialized before use
103 */
104 struct vcpu {
105 /* (o) protects state, run_state, hostcpu, sipi_vector */
106 struct mtx mtx;
107
108 enum vcpu_state state; /* (o) vcpu state */
109 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */
110 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */
111 kcondvar_t state_cv; /* (o) IDLE-transition cv */
112 int hostcpu; /* (o) vcpu's current host cpu */
113 int lastloccpu; /* (o) last host cpu localized to */
114 int reqidle; /* (i) request vcpu to idle */
115 struct vlapic *vlapic; /* (i) APIC device model */
116 enum x2apic_state x2apic_state; /* (i) APIC mode */
117 uint64_t exitintinfo; /* (i) events pending at VM exit */
118 int nmi_pending; /* (i) NMI pending */
119 int extint_pending; /* (i) INTR pending */
120 int exception_pending; /* (i) exception pending */
121 int exc_vector; /* (x) exception collateral */
122 int exc_errcode_valid;
123 uint32_t exc_errcode;
124 uint8_t sipi_vector; /* (i) SIPI vector */
125 struct savefpu *guestfpu; /* (a,i) guest fpu state */
126 uint64_t guest_xcr0; /* (i) guest %xcr0 register */
127 void *stats; /* (a,i) statistics */
128 struct vm_exit exitinfo; /* (x) exit reason and collateral */
129 uint64_t nextrip; /* (x) next instruction to execute */
130 struct vie *vie_ctx; /* (x) instruction emulation context */
131 uint64_t tsc_offset; /* (x) offset from host TSC */
132
133 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */
134 hrtime_t ustate_when; /* (i) time of last ustate change */
135 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */
136 };
137
138 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
139 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
140 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
141 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
142 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
143
144 struct mem_seg {
145 size_t len;
146 bool sysmem;
147 struct vm_object *object;
148 };
149 #define VM_MAX_MEMSEGS 4
150
151 struct mem_map {
152 vm_paddr_t gpa;
153 size_t len;
154 vm_ooffset_t segoff;
155 int segid;
156 int prot;
157 int flags;
158 };
159 #define VM_MAX_MEMMAPS 8
160
161 /*
162 * Initialization:
163 * (o) initialized the first time the VM is created
164 * (i) initialized when VM is created and when it is reinitialized
165 * (x) initialized before use
166 */
167 struct vm {
168 void *cookie; /* (i) cpu-specific data */
169 void *iommu; /* (x) iommu-specific data */
170 struct vhpet *vhpet; /* (i) virtual HPET */
171 struct vioapic *vioapic; /* (i) virtual ioapic */
172 struct vatpic *vatpic; /* (i) virtual atpic */
173 struct vatpit *vatpit; /* (i) virtual atpit */
174 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
175 struct vrtc *vrtc; /* (o) virtual RTC */
176 volatile cpuset_t active_cpus; /* (i) active vcpus */
177 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */
178 int suspend; /* (i) stop VM execution */
179 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
180 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
181 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
182 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
183 struct vmspace *vmspace; /* (o) guest's address space */
184 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
185 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
186 /* The following describe the vm cpu topology */
187 uint16_t sockets; /* (o) num of sockets */
188 uint16_t cores; /* (o) num of cores/socket */
189 uint16_t threads; /* (o) num of threads/core */
190 uint16_t maxcpus; /* (o) max pluggable cpus */
191 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */
192
193 struct ioport_config ioports; /* (o) ioport handling */
194 };
195
196 static int vmm_initialized;
197
198
199 static void
200 nullop_panic(void)
201 {
202 panic("null vmm operation call");
203 }
204
205 /* Do not allow use of an un-set `ops` to do anything but panic */
206 static struct vmm_ops vmm_ops_null = {
207 .init = (vmm_init_func_t)nullop_panic,
208 .cleanup = (vmm_cleanup_func_t)nullop_panic,
209 .resume = (vmm_resume_func_t)nullop_panic,
210 .vminit = (vmi_init_func_t)nullop_panic,
211 .vmrun = (vmi_run_func_t)nullop_panic,
212 .vmcleanup = (vmi_cleanup_func_t)nullop_panic,
213 .vmgetreg = (vmi_get_register_t)nullop_panic,
214 .vmsetreg = (vmi_set_register_t)nullop_panic,
215 .vmgetdesc = (vmi_get_desc_t)nullop_panic,
216 .vmsetdesc = (vmi_set_desc_t)nullop_panic,
217 .vmgetcap = (vmi_get_cap_t)nullop_panic,
218 .vmsetcap = (vmi_set_cap_t)nullop_panic,
219 .vmspace_alloc = (vmi_vmspace_alloc)nullop_panic,
220 .vmspace_free = (vmi_vmspace_free)nullop_panic,
221 .vlapic_init = (vmi_vlapic_init)nullop_panic,
222 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
223 .vmsavectx = (vmi_savectx)nullop_panic,
224 .vmrestorectx = (vmi_restorectx)nullop_panic,
225 };
226
227 static struct vmm_ops *ops = &vmm_ops_null;
228
229 #define VMM_INIT(num) ((*ops->init)(num))
230 #define VMM_CLEANUP() ((*ops->cleanup)())
231 #define VMM_RESUME() ((*ops->resume)())
232
233 #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap))
234 #define VMRUN(vmi, vcpu, rip, pmap) \
235 ((*ops->vmrun)(vmi, vcpu, rip, pmap))
236 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
237 #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max))
238 #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace))
239
240 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
241 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val))
242 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
243 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
244 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv))
245 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val))
246 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu))
247 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic))
248
249 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
250 #define fpu_stop_emulating() clts()
251
252 SDT_PROVIDER_DEFINE(vmm);
253
254 static MALLOC_DEFINE(M_VM, "vm", "vm");
255
256 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
257 NULL);
258
259 /*
260 * Halt the guest if all vcpus are executing a HLT instruction with
261 * interrupts disabled.
262 */
263 static int halt_detection_enabled = 1;
264
265 /* IPI vector used for vcpu notifications */
266 static int vmm_ipinum;
267
268 /* Trap into hypervisor on all guest exceptions and reflect them back */
269 static int trace_guest_exceptions;
270
271 static void vm_free_memmap(struct vm *vm, int ident);
272 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
273 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
274 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
275 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
276
277 /* Flags for vtc_status */
278 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */
279 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */
280
281 typedef struct vm_thread_ctx {
282 struct vm *vtc_vm;
283 int vtc_vcpuid;
284 uint_t vtc_status;
285 enum vcpu_ustate vtc_ustate;
286 } vm_thread_ctx_t;
287
288 #ifdef KTR
289 static const char *
290 vcpu_state2str(enum vcpu_state state)
291 {
292
293 switch (state) {
294 case VCPU_IDLE:
295 return ("idle");
296 case VCPU_FROZEN:
297 return ("frozen");
298 case VCPU_RUNNING:
299 return ("running");
300 case VCPU_SLEEPING:
301 return ("sleeping");
302 default:
303 return ("unknown");
304 }
305 }
306 #endif
307
308 static void
309 vcpu_cleanup(struct vm *vm, int i, bool destroy)
310 {
311 struct vcpu *vcpu = &vm->vcpu[i];
312
313 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
314 if (destroy) {
315 vmm_stat_free(vcpu->stats);
316 fpu_save_area_free(vcpu->guestfpu);
317 vie_free(vcpu->vie_ctx);
318 vcpu->vie_ctx = NULL;
319 }
320 }
321
322 static void
323 vcpu_init(struct vm *vm, int vcpu_id, bool create)
324 {
325 struct vcpu *vcpu;
326
327 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
328 ("vcpu_init: invalid vcpu %d", vcpu_id));
329
330 vcpu = &vm->vcpu[vcpu_id];
331
332 if (create) {
333 vcpu_lock_init(vcpu);
334 vcpu->state = VCPU_IDLE;
335 vcpu->hostcpu = NOCPU;
336 vcpu->lastloccpu = NOCPU;
337 vcpu->guestfpu = fpu_save_area_alloc();
338 vcpu->stats = vmm_stat_alloc();
339 vcpu->vie_ctx = vie_alloc();
340
341 vcpu->ustate = VU_INIT;
342 vcpu->ustate_when = gethrtime();
343 } else {
344 vie_reset(vcpu->vie_ctx);
345 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
346 if (vcpu->ustate != VU_INIT) {
347 vcpu_ustate_change(vm, vcpu_id, VU_INIT);
348 }
349 }
350
351 vcpu->run_state = VRS_HALT;
352 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
353 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
354 vcpu->reqidle = 0;
355 vcpu->exitintinfo = 0;
356 vcpu->nmi_pending = 0;
357 vcpu->extint_pending = 0;
358 vcpu->exception_pending = 0;
359 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
360 fpu_save_area_reset(vcpu->guestfpu);
361 vmm_stat_init(vcpu->stats);
362 vcpu->tsc_offset = 0;
363 }
364
365 int
366 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
367 {
368
369 return (trace_guest_exceptions);
370 }
371
372 struct vm_exit *
373 vm_exitinfo(struct vm *vm, int cpuid)
374 {
375 struct vcpu *vcpu;
376
377 if (cpuid < 0 || cpuid >= vm->maxcpus)
378 panic("vm_exitinfo: invalid cpuid %d", cpuid);
379
380 vcpu = &vm->vcpu[cpuid];
381
382 return (&vcpu->exitinfo);
383 }
384
385 struct vie *
386 vm_vie_ctx(struct vm *vm, int cpuid)
387 {
388 if (cpuid < 0 || cpuid >= vm->maxcpus)
389 panic("vm_vie_ctx: invalid cpuid %d", cpuid);
390
391 return (vm->vcpu[cpuid].vie_ctx);
392 }
393
394 static int
395 vmm_init(void)
396 {
397 int error;
398
399 vmm_host_state_init();
400
401 /* We use cpu_poke() for IPIs */
402 vmm_ipinum = 0;
403
404 error = vmm_mem_init();
405 if (error)
406 return (error);
407
408 if (vmm_is_intel())
409 ops = &vmm_ops_intel;
410 else if (vmm_is_svm())
411 ops = &vmm_ops_amd;
412 else
413 return (ENXIO);
414
415 return (VMM_INIT(vmm_ipinum));
416 }
417
418 int
419 vmm_mod_load()
420 {
421 int error;
422
423 VERIFY(vmm_initialized == 0);
424
425 error = vmm_init();
426 if (error == 0)
427 vmm_initialized = 1;
428
429 return (error);
430 }
431
432 int
433 vmm_mod_unload()
434 {
435 int error;
436
437 VERIFY(vmm_initialized == 1);
438
439 iommu_cleanup();
440 error = VMM_CLEANUP();
441 if (error)
442 return (error);
443 vmm_initialized = 0;
444
445 return (0);
446 }
447
448 static void
449 vm_init(struct vm *vm, bool create)
450 {
451 int i;
452
453 vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
454 vm->iommu = NULL;
455 vm->vioapic = vioapic_init(vm);
456 vm->vhpet = vhpet_init(vm);
457 vm->vatpic = vatpic_init(vm);
458 vm->vatpit = vatpit_init(vm);
459 vm->vpmtmr = vpmtmr_init(vm);
460 if (create)
461 vm->vrtc = vrtc_init(vm);
462
463 vm_inout_init(vm, &vm->ioports);
464
465 CPU_ZERO(&vm->active_cpus);
466 CPU_ZERO(&vm->debug_cpus);
467
468 vm->suspend = 0;
469 CPU_ZERO(&vm->suspended_cpus);
470
471 for (i = 0; i < vm->maxcpus; i++)
472 vcpu_init(vm, i, create);
473
474 /*
475 * Configure the VM-wide TSC offset so that the call to vm_init()
476 * represents the boot time (when the TSC(s) read 0). Each vCPU will
477 * have its own offset from this, which is altered if/when the guest
478 * writes to MSR_TSC.
479 *
480 * The TSC offsetting math is all unsigned, using overflow for negative
481 * offets. A reading of the TSC is negated to form the boot offset.
482 */
483 vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset());
484 }
485
486 /*
487 * The default CPU topology is a single thread per package.
488 */
489 uint_t cores_per_package = 1;
490 uint_t threads_per_core = 1;
491
492 int
493 vm_create(const char *name, struct vm **retvm)
494 {
495 struct vm *vm;
496 struct vmspace *vmspace;
497
498 /*
499 * If vmm.ko could not be successfully initialized then don't attempt
500 * to create the virtual machine.
501 */
502 if (!vmm_initialized)
503 return (ENXIO);
504
505 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
506 return (EINVAL);
507
508 vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
509 if (vmspace == NULL)
510 return (ENOMEM);
511
512 vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
513 strcpy(vm->name, name);
514 vm->vmspace = vmspace;
515
516 vm->sockets = 1;
517 vm->cores = cores_per_package; /* XXX backwards compatibility */
518 vm->threads = threads_per_core; /* XXX backwards compatibility */
519 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
520
521 vm_init(vm, true);
522
523 *retvm = vm;
524 return (0);
525 }
526
527 void
528 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
529 uint16_t *threads, uint16_t *maxcpus)
530 {
531 *sockets = vm->sockets;
532 *cores = vm->cores;
533 *threads = vm->threads;
534 *maxcpus = vm->maxcpus;
535 }
536
537 uint16_t
538 vm_get_maxcpus(struct vm *vm)
539 {
540 return (vm->maxcpus);
541 }
542
543 int
544 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
545 uint16_t threads, uint16_t maxcpus)
546 {
547 if (maxcpus != 0)
548 return (EINVAL); /* XXX remove when supported */
549 if ((sockets * cores * threads) > vm->maxcpus)
550 return (EINVAL);
551 /* XXX need to check sockets * cores * threads == vCPU, how? */
552 vm->sockets = sockets;
553 vm->cores = cores;
554 vm->threads = threads;
555 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
556 return (0);
557 }
558
559 static void
560 vm_cleanup(struct vm *vm, bool destroy)
561 {
562 struct mem_map *mm;
563 int i;
564
565 ppt_unassign_all(vm);
566
567 if (vm->iommu != NULL)
568 iommu_destroy_domain(vm->iommu);
569
570 /*
571 * Devices which attach their own ioport hooks should be cleaned up
572 * first so they can tear down those registrations.
573 */
574 vpmtmr_cleanup(vm->vpmtmr);
575
576 vm_inout_cleanup(vm, &vm->ioports);
577
578 if (destroy)
579 vrtc_cleanup(vm->vrtc);
580 else
581 vrtc_reset(vm->vrtc);
582
583 vatpit_cleanup(vm->vatpit);
584 vhpet_cleanup(vm->vhpet);
585 vatpic_cleanup(vm->vatpic);
586 vioapic_cleanup(vm->vioapic);
587
588 for (i = 0; i < vm->maxcpus; i++)
589 vcpu_cleanup(vm, i, destroy);
590
591 VMCLEANUP(vm->cookie);
592
593 /*
594 * System memory is removed from the guest address space only when
595 * the VM is destroyed. This is because the mapping remains the same
596 * across VM reset.
597 *
598 * Device memory can be relocated by the guest (e.g. using PCI BARs)
599 * so those mappings are removed on a VM reset.
600 */
601 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
602 mm = &vm->mem_maps[i];
603 if (destroy || !sysmem_mapping(vm, mm)) {
604 vm_free_memmap(vm, i);
605 } else {
606 /*
607 * We need to reset the IOMMU flag so this mapping can
608 * be reused when a VM is rebooted. Since the IOMMU
609 * domain has already been destroyed we can just reset
610 * the flag here.
611 */
612 mm->flags &= ~VM_MEMMAP_F_IOMMU;
613 }
614 }
615
616 if (destroy) {
617 for (i = 0; i < VM_MAX_MEMSEGS; i++)
618 vm_free_memseg(vm, i);
619
620 VMSPACE_FREE(vm->vmspace);
621 vm->vmspace = NULL;
622 }
623 }
624
625 void
626 vm_destroy(struct vm *vm)
627 {
628 vm_cleanup(vm, true);
629 free(vm, M_VM);
630 }
631
632 int
633 vm_reinit(struct vm *vm)
634 {
635 int error;
636
637 /*
638 * A virtual machine can be reset only if all vcpus are suspended.
639 */
640 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
641 vm_cleanup(vm, false);
642 vm_init(vm, false);
643 error = 0;
644 } else {
645 error = EBUSY;
646 }
647
648 return (error);
649 }
650
651 const char *
652 vm_name(struct vm *vm)
653 {
654 return (vm->name);
655 }
656
657 int
658 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
659 {
660 vm_object_t obj;
661
662 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
663 return (ENOMEM);
664 else
665 return (0);
666 }
667
668 int
669 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
670 {
671 return (vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len));
672 }
673
674 /*
675 * Return 'true' if 'gpa' is allocated in the guest address space.
676 *
677 * This function is called in the context of a running vcpu which acts as
678 * an implicit lock on 'vm->mem_maps[]'.
679 */
680 bool
681 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
682 {
683 struct mem_map *mm;
684 int i;
685
686 #ifdef INVARIANTS
687 int hostcpu, state;
688 state = vcpu_get_state(vm, vcpuid, &hostcpu);
689 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
690 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
691 #endif
692
693 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
694 mm = &vm->mem_maps[i];
695 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
696 return (true); /* 'gpa' is sysmem or devmem */
697 }
698
699 if (ppt_is_mmio(vm, gpa))
700 return (true); /* 'gpa' is pci passthru mmio */
701
702 return (false);
703 }
704
705 int
706 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
707 {
708 struct mem_seg *seg;
709 vm_object_t obj;
710
711 #ifndef __FreeBSD__
712 extern pgcnt_t get_max_page_get(void);
713 #endif
714
715 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
716 return (EINVAL);
717
718 if (len == 0 || (len & PAGE_MASK))
719 return (EINVAL);
720
721 #ifndef __FreeBSD__
722 if (len > ptob(get_max_page_get()))
723 return (EINVAL);
724 #endif
725
726 seg = &vm->mem_segs[ident];
727 if (seg->object != NULL) {
728 if (seg->len == len && seg->sysmem == sysmem)
729 return (EEXIST);
730 else
731 return (EINVAL);
732 }
733
734 obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
735 if (obj == NULL)
736 return (ENOMEM);
737
738 seg->len = len;
739 seg->object = obj;
740 seg->sysmem = sysmem;
741 return (0);
742 }
743
744 int
745 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
746 vm_object_t *objptr)
747 {
748 struct mem_seg *seg;
749
750 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
751 return (EINVAL);
752
753 seg = &vm->mem_segs[ident];
754 if (len)
755 *len = seg->len;
756 if (sysmem)
757 *sysmem = seg->sysmem;
758 if (objptr)
759 *objptr = seg->object;
760 return (0);
761 }
762
763 void
764 vm_free_memseg(struct vm *vm, int ident)
765 {
766 struct mem_seg *seg;
767
768 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
769 ("%s: invalid memseg ident %d", __func__, ident));
770
771 seg = &vm->mem_segs[ident];
772 if (seg->object != NULL) {
773 vm_object_deallocate(seg->object);
774 bzero(seg, sizeof (struct mem_seg));
775 }
776 }
777
778 int
779 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
780 size_t len, int prot, int flags)
781 {
782 struct mem_seg *seg;
783 struct mem_map *m, *map;
784 vm_ooffset_t last;
785 int i, error;
786
787 if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
788 return (EINVAL);
789
790 if (flags & ~VM_MEMMAP_F_WIRED)
791 return (EINVAL);
792
793 if (segid < 0 || segid >= VM_MAX_MEMSEGS)
794 return (EINVAL);
795
796 seg = &vm->mem_segs[segid];
797 if (seg->object == NULL)
798 return (EINVAL);
799
800 last = first + len;
801 if (first < 0 || first >= last || last > seg->len)
802 return (EINVAL);
803
804 if ((gpa | first | last) & PAGE_MASK)
805 return (EINVAL);
806
807 map = NULL;
808 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
809 m = &vm->mem_maps[i];
810 if (m->len == 0) {
811 map = m;
812 break;
813 }
814 }
815
816 if (map == NULL)
817 return (ENOSPC);
818
819 error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
820 len, 0, VMFS_NO_SPACE, prot, prot, 0);
821 if (error != 0)
822 return (EFAULT);
823
824 vm_object_reference(seg->object);
825
826 if ((flags & VM_MEMMAP_F_WIRED) != 0) {
827 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
828 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
829 if (error != 0) {
830 vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
831 return (EFAULT);
832 }
833 }
834
835 map->gpa = gpa;
836 map->len = len;
837 map->segoff = first;
838 map->segid = segid;
839 map->prot = prot;
840 map->flags = flags;
841 return (0);
842 }
843
844 int
845 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
846 {
847 struct mem_map *m;
848 int i;
849
850 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
851 m = &vm->mem_maps[i];
852 if (m->gpa == gpa && m->len == len &&
853 (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
854 vm_free_memmap(vm, i);
855 return (0);
856 }
857 }
858
859 return (EINVAL);
860 }
861
862 int
863 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
864 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
865 {
866 struct mem_map *mm, *mmnext;
867 int i;
868
869 mmnext = NULL;
870 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
871 mm = &vm->mem_maps[i];
872 if (mm->len == 0 || mm->gpa < *gpa)
873 continue;
874 if (mmnext == NULL || mm->gpa < mmnext->gpa)
875 mmnext = mm;
876 }
877
878 if (mmnext != NULL) {
879 *gpa = mmnext->gpa;
880 if (segid)
881 *segid = mmnext->segid;
882 if (segoff)
883 *segoff = mmnext->segoff;
884 if (len)
885 *len = mmnext->len;
886 if (prot)
887 *prot = mmnext->prot;
888 if (flags)
889 *flags = mmnext->flags;
890 return (0);
891 } else {
892 return (ENOENT);
893 }
894 }
895
896 static void
897 vm_free_memmap(struct vm *vm, int ident)
898 {
899 struct mem_map *mm;
900 int error;
901
902 mm = &vm->mem_maps[ident];
903 if (mm->len) {
904 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
905 mm->gpa + mm->len);
906 KASSERT(error == 0, ("%s: vm_map_remove error %d",
907 __func__, error));
908 bzero(mm, sizeof (struct mem_map));
909 }
910 }
911
912 static __inline bool
913 sysmem_mapping(struct vm *vm, struct mem_map *mm)
914 {
915
916 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
917 return (true);
918 else
919 return (false);
920 }
921
922 vm_paddr_t
923 vmm_sysmem_maxaddr(struct vm *vm)
924 {
925 struct mem_map *mm;
926 vm_paddr_t maxaddr;
927 int i;
928
929 maxaddr = 0;
930 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
931 mm = &vm->mem_maps[i];
932 if (sysmem_mapping(vm, mm)) {
933 if (maxaddr < mm->gpa + mm->len)
934 maxaddr = mm->gpa + mm->len;
935 }
936 }
937 return (maxaddr);
938 }
939
940 static void
941 vm_iommu_modify(struct vm *vm, bool map)
942 {
943 int i, sz;
944 vm_paddr_t gpa, hpa;
945 struct mem_map *mm;
946 #ifdef __FreeBSD__
947 void *vp, *cookie, *host_domain;
948 #else
949 void *vp, *cookie, *host_domain __unused;
950 #endif
951
952 sz = PAGE_SIZE;
953 host_domain = iommu_host_domain();
954
955 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
956 mm = &vm->mem_maps[i];
957 if (!sysmem_mapping(vm, mm))
958 continue;
959
960 if (map) {
961 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
962 ("iommu map found invalid memmap %lx/%lx/%x",
963 mm->gpa, mm->len, mm->flags));
964 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
965 continue;
966 mm->flags |= VM_MEMMAP_F_IOMMU;
967 } else {
968 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
969 continue;
970 mm->flags &= ~VM_MEMMAP_F_IOMMU;
971 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
972 ("iommu unmap found invalid memmap %lx/%lx/%x",
973 mm->gpa, mm->len, mm->flags));
974 }
975
976 gpa = mm->gpa;
977 while (gpa < mm->gpa + mm->len) {
978 vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, PROT_WRITE,
979 &cookie);
980 KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
981 vm_name(vm), gpa));
982
983 vm_gpa_release(cookie);
984
985 hpa = DMAP_TO_PHYS((uintptr_t)vp);
986 if (map) {
987 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
988 #ifdef __FreeBSD__
989 iommu_remove_mapping(host_domain, hpa, sz);
990 #endif
991 } else {
992 iommu_remove_mapping(vm->iommu, gpa, sz);
993 #ifdef __FreeBSD__
994 iommu_create_mapping(host_domain, hpa, hpa, sz);
995 #endif
996 }
997
998 gpa += PAGE_SIZE;
999 }
1000 }
1001
1002 /*
1003 * Invalidate the cached translations associated with the domain
1004 * from which pages were removed.
1005 */
1006 #ifdef __FreeBSD__
1007 if (map)
1008 iommu_invalidate_tlb(host_domain);
1009 else
1010 iommu_invalidate_tlb(vm->iommu);
1011 #else
1012 iommu_invalidate_tlb(vm->iommu);
1013 #endif
1014 }
1015
1016 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false)
1017 #define vm_iommu_map(vm) vm_iommu_modify((vm), true)
1018
1019 int
1020 vm_unassign_pptdev(struct vm *vm, int pptfd)
1021 {
1022 int error;
1023
1024 error = ppt_unassign_device(vm, pptfd);
1025 if (error)
1026 return (error);
1027
1028 if (ppt_assigned_devices(vm) == 0)
1029 vm_iommu_unmap(vm);
1030
1031 return (0);
1032 }
1033
1034 int
1035 vm_assign_pptdev(struct vm *vm, int pptfd)
1036 {
1037 int error;
1038 vm_paddr_t maxaddr;
1039
1040 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1041 if (ppt_assigned_devices(vm) == 0) {
1042 KASSERT(vm->iommu == NULL,
1043 ("vm_assign_pptdev: iommu must be NULL"));
1044 maxaddr = vmm_sysmem_maxaddr(vm);
1045 vm->iommu = iommu_create_domain(maxaddr);
1046 if (vm->iommu == NULL)
1047 return (ENXIO);
1048 vm_iommu_map(vm);
1049 }
1050
1051 error = ppt_assign_device(vm, pptfd);
1052 return (error);
1053 }
1054
1055 void *
1056 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1057 void **cookie)
1058 {
1059 int i, count, pageoff;
1060 struct mem_map *mm;
1061 vm_page_t m;
1062 #ifdef INVARIANTS
1063 /*
1064 * All vcpus are frozen by ioctls that modify the memory map
1065 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1066 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1067 */
1068 int state;
1069 KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1070 __func__, vcpuid));
1071 for (i = 0; i < vm->maxcpus; i++) {
1072 if (vcpuid != -1 && vcpuid != i)
1073 continue;
1074 state = vcpu_get_state(vm, i, NULL);
1075 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1076 __func__, state));
1077 }
1078 #endif
1079 pageoff = gpa & PAGE_MASK;
1080 if (len > PAGE_SIZE - pageoff)
1081 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1082
1083 count = 0;
1084 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1085 mm = &vm->mem_maps[i];
1086 if (mm->len == 0) {
1087 continue;
1088 }
1089 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1090 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1091 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1092 break;
1093 }
1094 }
1095
1096 if (count == 1) {
1097 *cookie = m;
1098 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1099 } else {
1100 *cookie = NULL;
1101 return (NULL);
1102 }
1103 }
1104
1105 void
1106 vm_gpa_release(void *cookie)
1107 {
1108 vm_page_t m = cookie;
1109
1110 vm_page_unwire(m, PQ_ACTIVE);
1111 }
1112
1113 int
1114 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1115 {
1116
1117 if (vcpu < 0 || vcpu >= vm->maxcpus)
1118 return (EINVAL);
1119
1120 if (reg >= VM_REG_LAST)
1121 return (EINVAL);
1122
1123 return (VMGETREG(vm->cookie, vcpu, reg, retval));
1124 }
1125
1126 int
1127 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1128 {
1129 struct vcpu *vcpu;
1130 int error;
1131
1132 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1133 return (EINVAL);
1134
1135 if (reg >= VM_REG_LAST)
1136 return (EINVAL);
1137
1138 error = VMSETREG(vm->cookie, vcpuid, reg, val);
1139 if (error || reg != VM_REG_GUEST_RIP)
1140 return (error);
1141
1142 /* Set 'nextrip' to match the value of %rip */
1143 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1144 vcpu = &vm->vcpu[vcpuid];
1145 vcpu->nextrip = val;
1146 return (0);
1147 }
1148
1149 static bool
1150 is_descriptor_table(int reg)
1151 {
1152 switch (reg) {
1153 case VM_REG_GUEST_IDTR:
1154 case VM_REG_GUEST_GDTR:
1155 return (true);
1156 default:
1157 return (false);
1158 }
1159 }
1160
1161 static bool
1162 is_segment_register(int reg)
1163 {
1164 switch (reg) {
1165 case VM_REG_GUEST_ES:
1166 case VM_REG_GUEST_CS:
1167 case VM_REG_GUEST_SS:
1168 case VM_REG_GUEST_DS:
1169 case VM_REG_GUEST_FS:
1170 case VM_REG_GUEST_GS:
1171 case VM_REG_GUEST_TR:
1172 case VM_REG_GUEST_LDTR:
1173 return (true);
1174 default:
1175 return (false);
1176 }
1177 }
1178
1179 int
1180 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1181 {
1182
1183 if (vcpu < 0 || vcpu >= vm->maxcpus)
1184 return (EINVAL);
1185
1186 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1187 return (EINVAL);
1188
1189 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1190 }
1191
1192 int
1193 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1194 {
1195 if (vcpu < 0 || vcpu >= vm->maxcpus)
1196 return (EINVAL);
1197
1198 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1199 return (EINVAL);
1200
1201 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1202 }
1203
1204 int
1205 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1206 {
1207 struct vcpu *vcpu;
1208
1209 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1210 return (EINVAL);
1211 }
1212
1213 vcpu = &vm->vcpu[vcpuid];
1214
1215 vcpu_lock(vcpu);
1216 *state = vcpu->run_state;
1217 *sipi_vec = vcpu->sipi_vector;
1218 vcpu_unlock(vcpu);
1219
1220 return (0);
1221 }
1222
1223 int
1224 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1225 {
1226 struct vcpu *vcpu;
1227
1228 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1229 return (EINVAL);
1230 }
1231 if (!VRS_IS_VALID(state)) {
1232 return (EINVAL);
1233 }
1234
1235 vcpu = &vm->vcpu[vcpuid];
1236
1237 vcpu_lock(vcpu);
1238 vcpu->run_state = state;
1239 vcpu->sipi_vector = sipi_vec;
1240 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1241 vcpu_unlock(vcpu);
1242
1243 return (0);
1244 }
1245
1246
1247 static void
1248 restore_guest_fpustate(struct vcpu *vcpu)
1249 {
1250
1251 /* flush host state to the pcb */
1252 fpuexit(curthread);
1253
1254 /* restore guest FPU state */
1255 fpu_stop_emulating();
1256 fpurestore(vcpu->guestfpu);
1257
1258 /* restore guest XCR0 if XSAVE is enabled in the host */
1259 if (rcr4() & CR4_XSAVE)
1260 load_xcr(0, vcpu->guest_xcr0);
1261
1262 /*
1263 * The FPU is now "dirty" with the guest's state so turn on emulation
1264 * to trap any access to the FPU by the host.
1265 */
1266 fpu_start_emulating();
1267 }
1268
1269 static void
1270 save_guest_fpustate(struct vcpu *vcpu)
1271 {
1272
1273 if ((rcr0() & CR0_TS) == 0)
1274 panic("fpu emulation not enabled in host!");
1275
1276 /* save guest XCR0 and restore host XCR0 */
1277 if (rcr4() & CR4_XSAVE) {
1278 vcpu->guest_xcr0 = rxcr(0);
1279 load_xcr(0, vmm_get_host_xcr0());
1280 }
1281
1282 /* save guest FPU state */
1283 fpu_stop_emulating();
1284 fpusave(vcpu->guestfpu);
1285 /*
1286 * When the host state has been restored, we should not re-enable
1287 * CR0.TS on illumos for eager FPU.
1288 */
1289 }
1290
1291 static int
1292 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1293 bool from_idle)
1294 {
1295 struct vcpu *vcpu;
1296 int error;
1297
1298 vcpu = &vm->vcpu[vcpuid];
1299 vcpu_assert_locked(vcpu);
1300
1301 /*
1302 * State transitions from the vmmdev_ioctl() must always begin from
1303 * the VCPU_IDLE state. This guarantees that there is only a single
1304 * ioctl() operating on a vcpu at any point.
1305 */
1306 if (from_idle) {
1307 while (vcpu->state != VCPU_IDLE) {
1308 vcpu->reqidle = 1;
1309 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1310 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1311 "idle requested", vcpu_state2str(vcpu->state));
1312 cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1313 }
1314 } else {
1315 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1316 "vcpu idle state"));
1317 }
1318
1319 if (vcpu->state == VCPU_RUNNING) {
1320 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1321 "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1322 } else {
1323 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1324 "vcpu that is not running", vcpu->hostcpu));
1325 }
1326
1327 /*
1328 * The following state transitions are allowed:
1329 * IDLE -> FROZEN -> IDLE
1330 * FROZEN -> RUNNING -> FROZEN
1331 * FROZEN -> SLEEPING -> FROZEN
1332 */
1333 switch (vcpu->state) {
1334 case VCPU_IDLE:
1335 case VCPU_RUNNING:
1336 case VCPU_SLEEPING:
1337 error = (newstate != VCPU_FROZEN);
1338 break;
1339 case VCPU_FROZEN:
1340 error = (newstate == VCPU_FROZEN);
1341 break;
1342 default:
1343 error = 1;
1344 break;
1345 }
1346
1347 if (error)
1348 return (EBUSY);
1349
1350 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1351 vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1352
1353 vcpu->state = newstate;
1354 if (newstate == VCPU_RUNNING)
1355 vcpu->hostcpu = curcpu;
1356 else
1357 vcpu->hostcpu = NOCPU;
1358
1359 if (newstate == VCPU_IDLE) {
1360 cv_broadcast(&vcpu->state_cv);
1361 }
1362
1363 return (0);
1364 }
1365
1366 static void
1367 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1368 {
1369 int error;
1370
1371 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1372 panic("Error %d setting state to %d\n", error, newstate);
1373 }
1374
1375 static void
1376 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1377 {
1378 int error;
1379
1380 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1381 panic("Error %d setting state to %d", error, newstate);
1382 }
1383
1384 /*
1385 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1386 */
1387 static int
1388 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1389 {
1390 struct vcpu *vcpu;
1391 int vcpu_halted, vm_halted;
1392 bool userspace_exit = false;
1393
1394 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1395
1396 vcpu = &vm->vcpu[vcpuid];
1397 vcpu_halted = 0;
1398 vm_halted = 0;
1399
1400 vcpu_lock(vcpu);
1401 while (1) {
1402 /*
1403 * Do a final check for pending interrupts (including NMI and
1404 * INIT) before putting this thread to sleep.
1405 */
1406 if (vm_nmi_pending(vm, vcpuid))
1407 break;
1408 if (vcpu_run_state_pending(vm, vcpuid))
1409 break;
1410 if (!intr_disabled) {
1411 if (vm_extint_pending(vm, vcpuid) ||
1412 vlapic_pending_intr(vcpu->vlapic, NULL)) {
1413 break;
1414 }
1415 }
1416
1417 /*
1418 * Also check for software events which would cause a wake-up.
1419 * This will set the appropriate exitcode directly, rather than
1420 * requiring a trip through VM_RUN().
1421 */
1422 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1423 userspace_exit = true;
1424 break;
1425 }
1426
1427 /*
1428 * Some Linux guests implement "halt" by having all vcpus
1429 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1430 * track of the vcpus that have entered this state. When all
1431 * vcpus enter the halted state the virtual machine is halted.
1432 */
1433 if (intr_disabled) {
1434 if (!vcpu_halted && halt_detection_enabled) {
1435 vcpu_halted = 1;
1436 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1437 }
1438 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1439 vm_halted = 1;
1440 break;
1441 }
1442 }
1443
1444 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1445 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1446 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1447 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1448 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1449 }
1450
1451 if (vcpu_halted)
1452 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1453
1454 vcpu_unlock(vcpu);
1455
1456 if (vm_halted)
1457 vm_suspend(vm, VM_SUSPEND_HALT);
1458
1459 return (userspace_exit ? -1 : 0);
1460 }
1461
1462 static int
1463 vm_handle_paging(struct vm *vm, int vcpuid)
1464 {
1465 int rv, ftype;
1466 struct vm_map *map;
1467 struct vcpu *vcpu;
1468 struct vm_exit *vme;
1469
1470 vcpu = &vm->vcpu[vcpuid];
1471 vme = &vcpu->exitinfo;
1472
1473 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1474 __func__, vme->inst_length));
1475
1476 ftype = vme->u.paging.fault_type;
1477 KASSERT(ftype == PROT_READ ||
1478 ftype == PROT_WRITE || ftype == PROT_EXEC,
1479 ("vm_handle_paging: invalid fault_type %d", ftype));
1480
1481 if (ftype == PROT_READ || ftype == PROT_WRITE) {
1482 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1483 vme->u.paging.gpa, ftype);
1484 if (rv == 0) {
1485 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1486 ftype == PROT_READ ? "accessed" : "dirty",
1487 vme->u.paging.gpa);
1488 goto done;
1489 }
1490 }
1491
1492 map = &vm->vmspace->vm_map;
1493 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1494
1495 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1496 "ftype = %d", rv, vme->u.paging.gpa, ftype);
1497
1498 if (rv != 0)
1499 return (EFAULT);
1500 done:
1501 return (0);
1502 }
1503
1504 int
1505 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1506 int rsize)
1507 {
1508 int err = ESRCH;
1509
1510 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1511 err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1512 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1513 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1514 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1515 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1516 }
1517
1518 return (err);
1519 }
1520
1521 int
1522 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1523 int wsize)
1524 {
1525 int err = ESRCH;
1526
1527 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1528 err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1529 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1530 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1531 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1532 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1533 }
1534
1535 return (err);
1536 }
1537
1538 static int
1539 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1540 {
1541 struct vie *vie;
1542 struct vcpu *vcpu;
1543 struct vm_exit *vme;
1544 uint64_t inst_addr;
1545 int error, fault, cs_d;
1546
1547 vcpu = &vm->vcpu[vcpuid];
1548 vme = &vcpu->exitinfo;
1549 vie = vcpu->vie_ctx;
1550
1551 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1552 __func__, vme->inst_length));
1553
1554 inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1555 cs_d = vme->u.mmio_emul.cs_d;
1556
1557 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1558 vme->u.mmio_emul.gpa);
1559
1560 /* Fetch the faulting instruction */
1561 if (vie_needs_fetch(vie)) {
1562 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1563 &fault);
1564 if (error != 0) {
1565 return (error);
1566 } else if (fault) {
1567 /*
1568 * If a fault during instruction fetch was encountered,
1569 * it will have asserted that the appropriate exception
1570 * be injected at next entry.
1571 * No further work is required.
1572 */
1573 return (0);
1574 }
1575 }
1576
1577 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1578 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1579 inst_addr);
1580 /* Dump (unrecognized) instruction bytes in userspace */
1581 vie_fallback_exitinfo(vie, vme);
1582 return (-1);
1583 }
1584 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1585 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1586 /* Decoded GLA does not match GLA from VM exit state */
1587 vie_fallback_exitinfo(vie, vme);
1588 return (-1);
1589 }
1590
1591 repeat:
1592 error = vie_emulate_mmio(vie, vm, vcpuid);
1593 if (error < 0) {
1594 /*
1595 * MMIO not handled by any of the in-kernel-emulated devices, so
1596 * make a trip out to userspace for it.
1597 */
1598 vie_exitinfo(vie, vme);
1599 } else if (error == EAGAIN) {
1600 /*
1601 * Continue emulating the rep-prefixed instruction, which has
1602 * not completed its iterations.
1603 *
1604 * In case this can be emulated in-kernel and has a high
1605 * repetition count (causing a tight spin), it should be
1606 * deferential to yield conditions.
1607 */
1608 if (!vcpu_should_yield(vm, vcpuid)) {
1609 goto repeat;
1610 } else {
1611 /*
1612 * Defer to the contending load by making a trip to
1613 * userspace with a no-op (BOGUS) exit reason.
1614 */
1615 vie_reset(vie);
1616 vme->exitcode = VM_EXITCODE_BOGUS;
1617 return (-1);
1618 }
1619 } else if (error == 0) {
1620 /* Update %rip now that instruction has been emulated */
1621 vie_advance_pc(vie, &vcpu->nextrip);
1622 }
1623 return (error);
1624 }
1625
1626 static int
1627 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1628 {
1629 struct vcpu *vcpu;
1630 struct vie *vie;
1631 int err;
1632
1633 vcpu = &vm->vcpu[vcpuid];
1634 vie = vcpu->vie_ctx;
1635
1636 repeat:
1637 err = vie_emulate_inout(vie, vm, vcpuid);
1638
1639 if (err < 0) {
1640 /*
1641 * In/out not handled by any of the in-kernel-emulated devices,
1642 * so make a trip out to userspace for it.
1643 */
1644 vie_exitinfo(vie, vme);
1645 return (err);
1646 } else if (err == EAGAIN) {
1647 /*
1648 * Continue emulating the rep-prefixed ins/outs, which has not
1649 * completed its iterations.
1650 *
1651 * In case this can be emulated in-kernel and has a high
1652 * repetition count (causing a tight spin), it should be
1653 * deferential to yield conditions.
1654 */
1655 if (!vcpu_should_yield(vm, vcpuid)) {
1656 goto repeat;
1657 } else {
1658 /*
1659 * Defer to the contending load by making a trip to
1660 * userspace with a no-op (BOGUS) exit reason.
1661 */
1662 vie_reset(vie);
1663 vme->exitcode = VM_EXITCODE_BOGUS;
1664 return (-1);
1665 }
1666 } else if (err != 0) {
1667 /* Emulation failure. Bail all the way out to userspace. */
1668 vme->exitcode = VM_EXITCODE_INST_EMUL;
1669 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1670 return (-1);
1671 }
1672
1673 vie_advance_pc(vie, &vcpu->nextrip);
1674 return (0);
1675 }
1676
1677 static int
1678 vm_handle_inst_emul(struct vm *vm, int vcpuid)
1679 {
1680 struct vie *vie;
1681 struct vcpu *vcpu;
1682 struct vm_exit *vme;
1683 uint64_t cs_base;
1684 int error, fault, cs_d;
1685
1686 vcpu = &vm->vcpu[vcpuid];
1687 vme = &vcpu->exitinfo;
1688 vie = vcpu->vie_ctx;
1689
1690 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1691
1692 /* Fetch the faulting instruction */
1693 ASSERT(vie_needs_fetch(vie));
1694 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1695 &fault);
1696 if (error != 0) {
1697 return (error);
1698 } else if (fault) {
1699 /*
1700 * If a fault during instruction fetch was encounted, it will
1701 * have asserted that the appropriate exception be injected at
1702 * next entry. No further work is required.
1703 */
1704 return (0);
1705 }
1706
1707 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1708 /* Dump (unrecognized) instruction bytes in userspace */
1709 vie_fallback_exitinfo(vie, vme);
1710 return (-1);
1711 }
1712
1713 error = vie_emulate_other(vie, vm, vcpuid);
1714 if (error != 0) {
1715 /*
1716 * Instruction emulation was unable to complete successfully, so
1717 * kick it out to userspace for handling.
1718 */
1719 vie_fallback_exitinfo(vie, vme);
1720 } else {
1721 /* Update %rip now that instruction has been emulated */
1722 vie_advance_pc(vie, &vcpu->nextrip);
1723 }
1724 return (error);
1725 }
1726
1727 static int
1728 vm_handle_suspend(struct vm *vm, int vcpuid)
1729 {
1730 int i;
1731 struct vcpu *vcpu;
1732
1733 vcpu = &vm->vcpu[vcpuid];
1734
1735 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1736
1737 /*
1738 * Wait until all 'active_cpus' have suspended themselves.
1739 */
1740 vcpu_lock(vcpu);
1741 vcpu_ustate_change(vm, vcpuid, VU_INIT);
1742 while (1) {
1743 int rc;
1744
1745 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1746 VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1747 break;
1748 }
1749
1750 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1751 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1752 TR_CLOCK_TICK);
1753 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1754
1755 /*
1756 * If the userspace process driving the instance is killed, any
1757 * vCPUs yet to be marked suspended (because they are not
1758 * VM_RUN-ing in the kernel presently) will never reach that
1759 * state.
1760 *
1761 * To avoid vm_handle_suspend() getting stuck in the kernel
1762 * waiting for those vCPUs, offer a bail-out even though it
1763 * means returning without all vCPUs in a suspended state.
1764 */
1765 if (rc <= 0) {
1766 if ((curproc->p_flag & SEXITING) != 0) {
1767 break;
1768 }
1769 }
1770 }
1771 vcpu_unlock(vcpu);
1772
1773 /*
1774 * Wakeup the other sleeping vcpus and return to userspace.
1775 */
1776 for (i = 0; i < vm->maxcpus; i++) {
1777 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1778 vcpu_notify_event(vm, i);
1779 }
1780 }
1781
1782 return (-1);
1783 }
1784
1785 static int
1786 vm_handle_reqidle(struct vm *vm, int vcpuid)
1787 {
1788 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1789
1790 vcpu_lock(vcpu);
1791 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1792 vcpu->reqidle = 0;
1793 vcpu_unlock(vcpu);
1794 return (-1);
1795 }
1796
1797 static int
1798 vm_handle_run_state(struct vm *vm, int vcpuid)
1799 {
1800 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1801 bool handled = false;
1802
1803 vcpu_lock(vcpu);
1804 while (1) {
1805 if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1806 vcpu_unlock(vcpu);
1807 VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1808 vcpu_lock(vcpu);
1809
1810 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1811 vcpu->run_state |= VRS_INIT;
1812 }
1813
1814 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1815 (VRS_INIT | VRS_PEND_SIPI)) {
1816 const uint8_t vector = vcpu->sipi_vector;
1817
1818 vcpu_unlock(vcpu);
1819 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1820 vcpu_lock(vcpu);
1821
1822 vcpu->run_state &= ~VRS_PEND_SIPI;
1823 vcpu->run_state |= VRS_RUN;
1824 }
1825
1826 /*
1827 * If the vCPU is now in the running state, there is no need to
1828 * wait for anything prior to re-entry.
1829 */
1830 if ((vcpu->run_state & VRS_RUN) != 0) {
1831 handled = true;
1832 break;
1833 }
1834
1835 /*
1836 * Also check for software events which would cause a wake-up.
1837 * This will set the appropriate exitcode directly, rather than
1838 * requiring a trip through VM_RUN().
1839 */
1840 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1841 break;
1842 }
1843
1844 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1845 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1846 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1847 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1848 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1849 }
1850 vcpu_unlock(vcpu);
1851
1852 return (handled ? 0 : -1);
1853 }
1854
1855 static int
1856 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1857 {
1858 const uint32_t code = vme->u.msr.code;
1859 uint64_t val = 0;
1860
1861 switch (code) {
1862 case MSR_MCG_CAP:
1863 case MSR_MCG_STATUS:
1864 val = 0;
1865 break;
1866
1867 case MSR_MTRRcap:
1868 case MSR_MTRRdefType:
1869 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1870 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1871 case MSR_MTRR64kBase:
1872 val = 0;
1873 break;
1874
1875 case MSR_TSC:
1876 /*
1877 * In all likelihood, this should always be handled in guest
1878 * context by VMX/SVM rather than taking an exit. (Both VMX and
1879 * SVM pass through read-only access to MSR_TSC to the guest.)
1880 *
1881 * No physical offset is requested of vcpu_tsc_offset() since
1882 * rdtsc_offset() takes care of that instead.
1883 */
1884 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
1885 break;
1886
1887 default:
1888 /*
1889 * Anything not handled at this point will be kicked out to
1890 * userspace for attempted processing there.
1891 */
1892 return (-1);
1893 }
1894
1895 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
1896 val & 0xffffffff));
1897 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
1898 val >> 32));
1899 return (0);
1900 }
1901
1902 static int
1903 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1904 {
1905 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1906 const uint32_t code = vme->u.msr.code;
1907 const uint64_t val = vme->u.msr.wval;
1908
1909 switch (code) {
1910 case MSR_MCG_CAP:
1911 case MSR_MCG_STATUS:
1912 /* Ignore writes */
1913 break;
1914
1915 case MSR_MTRRcap:
1916 vm_inject_gp(vm, vcpuid);
1917 break;
1918 case MSR_MTRRdefType:
1919 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1920 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1921 case MSR_MTRR64kBase:
1922 /* Ignore writes */
1923 break;
1924
1925 case MSR_TSC:
1926 /*
1927 * The effect of writing the TSC MSR is that a subsequent read
1928 * of the TSC would report that value written (plus any time
1929 * elapsed between the write and the read). The guest TSC value
1930 * is calculated from a global offset for the guest (which
1931 * effectively makes its TSC read 0 at guest boot) and a
1932 * per-vCPU offset to handle these writes to the MSR.
1933 *
1934 * To calculate that per-vCPU offset, we can work backwards from
1935 * the guest value at the time of write:
1936 *
1937 * value = host TSC + VM boot offset + vCPU offset
1938 *
1939 * so therefore:
1940 *
1941 * value - host TSC - VM boot offset = vCPU offset
1942 */
1943 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
1944 break;
1945
1946 default:
1947 /*
1948 * Anything not handled at this point will be kicked out to
1949 * userspace for attempted processing there.
1950 */
1951 return (-1);
1952 }
1953
1954 return (0);
1955 }
1956
1957 int
1958 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1959 {
1960 int i;
1961
1962 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1963 return (EINVAL);
1964
1965 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1966 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1967 vm->suspend, how);
1968 return (EALREADY);
1969 }
1970
1971 VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1972
1973 /*
1974 * Notify all active vcpus that they are now suspended.
1975 */
1976 for (i = 0; i < vm->maxcpus; i++) {
1977 if (CPU_ISSET(i, &vm->active_cpus))
1978 vcpu_notify_event(vm, i);
1979 }
1980
1981 return (0);
1982 }
1983
1984 void
1985 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1986 {
1987 struct vm_exit *vmexit;
1988
1989 vmexit = vm_exitinfo(vm, vcpuid);
1990 vmexit->rip = rip;
1991 vmexit->inst_length = 0;
1992 vmexit->exitcode = VM_EXITCODE_RUN_STATE;
1993 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
1994 }
1995
1996 /*
1997 * Some vmm resources, such as the lapic, may have CPU-specific resources
1998 * allocated to them which would benefit from migration onto the host CPU which
1999 * is processing the vcpu state.
2000 */
2001 static void
2002 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2003 {
2004 /*
2005 * Localizing cyclic resources requires acquisition of cpu_lock, and
2006 * doing so with kpreempt disabled is a recipe for deadlock disaster.
2007 */
2008 VERIFY(curthread->t_preempt == 0);
2009
2010 /*
2011 * Do not bother with localization if this vCPU is about to return to
2012 * the host CPU it was last localized to.
2013 */
2014 if (vcpu->lastloccpu == curcpu)
2015 return;
2016
2017 /*
2018 * Localize system-wide resources to the primary boot vCPU. While any
2019 * of the other vCPUs may access them, it keeps the potential interrupt
2020 * footprint constrained to CPUs involved with this instance.
2021 */
2022 if (vcpu == &vm->vcpu[0]) {
2023 vhpet_localize_resources(vm->vhpet);
2024 vrtc_localize_resources(vm->vrtc);
2025 vatpit_localize_resources(vm->vatpit);
2026 }
2027
2028 vlapic_localize_resources(vcpu->vlapic);
2029
2030 vcpu->lastloccpu = curcpu;
2031 }
2032
2033 static void
2034 vmm_savectx(void *arg)
2035 {
2036 vm_thread_ctx_t *vtc = arg;
2037 struct vm *vm = vtc->vtc_vm;
2038 const int vcpuid = vtc->vtc_vcpuid;
2039
2040 if (ops->vmsavectx != NULL) {
2041 ops->vmsavectx(vm->cookie, vcpuid);
2042 }
2043
2044 /*
2045 * Account for going off-cpu, unless the vCPU is idled, where being
2046 * off-cpu is the explicit point.
2047 */
2048 if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2049 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2050 vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2051 }
2052
2053 /*
2054 * If the CPU holds the restored guest FPU state, save it and restore
2055 * the host FPU state before this thread goes off-cpu.
2056 */
2057 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2058 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2059
2060 save_guest_fpustate(vcpu);
2061 vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2062 }
2063 }
2064
2065 static void
2066 vmm_restorectx(void *arg)
2067 {
2068 vm_thread_ctx_t *vtc = arg;
2069 struct vm *vm = vtc->vtc_vm;
2070 const int vcpuid = vtc->vtc_vcpuid;
2071
2072 /* Complete microstate accounting for vCPU being off-cpu */
2073 if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2074 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2075 }
2076
2077 /*
2078 * When coming back on-cpu, only restore the guest FPU status if the
2079 * thread is in a context marked as requiring it. This should be rare,
2080 * occurring only when a future logic error results in a voluntary
2081 * sleep during the VMRUN critical section.
2082 *
2083 * The common case will result in elision of the guest FPU state
2084 * restoration, deferring that action until it is clearly necessary
2085 * during vm_run.
2086 */
2087 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2088 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2089 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2090
2091 restore_guest_fpustate(vcpu);
2092 vtc->vtc_status |= VTCS_FPU_RESTORED;
2093 }
2094
2095 if (ops->vmrestorectx != NULL) {
2096 ops->vmrestorectx(vm->cookie, vcpuid);
2097 }
2098
2099 }
2100
2101 /*
2102 * If we're in removectx(), we might still have state to tidy up.
2103 */
2104 static void
2105 vmm_freectx(void *arg, int isexec)
2106 {
2107 vmm_savectx(arg);
2108 }
2109
2110 static int
2111 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2112 struct vm_exit *vme)
2113 {
2114 struct vcpu *vcpu;
2115 struct vie *vie;
2116 int err;
2117
2118 vcpu = &vm->vcpu[vcpuid];
2119 vie = vcpu->vie_ctx;
2120 err = 0;
2121
2122 switch (entry->cmd) {
2123 case VEC_DEFAULT:
2124 return (0);
2125 case VEC_DISCARD_INSTR:
2126 vie_reset(vie);
2127 return (0);
2128 case VEC_FULFILL_MMIO:
2129 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2130 if (err == 0) {
2131 err = vie_emulate_mmio(vie, vm, vcpuid);
2132 if (err == 0) {
2133 vie_advance_pc(vie, &vcpu->nextrip);
2134 } else if (err < 0) {
2135 vie_exitinfo(vie, vme);
2136 } else if (err == EAGAIN) {
2137 /*
2138 * Clear the instruction emulation state in
2139 * order to re-enter VM context and continue
2140 * this 'rep <instruction>'
2141 */
2142 vie_reset(vie);
2143 err = 0;
2144 }
2145 }
2146 break;
2147 case VEC_FULFILL_INOUT:
2148 err = vie_fulfill_inout(vie, &entry->u.inout);
2149 if (err == 0) {
2150 err = vie_emulate_inout(vie, vm, vcpuid);
2151 if (err == 0) {
2152 vie_advance_pc(vie, &vcpu->nextrip);
2153 } else if (err < 0) {
2154 vie_exitinfo(vie, vme);
2155 } else if (err == EAGAIN) {
2156 /*
2157 * Clear the instruction emulation state in
2158 * order to re-enter VM context and continue
2159 * this 'rep ins/outs'
2160 */
2161 vie_reset(vie);
2162 err = 0;
2163 }
2164 }
2165 break;
2166 default:
2167 return (EINVAL);
2168 }
2169 return (err);
2170 }
2171
2172 static int
2173 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2174 {
2175 struct vie *vie;
2176
2177 vie = vm->vcpu[vcpuid].vie_ctx;
2178
2179 if (vie_pending(vie)) {
2180 /*
2181 * Userspace has not fulfilled the pending needs of the
2182 * instruction emulation, so bail back out.
2183 */
2184 vie_exitinfo(vie, vme);
2185 return (-1);
2186 }
2187
2188 return (0);
2189 }
2190
2191 int
2192 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2193 {
2194 int error;
2195 struct vcpu *vcpu;
2196 struct vm_exit *vme;
2197 bool intr_disabled;
2198 pmap_t pmap;
2199 vm_thread_ctx_t vtc;
2200 int affinity_type = CPU_CURRENT;
2201
2202 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2203 return (EINVAL);
2204 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2205 return (EINVAL);
2206 if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2207 return (EINVAL);
2208
2209 pmap = vmspace_pmap(vm->vmspace);
2210 vcpu = &vm->vcpu[vcpuid];
2211 vme = &vcpu->exitinfo;
2212
2213 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2214
2215 vtc.vtc_vm = vm;
2216 vtc.vtc_vcpuid = vcpuid;
2217 vtc.vtc_status = 0;
2218 installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2219 NULL, vmm_freectx, NULL);
2220
2221 error = vm_entry_actions(vm, vcpuid, entry, vme);
2222 if (error != 0) {
2223 goto exit;
2224 }
2225
2226 restart:
2227 error = vm_loop_checks(vm, vcpuid, vme);
2228 if (error != 0) {
2229 goto exit;
2230 }
2231
2232 thread_affinity_set(curthread, affinity_type);
2233 /*
2234 * Resource localization should happen after the CPU affinity for the
2235 * thread has been set to ensure that access from restricted contexts,
2236 * such as VMX-accelerated APIC operations, can occur without inducing
2237 * cyclic cross-calls.
2238 *
2239 * This must be done prior to disabling kpreempt via critical_enter().
2240 */
2241 vm_localize_resources(vm, vcpu);
2242 affinity_type = CPU_CURRENT;
2243 critical_enter();
2244
2245 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2246 ("vm_run: absurd pm_active"));
2247
2248 /* Force a trip through update_sregs to reload %fs/%gs and friends */
2249 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2250
2251 if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2252 restore_guest_fpustate(vcpu);
2253 vtc.vtc_status |= VTCS_FPU_RESTORED;
2254 }
2255 vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2256
2257 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2258 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2259 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2260
2261 /*
2262 * Once clear of the delicate contexts comprising the VM_RUN handler,
2263 * thread CPU affinity can be loosened while other processing occurs.
2264 */
2265 vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2266 thread_affinity_clear(curthread);
2267 critical_exit();
2268
2269 if (error != 0) {
2270 /* Communicate out any error from VMRUN() above */
2271 goto exit;
2272 }
2273
2274 vcpu->nextrip = vme->rip + vme->inst_length;
2275 switch (vme->exitcode) {
2276 case VM_EXITCODE_REQIDLE:
2277 error = vm_handle_reqidle(vm, vcpuid);
2278 break;
2279 case VM_EXITCODE_RUN_STATE:
2280 error = vm_handle_run_state(vm, vcpuid);
2281 break;
2282 case VM_EXITCODE_SUSPENDED:
2283 error = vm_handle_suspend(vm, vcpuid);
2284 break;
2285 case VM_EXITCODE_IOAPIC_EOI:
2286 vioapic_process_eoi(vm, vcpuid,
2287 vme->u.ioapic_eoi.vector);
2288 break;
2289 case VM_EXITCODE_HLT:
2290 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2291 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2292 break;
2293 case VM_EXITCODE_PAGING:
2294 error = vm_handle_paging(vm, vcpuid);
2295 break;
2296 case VM_EXITCODE_MMIO_EMUL:
2297 error = vm_handle_mmio_emul(vm, vcpuid);
2298 break;
2299 case VM_EXITCODE_INOUT:
2300 error = vm_handle_inout(vm, vcpuid, vme);
2301 break;
2302 case VM_EXITCODE_INST_EMUL:
2303 error = vm_handle_inst_emul(vm, vcpuid);
2304 break;
2305 case VM_EXITCODE_MONITOR:
2306 case VM_EXITCODE_MWAIT:
2307 case VM_EXITCODE_VMINSN:
2308 vm_inject_ud(vm, vcpuid);
2309 break;
2310 case VM_EXITCODE_RDMSR:
2311 error = vm_handle_rdmsr(vm, vcpuid, vme);
2312 break;
2313 case VM_EXITCODE_WRMSR:
2314 error = vm_handle_wrmsr(vm, vcpuid, vme);
2315 break;
2316 case VM_EXITCODE_HT:
2317 affinity_type = CPU_BEST;
2318 break;
2319 case VM_EXITCODE_MTRAP:
2320 vm_suspend_cpu(vm, vcpuid);
2321 error = -1;
2322 break;
2323 default:
2324 /* handled in userland */
2325 error = -1;
2326 break;
2327 }
2328
2329 if (error == 0) {
2330 /* VM exit conditions handled in-kernel, continue running */
2331 goto restart;
2332 }
2333
2334 exit:
2335 removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2336 NULL, vmm_freectx);
2337
2338 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2339
2340 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2341 return (error);
2342 }
2343
2344 int
2345 vm_restart_instruction(void *arg, int vcpuid)
2346 {
2347 struct vm *vm;
2348 struct vcpu *vcpu;
2349 enum vcpu_state state;
2350 uint64_t rip;
2351 int error;
2352
2353 vm = arg;
2354 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2355 return (EINVAL);
2356
2357 vcpu = &vm->vcpu[vcpuid];
2358 state = vcpu_get_state(vm, vcpuid, NULL);
2359 if (state == VCPU_RUNNING) {
2360 /*
2361 * When a vcpu is "running" the next instruction is determined
2362 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2363 * Thus setting 'inst_length' to zero will cause the current
2364 * instruction to be restarted.
2365 */
2366 vcpu->exitinfo.inst_length = 0;
2367 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2368 "setting inst_length to zero", vcpu->exitinfo.rip);
2369 } else if (state == VCPU_FROZEN) {
2370 /*
2371 * When a vcpu is "frozen" it is outside the critical section
2372 * around VMRUN() and 'nextrip' points to the next instruction.
2373 * Thus instruction restart is achieved by setting 'nextrip'
2374 * to the vcpu's %rip.
2375 */
2376 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2377 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2378 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2379 "nextrip from %lx to %lx", vcpu->nextrip, rip);
2380 vcpu->nextrip = rip;
2381 } else {
2382 panic("%s: invalid state %d", __func__, state);
2383 }
2384 return (0);
2385 }
2386
2387 int
2388 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2389 {
2390 struct vcpu *vcpu;
2391 int type, vector;
2392
2393 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2394 return (EINVAL);
2395
2396 vcpu = &vm->vcpu[vcpuid];
2397
2398 if (info & VM_INTINFO_VALID) {
2399 type = info & VM_INTINFO_TYPE;
2400 vector = info & 0xff;
2401 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2402 return (EINVAL);
2403 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2404 return (EINVAL);
2405 if (info & VM_INTINFO_RSVD)
2406 return (EINVAL);
2407 } else {
2408 info = 0;
2409 }
2410 VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2411 vcpu->exitintinfo = info;
2412 return (0);
2413 }
2414
2415 enum exc_class {
2416 EXC_BENIGN,
2417 EXC_CONTRIBUTORY,
2418 EXC_PAGEFAULT
2419 };
2420
2421 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */
2422
2423 static enum exc_class
2424 exception_class(uint64_t info)
2425 {
2426 int type, vector;
2427
2428 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2429 type = info & VM_INTINFO_TYPE;
2430 vector = info & 0xff;
2431
2432 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2433 switch (type) {
2434 case VM_INTINFO_HWINTR:
2435 case VM_INTINFO_SWINTR:
2436 case VM_INTINFO_NMI:
2437 return (EXC_BENIGN);
2438 default:
2439 /*
2440 * Hardware exception.
2441 *
2442 * SVM and VT-x use identical type values to represent NMI,
2443 * hardware interrupt and software interrupt.
2444 *
2445 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2446 * for exceptions except #BP and #OF. #BP and #OF use a type
2447 * value of '5' or '6'. Therefore we don't check for explicit
2448 * values of 'type' to classify 'intinfo' into a hardware
2449 * exception.
2450 */
2451 break;
2452 }
2453
2454 switch (vector) {
2455 case IDT_PF:
2456 case IDT_VE:
2457 return (EXC_PAGEFAULT);
2458 case IDT_DE:
2459 case IDT_TS:
2460 case IDT_NP:
2461 case IDT_SS:
2462 case IDT_GP:
2463 return (EXC_CONTRIBUTORY);
2464 default:
2465 return (EXC_BENIGN);
2466 }
2467 }
2468
2469 static int
2470 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2471 uint64_t *retinfo)
2472 {
2473 enum exc_class exc1, exc2;
2474 int type1, vector1;
2475
2476 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2477 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2478
2479 /*
2480 * If an exception occurs while attempting to call the double-fault
2481 * handler the processor enters shutdown mode (aka triple fault).
2482 */
2483 type1 = info1 & VM_INTINFO_TYPE;
2484 vector1 = info1 & 0xff;
2485 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2486 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2487 info1, info2);
2488 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2489 *retinfo = 0;
2490 return (0);
2491 }
2492
2493 /*
2494 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2495 */
2496 exc1 = exception_class(info1);
2497 exc2 = exception_class(info2);
2498 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2499 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2500 /* Convert nested fault into a double fault. */
2501 *retinfo = IDT_DF;
2502 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2503 *retinfo |= VM_INTINFO_DEL_ERRCODE;
2504 } else {
2505 /* Handle exceptions serially */
2506 *retinfo = info2;
2507 }
2508 return (1);
2509 }
2510
2511 static uint64_t
2512 vcpu_exception_intinfo(struct vcpu *vcpu)
2513 {
2514 uint64_t info = 0;
2515
2516 if (vcpu->exception_pending) {
2517 info = vcpu->exc_vector & 0xff;
2518 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2519 if (vcpu->exc_errcode_valid) {
2520 info |= VM_INTINFO_DEL_ERRCODE;
2521 info |= (uint64_t)vcpu->exc_errcode << 32;
2522 }
2523 }
2524 return (info);
2525 }
2526
2527 int
2528 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2529 {
2530 struct vcpu *vcpu;
2531 uint64_t info1, info2;
2532 int valid;
2533
2534 KASSERT(vcpuid >= 0 &&
2535 vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2536
2537 vcpu = &vm->vcpu[vcpuid];
2538
2539 info1 = vcpu->exitintinfo;
2540 vcpu->exitintinfo = 0;
2541
2542 info2 = 0;
2543 if (vcpu->exception_pending) {
2544 info2 = vcpu_exception_intinfo(vcpu);
2545 vcpu->exception_pending = 0;
2546 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2547 vcpu->exc_vector, info2);
2548 }
2549
2550 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2551 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2552 } else if (info1 & VM_INTINFO_VALID) {
2553 *retinfo = info1;
2554 valid = 1;
2555 } else if (info2 & VM_INTINFO_VALID) {
2556 *retinfo = info2;
2557 valid = 1;
2558 } else {
2559 valid = 0;
2560 }
2561
2562 if (valid) {
2563 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2564 "retinfo(%lx)", __func__, info1, info2, *retinfo);
2565 }
2566
2567 return (valid);
2568 }
2569
2570 int
2571 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2572 {
2573 struct vcpu *vcpu;
2574
2575 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2576 return (EINVAL);
2577
2578 vcpu = &vm->vcpu[vcpuid];
2579 *info1 = vcpu->exitintinfo;
2580 *info2 = vcpu_exception_intinfo(vcpu);
2581 return (0);
2582 }
2583
2584 int
2585 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2586 uint32_t errcode, int restart_instruction)
2587 {
2588 struct vcpu *vcpu;
2589 uint64_t regval;
2590 int error;
2591
2592 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2593 return (EINVAL);
2594
2595 if (vector < 0 || vector >= 32)
2596 return (EINVAL);
2597
2598 /*
2599 * NMIs (which bear an exception vector of 2) are to be injected via
2600 * their own specialized path using vm_inject_nmi().
2601 */
2602 if (vector == 2) {
2603 return (EINVAL);
2604 }
2605
2606 /*
2607 * A double fault exception should never be injected directly into
2608 * the guest. It is a derived exception that results from specific
2609 * combinations of nested faults.
2610 */
2611 if (vector == IDT_DF)
2612 return (EINVAL);
2613
2614 vcpu = &vm->vcpu[vcpuid];
2615
2616 if (vcpu->exception_pending) {
2617 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2618 "pending exception %d", vector, vcpu->exc_vector);
2619 return (EBUSY);
2620 }
2621
2622 if (errcode_valid) {
2623 /*
2624 * Exceptions don't deliver an error code in real mode.
2625 */
2626 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val);
2627 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2628 if (!(regval & CR0_PE))
2629 errcode_valid = 0;
2630 }
2631
2632 /*
2633 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2634 *
2635 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2636 * one instruction or incurs an exception.
2637 */
2638 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2639 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2640 __func__, error));
2641
2642 if (restart_instruction)
2643 vm_restart_instruction(vm, vcpuid);
2644
2645 vcpu->exception_pending = 1;
2646 vcpu->exc_vector = vector;
2647 vcpu->exc_errcode = errcode;
2648 vcpu->exc_errcode_valid = errcode_valid;
2649 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2650 return (0);
2651 }
2652
2653 void
2654 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2655 int errcode)
2656 {
2657 int error;
2658
2659 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2660 errcode, 1);
2661 KASSERT(error == 0, ("vm_inject_exception error %d", error));
2662 }
2663
2664 void
2665 vm_inject_ud(struct vm *vm, int vcpuid)
2666 {
2667 vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2668 }
2669
2670 void
2671 vm_inject_gp(struct vm *vm, int vcpuid)
2672 {
2673 vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2674 }
2675
2676 void
2677 vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2678 {
2679 vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2680 }
2681
2682 void
2683 vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2684 {
2685 vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2686 }
2687
2688 void
2689 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2690 {
2691 int error;
2692
2693 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2694 error_code, cr2);
2695
2696 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2697 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2698
2699 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2700 }
2701
2702 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2703
2704 int
2705 vm_inject_nmi(struct vm *vm, int vcpuid)
2706 {
2707 struct vcpu *vcpu;
2708
2709 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2710 return (EINVAL);
2711
2712 vcpu = &vm->vcpu[vcpuid];
2713
2714 vcpu->nmi_pending = 1;
2715 vcpu_notify_event(vm, vcpuid);
2716 return (0);
2717 }
2718
2719 int
2720 vm_nmi_pending(struct vm *vm, int vcpuid)
2721 {
2722 struct vcpu *vcpu;
2723
2724 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2725 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2726
2727 vcpu = &vm->vcpu[vcpuid];
2728
2729 return (vcpu->nmi_pending);
2730 }
2731
2732 void
2733 vm_nmi_clear(struct vm *vm, int vcpuid)
2734 {
2735 struct vcpu *vcpu;
2736
2737 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2738 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2739
2740 vcpu = &vm->vcpu[vcpuid];
2741
2742 if (vcpu->nmi_pending == 0)
2743 panic("vm_nmi_clear: inconsistent nmi_pending state");
2744
2745 vcpu->nmi_pending = 0;
2746 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2747 }
2748
2749 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2750
2751 int
2752 vm_inject_extint(struct vm *vm, int vcpuid)
2753 {
2754 struct vcpu *vcpu;
2755
2756 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2757 return (EINVAL);
2758
2759 vcpu = &vm->vcpu[vcpuid];
2760
2761 vcpu->extint_pending = 1;
2762 vcpu_notify_event(vm, vcpuid);
2763 return (0);
2764 }
2765
2766 int
2767 vm_extint_pending(struct vm *vm, int vcpuid)
2768 {
2769 struct vcpu *vcpu;
2770
2771 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2772 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2773
2774 vcpu = &vm->vcpu[vcpuid];
2775
2776 return (vcpu->extint_pending);
2777 }
2778
2779 void
2780 vm_extint_clear(struct vm *vm, int vcpuid)
2781 {
2782 struct vcpu *vcpu;
2783
2784 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2785 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2786
2787 vcpu = &vm->vcpu[vcpuid];
2788
2789 if (vcpu->extint_pending == 0)
2790 panic("vm_extint_clear: inconsistent extint_pending state");
2791
2792 vcpu->extint_pending = 0;
2793 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2794 }
2795
2796 int
2797 vm_inject_init(struct vm *vm, int vcpuid)
2798 {
2799 struct vcpu *vcpu;
2800
2801 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2802 return (EINVAL);
2803
2804 vcpu = &vm->vcpu[vcpuid];
2805 vcpu_lock(vcpu);
2806 vcpu->run_state |= VRS_PEND_INIT;
2807 /*
2808 * As part of queuing the INIT request, clear any pending SIPI. It
2809 * would not otherwise survive across the reset of the vCPU when it
2810 * undergoes the requested INIT. We would not want it to linger when it
2811 * could be mistaken as a subsequent (after the INIT) SIPI request.
2812 */
2813 vcpu->run_state &= ~VRS_PEND_SIPI;
2814 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2815
2816 vcpu_unlock(vcpu);
2817 return (0);
2818 }
2819
2820 int
2821 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2822 {
2823 struct vcpu *vcpu;
2824
2825 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2826 return (EINVAL);
2827
2828 vcpu = &vm->vcpu[vcpuid];
2829 vcpu_lock(vcpu);
2830 vcpu->run_state |= VRS_PEND_SIPI;
2831 vcpu->sipi_vector = vector;
2832 /* SIPI is only actionable if the CPU is waiting in INIT state */
2833 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2834 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2835 }
2836 vcpu_unlock(vcpu);
2837 return (0);
2838 }
2839
2840 bool
2841 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2842 {
2843 struct vcpu *vcpu;
2844
2845 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2846 vcpu = &vm->vcpu[vcpuid];
2847
2848 /* Of interest: vCPU not in running state or with pending INIT */
2849 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2850 }
2851
2852 int
2853 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2854 {
2855 struct seg_desc desc;
2856 const enum vm_reg_name clear_regs[] = {
2857 VM_REG_GUEST_CR2,
2858 VM_REG_GUEST_CR3,
2859 VM_REG_GUEST_CR4,
2860 VM_REG_GUEST_RAX,
2861 VM_REG_GUEST_RBX,
2862 VM_REG_GUEST_RCX,
2863 VM_REG_GUEST_RSI,
2864 VM_REG_GUEST_RDI,
2865 VM_REG_GUEST_RBP,
2866 VM_REG_GUEST_RSP,
2867 VM_REG_GUEST_R8,
2868 VM_REG_GUEST_R9,
2869 VM_REG_GUEST_R10,
2870 VM_REG_GUEST_R11,
2871 VM_REG_GUEST_R12,
2872 VM_REG_GUEST_R13,
2873 VM_REG_GUEST_R14,
2874 VM_REG_GUEST_R15,
2875 VM_REG_GUEST_DR0,
2876 VM_REG_GUEST_DR1,
2877 VM_REG_GUEST_DR2,
2878 VM_REG_GUEST_DR3,
2879 VM_REG_GUEST_EFER,
2880 };
2881 const enum vm_reg_name data_segs[] = {
2882 VM_REG_GUEST_SS,
2883 VM_REG_GUEST_DS,
2884 VM_REG_GUEST_ES,
2885 VM_REG_GUEST_FS,
2886 VM_REG_GUEST_GS,
2887 };
2888 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2889
2890 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2891 return (EINVAL);
2892
2893 for (uint_t i = 0; i < nitems(clear_regs); i++) {
2894 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2895 }
2896
2897 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2898 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2899 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2900
2901 /*
2902 * The prescribed contents of %rdx differ slightly between the Intel and
2903 * AMD architectural definitions. The former expects the Extended Model
2904 * in bits 16-19 where the latter expects all the Family, Model, and
2905 * Stepping be there. Common boot ROMs appear to disregard this
2906 * anyways, so we stick with a compromise value similar to what is
2907 * spelled out in the Intel SDM.
2908 */
2909 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2910
2911 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2912 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2913
2914 /* CS: Present, R/W, Accessed */
2915 desc.access = 0x0093;
2916 desc.base = 0xffff0000;
2917 desc.limit = 0xffff;
2918 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2919 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2920
2921 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2922 desc.access = 0x0093;
2923 desc.base = 0;
2924 desc.limit = 0xffff;
2925 for (uint_t i = 0; i < nitems(data_segs); i++) {
2926 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2927 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2928 }
2929
2930 /* GDTR, IDTR */
2931 desc.base = 0;
2932 desc.limit = 0xffff;
2933 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2934 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2935
2936 /* LDTR: Present, LDT */
2937 desc.access = 0x0082;
2938 desc.base = 0;
2939 desc.limit = 0xffff;
2940 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2941 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2942
2943 /* TR: Present, 32-bit TSS */
2944 desc.access = 0x008b;
2945 desc.base = 0;
2946 desc.limit = 0xffff;
2947 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2948 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2949
2950 vlapic_reset(vm_lapic(vm, vcpuid));
2951
2952 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2953
2954 vcpu->exitintinfo = 0;
2955 vcpu->exception_pending = 0;
2956 vcpu->nmi_pending = 0;
2957 vcpu->extint_pending = 0;
2958
2959 /*
2960 * A CPU reset caused by power-on or system reset clears more state than
2961 * one which is trigged from an INIT IPI.
2962 */
2963 if (!init_only) {
2964 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2965 fpu_save_area_reset(vcpu->guestfpu);
2966
2967 /* XXX: clear MSRs and other pieces */
2968 }
2969
2970 return (0);
2971 }
2972
2973 static int
2974 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2975 {
2976 struct seg_desc desc;
2977
2978 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2979 return (EINVAL);
2980
2981 /* CS: Present, R/W, Accessed */
2982 desc.access = 0x0093;
2983 desc.base = (uint64_t)vector << 12;
2984 desc.limit = 0xffff;
2985 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2986 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2987 (uint64_t)vector << 8));
2988
2989 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2990
2991 return (0);
2992 }
2993
2994 int
2995 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2996 {
2997 if (vcpu < 0 || vcpu >= vm->maxcpus)
2998 return (EINVAL);
2999
3000 if (type < 0 || type >= VM_CAP_MAX)
3001 return (EINVAL);
3002
3003 return (VMGETCAP(vm->cookie, vcpu, type, retval));
3004 }
3005
3006 int
3007 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3008 {
3009 if (vcpu < 0 || vcpu >= vm->maxcpus)
3010 return (EINVAL);
3011
3012 if (type < 0 || type >= VM_CAP_MAX)
3013 return (EINVAL);
3014
3015 return (VMSETCAP(vm->cookie, vcpu, type, val));
3016 }
3017
3018 struct vlapic *
3019 vm_lapic(struct vm *vm, int cpu)
3020 {
3021 return (vm->vcpu[cpu].vlapic);
3022 }
3023
3024 struct vioapic *
3025 vm_ioapic(struct vm *vm)
3026 {
3027
3028 return (vm->vioapic);
3029 }
3030
3031 struct vhpet *
3032 vm_hpet(struct vm *vm)
3033 {
3034
3035 return (vm->vhpet);
3036 }
3037
3038 void *
3039 vm_iommu_domain(struct vm *vm)
3040 {
3041
3042 return (vm->iommu);
3043 }
3044
3045 int
3046 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3047 bool from_idle)
3048 {
3049 int error;
3050 struct vcpu *vcpu;
3051
3052 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3053 panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3054
3055 vcpu = &vm->vcpu[vcpuid];
3056
3057 vcpu_lock(vcpu);
3058 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3059 vcpu_unlock(vcpu);
3060
3061 return (error);
3062 }
3063
3064 enum vcpu_state
3065 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3066 {
3067 struct vcpu *vcpu;
3068 enum vcpu_state state;
3069
3070 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3071 panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3072
3073 vcpu = &vm->vcpu[vcpuid];
3074
3075 vcpu_lock(vcpu);
3076 state = vcpu->state;
3077 if (hostcpu != NULL)
3078 *hostcpu = vcpu->hostcpu;
3079 vcpu_unlock(vcpu);
3080
3081 return (state);
3082 }
3083
3084 uint64_t
3085 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3086 {
3087 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3088
3089 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3090
3091 if (phys_adj) {
3092 /* Include any offset for the current physical CPU too */
3093 extern hrtime_t tsc_gethrtime_tick_delta(void);
3094 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3095 }
3096
3097 return (vcpu_off);
3098 }
3099
3100 int
3101 vm_activate_cpu(struct vm *vm, int vcpuid)
3102 {
3103
3104 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3105 return (EINVAL);
3106
3107 if (CPU_ISSET(vcpuid, &vm->active_cpus))
3108 return (EBUSY);
3109
3110 VCPU_CTR0(vm, vcpuid, "activated");
3111 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3112 return (0);
3113 }
3114
3115 int
3116 vm_suspend_cpu(struct vm *vm, int vcpuid)
3117 {
3118 int i;
3119
3120 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3121 return (EINVAL);
3122
3123 if (vcpuid == -1) {
3124 vm->debug_cpus = vm->active_cpus;
3125 for (i = 0; i < vm->maxcpus; i++) {
3126 if (CPU_ISSET(i, &vm->active_cpus))
3127 vcpu_notify_event(vm, i);
3128 }
3129 } else {
3130 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3131 return (EINVAL);
3132
3133 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3134 vcpu_notify_event(vm, vcpuid);
3135 }
3136 return (0);
3137 }
3138
3139 int
3140 vm_resume_cpu(struct vm *vm, int vcpuid)
3141 {
3142
3143 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3144 return (EINVAL);
3145
3146 if (vcpuid == -1) {
3147 CPU_ZERO(&vm->debug_cpus);
3148 } else {
3149 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3150 return (EINVAL);
3151
3152 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3153 }
3154 return (0);
3155 }
3156
3157 static bool
3158 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3159 uint64_t entry_rip)
3160 {
3161 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3162 struct vm_exit *vme = &vcpu->exitinfo;
3163 bool bail = false;
3164
3165 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3166
3167 if (vm->suspend) {
3168 if (on_entry) {
3169 VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3170 vm->suspend < VM_SUSPEND_LAST);
3171
3172 vme->exitcode = VM_EXITCODE_SUSPENDED;
3173 vme->u.suspended.how = vm->suspend;
3174 } else {
3175 /*
3176 * Handling VM suspend is complicated, so if that
3177 * condition is detected outside of VM-entry itself,
3178 * just emit a BOGUS exitcode so we take a lap to pick
3179 * up the event during an entry and are directed into
3180 * the vm_handle_suspend() logic.
3181 */
3182 vme->exitcode = VM_EXITCODE_BOGUS;
3183 }
3184 bail = true;
3185 }
3186 if (vcpu->reqidle) {
3187 vme->exitcode = VM_EXITCODE_REQIDLE;
3188 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3189
3190 if (!on_entry) {
3191 /*
3192 * A reqidle request detected outside of VM-entry can be
3193 * handled directly by clearing the request (and taking
3194 * a lap to userspace).
3195 */
3196 vcpu_assert_locked(vcpu);
3197 vcpu->reqidle = 0;
3198 }
3199 bail = true;
3200 }
3201 if (vcpu_should_yield(vm, vcpuid)) {
3202 vme->exitcode = VM_EXITCODE_BOGUS;
3203 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3204 bail = true;
3205 }
3206 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3207 vme->exitcode = VM_EXITCODE_DEBUG;
3208 bail = true;
3209 }
3210
3211 if (bail) {
3212 if (on_entry) {
3213 /*
3214 * If bailing out during VM-entry, the current %rip must
3215 * be recorded in the exitinfo.
3216 */
3217 vme->rip = entry_rip;
3218 }
3219 vme->inst_length = 0;
3220 }
3221 return (bail);
3222 }
3223
3224 static bool
3225 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3226 {
3227 /*
3228 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3229 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3230 * structure, and we would only modify the exitcode.
3231 */
3232 return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3233 }
3234
3235 bool
3236 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3237 {
3238 /*
3239 * Bail-out checks done as part of VM entry require an updated %rip to
3240 * populate the vm_exit struct if any of the conditions of interest are
3241 * matched in the check.
3242 */
3243 return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3244 }
3245
3246 cpuset_t
3247 vm_active_cpus(struct vm *vm)
3248 {
3249
3250 return (vm->active_cpus);
3251 }
3252
3253 cpuset_t
3254 vm_debug_cpus(struct vm *vm)
3255 {
3256
3257 return (vm->debug_cpus);
3258 }
3259
3260 cpuset_t
3261 vm_suspended_cpus(struct vm *vm)
3262 {
3263
3264 return (vm->suspended_cpus);
3265 }
3266
3267 void *
3268 vcpu_stats(struct vm *vm, int vcpuid)
3269 {
3270
3271 return (vm->vcpu[vcpuid].stats);
3272 }
3273
3274 int
3275 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3276 {
3277 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3278 return (EINVAL);
3279
3280 *state = vm->vcpu[vcpuid].x2apic_state;
3281
3282 return (0);
3283 }
3284
3285 int
3286 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3287 {
3288 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3289 return (EINVAL);
3290
3291 if (state >= X2APIC_STATE_LAST)
3292 return (EINVAL);
3293
3294 vm->vcpu[vcpuid].x2apic_state = state;
3295
3296 vlapic_set_x2apic_state(vm, vcpuid, state);
3297
3298 return (0);
3299 }
3300
3301 /*
3302 * This function is called to ensure that a vcpu "sees" a pending event
3303 * as soon as possible:
3304 * - If the vcpu thread is sleeping then it is woken up.
3305 * - If the vcpu is running on a different host_cpu then an IPI will be directed
3306 * to the host_cpu to cause the vcpu to trap into the hypervisor.
3307 */
3308 static void
3309 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3310 {
3311 int hostcpu;
3312
3313 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3314
3315 hostcpu = vcpu->hostcpu;
3316 if (vcpu->state == VCPU_RUNNING) {
3317 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3318 if (hostcpu != curcpu) {
3319 if (ntype == VCPU_NOTIFY_APIC) {
3320 vlapic_post_intr(vcpu->vlapic, hostcpu,
3321 vmm_ipinum);
3322 } else {
3323 ipi_cpu(hostcpu, vmm_ipinum);
3324 }
3325 } else {
3326 /*
3327 * If the 'vcpu' is running on 'curcpu' then it must
3328 * be sending a notification to itself (e.g. SELF_IPI).
3329 * The pending event will be picked up when the vcpu
3330 * transitions back to guest context.
3331 */
3332 }
3333 } else {
3334 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3335 "with hostcpu %d", vcpu->state, hostcpu));
3336 if (vcpu->state == VCPU_SLEEPING) {
3337 cv_signal(&vcpu->vcpu_cv);
3338 }
3339 }
3340 }
3341
3342 void
3343 vcpu_notify_event(struct vm *vm, int vcpuid)
3344 {
3345 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3346
3347 vcpu_lock(vcpu);
3348 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3349 vcpu_unlock(vcpu);
3350 }
3351
3352 void
3353 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3354 {
3355 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3356
3357 if (ntype == VCPU_NOTIFY_NONE) {
3358 return;
3359 }
3360
3361 vcpu_lock(vcpu);
3362 vcpu_notify_event_locked(vcpu, ntype);
3363 vcpu_unlock(vcpu);
3364 }
3365
3366 void
3367 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3368 {
3369 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3370 hrtime_t now = gethrtime();
3371
3372 ASSERT3U(ustate, !=, vcpu->ustate);
3373 ASSERT3S(ustate, <, VU_MAX);
3374 ASSERT3S(ustate, >=, VU_INIT);
3375
3376 hrtime_t delta = now - vcpu->ustate_when;
3377 vcpu->ustate_total[vcpu->ustate] += delta;
3378
3379 membar_producer();
3380
3381 vcpu->ustate_when = now;
3382 vcpu->ustate = ustate;
3383 }
3384
3385 struct vmspace *
3386 vm_get_vmspace(struct vm *vm)
3387 {
3388
3389 return (vm->vmspace);
3390 }
3391
3392 int
3393 vm_apicid2vcpuid(struct vm *vm, int apicid)
3394 {
3395 /*
3396 * XXX apic id is assumed to be numerically identical to vcpu id
3397 */
3398 return (apicid);
3399 }
3400
3401 struct vatpic *
3402 vm_atpic(struct vm *vm)
3403 {
3404 return (vm->vatpic);
3405 }
3406
3407 struct vatpit *
3408 vm_atpit(struct vm *vm)
3409 {
3410 return (vm->vatpit);
3411 }
3412
3413 struct vpmtmr *
3414 vm_pmtmr(struct vm *vm)
3415 {
3416
3417 return (vm->vpmtmr);
3418 }
3419
3420 struct vrtc *
3421 vm_rtc(struct vm *vm)
3422 {
3423
3424 return (vm->vrtc);
3425 }
3426
3427 enum vm_reg_name
3428 vm_segment_name(int seg)
3429 {
3430 static enum vm_reg_name seg_names[] = {
3431 VM_REG_GUEST_ES,
3432 VM_REG_GUEST_CS,
3433 VM_REG_GUEST_SS,
3434 VM_REG_GUEST_DS,
3435 VM_REG_GUEST_FS,
3436 VM_REG_GUEST_GS
3437 };
3438
3439 KASSERT(seg >= 0 && seg < nitems(seg_names),
3440 ("%s: invalid segment encoding %d", __func__, seg));
3441 return (seg_names[seg]);
3442 }
3443
3444 void
3445 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3446 int num_copyinfo)
3447 {
3448 int idx;
3449
3450 for (idx = 0; idx < num_copyinfo; idx++) {
3451 if (copyinfo[idx].cookie != NULL)
3452 vm_gpa_release(copyinfo[idx].cookie);
3453 }
3454 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3455 }
3456
3457 int
3458 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3459 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3460 int num_copyinfo, int *fault)
3461 {
3462 int error, idx, nused;
3463 size_t n, off, remaining;
3464 void *hva, *cookie;
3465 uint64_t gpa;
3466
3467 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3468
3469 nused = 0;
3470 remaining = len;
3471 while (remaining > 0) {
3472 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3473 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3474 if (error || *fault)
3475 return (error);
3476 off = gpa & PAGE_MASK;
3477 n = min(remaining, PAGE_SIZE - off);
3478 copyinfo[nused].gpa = gpa;
3479 copyinfo[nused].len = n;
3480 remaining -= n;
3481 gla += n;
3482 nused++;
3483 }
3484
3485 for (idx = 0; idx < nused; idx++) {
3486 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3487 copyinfo[idx].len, prot, &cookie);
3488 if (hva == NULL)
3489 break;
3490 copyinfo[idx].hva = hva;
3491 copyinfo[idx].cookie = cookie;
3492 }
3493
3494 if (idx != nused) {
3495 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3496 return (EFAULT);
3497 } else {
3498 *fault = 0;
3499 return (0);
3500 }
3501 }
3502
3503 void
3504 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3505 size_t len)
3506 {
3507 char *dst;
3508 int idx;
3509
3510 dst = kaddr;
3511 idx = 0;
3512 while (len > 0) {
3513 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3514 len -= copyinfo[idx].len;
3515 dst += copyinfo[idx].len;
3516 idx++;
3517 }
3518 }
3519
3520 void
3521 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3522 struct vm_copyinfo *copyinfo, size_t len)
3523 {
3524 const char *src;
3525 int idx;
3526
3527 src = kaddr;
3528 idx = 0;
3529 while (len > 0) {
3530 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3531 len -= copyinfo[idx].len;
3532 src += copyinfo[idx].len;
3533 idx++;
3534 }
3535 }
3536
3537 /*
3538 * Return the amount of in-use and wired memory for the VM. Since
3539 * these are global stats, only return the values with for vCPU 0
3540 */
3541 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3542 VMM_STAT_DECLARE(VMM_MEM_WIRED);
3543
3544 static void
3545 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3546 {
3547
3548 if (vcpu == 0) {
3549 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3550 PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3551 }
3552 }
3553
3554 static void
3555 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3556 {
3557
3558 if (vcpu == 0) {
3559 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3560 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3561 }
3562 }
3563
3564 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3565 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3566
3567 int
3568 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3569 uint8_t bytes, uint32_t *val)
3570 {
3571 return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3572 }
3573
3574 /*
3575 * bhyve-internal interfaces to attach or detach IO port handlers.
3576 * Must be called with VM write lock held for safety.
3577 */
3578 int
3579 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3580 void **cookie)
3581 {
3582 int err;
3583 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3584 if (err == 0) {
3585 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3586 }
3587 return (err);
3588 }
3589 int
3590 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3591 void **old_arg)
3592 {
3593 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3594 int err;
3595
3596 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3597 if (err == 0) {
3598 *cookie = NULL;
3599 }
3600 return (err);
3601 }
3602
3603 /*
3604 * External driver interfaces to attach or detach IO port handlers.
3605 * Must be called with VM write lock held for safety.
3606 */
3607 int
3608 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3609 void *arg, void **cookie)
3610 {
3611 int err;
3612
3613 if (port == 0) {
3614 return (EINVAL);
3615 }
3616
3617 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3618 if (err == 0) {
3619 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3620 }
3621 return (err);
3622 }
3623 void
3624 vm_ioport_unhook(struct vm *vm, void **cookie)
3625 {
3626 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3627 ioport_handler_t old_func;
3628 void *old_arg;
3629 int err;
3630
3631 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3632
3633 /* ioport-hook-using drivers are expected to be well-behaved */
3634 VERIFY0(err);
3635 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3636
3637 *cookie = NULL;
3638 }
3639
3640 int
3641 vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3642 {
3643 struct vm *vm = ksp->ks_private;
3644 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3645 const int vcpuid = vvk->vvk_vcpu.value.ui32;
3646 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3647
3648 ASSERT3U(vcpuid, <, VM_MAXCPU);
3649
3650 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3651 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3652 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3653 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3654 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3655 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3656
3657 return (0);
3658 }