Print this page
13902 Fix for 13717 may break 8-disk raidz2
13915 installctx() blocking allocate causes problems
Portions contributed by: Jerry Jelinek <gjelinek@gmail.com>
Change-Id: I934d69946cec42630fc541fa8c7385b862b69ca2
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/i86pc/io/vmm/vmm.c
+++ new/usr/src/uts/i86pc/io/vmm/vmm.c
1 1 /*-
2 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 3 *
4 4 * Copyright (c) 2011 NetApp, Inc.
5 5 * All rights reserved.
6 6 *
7 7 * Redistribution and use in source and binary forms, with or without
8 8 * modification, are permitted provided that the following conditions
9 9 * are met:
10 10 * 1. Redistributions of source code must retain the above copyright
11 11 * notice, this list of conditions and the following disclaimer.
12 12 * 2. Redistributions in binary form must reproduce the above copyright
13 13 * notice, this list of conditions and the following disclaimer in the
14 14 * documentation and/or other materials provided with the distribution.
15 15 *
16 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 26 * SUCH DAMAGE.
27 27 *
28 28 * $FreeBSD$
29 29 */
30 30 /*
31 31 * This file and its contents are supplied under the terms of the
32 32 * Common Development and Distribution License ("CDDL"), version 1.0.
33 33 * You may only use this file in accordance with the terms of version
34 34 * 1.0 of the CDDL.
35 35 *
36 36 * A full copy of the text of the CDDL should have accompanied this
37 37 * source. A copy of the CDDL is also available via the Internet at
38 38 * http://www.illumos.org/license/CDDL.
39 39 *
40 40 * Copyright 2015 Pluribus Networks Inc.
41 41 * Copyright 2018 Joyent, Inc.
42 42 * Copyright 2021 Oxide Computer Company
43 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
44 44 */
45 45
46 46 #include <sys/cdefs.h>
47 47 __FBSDID("$FreeBSD$");
48 48
49 49 #include <sys/param.h>
50 50 #include <sys/systm.h>
51 51 #include <sys/kernel.h>
52 52 #include <sys/module.h>
53 53 #include <sys/sysctl.h>
54 54 #include <sys/malloc.h>
55 55 #include <sys/pcpu.h>
56 56 #include <sys/lock.h>
57 57 #include <sys/mutex.h>
58 58 #include <sys/proc.h>
59 59 #include <sys/rwlock.h>
60 60 #include <sys/sched.h>
61 61 #include <sys/smp.h>
62 62 #include <sys/systm.h>
63 63
64 64 #include <machine/pcb.h>
65 65 #include <machine/smp.h>
66 66 #include <machine/md_var.h>
67 67 #include <x86/psl.h>
68 68 #include <x86/apicreg.h>
69 69
70 70 #include <machine/specialreg.h>
71 71 #include <machine/vmm.h>
72 72 #include <machine/vmm_dev.h>
73 73 #include <machine/vmparam.h>
74 74 #include <sys/vmm_instruction_emul.h>
75 75 #include <sys/vmm_vm.h>
76 76
77 77 #include "vmm_ioport.h"
78 78 #include "vmm_ktr.h"
79 79 #include "vmm_host.h"
80 80 #include "vmm_mem.h"
81 81 #include "vmm_util.h"
82 82 #include "vatpic.h"
83 83 #include "vatpit.h"
84 84 #include "vhpet.h"
85 85 #include "vioapic.h"
86 86 #include "vlapic.h"
87 87 #include "vpmtmr.h"
88 88 #include "vrtc.h"
89 89 #include "vmm_stat.h"
90 90 #include "vmm_lapic.h"
91 91
92 92 #include "io/ppt.h"
93 93 #include "io/iommu.h"
94 94
95 95 struct vlapic;
96 96
97 97 /*
98 98 * Initialization:
99 99 * (a) allocated when vcpu is created
100 100 * (i) initialized when vcpu is created and when it is reinitialized
101 101 * (o) initialized the first time the vcpu is created
102 102 * (x) initialized before use
103 103 */
104 104 struct vcpu {
105 105 /* (o) protects state, run_state, hostcpu, sipi_vector */
106 106 struct mtx mtx;
107 107
108 108 enum vcpu_state state; /* (o) vcpu state */
109 109 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */
110 110 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */
111 111 kcondvar_t state_cv; /* (o) IDLE-transition cv */
112 112 int hostcpu; /* (o) vcpu's current host cpu */
113 113 int lastloccpu; /* (o) last host cpu localized to */
114 114 int reqidle; /* (i) request vcpu to idle */
115 115 struct vlapic *vlapic; /* (i) APIC device model */
116 116 enum x2apic_state x2apic_state; /* (i) APIC mode */
117 117 uint64_t exitintinfo; /* (i) events pending at VM exit */
118 118 int nmi_pending; /* (i) NMI pending */
119 119 int extint_pending; /* (i) INTR pending */
120 120 int exception_pending; /* (i) exception pending */
121 121 int exc_vector; /* (x) exception collateral */
122 122 int exc_errcode_valid;
123 123 uint32_t exc_errcode;
124 124 uint8_t sipi_vector; /* (i) SIPI vector */
125 125 struct savefpu *guestfpu; /* (a,i) guest fpu state */
126 126 uint64_t guest_xcr0; /* (i) guest %xcr0 register */
127 127 void *stats; /* (a,i) statistics */
128 128 struct vm_exit exitinfo; /* (x) exit reason and collateral */
129 129 uint64_t nextrip; /* (x) next instruction to execute */
130 130 struct vie *vie_ctx; /* (x) instruction emulation context */
131 131 uint64_t tsc_offset; /* (x) offset from host TSC */
132 132
133 133 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */
134 134 hrtime_t ustate_when; /* (i) time of last ustate change */
135 135 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */
136 136 };
137 137
138 138 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
139 139 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
140 140 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
141 141 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
142 142 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
143 143
144 144 struct mem_seg {
145 145 size_t len;
146 146 bool sysmem;
147 147 struct vm_object *object;
148 148 };
149 149 #define VM_MAX_MEMSEGS 4
150 150
151 151 struct mem_map {
152 152 vm_paddr_t gpa;
153 153 size_t len;
154 154 vm_ooffset_t segoff;
155 155 int segid;
156 156 int prot;
157 157 int flags;
158 158 };
159 159 #define VM_MAX_MEMMAPS 8
160 160
161 161 /*
162 162 * Initialization:
163 163 * (o) initialized the first time the VM is created
164 164 * (i) initialized when VM is created and when it is reinitialized
165 165 * (x) initialized before use
166 166 */
167 167 struct vm {
168 168 void *cookie; /* (i) cpu-specific data */
169 169 void *iommu; /* (x) iommu-specific data */
170 170 struct vhpet *vhpet; /* (i) virtual HPET */
171 171 struct vioapic *vioapic; /* (i) virtual ioapic */
172 172 struct vatpic *vatpic; /* (i) virtual atpic */
173 173 struct vatpit *vatpit; /* (i) virtual atpit */
174 174 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
175 175 struct vrtc *vrtc; /* (o) virtual RTC */
176 176 volatile cpuset_t active_cpus; /* (i) active vcpus */
177 177 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */
178 178 int suspend; /* (i) stop VM execution */
179 179 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
180 180 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
181 181 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
182 182 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
183 183 struct vmspace *vmspace; /* (o) guest's address space */
184 184 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
185 185 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
186 186 /* The following describe the vm cpu topology */
187 187 uint16_t sockets; /* (o) num of sockets */
188 188 uint16_t cores; /* (o) num of cores/socket */
189 189 uint16_t threads; /* (o) num of threads/core */
190 190 uint16_t maxcpus; /* (o) max pluggable cpus */
191 191 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */
192 192
193 193 struct ioport_config ioports; /* (o) ioport handling */
194 194 };
195 195
196 196 static int vmm_initialized;
197 197
198 198
199 199 static void
200 200 nullop_panic(void)
201 201 {
202 202 panic("null vmm operation call");
203 203 }
204 204
205 205 /* Do not allow use of an un-set `ops` to do anything but panic */
206 206 static struct vmm_ops vmm_ops_null = {
207 207 .init = (vmm_init_func_t)nullop_panic,
208 208 .cleanup = (vmm_cleanup_func_t)nullop_panic,
209 209 .resume = (vmm_resume_func_t)nullop_panic,
210 210 .vminit = (vmi_init_func_t)nullop_panic,
211 211 .vmrun = (vmi_run_func_t)nullop_panic,
212 212 .vmcleanup = (vmi_cleanup_func_t)nullop_panic,
213 213 .vmgetreg = (vmi_get_register_t)nullop_panic,
214 214 .vmsetreg = (vmi_set_register_t)nullop_panic,
215 215 .vmgetdesc = (vmi_get_desc_t)nullop_panic,
216 216 .vmsetdesc = (vmi_set_desc_t)nullop_panic,
217 217 .vmgetcap = (vmi_get_cap_t)nullop_panic,
218 218 .vmsetcap = (vmi_set_cap_t)nullop_panic,
219 219 .vmspace_alloc = (vmi_vmspace_alloc)nullop_panic,
220 220 .vmspace_free = (vmi_vmspace_free)nullop_panic,
221 221 .vlapic_init = (vmi_vlapic_init)nullop_panic,
222 222 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
223 223 .vmsavectx = (vmi_savectx)nullop_panic,
224 224 .vmrestorectx = (vmi_restorectx)nullop_panic,
225 225 };
226 226
227 227 static struct vmm_ops *ops = &vmm_ops_null;
228 228
229 229 #define VMM_INIT(num) ((*ops->init)(num))
230 230 #define VMM_CLEANUP() ((*ops->cleanup)())
231 231 #define VMM_RESUME() ((*ops->resume)())
232 232
233 233 #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap))
234 234 #define VMRUN(vmi, vcpu, rip, pmap) \
235 235 ((*ops->vmrun)(vmi, vcpu, rip, pmap))
236 236 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
237 237 #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max))
238 238 #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace))
239 239
240 240 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
241 241 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val))
242 242 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
243 243 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
244 244 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv))
245 245 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val))
246 246 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu))
247 247 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic))
248 248
249 249 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
250 250 #define fpu_stop_emulating() clts()
251 251
252 252 SDT_PROVIDER_DEFINE(vmm);
253 253
254 254 static MALLOC_DEFINE(M_VM, "vm", "vm");
255 255
256 256 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
257 257 NULL);
258 258
259 259 /*
260 260 * Halt the guest if all vcpus are executing a HLT instruction with
261 261 * interrupts disabled.
262 262 */
263 263 static int halt_detection_enabled = 1;
264 264
265 265 /* IPI vector used for vcpu notifications */
266 266 static int vmm_ipinum;
267 267
268 268 /* Trap into hypervisor on all guest exceptions and reflect them back */
269 269 static int trace_guest_exceptions;
270 270
271 271 static void vm_free_memmap(struct vm *vm, int ident);
272 272 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
273 273 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
274 274 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
275 275 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
276 276
277 277 /* Flags for vtc_status */
278 278 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */
279 279 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */
280 280
281 281 typedef struct vm_thread_ctx {
282 282 struct vm *vtc_vm;
283 283 int vtc_vcpuid;
284 284 uint_t vtc_status;
285 285 enum vcpu_ustate vtc_ustate;
286 286 } vm_thread_ctx_t;
287 287
288 288 #ifdef KTR
289 289 static const char *
290 290 vcpu_state2str(enum vcpu_state state)
291 291 {
292 292
293 293 switch (state) {
294 294 case VCPU_IDLE:
295 295 return ("idle");
296 296 case VCPU_FROZEN:
297 297 return ("frozen");
298 298 case VCPU_RUNNING:
299 299 return ("running");
300 300 case VCPU_SLEEPING:
301 301 return ("sleeping");
302 302 default:
303 303 return ("unknown");
304 304 }
305 305 }
306 306 #endif
307 307
308 308 static void
309 309 vcpu_cleanup(struct vm *vm, int i, bool destroy)
310 310 {
311 311 struct vcpu *vcpu = &vm->vcpu[i];
312 312
313 313 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
314 314 if (destroy) {
315 315 vmm_stat_free(vcpu->stats);
316 316 fpu_save_area_free(vcpu->guestfpu);
317 317 vie_free(vcpu->vie_ctx);
318 318 vcpu->vie_ctx = NULL;
319 319 }
320 320 }
321 321
322 322 static void
323 323 vcpu_init(struct vm *vm, int vcpu_id, bool create)
324 324 {
325 325 struct vcpu *vcpu;
326 326
327 327 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
328 328 ("vcpu_init: invalid vcpu %d", vcpu_id));
329 329
330 330 vcpu = &vm->vcpu[vcpu_id];
331 331
332 332 if (create) {
333 333 vcpu_lock_init(vcpu);
334 334 vcpu->state = VCPU_IDLE;
335 335 vcpu->hostcpu = NOCPU;
336 336 vcpu->lastloccpu = NOCPU;
337 337 vcpu->guestfpu = fpu_save_area_alloc();
338 338 vcpu->stats = vmm_stat_alloc();
339 339 vcpu->vie_ctx = vie_alloc();
340 340
341 341 vcpu->ustate = VU_INIT;
342 342 vcpu->ustate_when = gethrtime();
343 343 } else {
344 344 vie_reset(vcpu->vie_ctx);
345 345 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
346 346 if (vcpu->ustate != VU_INIT) {
347 347 vcpu_ustate_change(vm, vcpu_id, VU_INIT);
348 348 }
349 349 }
350 350
351 351 vcpu->run_state = VRS_HALT;
352 352 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
353 353 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
354 354 vcpu->reqidle = 0;
355 355 vcpu->exitintinfo = 0;
356 356 vcpu->nmi_pending = 0;
357 357 vcpu->extint_pending = 0;
358 358 vcpu->exception_pending = 0;
359 359 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
360 360 fpu_save_area_reset(vcpu->guestfpu);
361 361 vmm_stat_init(vcpu->stats);
362 362 vcpu->tsc_offset = 0;
363 363 }
364 364
365 365 int
366 366 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
367 367 {
368 368
369 369 return (trace_guest_exceptions);
370 370 }
371 371
372 372 struct vm_exit *
373 373 vm_exitinfo(struct vm *vm, int cpuid)
374 374 {
375 375 struct vcpu *vcpu;
376 376
377 377 if (cpuid < 0 || cpuid >= vm->maxcpus)
378 378 panic("vm_exitinfo: invalid cpuid %d", cpuid);
379 379
380 380 vcpu = &vm->vcpu[cpuid];
381 381
382 382 return (&vcpu->exitinfo);
383 383 }
384 384
385 385 struct vie *
386 386 vm_vie_ctx(struct vm *vm, int cpuid)
387 387 {
388 388 if (cpuid < 0 || cpuid >= vm->maxcpus)
389 389 panic("vm_vie_ctx: invalid cpuid %d", cpuid);
390 390
391 391 return (vm->vcpu[cpuid].vie_ctx);
392 392 }
393 393
394 394 static int
395 395 vmm_init(void)
396 396 {
397 397 int error;
398 398
399 399 vmm_host_state_init();
400 400
401 401 /* We use cpu_poke() for IPIs */
402 402 vmm_ipinum = 0;
403 403
404 404 error = vmm_mem_init();
405 405 if (error)
406 406 return (error);
407 407
408 408 if (vmm_is_intel())
409 409 ops = &vmm_ops_intel;
410 410 else if (vmm_is_svm())
411 411 ops = &vmm_ops_amd;
412 412 else
413 413 return (ENXIO);
414 414
415 415 return (VMM_INIT(vmm_ipinum));
416 416 }
417 417
418 418 int
419 419 vmm_mod_load()
420 420 {
421 421 int error;
422 422
423 423 VERIFY(vmm_initialized == 0);
424 424
425 425 error = vmm_init();
426 426 if (error == 0)
427 427 vmm_initialized = 1;
428 428
429 429 return (error);
430 430 }
431 431
432 432 int
433 433 vmm_mod_unload()
434 434 {
435 435 int error;
436 436
437 437 VERIFY(vmm_initialized == 1);
438 438
439 439 iommu_cleanup();
440 440 error = VMM_CLEANUP();
441 441 if (error)
442 442 return (error);
443 443 vmm_initialized = 0;
444 444
445 445 return (0);
446 446 }
447 447
448 448 static void
449 449 vm_init(struct vm *vm, bool create)
450 450 {
451 451 int i;
452 452
453 453 vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
454 454 vm->iommu = NULL;
455 455 vm->vioapic = vioapic_init(vm);
456 456 vm->vhpet = vhpet_init(vm);
457 457 vm->vatpic = vatpic_init(vm);
458 458 vm->vatpit = vatpit_init(vm);
459 459 vm->vpmtmr = vpmtmr_init(vm);
460 460 if (create)
461 461 vm->vrtc = vrtc_init(vm);
462 462
463 463 vm_inout_init(vm, &vm->ioports);
464 464
465 465 CPU_ZERO(&vm->active_cpus);
466 466 CPU_ZERO(&vm->debug_cpus);
467 467
468 468 vm->suspend = 0;
469 469 CPU_ZERO(&vm->suspended_cpus);
470 470
471 471 for (i = 0; i < vm->maxcpus; i++)
472 472 vcpu_init(vm, i, create);
473 473
474 474 /*
475 475 * Configure the VM-wide TSC offset so that the call to vm_init()
476 476 * represents the boot time (when the TSC(s) read 0). Each vCPU will
477 477 * have its own offset from this, which is altered if/when the guest
478 478 * writes to MSR_TSC.
479 479 *
480 480 * The TSC offsetting math is all unsigned, using overflow for negative
481 481 * offets. A reading of the TSC is negated to form the boot offset.
482 482 */
483 483 vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset());
484 484 }
485 485
486 486 /*
487 487 * The default CPU topology is a single thread per package.
488 488 */
489 489 uint_t cores_per_package = 1;
490 490 uint_t threads_per_core = 1;
491 491
492 492 int
493 493 vm_create(const char *name, struct vm **retvm)
494 494 {
495 495 struct vm *vm;
496 496 struct vmspace *vmspace;
497 497
498 498 /*
499 499 * If vmm.ko could not be successfully initialized then don't attempt
500 500 * to create the virtual machine.
501 501 */
502 502 if (!vmm_initialized)
503 503 return (ENXIO);
504 504
505 505 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
506 506 return (EINVAL);
507 507
508 508 vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
509 509 if (vmspace == NULL)
510 510 return (ENOMEM);
511 511
512 512 vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
513 513 strcpy(vm->name, name);
514 514 vm->vmspace = vmspace;
515 515
516 516 vm->sockets = 1;
517 517 vm->cores = cores_per_package; /* XXX backwards compatibility */
518 518 vm->threads = threads_per_core; /* XXX backwards compatibility */
519 519 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
520 520
521 521 vm_init(vm, true);
522 522
523 523 *retvm = vm;
524 524 return (0);
525 525 }
526 526
527 527 void
528 528 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
529 529 uint16_t *threads, uint16_t *maxcpus)
530 530 {
531 531 *sockets = vm->sockets;
532 532 *cores = vm->cores;
533 533 *threads = vm->threads;
534 534 *maxcpus = vm->maxcpus;
535 535 }
536 536
537 537 uint16_t
538 538 vm_get_maxcpus(struct vm *vm)
539 539 {
540 540 return (vm->maxcpus);
541 541 }
542 542
543 543 int
544 544 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
545 545 uint16_t threads, uint16_t maxcpus)
546 546 {
547 547 if (maxcpus != 0)
548 548 return (EINVAL); /* XXX remove when supported */
549 549 if ((sockets * cores * threads) > vm->maxcpus)
550 550 return (EINVAL);
551 551 /* XXX need to check sockets * cores * threads == vCPU, how? */
552 552 vm->sockets = sockets;
553 553 vm->cores = cores;
554 554 vm->threads = threads;
555 555 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
556 556 return (0);
557 557 }
558 558
559 559 static void
560 560 vm_cleanup(struct vm *vm, bool destroy)
561 561 {
562 562 struct mem_map *mm;
563 563 int i;
564 564
565 565 ppt_unassign_all(vm);
566 566
567 567 if (vm->iommu != NULL)
568 568 iommu_destroy_domain(vm->iommu);
569 569
570 570 /*
571 571 * Devices which attach their own ioport hooks should be cleaned up
572 572 * first so they can tear down those registrations.
573 573 */
574 574 vpmtmr_cleanup(vm->vpmtmr);
575 575
576 576 vm_inout_cleanup(vm, &vm->ioports);
577 577
578 578 if (destroy)
579 579 vrtc_cleanup(vm->vrtc);
580 580 else
581 581 vrtc_reset(vm->vrtc);
582 582
583 583 vatpit_cleanup(vm->vatpit);
584 584 vhpet_cleanup(vm->vhpet);
585 585 vatpic_cleanup(vm->vatpic);
586 586 vioapic_cleanup(vm->vioapic);
587 587
588 588 for (i = 0; i < vm->maxcpus; i++)
589 589 vcpu_cleanup(vm, i, destroy);
590 590
591 591 VMCLEANUP(vm->cookie);
592 592
593 593 /*
594 594 * System memory is removed from the guest address space only when
595 595 * the VM is destroyed. This is because the mapping remains the same
596 596 * across VM reset.
597 597 *
598 598 * Device memory can be relocated by the guest (e.g. using PCI BARs)
599 599 * so those mappings are removed on a VM reset.
600 600 */
601 601 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
602 602 mm = &vm->mem_maps[i];
603 603 if (destroy || !sysmem_mapping(vm, mm)) {
604 604 vm_free_memmap(vm, i);
605 605 } else {
606 606 /*
607 607 * We need to reset the IOMMU flag so this mapping can
608 608 * be reused when a VM is rebooted. Since the IOMMU
609 609 * domain has already been destroyed we can just reset
610 610 * the flag here.
611 611 */
612 612 mm->flags &= ~VM_MEMMAP_F_IOMMU;
613 613 }
614 614 }
615 615
616 616 if (destroy) {
617 617 for (i = 0; i < VM_MAX_MEMSEGS; i++)
618 618 vm_free_memseg(vm, i);
619 619
620 620 VMSPACE_FREE(vm->vmspace);
621 621 vm->vmspace = NULL;
622 622 }
623 623 }
624 624
625 625 void
626 626 vm_destroy(struct vm *vm)
627 627 {
628 628 vm_cleanup(vm, true);
629 629 free(vm, M_VM);
630 630 }
631 631
632 632 int
633 633 vm_reinit(struct vm *vm)
634 634 {
635 635 int error;
636 636
637 637 /*
638 638 * A virtual machine can be reset only if all vcpus are suspended.
639 639 */
640 640 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
641 641 vm_cleanup(vm, false);
642 642 vm_init(vm, false);
643 643 error = 0;
644 644 } else {
645 645 error = EBUSY;
646 646 }
647 647
648 648 return (error);
649 649 }
650 650
651 651 const char *
652 652 vm_name(struct vm *vm)
653 653 {
654 654 return (vm->name);
655 655 }
656 656
657 657 int
658 658 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
659 659 {
660 660 vm_object_t obj;
661 661
662 662 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
663 663 return (ENOMEM);
664 664 else
665 665 return (0);
666 666 }
667 667
668 668 int
669 669 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
670 670 {
671 671 return (vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len));
672 672 }
673 673
674 674 /*
675 675 * Return 'true' if 'gpa' is allocated in the guest address space.
676 676 *
677 677 * This function is called in the context of a running vcpu which acts as
678 678 * an implicit lock on 'vm->mem_maps[]'.
679 679 */
680 680 bool
681 681 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
682 682 {
683 683 struct mem_map *mm;
684 684 int i;
685 685
686 686 #ifdef INVARIANTS
687 687 int hostcpu, state;
688 688 state = vcpu_get_state(vm, vcpuid, &hostcpu);
689 689 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
690 690 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
691 691 #endif
692 692
693 693 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
694 694 mm = &vm->mem_maps[i];
695 695 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
696 696 return (true); /* 'gpa' is sysmem or devmem */
697 697 }
698 698
699 699 if (ppt_is_mmio(vm, gpa))
700 700 return (true); /* 'gpa' is pci passthru mmio */
701 701
702 702 return (false);
703 703 }
704 704
705 705 int
706 706 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
707 707 {
708 708 struct mem_seg *seg;
709 709 vm_object_t obj;
710 710
711 711 #ifndef __FreeBSD__
712 712 extern pgcnt_t get_max_page_get(void);
713 713 #endif
714 714
715 715 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
716 716 return (EINVAL);
717 717
718 718 if (len == 0 || (len & PAGE_MASK))
719 719 return (EINVAL);
720 720
721 721 #ifndef __FreeBSD__
722 722 if (len > ptob(get_max_page_get()))
723 723 return (EINVAL);
724 724 #endif
725 725
726 726 seg = &vm->mem_segs[ident];
727 727 if (seg->object != NULL) {
728 728 if (seg->len == len && seg->sysmem == sysmem)
729 729 return (EEXIST);
730 730 else
731 731 return (EINVAL);
732 732 }
733 733
734 734 obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
735 735 if (obj == NULL)
736 736 return (ENOMEM);
737 737
738 738 seg->len = len;
739 739 seg->object = obj;
740 740 seg->sysmem = sysmem;
741 741 return (0);
742 742 }
743 743
744 744 int
745 745 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
746 746 vm_object_t *objptr)
747 747 {
748 748 struct mem_seg *seg;
749 749
750 750 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
751 751 return (EINVAL);
752 752
753 753 seg = &vm->mem_segs[ident];
754 754 if (len)
755 755 *len = seg->len;
756 756 if (sysmem)
757 757 *sysmem = seg->sysmem;
758 758 if (objptr)
759 759 *objptr = seg->object;
760 760 return (0);
761 761 }
762 762
763 763 void
764 764 vm_free_memseg(struct vm *vm, int ident)
765 765 {
766 766 struct mem_seg *seg;
767 767
768 768 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
769 769 ("%s: invalid memseg ident %d", __func__, ident));
770 770
771 771 seg = &vm->mem_segs[ident];
772 772 if (seg->object != NULL) {
773 773 vm_object_deallocate(seg->object);
774 774 bzero(seg, sizeof (struct mem_seg));
775 775 }
776 776 }
777 777
778 778 int
779 779 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
780 780 size_t len, int prot, int flags)
781 781 {
782 782 struct mem_seg *seg;
783 783 struct mem_map *m, *map;
784 784 vm_ooffset_t last;
785 785 int i, error;
786 786
787 787 if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
788 788 return (EINVAL);
789 789
790 790 if (flags & ~VM_MEMMAP_F_WIRED)
791 791 return (EINVAL);
792 792
793 793 if (segid < 0 || segid >= VM_MAX_MEMSEGS)
794 794 return (EINVAL);
795 795
796 796 seg = &vm->mem_segs[segid];
797 797 if (seg->object == NULL)
798 798 return (EINVAL);
799 799
800 800 last = first + len;
801 801 if (first < 0 || first >= last || last > seg->len)
802 802 return (EINVAL);
803 803
804 804 if ((gpa | first | last) & PAGE_MASK)
805 805 return (EINVAL);
806 806
807 807 map = NULL;
808 808 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
809 809 m = &vm->mem_maps[i];
810 810 if (m->len == 0) {
811 811 map = m;
812 812 break;
813 813 }
814 814 }
815 815
816 816 if (map == NULL)
817 817 return (ENOSPC);
818 818
819 819 error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
820 820 len, 0, VMFS_NO_SPACE, prot, prot, 0);
821 821 if (error != 0)
822 822 return (EFAULT);
823 823
824 824 vm_object_reference(seg->object);
825 825
826 826 if ((flags & VM_MEMMAP_F_WIRED) != 0) {
827 827 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
828 828 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
829 829 if (error != 0) {
830 830 vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
831 831 return (EFAULT);
832 832 }
833 833 }
834 834
835 835 map->gpa = gpa;
836 836 map->len = len;
837 837 map->segoff = first;
838 838 map->segid = segid;
839 839 map->prot = prot;
840 840 map->flags = flags;
841 841 return (0);
842 842 }
843 843
844 844 int
845 845 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
846 846 {
847 847 struct mem_map *m;
848 848 int i;
849 849
850 850 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
851 851 m = &vm->mem_maps[i];
852 852 if (m->gpa == gpa && m->len == len &&
853 853 (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
854 854 vm_free_memmap(vm, i);
855 855 return (0);
856 856 }
857 857 }
858 858
859 859 return (EINVAL);
860 860 }
861 861
862 862 int
863 863 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
864 864 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
865 865 {
866 866 struct mem_map *mm, *mmnext;
867 867 int i;
868 868
869 869 mmnext = NULL;
870 870 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
871 871 mm = &vm->mem_maps[i];
872 872 if (mm->len == 0 || mm->gpa < *gpa)
873 873 continue;
874 874 if (mmnext == NULL || mm->gpa < mmnext->gpa)
875 875 mmnext = mm;
876 876 }
877 877
878 878 if (mmnext != NULL) {
879 879 *gpa = mmnext->gpa;
880 880 if (segid)
881 881 *segid = mmnext->segid;
882 882 if (segoff)
883 883 *segoff = mmnext->segoff;
884 884 if (len)
885 885 *len = mmnext->len;
886 886 if (prot)
887 887 *prot = mmnext->prot;
888 888 if (flags)
889 889 *flags = mmnext->flags;
890 890 return (0);
891 891 } else {
892 892 return (ENOENT);
893 893 }
894 894 }
895 895
896 896 static void
897 897 vm_free_memmap(struct vm *vm, int ident)
898 898 {
899 899 struct mem_map *mm;
900 900 int error;
901 901
902 902 mm = &vm->mem_maps[ident];
903 903 if (mm->len) {
904 904 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
905 905 mm->gpa + mm->len);
906 906 KASSERT(error == 0, ("%s: vm_map_remove error %d",
907 907 __func__, error));
908 908 bzero(mm, sizeof (struct mem_map));
909 909 }
910 910 }
911 911
912 912 static __inline bool
913 913 sysmem_mapping(struct vm *vm, struct mem_map *mm)
914 914 {
915 915
916 916 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
917 917 return (true);
918 918 else
919 919 return (false);
920 920 }
921 921
922 922 vm_paddr_t
923 923 vmm_sysmem_maxaddr(struct vm *vm)
924 924 {
925 925 struct mem_map *mm;
926 926 vm_paddr_t maxaddr;
927 927 int i;
928 928
929 929 maxaddr = 0;
930 930 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
931 931 mm = &vm->mem_maps[i];
932 932 if (sysmem_mapping(vm, mm)) {
933 933 if (maxaddr < mm->gpa + mm->len)
934 934 maxaddr = mm->gpa + mm->len;
935 935 }
936 936 }
937 937 return (maxaddr);
938 938 }
939 939
940 940 static void
941 941 vm_iommu_modify(struct vm *vm, bool map)
942 942 {
943 943 int i, sz;
944 944 vm_paddr_t gpa, hpa;
945 945 struct mem_map *mm;
946 946 #ifdef __FreeBSD__
947 947 void *vp, *cookie, *host_domain;
948 948 #else
949 949 void *vp, *cookie, *host_domain __unused;
950 950 #endif
951 951
952 952 sz = PAGE_SIZE;
953 953 host_domain = iommu_host_domain();
954 954
955 955 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
956 956 mm = &vm->mem_maps[i];
957 957 if (!sysmem_mapping(vm, mm))
958 958 continue;
959 959
960 960 if (map) {
961 961 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
962 962 ("iommu map found invalid memmap %lx/%lx/%x",
963 963 mm->gpa, mm->len, mm->flags));
964 964 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
965 965 continue;
966 966 mm->flags |= VM_MEMMAP_F_IOMMU;
967 967 } else {
968 968 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
969 969 continue;
970 970 mm->flags &= ~VM_MEMMAP_F_IOMMU;
971 971 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
972 972 ("iommu unmap found invalid memmap %lx/%lx/%x",
973 973 mm->gpa, mm->len, mm->flags));
974 974 }
975 975
976 976 gpa = mm->gpa;
977 977 while (gpa < mm->gpa + mm->len) {
978 978 vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, PROT_WRITE,
979 979 &cookie);
980 980 KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
981 981 vm_name(vm), gpa));
982 982
983 983 vm_gpa_release(cookie);
984 984
985 985 hpa = DMAP_TO_PHYS((uintptr_t)vp);
986 986 if (map) {
987 987 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
988 988 #ifdef __FreeBSD__
989 989 iommu_remove_mapping(host_domain, hpa, sz);
990 990 #endif
991 991 } else {
992 992 iommu_remove_mapping(vm->iommu, gpa, sz);
993 993 #ifdef __FreeBSD__
994 994 iommu_create_mapping(host_domain, hpa, hpa, sz);
995 995 #endif
996 996 }
997 997
998 998 gpa += PAGE_SIZE;
999 999 }
1000 1000 }
1001 1001
1002 1002 /*
1003 1003 * Invalidate the cached translations associated with the domain
1004 1004 * from which pages were removed.
1005 1005 */
1006 1006 #ifdef __FreeBSD__
1007 1007 if (map)
1008 1008 iommu_invalidate_tlb(host_domain);
1009 1009 else
1010 1010 iommu_invalidate_tlb(vm->iommu);
1011 1011 #else
1012 1012 iommu_invalidate_tlb(vm->iommu);
1013 1013 #endif
1014 1014 }
1015 1015
1016 1016 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false)
1017 1017 #define vm_iommu_map(vm) vm_iommu_modify((vm), true)
1018 1018
1019 1019 int
1020 1020 vm_unassign_pptdev(struct vm *vm, int pptfd)
1021 1021 {
1022 1022 int error;
1023 1023
1024 1024 error = ppt_unassign_device(vm, pptfd);
1025 1025 if (error)
1026 1026 return (error);
1027 1027
1028 1028 if (ppt_assigned_devices(vm) == 0)
1029 1029 vm_iommu_unmap(vm);
1030 1030
1031 1031 return (0);
1032 1032 }
1033 1033
1034 1034 int
1035 1035 vm_assign_pptdev(struct vm *vm, int pptfd)
1036 1036 {
1037 1037 int error;
1038 1038 vm_paddr_t maxaddr;
1039 1039
1040 1040 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1041 1041 if (ppt_assigned_devices(vm) == 0) {
1042 1042 KASSERT(vm->iommu == NULL,
1043 1043 ("vm_assign_pptdev: iommu must be NULL"));
1044 1044 maxaddr = vmm_sysmem_maxaddr(vm);
1045 1045 vm->iommu = iommu_create_domain(maxaddr);
1046 1046 if (vm->iommu == NULL)
1047 1047 return (ENXIO);
1048 1048 vm_iommu_map(vm);
1049 1049 }
1050 1050
1051 1051 error = ppt_assign_device(vm, pptfd);
1052 1052 return (error);
1053 1053 }
1054 1054
1055 1055 void *
1056 1056 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1057 1057 void **cookie)
1058 1058 {
1059 1059 int i, count, pageoff;
1060 1060 struct mem_map *mm;
1061 1061 vm_page_t m;
1062 1062 #ifdef INVARIANTS
1063 1063 /*
1064 1064 * All vcpus are frozen by ioctls that modify the memory map
1065 1065 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1066 1066 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1067 1067 */
1068 1068 int state;
1069 1069 KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1070 1070 __func__, vcpuid));
1071 1071 for (i = 0; i < vm->maxcpus; i++) {
1072 1072 if (vcpuid != -1 && vcpuid != i)
1073 1073 continue;
1074 1074 state = vcpu_get_state(vm, i, NULL);
1075 1075 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1076 1076 __func__, state));
1077 1077 }
1078 1078 #endif
1079 1079 pageoff = gpa & PAGE_MASK;
1080 1080 if (len > PAGE_SIZE - pageoff)
1081 1081 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1082 1082
1083 1083 count = 0;
1084 1084 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1085 1085 mm = &vm->mem_maps[i];
1086 1086 if (mm->len == 0) {
1087 1087 continue;
1088 1088 }
1089 1089 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1090 1090 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1091 1091 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1092 1092 break;
1093 1093 }
1094 1094 }
1095 1095
1096 1096 if (count == 1) {
1097 1097 *cookie = m;
1098 1098 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1099 1099 } else {
1100 1100 *cookie = NULL;
1101 1101 return (NULL);
1102 1102 }
1103 1103 }
1104 1104
1105 1105 void
1106 1106 vm_gpa_release(void *cookie)
1107 1107 {
1108 1108 vm_page_t m = cookie;
1109 1109
1110 1110 vm_page_unwire(m, PQ_ACTIVE);
1111 1111 }
1112 1112
1113 1113 int
1114 1114 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1115 1115 {
1116 1116
1117 1117 if (vcpu < 0 || vcpu >= vm->maxcpus)
1118 1118 return (EINVAL);
1119 1119
1120 1120 if (reg >= VM_REG_LAST)
1121 1121 return (EINVAL);
1122 1122
1123 1123 return (VMGETREG(vm->cookie, vcpu, reg, retval));
1124 1124 }
1125 1125
1126 1126 int
1127 1127 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1128 1128 {
1129 1129 struct vcpu *vcpu;
1130 1130 int error;
1131 1131
1132 1132 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1133 1133 return (EINVAL);
1134 1134
1135 1135 if (reg >= VM_REG_LAST)
1136 1136 return (EINVAL);
1137 1137
1138 1138 error = VMSETREG(vm->cookie, vcpuid, reg, val);
1139 1139 if (error || reg != VM_REG_GUEST_RIP)
1140 1140 return (error);
1141 1141
1142 1142 /* Set 'nextrip' to match the value of %rip */
1143 1143 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1144 1144 vcpu = &vm->vcpu[vcpuid];
1145 1145 vcpu->nextrip = val;
1146 1146 return (0);
1147 1147 }
1148 1148
1149 1149 static bool
1150 1150 is_descriptor_table(int reg)
1151 1151 {
1152 1152 switch (reg) {
1153 1153 case VM_REG_GUEST_IDTR:
1154 1154 case VM_REG_GUEST_GDTR:
1155 1155 return (true);
1156 1156 default:
1157 1157 return (false);
1158 1158 }
1159 1159 }
1160 1160
1161 1161 static bool
1162 1162 is_segment_register(int reg)
1163 1163 {
1164 1164 switch (reg) {
1165 1165 case VM_REG_GUEST_ES:
1166 1166 case VM_REG_GUEST_CS:
1167 1167 case VM_REG_GUEST_SS:
1168 1168 case VM_REG_GUEST_DS:
1169 1169 case VM_REG_GUEST_FS:
1170 1170 case VM_REG_GUEST_GS:
1171 1171 case VM_REG_GUEST_TR:
1172 1172 case VM_REG_GUEST_LDTR:
1173 1173 return (true);
1174 1174 default:
1175 1175 return (false);
1176 1176 }
1177 1177 }
1178 1178
1179 1179 int
1180 1180 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1181 1181 {
1182 1182
1183 1183 if (vcpu < 0 || vcpu >= vm->maxcpus)
1184 1184 return (EINVAL);
1185 1185
1186 1186 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1187 1187 return (EINVAL);
1188 1188
1189 1189 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1190 1190 }
1191 1191
1192 1192 int
1193 1193 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1194 1194 {
1195 1195 if (vcpu < 0 || vcpu >= vm->maxcpus)
1196 1196 return (EINVAL);
1197 1197
1198 1198 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1199 1199 return (EINVAL);
1200 1200
1201 1201 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1202 1202 }
1203 1203
1204 1204 int
1205 1205 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1206 1206 {
1207 1207 struct vcpu *vcpu;
1208 1208
1209 1209 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1210 1210 return (EINVAL);
1211 1211 }
1212 1212
1213 1213 vcpu = &vm->vcpu[vcpuid];
1214 1214
1215 1215 vcpu_lock(vcpu);
1216 1216 *state = vcpu->run_state;
1217 1217 *sipi_vec = vcpu->sipi_vector;
1218 1218 vcpu_unlock(vcpu);
1219 1219
1220 1220 return (0);
1221 1221 }
1222 1222
1223 1223 int
1224 1224 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1225 1225 {
1226 1226 struct vcpu *vcpu;
1227 1227
1228 1228 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1229 1229 return (EINVAL);
1230 1230 }
1231 1231 if (!VRS_IS_VALID(state)) {
1232 1232 return (EINVAL);
1233 1233 }
1234 1234
1235 1235 vcpu = &vm->vcpu[vcpuid];
1236 1236
1237 1237 vcpu_lock(vcpu);
1238 1238 vcpu->run_state = state;
1239 1239 vcpu->sipi_vector = sipi_vec;
1240 1240 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1241 1241 vcpu_unlock(vcpu);
1242 1242
1243 1243 return (0);
1244 1244 }
1245 1245
1246 1246
1247 1247 static void
1248 1248 restore_guest_fpustate(struct vcpu *vcpu)
1249 1249 {
1250 1250
1251 1251 /* flush host state to the pcb */
1252 1252 fpuexit(curthread);
1253 1253
1254 1254 /* restore guest FPU state */
1255 1255 fpu_stop_emulating();
1256 1256 fpurestore(vcpu->guestfpu);
1257 1257
1258 1258 /* restore guest XCR0 if XSAVE is enabled in the host */
1259 1259 if (rcr4() & CR4_XSAVE)
1260 1260 load_xcr(0, vcpu->guest_xcr0);
1261 1261
1262 1262 /*
1263 1263 * The FPU is now "dirty" with the guest's state so turn on emulation
1264 1264 * to trap any access to the FPU by the host.
1265 1265 */
1266 1266 fpu_start_emulating();
1267 1267 }
1268 1268
1269 1269 static void
1270 1270 save_guest_fpustate(struct vcpu *vcpu)
1271 1271 {
1272 1272
1273 1273 if ((rcr0() & CR0_TS) == 0)
1274 1274 panic("fpu emulation not enabled in host!");
1275 1275
1276 1276 /* save guest XCR0 and restore host XCR0 */
1277 1277 if (rcr4() & CR4_XSAVE) {
1278 1278 vcpu->guest_xcr0 = rxcr(0);
1279 1279 load_xcr(0, vmm_get_host_xcr0());
1280 1280 }
1281 1281
1282 1282 /* save guest FPU state */
1283 1283 fpu_stop_emulating();
1284 1284 fpusave(vcpu->guestfpu);
1285 1285 /*
1286 1286 * When the host state has been restored, we should not re-enable
1287 1287 * CR0.TS on illumos for eager FPU.
1288 1288 */
1289 1289 }
1290 1290
1291 1291 static int
1292 1292 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1293 1293 bool from_idle)
1294 1294 {
1295 1295 struct vcpu *vcpu;
1296 1296 int error;
1297 1297
1298 1298 vcpu = &vm->vcpu[vcpuid];
1299 1299 vcpu_assert_locked(vcpu);
1300 1300
1301 1301 /*
1302 1302 * State transitions from the vmmdev_ioctl() must always begin from
1303 1303 * the VCPU_IDLE state. This guarantees that there is only a single
1304 1304 * ioctl() operating on a vcpu at any point.
1305 1305 */
1306 1306 if (from_idle) {
1307 1307 while (vcpu->state != VCPU_IDLE) {
1308 1308 vcpu->reqidle = 1;
1309 1309 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1310 1310 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1311 1311 "idle requested", vcpu_state2str(vcpu->state));
1312 1312 cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1313 1313 }
1314 1314 } else {
1315 1315 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1316 1316 "vcpu idle state"));
1317 1317 }
1318 1318
1319 1319 if (vcpu->state == VCPU_RUNNING) {
1320 1320 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1321 1321 "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1322 1322 } else {
1323 1323 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1324 1324 "vcpu that is not running", vcpu->hostcpu));
1325 1325 }
1326 1326
1327 1327 /*
1328 1328 * The following state transitions are allowed:
1329 1329 * IDLE -> FROZEN -> IDLE
1330 1330 * FROZEN -> RUNNING -> FROZEN
1331 1331 * FROZEN -> SLEEPING -> FROZEN
1332 1332 */
1333 1333 switch (vcpu->state) {
1334 1334 case VCPU_IDLE:
1335 1335 case VCPU_RUNNING:
1336 1336 case VCPU_SLEEPING:
1337 1337 error = (newstate != VCPU_FROZEN);
1338 1338 break;
1339 1339 case VCPU_FROZEN:
1340 1340 error = (newstate == VCPU_FROZEN);
1341 1341 break;
1342 1342 default:
1343 1343 error = 1;
1344 1344 break;
1345 1345 }
1346 1346
1347 1347 if (error)
1348 1348 return (EBUSY);
1349 1349
1350 1350 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1351 1351 vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1352 1352
1353 1353 vcpu->state = newstate;
1354 1354 if (newstate == VCPU_RUNNING)
1355 1355 vcpu->hostcpu = curcpu;
1356 1356 else
1357 1357 vcpu->hostcpu = NOCPU;
1358 1358
1359 1359 if (newstate == VCPU_IDLE) {
1360 1360 cv_broadcast(&vcpu->state_cv);
1361 1361 }
1362 1362
1363 1363 return (0);
1364 1364 }
1365 1365
1366 1366 static void
1367 1367 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1368 1368 {
1369 1369 int error;
1370 1370
1371 1371 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1372 1372 panic("Error %d setting state to %d\n", error, newstate);
1373 1373 }
1374 1374
1375 1375 static void
1376 1376 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1377 1377 {
1378 1378 int error;
1379 1379
1380 1380 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1381 1381 panic("Error %d setting state to %d", error, newstate);
1382 1382 }
1383 1383
1384 1384 /*
1385 1385 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1386 1386 */
1387 1387 static int
1388 1388 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1389 1389 {
1390 1390 struct vcpu *vcpu;
1391 1391 int vcpu_halted, vm_halted;
1392 1392 bool userspace_exit = false;
1393 1393
1394 1394 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1395 1395
1396 1396 vcpu = &vm->vcpu[vcpuid];
1397 1397 vcpu_halted = 0;
1398 1398 vm_halted = 0;
1399 1399
1400 1400 vcpu_lock(vcpu);
1401 1401 while (1) {
1402 1402 /*
1403 1403 * Do a final check for pending interrupts (including NMI and
1404 1404 * INIT) before putting this thread to sleep.
1405 1405 */
1406 1406 if (vm_nmi_pending(vm, vcpuid))
1407 1407 break;
1408 1408 if (vcpu_run_state_pending(vm, vcpuid))
1409 1409 break;
1410 1410 if (!intr_disabled) {
1411 1411 if (vm_extint_pending(vm, vcpuid) ||
1412 1412 vlapic_pending_intr(vcpu->vlapic, NULL)) {
1413 1413 break;
1414 1414 }
1415 1415 }
1416 1416
1417 1417 /*
1418 1418 * Also check for software events which would cause a wake-up.
1419 1419 * This will set the appropriate exitcode directly, rather than
1420 1420 * requiring a trip through VM_RUN().
1421 1421 */
1422 1422 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1423 1423 userspace_exit = true;
1424 1424 break;
1425 1425 }
1426 1426
1427 1427 /*
1428 1428 * Some Linux guests implement "halt" by having all vcpus
1429 1429 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1430 1430 * track of the vcpus that have entered this state. When all
1431 1431 * vcpus enter the halted state the virtual machine is halted.
1432 1432 */
1433 1433 if (intr_disabled) {
1434 1434 if (!vcpu_halted && halt_detection_enabled) {
1435 1435 vcpu_halted = 1;
1436 1436 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1437 1437 }
1438 1438 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1439 1439 vm_halted = 1;
1440 1440 break;
1441 1441 }
1442 1442 }
1443 1443
1444 1444 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1445 1445 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1446 1446 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1447 1447 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1448 1448 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1449 1449 }
1450 1450
1451 1451 if (vcpu_halted)
1452 1452 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1453 1453
1454 1454 vcpu_unlock(vcpu);
1455 1455
1456 1456 if (vm_halted)
1457 1457 vm_suspend(vm, VM_SUSPEND_HALT);
1458 1458
1459 1459 return (userspace_exit ? -1 : 0);
1460 1460 }
1461 1461
1462 1462 static int
1463 1463 vm_handle_paging(struct vm *vm, int vcpuid)
1464 1464 {
1465 1465 int rv, ftype;
1466 1466 struct vm_map *map;
1467 1467 struct vcpu *vcpu;
1468 1468 struct vm_exit *vme;
1469 1469
1470 1470 vcpu = &vm->vcpu[vcpuid];
1471 1471 vme = &vcpu->exitinfo;
1472 1472
1473 1473 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1474 1474 __func__, vme->inst_length));
1475 1475
1476 1476 ftype = vme->u.paging.fault_type;
1477 1477 KASSERT(ftype == PROT_READ ||
1478 1478 ftype == PROT_WRITE || ftype == PROT_EXEC,
1479 1479 ("vm_handle_paging: invalid fault_type %d", ftype));
1480 1480
1481 1481 if (ftype == PROT_READ || ftype == PROT_WRITE) {
1482 1482 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1483 1483 vme->u.paging.gpa, ftype);
1484 1484 if (rv == 0) {
1485 1485 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1486 1486 ftype == PROT_READ ? "accessed" : "dirty",
1487 1487 vme->u.paging.gpa);
1488 1488 goto done;
1489 1489 }
1490 1490 }
1491 1491
1492 1492 map = &vm->vmspace->vm_map;
1493 1493 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1494 1494
1495 1495 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1496 1496 "ftype = %d", rv, vme->u.paging.gpa, ftype);
1497 1497
1498 1498 if (rv != 0)
1499 1499 return (EFAULT);
1500 1500 done:
1501 1501 return (0);
1502 1502 }
1503 1503
1504 1504 int
1505 1505 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1506 1506 int rsize)
1507 1507 {
1508 1508 int err = ESRCH;
1509 1509
1510 1510 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1511 1511 err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1512 1512 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1513 1513 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1514 1514 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1515 1515 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1516 1516 }
1517 1517
1518 1518 return (err);
1519 1519 }
1520 1520
1521 1521 int
1522 1522 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1523 1523 int wsize)
1524 1524 {
1525 1525 int err = ESRCH;
1526 1526
1527 1527 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1528 1528 err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1529 1529 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1530 1530 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1531 1531 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1532 1532 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1533 1533 }
1534 1534
1535 1535 return (err);
1536 1536 }
1537 1537
1538 1538 static int
1539 1539 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1540 1540 {
1541 1541 struct vie *vie;
1542 1542 struct vcpu *vcpu;
1543 1543 struct vm_exit *vme;
1544 1544 uint64_t inst_addr;
1545 1545 int error, fault, cs_d;
1546 1546
1547 1547 vcpu = &vm->vcpu[vcpuid];
1548 1548 vme = &vcpu->exitinfo;
1549 1549 vie = vcpu->vie_ctx;
1550 1550
1551 1551 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1552 1552 __func__, vme->inst_length));
1553 1553
1554 1554 inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1555 1555 cs_d = vme->u.mmio_emul.cs_d;
1556 1556
1557 1557 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1558 1558 vme->u.mmio_emul.gpa);
1559 1559
1560 1560 /* Fetch the faulting instruction */
1561 1561 if (vie_needs_fetch(vie)) {
1562 1562 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1563 1563 &fault);
1564 1564 if (error != 0) {
1565 1565 return (error);
1566 1566 } else if (fault) {
1567 1567 /*
1568 1568 * If a fault during instruction fetch was encountered,
1569 1569 * it will have asserted that the appropriate exception
1570 1570 * be injected at next entry.
1571 1571 * No further work is required.
1572 1572 */
1573 1573 return (0);
1574 1574 }
1575 1575 }
1576 1576
1577 1577 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1578 1578 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1579 1579 inst_addr);
1580 1580 /* Dump (unrecognized) instruction bytes in userspace */
1581 1581 vie_fallback_exitinfo(vie, vme);
1582 1582 return (-1);
1583 1583 }
1584 1584 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1585 1585 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1586 1586 /* Decoded GLA does not match GLA from VM exit state */
1587 1587 vie_fallback_exitinfo(vie, vme);
1588 1588 return (-1);
1589 1589 }
1590 1590
1591 1591 repeat:
1592 1592 error = vie_emulate_mmio(vie, vm, vcpuid);
1593 1593 if (error < 0) {
1594 1594 /*
1595 1595 * MMIO not handled by any of the in-kernel-emulated devices, so
1596 1596 * make a trip out to userspace for it.
1597 1597 */
1598 1598 vie_exitinfo(vie, vme);
1599 1599 } else if (error == EAGAIN) {
1600 1600 /*
1601 1601 * Continue emulating the rep-prefixed instruction, which has
1602 1602 * not completed its iterations.
1603 1603 *
1604 1604 * In case this can be emulated in-kernel and has a high
1605 1605 * repetition count (causing a tight spin), it should be
1606 1606 * deferential to yield conditions.
1607 1607 */
1608 1608 if (!vcpu_should_yield(vm, vcpuid)) {
1609 1609 goto repeat;
1610 1610 } else {
1611 1611 /*
1612 1612 * Defer to the contending load by making a trip to
1613 1613 * userspace with a no-op (BOGUS) exit reason.
1614 1614 */
1615 1615 vie_reset(vie);
1616 1616 vme->exitcode = VM_EXITCODE_BOGUS;
1617 1617 return (-1);
1618 1618 }
1619 1619 } else if (error == 0) {
1620 1620 /* Update %rip now that instruction has been emulated */
1621 1621 vie_advance_pc(vie, &vcpu->nextrip);
1622 1622 }
1623 1623 return (error);
1624 1624 }
1625 1625
1626 1626 static int
1627 1627 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1628 1628 {
1629 1629 struct vcpu *vcpu;
1630 1630 struct vie *vie;
1631 1631 int err;
1632 1632
1633 1633 vcpu = &vm->vcpu[vcpuid];
1634 1634 vie = vcpu->vie_ctx;
1635 1635
1636 1636 repeat:
1637 1637 err = vie_emulate_inout(vie, vm, vcpuid);
1638 1638
1639 1639 if (err < 0) {
1640 1640 /*
1641 1641 * In/out not handled by any of the in-kernel-emulated devices,
1642 1642 * so make a trip out to userspace for it.
1643 1643 */
1644 1644 vie_exitinfo(vie, vme);
1645 1645 return (err);
1646 1646 } else if (err == EAGAIN) {
1647 1647 /*
1648 1648 * Continue emulating the rep-prefixed ins/outs, which has not
1649 1649 * completed its iterations.
1650 1650 *
1651 1651 * In case this can be emulated in-kernel and has a high
1652 1652 * repetition count (causing a tight spin), it should be
1653 1653 * deferential to yield conditions.
1654 1654 */
1655 1655 if (!vcpu_should_yield(vm, vcpuid)) {
1656 1656 goto repeat;
1657 1657 } else {
1658 1658 /*
1659 1659 * Defer to the contending load by making a trip to
1660 1660 * userspace with a no-op (BOGUS) exit reason.
1661 1661 */
1662 1662 vie_reset(vie);
1663 1663 vme->exitcode = VM_EXITCODE_BOGUS;
1664 1664 return (-1);
1665 1665 }
1666 1666 } else if (err != 0) {
1667 1667 /* Emulation failure. Bail all the way out to userspace. */
1668 1668 vme->exitcode = VM_EXITCODE_INST_EMUL;
1669 1669 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1670 1670 return (-1);
1671 1671 }
1672 1672
1673 1673 vie_advance_pc(vie, &vcpu->nextrip);
1674 1674 return (0);
1675 1675 }
1676 1676
1677 1677 static int
1678 1678 vm_handle_inst_emul(struct vm *vm, int vcpuid)
1679 1679 {
1680 1680 struct vie *vie;
1681 1681 struct vcpu *vcpu;
1682 1682 struct vm_exit *vme;
1683 1683 uint64_t cs_base;
1684 1684 int error, fault, cs_d;
1685 1685
1686 1686 vcpu = &vm->vcpu[vcpuid];
1687 1687 vme = &vcpu->exitinfo;
1688 1688 vie = vcpu->vie_ctx;
1689 1689
1690 1690 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1691 1691
1692 1692 /* Fetch the faulting instruction */
1693 1693 ASSERT(vie_needs_fetch(vie));
1694 1694 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1695 1695 &fault);
1696 1696 if (error != 0) {
1697 1697 return (error);
1698 1698 } else if (fault) {
1699 1699 /*
1700 1700 * If a fault during instruction fetch was encounted, it will
1701 1701 * have asserted that the appropriate exception be injected at
1702 1702 * next entry. No further work is required.
1703 1703 */
1704 1704 return (0);
1705 1705 }
1706 1706
1707 1707 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1708 1708 /* Dump (unrecognized) instruction bytes in userspace */
1709 1709 vie_fallback_exitinfo(vie, vme);
1710 1710 return (-1);
1711 1711 }
1712 1712
1713 1713 error = vie_emulate_other(vie, vm, vcpuid);
1714 1714 if (error != 0) {
1715 1715 /*
1716 1716 * Instruction emulation was unable to complete successfully, so
1717 1717 * kick it out to userspace for handling.
1718 1718 */
1719 1719 vie_fallback_exitinfo(vie, vme);
1720 1720 } else {
1721 1721 /* Update %rip now that instruction has been emulated */
1722 1722 vie_advance_pc(vie, &vcpu->nextrip);
1723 1723 }
1724 1724 return (error);
1725 1725 }
1726 1726
1727 1727 static int
1728 1728 vm_handle_suspend(struct vm *vm, int vcpuid)
1729 1729 {
1730 1730 int i;
1731 1731 struct vcpu *vcpu;
1732 1732
1733 1733 vcpu = &vm->vcpu[vcpuid];
1734 1734
1735 1735 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1736 1736
1737 1737 /*
1738 1738 * Wait until all 'active_cpus' have suspended themselves.
1739 1739 */
1740 1740 vcpu_lock(vcpu);
1741 1741 vcpu_ustate_change(vm, vcpuid, VU_INIT);
1742 1742 while (1) {
1743 1743 int rc;
1744 1744
1745 1745 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1746 1746 VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1747 1747 break;
1748 1748 }
1749 1749
1750 1750 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1751 1751 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1752 1752 TR_CLOCK_TICK);
1753 1753 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1754 1754
1755 1755 /*
1756 1756 * If the userspace process driving the instance is killed, any
1757 1757 * vCPUs yet to be marked suspended (because they are not
1758 1758 * VM_RUN-ing in the kernel presently) will never reach that
1759 1759 * state.
1760 1760 *
1761 1761 * To avoid vm_handle_suspend() getting stuck in the kernel
1762 1762 * waiting for those vCPUs, offer a bail-out even though it
1763 1763 * means returning without all vCPUs in a suspended state.
1764 1764 */
1765 1765 if (rc <= 0) {
1766 1766 if ((curproc->p_flag & SEXITING) != 0) {
1767 1767 break;
1768 1768 }
1769 1769 }
1770 1770 }
1771 1771 vcpu_unlock(vcpu);
1772 1772
1773 1773 /*
1774 1774 * Wakeup the other sleeping vcpus and return to userspace.
1775 1775 */
1776 1776 for (i = 0; i < vm->maxcpus; i++) {
1777 1777 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1778 1778 vcpu_notify_event(vm, i);
1779 1779 }
1780 1780 }
1781 1781
1782 1782 return (-1);
1783 1783 }
1784 1784
1785 1785 static int
1786 1786 vm_handle_reqidle(struct vm *vm, int vcpuid)
1787 1787 {
1788 1788 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1789 1789
1790 1790 vcpu_lock(vcpu);
1791 1791 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1792 1792 vcpu->reqidle = 0;
1793 1793 vcpu_unlock(vcpu);
1794 1794 return (-1);
1795 1795 }
1796 1796
1797 1797 static int
1798 1798 vm_handle_run_state(struct vm *vm, int vcpuid)
1799 1799 {
1800 1800 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1801 1801 bool handled = false;
1802 1802
1803 1803 vcpu_lock(vcpu);
1804 1804 while (1) {
1805 1805 if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1806 1806 vcpu_unlock(vcpu);
1807 1807 VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1808 1808 vcpu_lock(vcpu);
1809 1809
1810 1810 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1811 1811 vcpu->run_state |= VRS_INIT;
1812 1812 }
1813 1813
1814 1814 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1815 1815 (VRS_INIT | VRS_PEND_SIPI)) {
1816 1816 const uint8_t vector = vcpu->sipi_vector;
1817 1817
1818 1818 vcpu_unlock(vcpu);
1819 1819 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1820 1820 vcpu_lock(vcpu);
1821 1821
1822 1822 vcpu->run_state &= ~VRS_PEND_SIPI;
1823 1823 vcpu->run_state |= VRS_RUN;
1824 1824 }
1825 1825
1826 1826 /*
1827 1827 * If the vCPU is now in the running state, there is no need to
1828 1828 * wait for anything prior to re-entry.
1829 1829 */
1830 1830 if ((vcpu->run_state & VRS_RUN) != 0) {
1831 1831 handled = true;
1832 1832 break;
1833 1833 }
1834 1834
1835 1835 /*
1836 1836 * Also check for software events which would cause a wake-up.
1837 1837 * This will set the appropriate exitcode directly, rather than
1838 1838 * requiring a trip through VM_RUN().
1839 1839 */
1840 1840 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1841 1841 break;
1842 1842 }
1843 1843
1844 1844 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1845 1845 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1846 1846 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1847 1847 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1848 1848 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1849 1849 }
1850 1850 vcpu_unlock(vcpu);
1851 1851
1852 1852 return (handled ? 0 : -1);
1853 1853 }
1854 1854
1855 1855 static int
1856 1856 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1857 1857 {
1858 1858 const uint32_t code = vme->u.msr.code;
1859 1859 uint64_t val = 0;
1860 1860
1861 1861 switch (code) {
1862 1862 case MSR_MCG_CAP:
1863 1863 case MSR_MCG_STATUS:
1864 1864 val = 0;
1865 1865 break;
1866 1866
1867 1867 case MSR_MTRRcap:
1868 1868 case MSR_MTRRdefType:
1869 1869 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1870 1870 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1871 1871 case MSR_MTRR64kBase:
1872 1872 val = 0;
1873 1873 break;
1874 1874
1875 1875 case MSR_TSC:
1876 1876 /*
1877 1877 * In all likelihood, this should always be handled in guest
1878 1878 * context by VMX/SVM rather than taking an exit. (Both VMX and
1879 1879 * SVM pass through read-only access to MSR_TSC to the guest.)
1880 1880 *
1881 1881 * No physical offset is requested of vcpu_tsc_offset() since
1882 1882 * rdtsc_offset() takes care of that instead.
1883 1883 */
1884 1884 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
1885 1885 break;
1886 1886
1887 1887 default:
1888 1888 /*
1889 1889 * Anything not handled at this point will be kicked out to
1890 1890 * userspace for attempted processing there.
1891 1891 */
1892 1892 return (-1);
1893 1893 }
1894 1894
1895 1895 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
1896 1896 val & 0xffffffff));
1897 1897 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
1898 1898 val >> 32));
1899 1899 return (0);
1900 1900 }
1901 1901
1902 1902 static int
1903 1903 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1904 1904 {
1905 1905 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1906 1906 const uint32_t code = vme->u.msr.code;
1907 1907 const uint64_t val = vme->u.msr.wval;
1908 1908
1909 1909 switch (code) {
1910 1910 case MSR_MCG_CAP:
1911 1911 case MSR_MCG_STATUS:
1912 1912 /* Ignore writes */
1913 1913 break;
1914 1914
1915 1915 case MSR_MTRRcap:
1916 1916 vm_inject_gp(vm, vcpuid);
1917 1917 break;
1918 1918 case MSR_MTRRdefType:
1919 1919 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1920 1920 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1921 1921 case MSR_MTRR64kBase:
1922 1922 /* Ignore writes */
1923 1923 break;
1924 1924
1925 1925 case MSR_TSC:
1926 1926 /*
1927 1927 * The effect of writing the TSC MSR is that a subsequent read
1928 1928 * of the TSC would report that value written (plus any time
1929 1929 * elapsed between the write and the read). The guest TSC value
1930 1930 * is calculated from a global offset for the guest (which
1931 1931 * effectively makes its TSC read 0 at guest boot) and a
1932 1932 * per-vCPU offset to handle these writes to the MSR.
1933 1933 *
1934 1934 * To calculate that per-vCPU offset, we can work backwards from
1935 1935 * the guest value at the time of write:
1936 1936 *
1937 1937 * value = host TSC + VM boot offset + vCPU offset
1938 1938 *
1939 1939 * so therefore:
1940 1940 *
1941 1941 * value - host TSC - VM boot offset = vCPU offset
1942 1942 */
1943 1943 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
1944 1944 break;
1945 1945
1946 1946 default:
1947 1947 /*
1948 1948 * Anything not handled at this point will be kicked out to
1949 1949 * userspace for attempted processing there.
1950 1950 */
1951 1951 return (-1);
1952 1952 }
1953 1953
1954 1954 return (0);
1955 1955 }
1956 1956
1957 1957 int
1958 1958 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1959 1959 {
1960 1960 int i;
1961 1961
1962 1962 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1963 1963 return (EINVAL);
1964 1964
1965 1965 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1966 1966 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1967 1967 vm->suspend, how);
1968 1968 return (EALREADY);
1969 1969 }
1970 1970
1971 1971 VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1972 1972
1973 1973 /*
1974 1974 * Notify all active vcpus that they are now suspended.
1975 1975 */
1976 1976 for (i = 0; i < vm->maxcpus; i++) {
1977 1977 if (CPU_ISSET(i, &vm->active_cpus))
1978 1978 vcpu_notify_event(vm, i);
1979 1979 }
1980 1980
1981 1981 return (0);
1982 1982 }
1983 1983
1984 1984 void
1985 1985 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1986 1986 {
1987 1987 struct vm_exit *vmexit;
1988 1988
1989 1989 vmexit = vm_exitinfo(vm, vcpuid);
1990 1990 vmexit->rip = rip;
1991 1991 vmexit->inst_length = 0;
1992 1992 vmexit->exitcode = VM_EXITCODE_RUN_STATE;
1993 1993 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
1994 1994 }
1995 1995
1996 1996 /*
1997 1997 * Some vmm resources, such as the lapic, may have CPU-specific resources
1998 1998 * allocated to them which would benefit from migration onto the host CPU which
1999 1999 * is processing the vcpu state.
2000 2000 */
2001 2001 static void
2002 2002 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2003 2003 {
2004 2004 /*
2005 2005 * Localizing cyclic resources requires acquisition of cpu_lock, and
2006 2006 * doing so with kpreempt disabled is a recipe for deadlock disaster.
2007 2007 */
2008 2008 VERIFY(curthread->t_preempt == 0);
2009 2009
2010 2010 /*
2011 2011 * Do not bother with localization if this vCPU is about to return to
2012 2012 * the host CPU it was last localized to.
2013 2013 */
2014 2014 if (vcpu->lastloccpu == curcpu)
2015 2015 return;
2016 2016
2017 2017 /*
2018 2018 * Localize system-wide resources to the primary boot vCPU. While any
2019 2019 * of the other vCPUs may access them, it keeps the potential interrupt
2020 2020 * footprint constrained to CPUs involved with this instance.
2021 2021 */
2022 2022 if (vcpu == &vm->vcpu[0]) {
2023 2023 vhpet_localize_resources(vm->vhpet);
2024 2024 vrtc_localize_resources(vm->vrtc);
2025 2025 vatpit_localize_resources(vm->vatpit);
2026 2026 }
2027 2027
2028 2028 vlapic_localize_resources(vcpu->vlapic);
2029 2029
2030 2030 vcpu->lastloccpu = curcpu;
2031 2031 }
2032 2032
2033 2033 static void
2034 2034 vmm_savectx(void *arg)
2035 2035 {
2036 2036 vm_thread_ctx_t *vtc = arg;
2037 2037 struct vm *vm = vtc->vtc_vm;
2038 2038 const int vcpuid = vtc->vtc_vcpuid;
2039 2039
2040 2040 if (ops->vmsavectx != NULL) {
2041 2041 ops->vmsavectx(vm->cookie, vcpuid);
2042 2042 }
2043 2043
2044 2044 /*
2045 2045 * Account for going off-cpu, unless the vCPU is idled, where being
2046 2046 * off-cpu is the explicit point.
2047 2047 */
2048 2048 if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2049 2049 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2050 2050 vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2051 2051 }
2052 2052
2053 2053 /*
2054 2054 * If the CPU holds the restored guest FPU state, save it and restore
2055 2055 * the host FPU state before this thread goes off-cpu.
2056 2056 */
2057 2057 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2058 2058 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2059 2059
2060 2060 save_guest_fpustate(vcpu);
2061 2061 vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2062 2062 }
2063 2063 }
2064 2064
2065 2065 static void
2066 2066 vmm_restorectx(void *arg)
2067 2067 {
2068 2068 vm_thread_ctx_t *vtc = arg;
2069 2069 struct vm *vm = vtc->vtc_vm;
2070 2070 const int vcpuid = vtc->vtc_vcpuid;
2071 2071
2072 2072 /* Complete microstate accounting for vCPU being off-cpu */
2073 2073 if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2074 2074 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2075 2075 }
2076 2076
2077 2077 /*
2078 2078 * When coming back on-cpu, only restore the guest FPU status if the
2079 2079 * thread is in a context marked as requiring it. This should be rare,
2080 2080 * occurring only when a future logic error results in a voluntary
2081 2081 * sleep during the VMRUN critical section.
2082 2082 *
2083 2083 * The common case will result in elision of the guest FPU state
2084 2084 * restoration, deferring that action until it is clearly necessary
2085 2085 * during vm_run.
2086 2086 */
2087 2087 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2088 2088 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2089 2089 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2090 2090
2091 2091 restore_guest_fpustate(vcpu);
2092 2092 vtc->vtc_status |= VTCS_FPU_RESTORED;
2093 2093 }
2094 2094
2095 2095 if (ops->vmrestorectx != NULL) {
2096 2096 ops->vmrestorectx(vm->cookie, vcpuid);
2097 2097 }
2098 2098
2099 2099 }
2100 2100
2101 2101 /*
2102 2102 * If we're in removectx(), we might still have state to tidy up.
2103 2103 */
2104 2104 static void
2105 2105 vmm_freectx(void *arg, int isexec)
2106 2106 {
2107 2107 vmm_savectx(arg);
2108 2108 }
2109 2109
2110 2110 static int
2111 2111 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2112 2112 struct vm_exit *vme)
2113 2113 {
2114 2114 struct vcpu *vcpu;
2115 2115 struct vie *vie;
2116 2116 int err;
2117 2117
2118 2118 vcpu = &vm->vcpu[vcpuid];
2119 2119 vie = vcpu->vie_ctx;
2120 2120 err = 0;
2121 2121
2122 2122 switch (entry->cmd) {
2123 2123 case VEC_DEFAULT:
2124 2124 return (0);
2125 2125 case VEC_DISCARD_INSTR:
2126 2126 vie_reset(vie);
2127 2127 return (0);
2128 2128 case VEC_FULFILL_MMIO:
2129 2129 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2130 2130 if (err == 0) {
2131 2131 err = vie_emulate_mmio(vie, vm, vcpuid);
2132 2132 if (err == 0) {
2133 2133 vie_advance_pc(vie, &vcpu->nextrip);
2134 2134 } else if (err < 0) {
2135 2135 vie_exitinfo(vie, vme);
2136 2136 } else if (err == EAGAIN) {
2137 2137 /*
2138 2138 * Clear the instruction emulation state in
2139 2139 * order to re-enter VM context and continue
2140 2140 * this 'rep <instruction>'
2141 2141 */
2142 2142 vie_reset(vie);
2143 2143 err = 0;
2144 2144 }
2145 2145 }
2146 2146 break;
2147 2147 case VEC_FULFILL_INOUT:
2148 2148 err = vie_fulfill_inout(vie, &entry->u.inout);
2149 2149 if (err == 0) {
2150 2150 err = vie_emulate_inout(vie, vm, vcpuid);
2151 2151 if (err == 0) {
2152 2152 vie_advance_pc(vie, &vcpu->nextrip);
2153 2153 } else if (err < 0) {
2154 2154 vie_exitinfo(vie, vme);
2155 2155 } else if (err == EAGAIN) {
2156 2156 /*
2157 2157 * Clear the instruction emulation state in
2158 2158 * order to re-enter VM context and continue
2159 2159 * this 'rep ins/outs'
2160 2160 */
2161 2161 vie_reset(vie);
2162 2162 err = 0;
2163 2163 }
2164 2164 }
2165 2165 break;
2166 2166 default:
2167 2167 return (EINVAL);
2168 2168 }
2169 2169 return (err);
2170 2170 }
2171 2171
2172 2172 static int
2173 2173 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2174 2174 {
2175 2175 struct vie *vie;
2176 2176
2177 2177 vie = vm->vcpu[vcpuid].vie_ctx;
2178 2178
2179 2179 if (vie_pending(vie)) {
2180 2180 /*
2181 2181 * Userspace has not fulfilled the pending needs of the
2182 2182 * instruction emulation, so bail back out.
2183 2183 */
2184 2184 vie_exitinfo(vie, vme);
2185 2185 return (-1);
2186 2186 }
2187 2187
2188 2188 return (0);
2189 2189 }
2190 2190
2191 2191 int
2192 2192 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2193 2193 {
2194 2194 int error;
2195 2195 struct vcpu *vcpu;
2196 2196 struct vm_exit *vme;
2197 2197 bool intr_disabled;
2198 2198 pmap_t pmap;
2199 2199 vm_thread_ctx_t vtc;
2200 2200 int affinity_type = CPU_CURRENT;
2201 2201
2202 2202 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2203 2203 return (EINVAL);
2204 2204 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2205 2205 return (EINVAL);
2206 2206 if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2207 2207 return (EINVAL);
2208 2208
|
↓ open down ↓ |
2208 lines elided |
↑ open up ↑ |
2209 2209 pmap = vmspace_pmap(vm->vmspace);
2210 2210 vcpu = &vm->vcpu[vcpuid];
2211 2211 vme = &vcpu->exitinfo;
2212 2212
2213 2213 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2214 2214
2215 2215 vtc.vtc_vm = vm;
2216 2216 vtc.vtc_vcpuid = vcpuid;
2217 2217 vtc.vtc_status = 0;
2218 2218 installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2219 - NULL, vmm_freectx);
2219 + NULL, vmm_freectx, NULL);
2220 2220
2221 2221 error = vm_entry_actions(vm, vcpuid, entry, vme);
2222 2222 if (error != 0) {
2223 2223 goto exit;
2224 2224 }
2225 2225
2226 2226 restart:
2227 2227 error = vm_loop_checks(vm, vcpuid, vme);
2228 2228 if (error != 0) {
2229 2229 goto exit;
2230 2230 }
2231 2231
2232 2232 thread_affinity_set(curthread, affinity_type);
2233 2233 /*
2234 2234 * Resource localization should happen after the CPU affinity for the
2235 2235 * thread has been set to ensure that access from restricted contexts,
2236 2236 * such as VMX-accelerated APIC operations, can occur without inducing
2237 2237 * cyclic cross-calls.
2238 2238 *
2239 2239 * This must be done prior to disabling kpreempt via critical_enter().
2240 2240 */
2241 2241 vm_localize_resources(vm, vcpu);
2242 2242 affinity_type = CPU_CURRENT;
2243 2243 critical_enter();
2244 2244
2245 2245 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2246 2246 ("vm_run: absurd pm_active"));
2247 2247
2248 2248 /* Force a trip through update_sregs to reload %fs/%gs and friends */
2249 2249 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2250 2250
2251 2251 if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2252 2252 restore_guest_fpustate(vcpu);
2253 2253 vtc.vtc_status |= VTCS_FPU_RESTORED;
2254 2254 }
2255 2255 vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2256 2256
2257 2257 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2258 2258 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2259 2259 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2260 2260
2261 2261 /*
2262 2262 * Once clear of the delicate contexts comprising the VM_RUN handler,
2263 2263 * thread CPU affinity can be loosened while other processing occurs.
2264 2264 */
2265 2265 vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2266 2266 thread_affinity_clear(curthread);
2267 2267 critical_exit();
2268 2268
2269 2269 if (error != 0) {
2270 2270 /* Communicate out any error from VMRUN() above */
2271 2271 goto exit;
2272 2272 }
2273 2273
2274 2274 vcpu->nextrip = vme->rip + vme->inst_length;
2275 2275 switch (vme->exitcode) {
2276 2276 case VM_EXITCODE_REQIDLE:
2277 2277 error = vm_handle_reqidle(vm, vcpuid);
2278 2278 break;
2279 2279 case VM_EXITCODE_RUN_STATE:
2280 2280 error = vm_handle_run_state(vm, vcpuid);
2281 2281 break;
2282 2282 case VM_EXITCODE_SUSPENDED:
2283 2283 error = vm_handle_suspend(vm, vcpuid);
2284 2284 break;
2285 2285 case VM_EXITCODE_IOAPIC_EOI:
2286 2286 vioapic_process_eoi(vm, vcpuid,
2287 2287 vme->u.ioapic_eoi.vector);
2288 2288 break;
2289 2289 case VM_EXITCODE_HLT:
2290 2290 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2291 2291 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2292 2292 break;
2293 2293 case VM_EXITCODE_PAGING:
2294 2294 error = vm_handle_paging(vm, vcpuid);
2295 2295 break;
2296 2296 case VM_EXITCODE_MMIO_EMUL:
2297 2297 error = vm_handle_mmio_emul(vm, vcpuid);
2298 2298 break;
2299 2299 case VM_EXITCODE_INOUT:
2300 2300 error = vm_handle_inout(vm, vcpuid, vme);
2301 2301 break;
2302 2302 case VM_EXITCODE_INST_EMUL:
2303 2303 error = vm_handle_inst_emul(vm, vcpuid);
2304 2304 break;
2305 2305 case VM_EXITCODE_MONITOR:
2306 2306 case VM_EXITCODE_MWAIT:
2307 2307 case VM_EXITCODE_VMINSN:
2308 2308 vm_inject_ud(vm, vcpuid);
2309 2309 break;
2310 2310 case VM_EXITCODE_RDMSR:
2311 2311 error = vm_handle_rdmsr(vm, vcpuid, vme);
2312 2312 break;
2313 2313 case VM_EXITCODE_WRMSR:
2314 2314 error = vm_handle_wrmsr(vm, vcpuid, vme);
2315 2315 break;
2316 2316 case VM_EXITCODE_HT:
2317 2317 affinity_type = CPU_BEST;
2318 2318 break;
2319 2319 case VM_EXITCODE_MTRAP:
2320 2320 vm_suspend_cpu(vm, vcpuid);
2321 2321 error = -1;
2322 2322 break;
2323 2323 default:
2324 2324 /* handled in userland */
2325 2325 error = -1;
2326 2326 break;
2327 2327 }
2328 2328
2329 2329 if (error == 0) {
2330 2330 /* VM exit conditions handled in-kernel, continue running */
2331 2331 goto restart;
2332 2332 }
2333 2333
2334 2334 exit:
2335 2335 removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2336 2336 NULL, vmm_freectx);
2337 2337
2338 2338 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2339 2339
2340 2340 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2341 2341 return (error);
2342 2342 }
2343 2343
2344 2344 int
2345 2345 vm_restart_instruction(void *arg, int vcpuid)
2346 2346 {
2347 2347 struct vm *vm;
2348 2348 struct vcpu *vcpu;
2349 2349 enum vcpu_state state;
2350 2350 uint64_t rip;
2351 2351 int error;
2352 2352
2353 2353 vm = arg;
2354 2354 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2355 2355 return (EINVAL);
2356 2356
2357 2357 vcpu = &vm->vcpu[vcpuid];
2358 2358 state = vcpu_get_state(vm, vcpuid, NULL);
2359 2359 if (state == VCPU_RUNNING) {
2360 2360 /*
2361 2361 * When a vcpu is "running" the next instruction is determined
2362 2362 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2363 2363 * Thus setting 'inst_length' to zero will cause the current
2364 2364 * instruction to be restarted.
2365 2365 */
2366 2366 vcpu->exitinfo.inst_length = 0;
2367 2367 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2368 2368 "setting inst_length to zero", vcpu->exitinfo.rip);
2369 2369 } else if (state == VCPU_FROZEN) {
2370 2370 /*
2371 2371 * When a vcpu is "frozen" it is outside the critical section
2372 2372 * around VMRUN() and 'nextrip' points to the next instruction.
2373 2373 * Thus instruction restart is achieved by setting 'nextrip'
2374 2374 * to the vcpu's %rip.
2375 2375 */
2376 2376 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2377 2377 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2378 2378 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2379 2379 "nextrip from %lx to %lx", vcpu->nextrip, rip);
2380 2380 vcpu->nextrip = rip;
2381 2381 } else {
2382 2382 panic("%s: invalid state %d", __func__, state);
2383 2383 }
2384 2384 return (0);
2385 2385 }
2386 2386
2387 2387 int
2388 2388 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2389 2389 {
2390 2390 struct vcpu *vcpu;
2391 2391 int type, vector;
2392 2392
2393 2393 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2394 2394 return (EINVAL);
2395 2395
2396 2396 vcpu = &vm->vcpu[vcpuid];
2397 2397
2398 2398 if (info & VM_INTINFO_VALID) {
2399 2399 type = info & VM_INTINFO_TYPE;
2400 2400 vector = info & 0xff;
2401 2401 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2402 2402 return (EINVAL);
2403 2403 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2404 2404 return (EINVAL);
2405 2405 if (info & VM_INTINFO_RSVD)
2406 2406 return (EINVAL);
2407 2407 } else {
2408 2408 info = 0;
2409 2409 }
2410 2410 VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2411 2411 vcpu->exitintinfo = info;
2412 2412 return (0);
2413 2413 }
2414 2414
2415 2415 enum exc_class {
2416 2416 EXC_BENIGN,
2417 2417 EXC_CONTRIBUTORY,
2418 2418 EXC_PAGEFAULT
2419 2419 };
2420 2420
2421 2421 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */
2422 2422
2423 2423 static enum exc_class
2424 2424 exception_class(uint64_t info)
2425 2425 {
2426 2426 int type, vector;
2427 2427
2428 2428 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2429 2429 type = info & VM_INTINFO_TYPE;
2430 2430 vector = info & 0xff;
2431 2431
2432 2432 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2433 2433 switch (type) {
2434 2434 case VM_INTINFO_HWINTR:
2435 2435 case VM_INTINFO_SWINTR:
2436 2436 case VM_INTINFO_NMI:
2437 2437 return (EXC_BENIGN);
2438 2438 default:
2439 2439 /*
2440 2440 * Hardware exception.
2441 2441 *
2442 2442 * SVM and VT-x use identical type values to represent NMI,
2443 2443 * hardware interrupt and software interrupt.
2444 2444 *
2445 2445 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2446 2446 * for exceptions except #BP and #OF. #BP and #OF use a type
2447 2447 * value of '5' or '6'. Therefore we don't check for explicit
2448 2448 * values of 'type' to classify 'intinfo' into a hardware
2449 2449 * exception.
2450 2450 */
2451 2451 break;
2452 2452 }
2453 2453
2454 2454 switch (vector) {
2455 2455 case IDT_PF:
2456 2456 case IDT_VE:
2457 2457 return (EXC_PAGEFAULT);
2458 2458 case IDT_DE:
2459 2459 case IDT_TS:
2460 2460 case IDT_NP:
2461 2461 case IDT_SS:
2462 2462 case IDT_GP:
2463 2463 return (EXC_CONTRIBUTORY);
2464 2464 default:
2465 2465 return (EXC_BENIGN);
2466 2466 }
2467 2467 }
2468 2468
2469 2469 static int
2470 2470 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2471 2471 uint64_t *retinfo)
2472 2472 {
2473 2473 enum exc_class exc1, exc2;
2474 2474 int type1, vector1;
2475 2475
2476 2476 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2477 2477 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2478 2478
2479 2479 /*
2480 2480 * If an exception occurs while attempting to call the double-fault
2481 2481 * handler the processor enters shutdown mode (aka triple fault).
2482 2482 */
2483 2483 type1 = info1 & VM_INTINFO_TYPE;
2484 2484 vector1 = info1 & 0xff;
2485 2485 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2486 2486 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2487 2487 info1, info2);
2488 2488 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2489 2489 *retinfo = 0;
2490 2490 return (0);
2491 2491 }
2492 2492
2493 2493 /*
2494 2494 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2495 2495 */
2496 2496 exc1 = exception_class(info1);
2497 2497 exc2 = exception_class(info2);
2498 2498 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2499 2499 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2500 2500 /* Convert nested fault into a double fault. */
2501 2501 *retinfo = IDT_DF;
2502 2502 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2503 2503 *retinfo |= VM_INTINFO_DEL_ERRCODE;
2504 2504 } else {
2505 2505 /* Handle exceptions serially */
2506 2506 *retinfo = info2;
2507 2507 }
2508 2508 return (1);
2509 2509 }
2510 2510
2511 2511 static uint64_t
2512 2512 vcpu_exception_intinfo(struct vcpu *vcpu)
2513 2513 {
2514 2514 uint64_t info = 0;
2515 2515
2516 2516 if (vcpu->exception_pending) {
2517 2517 info = vcpu->exc_vector & 0xff;
2518 2518 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2519 2519 if (vcpu->exc_errcode_valid) {
2520 2520 info |= VM_INTINFO_DEL_ERRCODE;
2521 2521 info |= (uint64_t)vcpu->exc_errcode << 32;
2522 2522 }
2523 2523 }
2524 2524 return (info);
2525 2525 }
2526 2526
2527 2527 int
2528 2528 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2529 2529 {
2530 2530 struct vcpu *vcpu;
2531 2531 uint64_t info1, info2;
2532 2532 int valid;
2533 2533
2534 2534 KASSERT(vcpuid >= 0 &&
2535 2535 vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2536 2536
2537 2537 vcpu = &vm->vcpu[vcpuid];
2538 2538
2539 2539 info1 = vcpu->exitintinfo;
2540 2540 vcpu->exitintinfo = 0;
2541 2541
2542 2542 info2 = 0;
2543 2543 if (vcpu->exception_pending) {
2544 2544 info2 = vcpu_exception_intinfo(vcpu);
2545 2545 vcpu->exception_pending = 0;
2546 2546 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2547 2547 vcpu->exc_vector, info2);
2548 2548 }
2549 2549
2550 2550 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2551 2551 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2552 2552 } else if (info1 & VM_INTINFO_VALID) {
2553 2553 *retinfo = info1;
2554 2554 valid = 1;
2555 2555 } else if (info2 & VM_INTINFO_VALID) {
2556 2556 *retinfo = info2;
2557 2557 valid = 1;
2558 2558 } else {
2559 2559 valid = 0;
2560 2560 }
2561 2561
2562 2562 if (valid) {
2563 2563 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2564 2564 "retinfo(%lx)", __func__, info1, info2, *retinfo);
2565 2565 }
2566 2566
2567 2567 return (valid);
2568 2568 }
2569 2569
2570 2570 int
2571 2571 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2572 2572 {
2573 2573 struct vcpu *vcpu;
2574 2574
2575 2575 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2576 2576 return (EINVAL);
2577 2577
2578 2578 vcpu = &vm->vcpu[vcpuid];
2579 2579 *info1 = vcpu->exitintinfo;
2580 2580 *info2 = vcpu_exception_intinfo(vcpu);
2581 2581 return (0);
2582 2582 }
2583 2583
2584 2584 int
2585 2585 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2586 2586 uint32_t errcode, int restart_instruction)
2587 2587 {
2588 2588 struct vcpu *vcpu;
2589 2589 uint64_t regval;
2590 2590 int error;
2591 2591
2592 2592 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2593 2593 return (EINVAL);
2594 2594
2595 2595 if (vector < 0 || vector >= 32)
2596 2596 return (EINVAL);
2597 2597
2598 2598 /*
2599 2599 * NMIs (which bear an exception vector of 2) are to be injected via
2600 2600 * their own specialized path using vm_inject_nmi().
2601 2601 */
2602 2602 if (vector == 2) {
2603 2603 return (EINVAL);
2604 2604 }
2605 2605
2606 2606 /*
2607 2607 * A double fault exception should never be injected directly into
2608 2608 * the guest. It is a derived exception that results from specific
2609 2609 * combinations of nested faults.
2610 2610 */
2611 2611 if (vector == IDT_DF)
2612 2612 return (EINVAL);
2613 2613
2614 2614 vcpu = &vm->vcpu[vcpuid];
2615 2615
2616 2616 if (vcpu->exception_pending) {
2617 2617 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2618 2618 "pending exception %d", vector, vcpu->exc_vector);
2619 2619 return (EBUSY);
2620 2620 }
2621 2621
2622 2622 if (errcode_valid) {
2623 2623 /*
2624 2624 * Exceptions don't deliver an error code in real mode.
2625 2625 */
2626 2626 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val);
2627 2627 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2628 2628 if (!(regval & CR0_PE))
2629 2629 errcode_valid = 0;
2630 2630 }
2631 2631
2632 2632 /*
2633 2633 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2634 2634 *
2635 2635 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2636 2636 * one instruction or incurs an exception.
2637 2637 */
2638 2638 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2639 2639 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2640 2640 __func__, error));
2641 2641
2642 2642 if (restart_instruction)
2643 2643 vm_restart_instruction(vm, vcpuid);
2644 2644
2645 2645 vcpu->exception_pending = 1;
2646 2646 vcpu->exc_vector = vector;
2647 2647 vcpu->exc_errcode = errcode;
2648 2648 vcpu->exc_errcode_valid = errcode_valid;
2649 2649 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2650 2650 return (0);
2651 2651 }
2652 2652
2653 2653 void
2654 2654 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2655 2655 int errcode)
2656 2656 {
2657 2657 int error;
2658 2658
2659 2659 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2660 2660 errcode, 1);
2661 2661 KASSERT(error == 0, ("vm_inject_exception error %d", error));
2662 2662 }
2663 2663
2664 2664 void
2665 2665 vm_inject_ud(struct vm *vm, int vcpuid)
2666 2666 {
2667 2667 vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2668 2668 }
2669 2669
2670 2670 void
2671 2671 vm_inject_gp(struct vm *vm, int vcpuid)
2672 2672 {
2673 2673 vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2674 2674 }
2675 2675
2676 2676 void
2677 2677 vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2678 2678 {
2679 2679 vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2680 2680 }
2681 2681
2682 2682 void
2683 2683 vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2684 2684 {
2685 2685 vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2686 2686 }
2687 2687
2688 2688 void
2689 2689 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2690 2690 {
2691 2691 int error;
2692 2692
2693 2693 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2694 2694 error_code, cr2);
2695 2695
2696 2696 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2697 2697 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2698 2698
2699 2699 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2700 2700 }
2701 2701
2702 2702 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2703 2703
2704 2704 int
2705 2705 vm_inject_nmi(struct vm *vm, int vcpuid)
2706 2706 {
2707 2707 struct vcpu *vcpu;
2708 2708
2709 2709 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2710 2710 return (EINVAL);
2711 2711
2712 2712 vcpu = &vm->vcpu[vcpuid];
2713 2713
2714 2714 vcpu->nmi_pending = 1;
2715 2715 vcpu_notify_event(vm, vcpuid);
2716 2716 return (0);
2717 2717 }
2718 2718
2719 2719 int
2720 2720 vm_nmi_pending(struct vm *vm, int vcpuid)
2721 2721 {
2722 2722 struct vcpu *vcpu;
2723 2723
2724 2724 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2725 2725 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2726 2726
2727 2727 vcpu = &vm->vcpu[vcpuid];
2728 2728
2729 2729 return (vcpu->nmi_pending);
2730 2730 }
2731 2731
2732 2732 void
2733 2733 vm_nmi_clear(struct vm *vm, int vcpuid)
2734 2734 {
2735 2735 struct vcpu *vcpu;
2736 2736
2737 2737 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2738 2738 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2739 2739
2740 2740 vcpu = &vm->vcpu[vcpuid];
2741 2741
2742 2742 if (vcpu->nmi_pending == 0)
2743 2743 panic("vm_nmi_clear: inconsistent nmi_pending state");
2744 2744
2745 2745 vcpu->nmi_pending = 0;
2746 2746 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2747 2747 }
2748 2748
2749 2749 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2750 2750
2751 2751 int
2752 2752 vm_inject_extint(struct vm *vm, int vcpuid)
2753 2753 {
2754 2754 struct vcpu *vcpu;
2755 2755
2756 2756 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2757 2757 return (EINVAL);
2758 2758
2759 2759 vcpu = &vm->vcpu[vcpuid];
2760 2760
2761 2761 vcpu->extint_pending = 1;
2762 2762 vcpu_notify_event(vm, vcpuid);
2763 2763 return (0);
2764 2764 }
2765 2765
2766 2766 int
2767 2767 vm_extint_pending(struct vm *vm, int vcpuid)
2768 2768 {
2769 2769 struct vcpu *vcpu;
2770 2770
2771 2771 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2772 2772 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2773 2773
2774 2774 vcpu = &vm->vcpu[vcpuid];
2775 2775
2776 2776 return (vcpu->extint_pending);
2777 2777 }
2778 2778
2779 2779 void
2780 2780 vm_extint_clear(struct vm *vm, int vcpuid)
2781 2781 {
2782 2782 struct vcpu *vcpu;
2783 2783
2784 2784 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2785 2785 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2786 2786
2787 2787 vcpu = &vm->vcpu[vcpuid];
2788 2788
2789 2789 if (vcpu->extint_pending == 0)
2790 2790 panic("vm_extint_clear: inconsistent extint_pending state");
2791 2791
2792 2792 vcpu->extint_pending = 0;
2793 2793 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2794 2794 }
2795 2795
2796 2796 int
2797 2797 vm_inject_init(struct vm *vm, int vcpuid)
2798 2798 {
2799 2799 struct vcpu *vcpu;
2800 2800
2801 2801 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2802 2802 return (EINVAL);
2803 2803
2804 2804 vcpu = &vm->vcpu[vcpuid];
2805 2805 vcpu_lock(vcpu);
2806 2806 vcpu->run_state |= VRS_PEND_INIT;
2807 2807 /*
2808 2808 * As part of queuing the INIT request, clear any pending SIPI. It
2809 2809 * would not otherwise survive across the reset of the vCPU when it
2810 2810 * undergoes the requested INIT. We would not want it to linger when it
2811 2811 * could be mistaken as a subsequent (after the INIT) SIPI request.
2812 2812 */
2813 2813 vcpu->run_state &= ~VRS_PEND_SIPI;
2814 2814 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2815 2815
2816 2816 vcpu_unlock(vcpu);
2817 2817 return (0);
2818 2818 }
2819 2819
2820 2820 int
2821 2821 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2822 2822 {
2823 2823 struct vcpu *vcpu;
2824 2824
2825 2825 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2826 2826 return (EINVAL);
2827 2827
2828 2828 vcpu = &vm->vcpu[vcpuid];
2829 2829 vcpu_lock(vcpu);
2830 2830 vcpu->run_state |= VRS_PEND_SIPI;
2831 2831 vcpu->sipi_vector = vector;
2832 2832 /* SIPI is only actionable if the CPU is waiting in INIT state */
2833 2833 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2834 2834 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2835 2835 }
2836 2836 vcpu_unlock(vcpu);
2837 2837 return (0);
2838 2838 }
2839 2839
2840 2840 bool
2841 2841 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2842 2842 {
2843 2843 struct vcpu *vcpu;
2844 2844
2845 2845 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2846 2846 vcpu = &vm->vcpu[vcpuid];
2847 2847
2848 2848 /* Of interest: vCPU not in running state or with pending INIT */
2849 2849 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2850 2850 }
2851 2851
2852 2852 int
2853 2853 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2854 2854 {
2855 2855 struct seg_desc desc;
2856 2856 const enum vm_reg_name clear_regs[] = {
2857 2857 VM_REG_GUEST_CR2,
2858 2858 VM_REG_GUEST_CR3,
2859 2859 VM_REG_GUEST_CR4,
2860 2860 VM_REG_GUEST_RAX,
2861 2861 VM_REG_GUEST_RBX,
2862 2862 VM_REG_GUEST_RCX,
2863 2863 VM_REG_GUEST_RSI,
2864 2864 VM_REG_GUEST_RDI,
2865 2865 VM_REG_GUEST_RBP,
2866 2866 VM_REG_GUEST_RSP,
2867 2867 VM_REG_GUEST_R8,
2868 2868 VM_REG_GUEST_R9,
2869 2869 VM_REG_GUEST_R10,
2870 2870 VM_REG_GUEST_R11,
2871 2871 VM_REG_GUEST_R12,
2872 2872 VM_REG_GUEST_R13,
2873 2873 VM_REG_GUEST_R14,
2874 2874 VM_REG_GUEST_R15,
2875 2875 VM_REG_GUEST_DR0,
2876 2876 VM_REG_GUEST_DR1,
2877 2877 VM_REG_GUEST_DR2,
2878 2878 VM_REG_GUEST_DR3,
2879 2879 VM_REG_GUEST_EFER,
2880 2880 };
2881 2881 const enum vm_reg_name data_segs[] = {
2882 2882 VM_REG_GUEST_SS,
2883 2883 VM_REG_GUEST_DS,
2884 2884 VM_REG_GUEST_ES,
2885 2885 VM_REG_GUEST_FS,
2886 2886 VM_REG_GUEST_GS,
2887 2887 };
2888 2888 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2889 2889
2890 2890 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2891 2891 return (EINVAL);
2892 2892
2893 2893 for (uint_t i = 0; i < nitems(clear_regs); i++) {
2894 2894 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2895 2895 }
2896 2896
2897 2897 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2898 2898 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2899 2899 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2900 2900
2901 2901 /*
2902 2902 * The prescribed contents of %rdx differ slightly between the Intel and
2903 2903 * AMD architectural definitions. The former expects the Extended Model
2904 2904 * in bits 16-19 where the latter expects all the Family, Model, and
2905 2905 * Stepping be there. Common boot ROMs appear to disregard this
2906 2906 * anyways, so we stick with a compromise value similar to what is
2907 2907 * spelled out in the Intel SDM.
2908 2908 */
2909 2909 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2910 2910
2911 2911 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2912 2912 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2913 2913
2914 2914 /* CS: Present, R/W, Accessed */
2915 2915 desc.access = 0x0093;
2916 2916 desc.base = 0xffff0000;
2917 2917 desc.limit = 0xffff;
2918 2918 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2919 2919 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2920 2920
2921 2921 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2922 2922 desc.access = 0x0093;
2923 2923 desc.base = 0;
2924 2924 desc.limit = 0xffff;
2925 2925 for (uint_t i = 0; i < nitems(data_segs); i++) {
2926 2926 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2927 2927 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2928 2928 }
2929 2929
2930 2930 /* GDTR, IDTR */
2931 2931 desc.base = 0;
2932 2932 desc.limit = 0xffff;
2933 2933 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2934 2934 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2935 2935
2936 2936 /* LDTR: Present, LDT */
2937 2937 desc.access = 0x0082;
2938 2938 desc.base = 0;
2939 2939 desc.limit = 0xffff;
2940 2940 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2941 2941 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2942 2942
2943 2943 /* TR: Present, 32-bit TSS */
2944 2944 desc.access = 0x008b;
2945 2945 desc.base = 0;
2946 2946 desc.limit = 0xffff;
2947 2947 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2948 2948 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2949 2949
2950 2950 vlapic_reset(vm_lapic(vm, vcpuid));
2951 2951
2952 2952 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2953 2953
2954 2954 vcpu->exitintinfo = 0;
2955 2955 vcpu->exception_pending = 0;
2956 2956 vcpu->nmi_pending = 0;
2957 2957 vcpu->extint_pending = 0;
2958 2958
2959 2959 /*
2960 2960 * A CPU reset caused by power-on or system reset clears more state than
2961 2961 * one which is trigged from an INIT IPI.
2962 2962 */
2963 2963 if (!init_only) {
2964 2964 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2965 2965 fpu_save_area_reset(vcpu->guestfpu);
2966 2966
2967 2967 /* XXX: clear MSRs and other pieces */
2968 2968 }
2969 2969
2970 2970 return (0);
2971 2971 }
2972 2972
2973 2973 static int
2974 2974 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2975 2975 {
2976 2976 struct seg_desc desc;
2977 2977
2978 2978 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2979 2979 return (EINVAL);
2980 2980
2981 2981 /* CS: Present, R/W, Accessed */
2982 2982 desc.access = 0x0093;
2983 2983 desc.base = (uint64_t)vector << 12;
2984 2984 desc.limit = 0xffff;
2985 2985 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2986 2986 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2987 2987 (uint64_t)vector << 8));
2988 2988
2989 2989 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2990 2990
2991 2991 return (0);
2992 2992 }
2993 2993
2994 2994 int
2995 2995 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2996 2996 {
2997 2997 if (vcpu < 0 || vcpu >= vm->maxcpus)
2998 2998 return (EINVAL);
2999 2999
3000 3000 if (type < 0 || type >= VM_CAP_MAX)
3001 3001 return (EINVAL);
3002 3002
3003 3003 return (VMGETCAP(vm->cookie, vcpu, type, retval));
3004 3004 }
3005 3005
3006 3006 int
3007 3007 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3008 3008 {
3009 3009 if (vcpu < 0 || vcpu >= vm->maxcpus)
3010 3010 return (EINVAL);
3011 3011
3012 3012 if (type < 0 || type >= VM_CAP_MAX)
3013 3013 return (EINVAL);
3014 3014
3015 3015 return (VMSETCAP(vm->cookie, vcpu, type, val));
3016 3016 }
3017 3017
3018 3018 struct vlapic *
3019 3019 vm_lapic(struct vm *vm, int cpu)
3020 3020 {
3021 3021 return (vm->vcpu[cpu].vlapic);
3022 3022 }
3023 3023
3024 3024 struct vioapic *
3025 3025 vm_ioapic(struct vm *vm)
3026 3026 {
3027 3027
3028 3028 return (vm->vioapic);
3029 3029 }
3030 3030
3031 3031 struct vhpet *
3032 3032 vm_hpet(struct vm *vm)
3033 3033 {
3034 3034
3035 3035 return (vm->vhpet);
3036 3036 }
3037 3037
3038 3038 void *
3039 3039 vm_iommu_domain(struct vm *vm)
3040 3040 {
3041 3041
3042 3042 return (vm->iommu);
3043 3043 }
3044 3044
3045 3045 int
3046 3046 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3047 3047 bool from_idle)
3048 3048 {
3049 3049 int error;
3050 3050 struct vcpu *vcpu;
3051 3051
3052 3052 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3053 3053 panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3054 3054
3055 3055 vcpu = &vm->vcpu[vcpuid];
3056 3056
3057 3057 vcpu_lock(vcpu);
3058 3058 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3059 3059 vcpu_unlock(vcpu);
3060 3060
3061 3061 return (error);
3062 3062 }
3063 3063
3064 3064 enum vcpu_state
3065 3065 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3066 3066 {
3067 3067 struct vcpu *vcpu;
3068 3068 enum vcpu_state state;
3069 3069
3070 3070 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3071 3071 panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3072 3072
3073 3073 vcpu = &vm->vcpu[vcpuid];
3074 3074
3075 3075 vcpu_lock(vcpu);
3076 3076 state = vcpu->state;
3077 3077 if (hostcpu != NULL)
3078 3078 *hostcpu = vcpu->hostcpu;
3079 3079 vcpu_unlock(vcpu);
3080 3080
3081 3081 return (state);
3082 3082 }
3083 3083
3084 3084 uint64_t
3085 3085 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3086 3086 {
3087 3087 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3088 3088
3089 3089 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3090 3090
3091 3091 if (phys_adj) {
3092 3092 /* Include any offset for the current physical CPU too */
3093 3093 extern hrtime_t tsc_gethrtime_tick_delta(void);
3094 3094 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3095 3095 }
3096 3096
3097 3097 return (vcpu_off);
3098 3098 }
3099 3099
3100 3100 int
3101 3101 vm_activate_cpu(struct vm *vm, int vcpuid)
3102 3102 {
3103 3103
3104 3104 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3105 3105 return (EINVAL);
3106 3106
3107 3107 if (CPU_ISSET(vcpuid, &vm->active_cpus))
3108 3108 return (EBUSY);
3109 3109
3110 3110 VCPU_CTR0(vm, vcpuid, "activated");
3111 3111 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3112 3112 return (0);
3113 3113 }
3114 3114
3115 3115 int
3116 3116 vm_suspend_cpu(struct vm *vm, int vcpuid)
3117 3117 {
3118 3118 int i;
3119 3119
3120 3120 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3121 3121 return (EINVAL);
3122 3122
3123 3123 if (vcpuid == -1) {
3124 3124 vm->debug_cpus = vm->active_cpus;
3125 3125 for (i = 0; i < vm->maxcpus; i++) {
3126 3126 if (CPU_ISSET(i, &vm->active_cpus))
3127 3127 vcpu_notify_event(vm, i);
3128 3128 }
3129 3129 } else {
3130 3130 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3131 3131 return (EINVAL);
3132 3132
3133 3133 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3134 3134 vcpu_notify_event(vm, vcpuid);
3135 3135 }
3136 3136 return (0);
3137 3137 }
3138 3138
3139 3139 int
3140 3140 vm_resume_cpu(struct vm *vm, int vcpuid)
3141 3141 {
3142 3142
3143 3143 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3144 3144 return (EINVAL);
3145 3145
3146 3146 if (vcpuid == -1) {
3147 3147 CPU_ZERO(&vm->debug_cpus);
3148 3148 } else {
3149 3149 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3150 3150 return (EINVAL);
3151 3151
3152 3152 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3153 3153 }
3154 3154 return (0);
3155 3155 }
3156 3156
3157 3157 static bool
3158 3158 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3159 3159 uint64_t entry_rip)
3160 3160 {
3161 3161 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3162 3162 struct vm_exit *vme = &vcpu->exitinfo;
3163 3163 bool bail = false;
3164 3164
3165 3165 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3166 3166
3167 3167 if (vm->suspend) {
3168 3168 if (on_entry) {
3169 3169 VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3170 3170 vm->suspend < VM_SUSPEND_LAST);
3171 3171
3172 3172 vme->exitcode = VM_EXITCODE_SUSPENDED;
3173 3173 vme->u.suspended.how = vm->suspend;
3174 3174 } else {
3175 3175 /*
3176 3176 * Handling VM suspend is complicated, so if that
3177 3177 * condition is detected outside of VM-entry itself,
3178 3178 * just emit a BOGUS exitcode so we take a lap to pick
3179 3179 * up the event during an entry and are directed into
3180 3180 * the vm_handle_suspend() logic.
3181 3181 */
3182 3182 vme->exitcode = VM_EXITCODE_BOGUS;
3183 3183 }
3184 3184 bail = true;
3185 3185 }
3186 3186 if (vcpu->reqidle) {
3187 3187 vme->exitcode = VM_EXITCODE_REQIDLE;
3188 3188 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3189 3189
3190 3190 if (!on_entry) {
3191 3191 /*
3192 3192 * A reqidle request detected outside of VM-entry can be
3193 3193 * handled directly by clearing the request (and taking
3194 3194 * a lap to userspace).
3195 3195 */
3196 3196 vcpu_assert_locked(vcpu);
3197 3197 vcpu->reqidle = 0;
3198 3198 }
3199 3199 bail = true;
3200 3200 }
3201 3201 if (vcpu_should_yield(vm, vcpuid)) {
3202 3202 vme->exitcode = VM_EXITCODE_BOGUS;
3203 3203 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3204 3204 bail = true;
3205 3205 }
3206 3206 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3207 3207 vme->exitcode = VM_EXITCODE_DEBUG;
3208 3208 bail = true;
3209 3209 }
3210 3210
3211 3211 if (bail) {
3212 3212 if (on_entry) {
3213 3213 /*
3214 3214 * If bailing out during VM-entry, the current %rip must
3215 3215 * be recorded in the exitinfo.
3216 3216 */
3217 3217 vme->rip = entry_rip;
3218 3218 }
3219 3219 vme->inst_length = 0;
3220 3220 }
3221 3221 return (bail);
3222 3222 }
3223 3223
3224 3224 static bool
3225 3225 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3226 3226 {
3227 3227 /*
3228 3228 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3229 3229 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3230 3230 * structure, and we would only modify the exitcode.
3231 3231 */
3232 3232 return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3233 3233 }
3234 3234
3235 3235 bool
3236 3236 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3237 3237 {
3238 3238 /*
3239 3239 * Bail-out checks done as part of VM entry require an updated %rip to
3240 3240 * populate the vm_exit struct if any of the conditions of interest are
3241 3241 * matched in the check.
3242 3242 */
3243 3243 return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3244 3244 }
3245 3245
3246 3246 cpuset_t
3247 3247 vm_active_cpus(struct vm *vm)
3248 3248 {
3249 3249
3250 3250 return (vm->active_cpus);
3251 3251 }
3252 3252
3253 3253 cpuset_t
3254 3254 vm_debug_cpus(struct vm *vm)
3255 3255 {
3256 3256
3257 3257 return (vm->debug_cpus);
3258 3258 }
3259 3259
3260 3260 cpuset_t
3261 3261 vm_suspended_cpus(struct vm *vm)
3262 3262 {
3263 3263
3264 3264 return (vm->suspended_cpus);
3265 3265 }
3266 3266
3267 3267 void *
3268 3268 vcpu_stats(struct vm *vm, int vcpuid)
3269 3269 {
3270 3270
3271 3271 return (vm->vcpu[vcpuid].stats);
3272 3272 }
3273 3273
3274 3274 int
3275 3275 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3276 3276 {
3277 3277 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3278 3278 return (EINVAL);
3279 3279
3280 3280 *state = vm->vcpu[vcpuid].x2apic_state;
3281 3281
3282 3282 return (0);
3283 3283 }
3284 3284
3285 3285 int
3286 3286 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3287 3287 {
3288 3288 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3289 3289 return (EINVAL);
3290 3290
3291 3291 if (state >= X2APIC_STATE_LAST)
3292 3292 return (EINVAL);
3293 3293
3294 3294 vm->vcpu[vcpuid].x2apic_state = state;
3295 3295
3296 3296 vlapic_set_x2apic_state(vm, vcpuid, state);
3297 3297
3298 3298 return (0);
3299 3299 }
3300 3300
3301 3301 /*
3302 3302 * This function is called to ensure that a vcpu "sees" a pending event
3303 3303 * as soon as possible:
3304 3304 * - If the vcpu thread is sleeping then it is woken up.
3305 3305 * - If the vcpu is running on a different host_cpu then an IPI will be directed
3306 3306 * to the host_cpu to cause the vcpu to trap into the hypervisor.
3307 3307 */
3308 3308 static void
3309 3309 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3310 3310 {
3311 3311 int hostcpu;
3312 3312
3313 3313 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3314 3314
3315 3315 hostcpu = vcpu->hostcpu;
3316 3316 if (vcpu->state == VCPU_RUNNING) {
3317 3317 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3318 3318 if (hostcpu != curcpu) {
3319 3319 if (ntype == VCPU_NOTIFY_APIC) {
3320 3320 vlapic_post_intr(vcpu->vlapic, hostcpu,
3321 3321 vmm_ipinum);
3322 3322 } else {
3323 3323 ipi_cpu(hostcpu, vmm_ipinum);
3324 3324 }
3325 3325 } else {
3326 3326 /*
3327 3327 * If the 'vcpu' is running on 'curcpu' then it must
3328 3328 * be sending a notification to itself (e.g. SELF_IPI).
3329 3329 * The pending event will be picked up when the vcpu
3330 3330 * transitions back to guest context.
3331 3331 */
3332 3332 }
3333 3333 } else {
3334 3334 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3335 3335 "with hostcpu %d", vcpu->state, hostcpu));
3336 3336 if (vcpu->state == VCPU_SLEEPING) {
3337 3337 cv_signal(&vcpu->vcpu_cv);
3338 3338 }
3339 3339 }
3340 3340 }
3341 3341
3342 3342 void
3343 3343 vcpu_notify_event(struct vm *vm, int vcpuid)
3344 3344 {
3345 3345 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3346 3346
3347 3347 vcpu_lock(vcpu);
3348 3348 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3349 3349 vcpu_unlock(vcpu);
3350 3350 }
3351 3351
3352 3352 void
3353 3353 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3354 3354 {
3355 3355 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3356 3356
3357 3357 if (ntype == VCPU_NOTIFY_NONE) {
3358 3358 return;
3359 3359 }
3360 3360
3361 3361 vcpu_lock(vcpu);
3362 3362 vcpu_notify_event_locked(vcpu, ntype);
3363 3363 vcpu_unlock(vcpu);
3364 3364 }
3365 3365
3366 3366 void
3367 3367 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3368 3368 {
3369 3369 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3370 3370 hrtime_t now = gethrtime();
3371 3371
3372 3372 ASSERT3U(ustate, !=, vcpu->ustate);
3373 3373 ASSERT3S(ustate, <, VU_MAX);
3374 3374 ASSERT3S(ustate, >=, VU_INIT);
3375 3375
3376 3376 hrtime_t delta = now - vcpu->ustate_when;
3377 3377 vcpu->ustate_total[vcpu->ustate] += delta;
3378 3378
3379 3379 membar_producer();
3380 3380
3381 3381 vcpu->ustate_when = now;
3382 3382 vcpu->ustate = ustate;
3383 3383 }
3384 3384
3385 3385 struct vmspace *
3386 3386 vm_get_vmspace(struct vm *vm)
3387 3387 {
3388 3388
3389 3389 return (vm->vmspace);
3390 3390 }
3391 3391
3392 3392 int
3393 3393 vm_apicid2vcpuid(struct vm *vm, int apicid)
3394 3394 {
3395 3395 /*
3396 3396 * XXX apic id is assumed to be numerically identical to vcpu id
3397 3397 */
3398 3398 return (apicid);
3399 3399 }
3400 3400
3401 3401 struct vatpic *
3402 3402 vm_atpic(struct vm *vm)
3403 3403 {
3404 3404 return (vm->vatpic);
3405 3405 }
3406 3406
3407 3407 struct vatpit *
3408 3408 vm_atpit(struct vm *vm)
3409 3409 {
3410 3410 return (vm->vatpit);
3411 3411 }
3412 3412
3413 3413 struct vpmtmr *
3414 3414 vm_pmtmr(struct vm *vm)
3415 3415 {
3416 3416
3417 3417 return (vm->vpmtmr);
3418 3418 }
3419 3419
3420 3420 struct vrtc *
3421 3421 vm_rtc(struct vm *vm)
3422 3422 {
3423 3423
3424 3424 return (vm->vrtc);
3425 3425 }
3426 3426
3427 3427 enum vm_reg_name
3428 3428 vm_segment_name(int seg)
3429 3429 {
3430 3430 static enum vm_reg_name seg_names[] = {
3431 3431 VM_REG_GUEST_ES,
3432 3432 VM_REG_GUEST_CS,
3433 3433 VM_REG_GUEST_SS,
3434 3434 VM_REG_GUEST_DS,
3435 3435 VM_REG_GUEST_FS,
3436 3436 VM_REG_GUEST_GS
3437 3437 };
3438 3438
3439 3439 KASSERT(seg >= 0 && seg < nitems(seg_names),
3440 3440 ("%s: invalid segment encoding %d", __func__, seg));
3441 3441 return (seg_names[seg]);
3442 3442 }
3443 3443
3444 3444 void
3445 3445 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3446 3446 int num_copyinfo)
3447 3447 {
3448 3448 int idx;
3449 3449
3450 3450 for (idx = 0; idx < num_copyinfo; idx++) {
3451 3451 if (copyinfo[idx].cookie != NULL)
3452 3452 vm_gpa_release(copyinfo[idx].cookie);
3453 3453 }
3454 3454 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3455 3455 }
3456 3456
3457 3457 int
3458 3458 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3459 3459 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3460 3460 int num_copyinfo, int *fault)
3461 3461 {
3462 3462 int error, idx, nused;
3463 3463 size_t n, off, remaining;
3464 3464 void *hva, *cookie;
3465 3465 uint64_t gpa;
3466 3466
3467 3467 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3468 3468
3469 3469 nused = 0;
3470 3470 remaining = len;
3471 3471 while (remaining > 0) {
3472 3472 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3473 3473 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3474 3474 if (error || *fault)
3475 3475 return (error);
3476 3476 off = gpa & PAGE_MASK;
3477 3477 n = min(remaining, PAGE_SIZE - off);
3478 3478 copyinfo[nused].gpa = gpa;
3479 3479 copyinfo[nused].len = n;
3480 3480 remaining -= n;
3481 3481 gla += n;
3482 3482 nused++;
3483 3483 }
3484 3484
3485 3485 for (idx = 0; idx < nused; idx++) {
3486 3486 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3487 3487 copyinfo[idx].len, prot, &cookie);
3488 3488 if (hva == NULL)
3489 3489 break;
3490 3490 copyinfo[idx].hva = hva;
3491 3491 copyinfo[idx].cookie = cookie;
3492 3492 }
3493 3493
3494 3494 if (idx != nused) {
3495 3495 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3496 3496 return (EFAULT);
3497 3497 } else {
3498 3498 *fault = 0;
3499 3499 return (0);
3500 3500 }
3501 3501 }
3502 3502
3503 3503 void
3504 3504 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3505 3505 size_t len)
3506 3506 {
3507 3507 char *dst;
3508 3508 int idx;
3509 3509
3510 3510 dst = kaddr;
3511 3511 idx = 0;
3512 3512 while (len > 0) {
3513 3513 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3514 3514 len -= copyinfo[idx].len;
3515 3515 dst += copyinfo[idx].len;
3516 3516 idx++;
3517 3517 }
3518 3518 }
3519 3519
3520 3520 void
3521 3521 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3522 3522 struct vm_copyinfo *copyinfo, size_t len)
3523 3523 {
3524 3524 const char *src;
3525 3525 int idx;
3526 3526
3527 3527 src = kaddr;
3528 3528 idx = 0;
3529 3529 while (len > 0) {
3530 3530 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3531 3531 len -= copyinfo[idx].len;
3532 3532 src += copyinfo[idx].len;
3533 3533 idx++;
3534 3534 }
3535 3535 }
3536 3536
3537 3537 /*
3538 3538 * Return the amount of in-use and wired memory for the VM. Since
3539 3539 * these are global stats, only return the values with for vCPU 0
3540 3540 */
3541 3541 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3542 3542 VMM_STAT_DECLARE(VMM_MEM_WIRED);
3543 3543
3544 3544 static void
3545 3545 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3546 3546 {
3547 3547
3548 3548 if (vcpu == 0) {
3549 3549 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3550 3550 PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3551 3551 }
3552 3552 }
3553 3553
3554 3554 static void
3555 3555 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3556 3556 {
3557 3557
3558 3558 if (vcpu == 0) {
3559 3559 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3560 3560 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3561 3561 }
3562 3562 }
3563 3563
3564 3564 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3565 3565 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3566 3566
3567 3567 int
3568 3568 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3569 3569 uint8_t bytes, uint32_t *val)
3570 3570 {
3571 3571 return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3572 3572 }
3573 3573
3574 3574 /*
3575 3575 * bhyve-internal interfaces to attach or detach IO port handlers.
3576 3576 * Must be called with VM write lock held for safety.
3577 3577 */
3578 3578 int
3579 3579 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3580 3580 void **cookie)
3581 3581 {
3582 3582 int err;
3583 3583 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3584 3584 if (err == 0) {
3585 3585 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3586 3586 }
3587 3587 return (err);
3588 3588 }
3589 3589 int
3590 3590 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3591 3591 void **old_arg)
3592 3592 {
3593 3593 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3594 3594 int err;
3595 3595
3596 3596 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3597 3597 if (err == 0) {
3598 3598 *cookie = NULL;
3599 3599 }
3600 3600 return (err);
3601 3601 }
3602 3602
3603 3603 /*
3604 3604 * External driver interfaces to attach or detach IO port handlers.
3605 3605 * Must be called with VM write lock held for safety.
3606 3606 */
3607 3607 int
3608 3608 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3609 3609 void *arg, void **cookie)
3610 3610 {
3611 3611 int err;
3612 3612
3613 3613 if (port == 0) {
3614 3614 return (EINVAL);
3615 3615 }
3616 3616
3617 3617 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3618 3618 if (err == 0) {
3619 3619 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3620 3620 }
3621 3621 return (err);
3622 3622 }
3623 3623 void
3624 3624 vm_ioport_unhook(struct vm *vm, void **cookie)
3625 3625 {
3626 3626 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3627 3627 ioport_handler_t old_func;
3628 3628 void *old_arg;
3629 3629 int err;
3630 3630
3631 3631 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3632 3632
3633 3633 /* ioport-hook-using drivers are expected to be well-behaved */
3634 3634 VERIFY0(err);
3635 3635 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3636 3636
3637 3637 *cookie = NULL;
3638 3638 }
3639 3639
3640 3640 int
3641 3641 vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3642 3642 {
3643 3643 struct vm *vm = ksp->ks_private;
3644 3644 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3645 3645 const int vcpuid = vvk->vvk_vcpu.value.ui32;
3646 3646 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3647 3647
3648 3648 ASSERT3U(vcpuid, <, VM_MAXCPU);
3649 3649
3650 3650 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3651 3651 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3652 3652 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3653 3653 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3654 3654 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3655 3655 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3656 3656
3657 3657 return (0);
3658 3658 }
|
↓ open down ↓ |
1429 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX