Print this page
Revert "OS-8005 bhyve memory pressure needs to target ARC better (#354)"
This reverts commit a6033573eedd94118d2b9e65f45deca0bf4b42f7.
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/i86pc/io/vmm/vmm.c
+++ new/usr/src/uts/i86pc/io/vmm/vmm.c
1 1 /*-
2 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 3 *
4 4 * Copyright (c) 2011 NetApp, Inc.
5 5 * All rights reserved.
6 6 *
7 7 * Redistribution and use in source and binary forms, with or without
8 8 * modification, are permitted provided that the following conditions
9 9 * are met:
10 10 * 1. Redistributions of source code must retain the above copyright
11 11 * notice, this list of conditions and the following disclaimer.
12 12 * 2. Redistributions in binary form must reproduce the above copyright
13 13 * notice, this list of conditions and the following disclaimer in the
14 14 * documentation and/or other materials provided with the distribution.
15 15 *
16 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 26 * SUCH DAMAGE.
27 27 *
28 28 * $FreeBSD$
29 29 */
30 30 /*
|
↓ open down ↓ |
30 lines elided |
↑ open up ↑ |
31 31 * This file and its contents are supplied under the terms of the
32 32 * Common Development and Distribution License ("CDDL"), version 1.0.
33 33 * You may only use this file in accordance with the terms of version
34 34 * 1.0 of the CDDL.
35 35 *
36 36 * A full copy of the text of the CDDL should have accompanied this
37 37 * source. A copy of the CDDL is also available via the Internet at
38 38 * http://www.illumos.org/license/CDDL.
39 39 *
40 40 * Copyright 2015 Pluribus Networks Inc.
41 - * Copyright 2021 Joyent, Inc.
41 + * Copyright 2018 Joyent, Inc.
42 42 * Copyright 2021 Oxide Computer Company
43 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
44 44 */
45 45
46 46 #include <sys/cdefs.h>
47 47 __FBSDID("$FreeBSD$");
48 48
49 49 #include <sys/param.h>
50 50 #include <sys/systm.h>
51 51 #include <sys/kernel.h>
52 52 #include <sys/module.h>
53 53 #include <sys/sysctl.h>
54 54 #include <sys/malloc.h>
55 55 #include <sys/pcpu.h>
56 56 #include <sys/lock.h>
57 57 #include <sys/mutex.h>
58 58 #include <sys/proc.h>
59 59 #include <sys/rwlock.h>
60 60 #include <sys/sched.h>
61 61 #include <sys/smp.h>
62 62 #include <sys/systm.h>
63 63
64 64 #include <machine/pcb.h>
65 65 #include <machine/smp.h>
66 66 #include <machine/md_var.h>
67 67 #include <x86/psl.h>
68 68 #include <x86/apicreg.h>
69 69
70 70 #include <machine/specialreg.h>
71 71 #include <machine/vmm.h>
72 72 #include <machine/vmm_dev.h>
73 73 #include <machine/vmparam.h>
74 74 #include <sys/vmm_instruction_emul.h>
75 75 #include <sys/vmm_vm.h>
76 76
77 77 #include "vmm_ioport.h"
78 78 #include "vmm_ktr.h"
79 79 #include "vmm_host.h"
80 80 #include "vmm_mem.h"
81 81 #include "vmm_util.h"
82 82 #include "vatpic.h"
83 83 #include "vatpit.h"
84 84 #include "vhpet.h"
85 85 #include "vioapic.h"
86 86 #include "vlapic.h"
87 87 #include "vpmtmr.h"
88 88 #include "vrtc.h"
89 89 #include "vmm_stat.h"
90 90 #include "vmm_lapic.h"
91 91
92 92 #include "io/ppt.h"
93 93 #include "io/iommu.h"
94 94
95 95 struct vlapic;
96 96
97 97 /*
98 98 * Initialization:
99 99 * (a) allocated when vcpu is created
100 100 * (i) initialized when vcpu is created and when it is reinitialized
101 101 * (o) initialized the first time the vcpu is created
102 102 * (x) initialized before use
103 103 */
104 104 struct vcpu {
105 105 /* (o) protects state, run_state, hostcpu, sipi_vector */
106 106 struct mtx mtx;
107 107
108 108 enum vcpu_state state; /* (o) vcpu state */
109 109 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */
110 110 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */
111 111 kcondvar_t state_cv; /* (o) IDLE-transition cv */
112 112 int hostcpu; /* (o) vcpu's current host cpu */
113 113 int lastloccpu; /* (o) last host cpu localized to */
114 114 int reqidle; /* (i) request vcpu to idle */
115 115 struct vlapic *vlapic; /* (i) APIC device model */
116 116 enum x2apic_state x2apic_state; /* (i) APIC mode */
117 117 uint64_t exitintinfo; /* (i) events pending at VM exit */
118 118 int nmi_pending; /* (i) NMI pending */
119 119 int extint_pending; /* (i) INTR pending */
120 120 int exception_pending; /* (i) exception pending */
121 121 int exc_vector; /* (x) exception collateral */
122 122 int exc_errcode_valid;
123 123 uint32_t exc_errcode;
124 124 uint8_t sipi_vector; /* (i) SIPI vector */
125 125 struct savefpu *guestfpu; /* (a,i) guest fpu state */
126 126 uint64_t guest_xcr0; /* (i) guest %xcr0 register */
127 127 void *stats; /* (a,i) statistics */
128 128 struct vm_exit exitinfo; /* (x) exit reason and collateral */
129 129 uint64_t nextrip; /* (x) next instruction to execute */
130 130 struct vie *vie_ctx; /* (x) instruction emulation context */
131 131 uint64_t tsc_offset; /* (x) offset from host TSC */
132 132
133 133 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */
134 134 hrtime_t ustate_when; /* (i) time of last ustate change */
135 135 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */
136 136 };
137 137
138 138 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
139 139 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
140 140 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
141 141 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
142 142 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
143 143
144 144 struct mem_seg {
145 145 size_t len;
146 146 bool sysmem;
147 147 struct vm_object *object;
148 148 };
149 149 #define VM_MAX_MEMSEGS 4
150 150
151 151 struct mem_map {
152 152 vm_paddr_t gpa;
153 153 size_t len;
154 154 vm_ooffset_t segoff;
155 155 int segid;
156 156 int prot;
157 157 int flags;
158 158 };
159 159 #define VM_MAX_MEMMAPS 8
160 160
161 161 /*
162 162 * Initialization:
163 163 * (o) initialized the first time the VM is created
164 164 * (i) initialized when VM is created and when it is reinitialized
165 165 * (x) initialized before use
166 166 */
167 167 struct vm {
168 168 void *cookie; /* (i) cpu-specific data */
169 169 void *iommu; /* (x) iommu-specific data */
170 170 struct vhpet *vhpet; /* (i) virtual HPET */
171 171 struct vioapic *vioapic; /* (i) virtual ioapic */
172 172 struct vatpic *vatpic; /* (i) virtual atpic */
173 173 struct vatpit *vatpit; /* (i) virtual atpit */
174 174 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
175 175 struct vrtc *vrtc; /* (o) virtual RTC */
176 176 volatile cpuset_t active_cpus; /* (i) active vcpus */
177 177 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */
178 178 int suspend; /* (i) stop VM execution */
179 179 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
180 180 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
181 181 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
|
↓ open down ↓ |
130 lines elided |
↑ open up ↑ |
182 182 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
183 183 struct vmspace *vmspace; /* (o) guest's address space */
184 184 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
185 185 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
186 186 /* The following describe the vm cpu topology */
187 187 uint16_t sockets; /* (o) num of sockets */
188 188 uint16_t cores; /* (o) num of cores/socket */
189 189 uint16_t threads; /* (o) num of threads/core */
190 190 uint16_t maxcpus; /* (o) max pluggable cpus */
191 191 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */
192 - size_t arc_resv; /* # of pages take from ARC */
193 192
194 193 struct ioport_config ioports; /* (o) ioport handling */
195 194 };
196 195
197 196 static int vmm_initialized;
198 197
199 198
200 199 static void
201 200 nullop_panic(void)
202 201 {
203 202 panic("null vmm operation call");
204 203 }
205 204
206 205 /* Do not allow use of an un-set `ops` to do anything but panic */
207 206 static struct vmm_ops vmm_ops_null = {
208 207 .init = (vmm_init_func_t)nullop_panic,
209 208 .cleanup = (vmm_cleanup_func_t)nullop_panic,
210 209 .resume = (vmm_resume_func_t)nullop_panic,
211 210 .vminit = (vmi_init_func_t)nullop_panic,
212 211 .vmrun = (vmi_run_func_t)nullop_panic,
213 212 .vmcleanup = (vmi_cleanup_func_t)nullop_panic,
214 213 .vmgetreg = (vmi_get_register_t)nullop_panic,
215 214 .vmsetreg = (vmi_set_register_t)nullop_panic,
216 215 .vmgetdesc = (vmi_get_desc_t)nullop_panic,
217 216 .vmsetdesc = (vmi_set_desc_t)nullop_panic,
218 217 .vmgetcap = (vmi_get_cap_t)nullop_panic,
219 218 .vmsetcap = (vmi_set_cap_t)nullop_panic,
220 219 .vmspace_alloc = (vmi_vmspace_alloc)nullop_panic,
221 220 .vmspace_free = (vmi_vmspace_free)nullop_panic,
222 221 .vlapic_init = (vmi_vlapic_init)nullop_panic,
223 222 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
224 223 .vmsavectx = (vmi_savectx)nullop_panic,
225 224 .vmrestorectx = (vmi_restorectx)nullop_panic,
226 225 };
227 226
228 227 static struct vmm_ops *ops = &vmm_ops_null;
229 228
230 229 #define VMM_INIT(num) ((*ops->init)(num))
231 230 #define VMM_CLEANUP() ((*ops->cleanup)())
232 231 #define VMM_RESUME() ((*ops->resume)())
233 232
234 233 #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap))
235 234 #define VMRUN(vmi, vcpu, rip, pmap) \
236 235 ((*ops->vmrun)(vmi, vcpu, rip, pmap))
237 236 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
238 237 #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max))
239 238 #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace))
240 239
241 240 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
242 241 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val))
243 242 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
244 243 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
245 244 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv))
246 245 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val))
247 246 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu))
248 247 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic))
249 248
250 249 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
251 250 #define fpu_stop_emulating() clts()
252 251
253 252 SDT_PROVIDER_DEFINE(vmm);
254 253
255 254 static MALLOC_DEFINE(M_VM, "vm", "vm");
256 255
257 256 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
258 257 NULL);
259 258
260 259 /*
261 260 * Halt the guest if all vcpus are executing a HLT instruction with
262 261 * interrupts disabled.
263 262 */
264 263 static int halt_detection_enabled = 1;
265 264
266 265 /* IPI vector used for vcpu notifications */
267 266 static int vmm_ipinum;
|
↓ open down ↓ |
65 lines elided |
↑ open up ↑ |
268 267
269 268 /* Trap into hypervisor on all guest exceptions and reflect them back */
270 269 static int trace_guest_exceptions;
271 270
272 271 static void vm_free_memmap(struct vm *vm, int ident);
273 272 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
274 273 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
275 274 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
276 275 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
277 276
278 -extern int arc_virt_machine_reserve(size_t);
279 -extern void arc_virt_machine_release(size_t);
280 -
281 277 /* Flags for vtc_status */
282 278 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */
283 279 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */
284 280
285 281 typedef struct vm_thread_ctx {
286 282 struct vm *vtc_vm;
287 283 int vtc_vcpuid;
288 284 uint_t vtc_status;
289 285 enum vcpu_ustate vtc_ustate;
290 286 } vm_thread_ctx_t;
291 287
292 288 #ifdef KTR
293 289 static const char *
294 290 vcpu_state2str(enum vcpu_state state)
295 291 {
296 292
297 293 switch (state) {
298 294 case VCPU_IDLE:
299 295 return ("idle");
300 296 case VCPU_FROZEN:
301 297 return ("frozen");
302 298 case VCPU_RUNNING:
303 299 return ("running");
304 300 case VCPU_SLEEPING:
305 301 return ("sleeping");
306 302 default:
307 303 return ("unknown");
308 304 }
309 305 }
310 306 #endif
311 307
312 308 static void
313 309 vcpu_cleanup(struct vm *vm, int i, bool destroy)
314 310 {
315 311 struct vcpu *vcpu = &vm->vcpu[i];
316 312
317 313 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
318 314 if (destroy) {
319 315 vmm_stat_free(vcpu->stats);
320 316 fpu_save_area_free(vcpu->guestfpu);
321 317 vie_free(vcpu->vie_ctx);
322 318 vcpu->vie_ctx = NULL;
323 319 }
324 320 }
325 321
326 322 static void
327 323 vcpu_init(struct vm *vm, int vcpu_id, bool create)
328 324 {
329 325 struct vcpu *vcpu;
330 326
331 327 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
332 328 ("vcpu_init: invalid vcpu %d", vcpu_id));
333 329
334 330 vcpu = &vm->vcpu[vcpu_id];
335 331
336 332 if (create) {
337 333 vcpu_lock_init(vcpu);
338 334 vcpu->state = VCPU_IDLE;
339 335 vcpu->hostcpu = NOCPU;
340 336 vcpu->lastloccpu = NOCPU;
341 337 vcpu->guestfpu = fpu_save_area_alloc();
342 338 vcpu->stats = vmm_stat_alloc();
343 339 vcpu->vie_ctx = vie_alloc();
344 340
345 341 vcpu->ustate = VU_INIT;
346 342 vcpu->ustate_when = gethrtime();
347 343 } else {
348 344 vie_reset(vcpu->vie_ctx);
349 345 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
350 346 if (vcpu->ustate != VU_INIT) {
351 347 vcpu_ustate_change(vm, vcpu_id, VU_INIT);
352 348 }
353 349 }
354 350
355 351 vcpu->run_state = VRS_HALT;
356 352 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
357 353 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
358 354 vcpu->reqidle = 0;
359 355 vcpu->exitintinfo = 0;
360 356 vcpu->nmi_pending = 0;
361 357 vcpu->extint_pending = 0;
362 358 vcpu->exception_pending = 0;
363 359 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
364 360 fpu_save_area_reset(vcpu->guestfpu);
365 361 vmm_stat_init(vcpu->stats);
366 362 vcpu->tsc_offset = 0;
367 363 }
368 364
369 365 int
370 366 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
371 367 {
372 368
373 369 return (trace_guest_exceptions);
374 370 }
375 371
376 372 struct vm_exit *
377 373 vm_exitinfo(struct vm *vm, int cpuid)
378 374 {
379 375 struct vcpu *vcpu;
380 376
381 377 if (cpuid < 0 || cpuid >= vm->maxcpus)
382 378 panic("vm_exitinfo: invalid cpuid %d", cpuid);
383 379
384 380 vcpu = &vm->vcpu[cpuid];
385 381
386 382 return (&vcpu->exitinfo);
387 383 }
388 384
389 385 struct vie *
390 386 vm_vie_ctx(struct vm *vm, int cpuid)
391 387 {
392 388 if (cpuid < 0 || cpuid >= vm->maxcpus)
393 389 panic("vm_vie_ctx: invalid cpuid %d", cpuid);
394 390
395 391 return (vm->vcpu[cpuid].vie_ctx);
396 392 }
397 393
398 394 static int
399 395 vmm_init(void)
400 396 {
401 397 int error;
402 398
403 399 vmm_host_state_init();
404 400
405 401 /* We use cpu_poke() for IPIs */
406 402 vmm_ipinum = 0;
407 403
408 404 error = vmm_mem_init();
409 405 if (error)
410 406 return (error);
411 407
412 408 if (vmm_is_intel())
413 409 ops = &vmm_ops_intel;
414 410 else if (vmm_is_svm())
415 411 ops = &vmm_ops_amd;
416 412 else
417 413 return (ENXIO);
418 414
419 415 return (VMM_INIT(vmm_ipinum));
420 416 }
421 417
422 418 int
423 419 vmm_mod_load()
424 420 {
425 421 int error;
426 422
427 423 VERIFY(vmm_initialized == 0);
428 424
429 425 error = vmm_init();
430 426 if (error == 0)
431 427 vmm_initialized = 1;
432 428
433 429 return (error);
434 430 }
435 431
436 432 int
437 433 vmm_mod_unload()
438 434 {
439 435 int error;
440 436
441 437 VERIFY(vmm_initialized == 1);
442 438
443 439 iommu_cleanup();
444 440 error = VMM_CLEANUP();
445 441 if (error)
446 442 return (error);
447 443 vmm_initialized = 0;
448 444
449 445 return (0);
450 446 }
451 447
452 448 static void
453 449 vm_init(struct vm *vm, bool create)
454 450 {
455 451 int i;
456 452
457 453 vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
458 454 vm->iommu = NULL;
459 455 vm->vioapic = vioapic_init(vm);
460 456 vm->vhpet = vhpet_init(vm);
461 457 vm->vatpic = vatpic_init(vm);
462 458 vm->vatpit = vatpit_init(vm);
463 459 vm->vpmtmr = vpmtmr_init(vm);
464 460 if (create)
465 461 vm->vrtc = vrtc_init(vm);
466 462
467 463 vm_inout_init(vm, &vm->ioports);
468 464
469 465 CPU_ZERO(&vm->active_cpus);
470 466 CPU_ZERO(&vm->debug_cpus);
471 467
472 468 vm->suspend = 0;
473 469 CPU_ZERO(&vm->suspended_cpus);
474 470
475 471 for (i = 0; i < vm->maxcpus; i++)
476 472 vcpu_init(vm, i, create);
477 473
478 474 /*
479 475 * Configure the VM-wide TSC offset so that the call to vm_init()
480 476 * represents the boot time (when the TSC(s) read 0). Each vCPU will
481 477 * have its own offset from this, which is altered if/when the guest
482 478 * writes to MSR_TSC.
483 479 *
484 480 * The TSC offsetting math is all unsigned, using overflow for negative
485 481 * offets. A reading of the TSC is negated to form the boot offset.
486 482 */
487 483 vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset());
488 484 }
489 485
490 486 /*
491 487 * The default CPU topology is a single thread per package.
492 488 */
493 489 uint_t cores_per_package = 1;
494 490 uint_t threads_per_core = 1;
495 491
496 492 int
497 493 vm_create(const char *name, struct vm **retvm)
498 494 {
499 495 struct vm *vm;
500 496 struct vmspace *vmspace;
501 497
502 498 /*
503 499 * If vmm.ko could not be successfully initialized then don't attempt
504 500 * to create the virtual machine.
505 501 */
506 502 if (!vmm_initialized)
507 503 return (ENXIO);
508 504
509 505 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
510 506 return (EINVAL);
511 507
512 508 vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
513 509 if (vmspace == NULL)
514 510 return (ENOMEM);
515 511
516 512 vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
517 513 strcpy(vm->name, name);
518 514 vm->vmspace = vmspace;
519 515
520 516 vm->sockets = 1;
521 517 vm->cores = cores_per_package; /* XXX backwards compatibility */
522 518 vm->threads = threads_per_core; /* XXX backwards compatibility */
523 519 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
524 520
525 521 vm_init(vm, true);
526 522
527 523 *retvm = vm;
528 524 return (0);
529 525 }
530 526
531 527 void
532 528 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
533 529 uint16_t *threads, uint16_t *maxcpus)
534 530 {
535 531 *sockets = vm->sockets;
536 532 *cores = vm->cores;
537 533 *threads = vm->threads;
538 534 *maxcpus = vm->maxcpus;
539 535 }
540 536
541 537 uint16_t
542 538 vm_get_maxcpus(struct vm *vm)
543 539 {
544 540 return (vm->maxcpus);
545 541 }
546 542
547 543 int
548 544 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
549 545 uint16_t threads, uint16_t maxcpus)
550 546 {
551 547 if (maxcpus != 0)
552 548 return (EINVAL); /* XXX remove when supported */
553 549 if ((sockets * cores * threads) > vm->maxcpus)
554 550 return (EINVAL);
555 551 /* XXX need to check sockets * cores * threads == vCPU, how? */
556 552 vm->sockets = sockets;
557 553 vm->cores = cores;
558 554 vm->threads = threads;
559 555 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
560 556 return (0);
561 557 }
562 558
563 559 static void
564 560 vm_cleanup(struct vm *vm, bool destroy)
565 561 {
566 562 struct mem_map *mm;
567 563 int i;
568 564
569 565 ppt_unassign_all(vm);
570 566
571 567 if (vm->iommu != NULL)
572 568 iommu_destroy_domain(vm->iommu);
573 569
574 570 /*
575 571 * Devices which attach their own ioport hooks should be cleaned up
576 572 * first so they can tear down those registrations.
577 573 */
578 574 vpmtmr_cleanup(vm->vpmtmr);
579 575
580 576 vm_inout_cleanup(vm, &vm->ioports);
581 577
582 578 if (destroy)
583 579 vrtc_cleanup(vm->vrtc);
584 580 else
585 581 vrtc_reset(vm->vrtc);
586 582
587 583 vatpit_cleanup(vm->vatpit);
588 584 vhpet_cleanup(vm->vhpet);
589 585 vatpic_cleanup(vm->vatpic);
590 586 vioapic_cleanup(vm->vioapic);
591 587
592 588 for (i = 0; i < vm->maxcpus; i++)
593 589 vcpu_cleanup(vm, i, destroy);
594 590
595 591 VMCLEANUP(vm->cookie);
596 592
597 593 /*
598 594 * System memory is removed from the guest address space only when
599 595 * the VM is destroyed. This is because the mapping remains the same
600 596 * across VM reset.
601 597 *
602 598 * Device memory can be relocated by the guest (e.g. using PCI BARs)
603 599 * so those mappings are removed on a VM reset.
604 600 */
605 601 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
606 602 mm = &vm->mem_maps[i];
607 603 if (destroy || !sysmem_mapping(vm, mm)) {
608 604 vm_free_memmap(vm, i);
609 605 } else {
610 606 /*
611 607 * We need to reset the IOMMU flag so this mapping can
612 608 * be reused when a VM is rebooted. Since the IOMMU
613 609 * domain has already been destroyed we can just reset
614 610 * the flag here.
615 611 */
|
↓ open down ↓ |
325 lines elided |
↑ open up ↑ |
616 612 mm->flags &= ~VM_MEMMAP_F_IOMMU;
617 613 }
618 614 }
619 615
620 616 if (destroy) {
621 617 for (i = 0; i < VM_MAX_MEMSEGS; i++)
622 618 vm_free_memseg(vm, i);
623 619
624 620 VMSPACE_FREE(vm->vmspace);
625 621 vm->vmspace = NULL;
626 -
627 - arc_virt_machine_release(vm->arc_resv);
628 - vm->arc_resv = 0;
629 622 }
630 623 }
631 624
632 625 void
633 626 vm_destroy(struct vm *vm)
634 627 {
635 628 vm_cleanup(vm, true);
636 629 free(vm, M_VM);
637 630 }
638 631
639 632 int
640 633 vm_reinit(struct vm *vm)
641 634 {
642 635 int error;
643 636
644 637 /*
645 638 * A virtual machine can be reset only if all vcpus are suspended.
646 639 */
647 640 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
648 641 vm_cleanup(vm, false);
649 642 vm_init(vm, false);
650 643 error = 0;
651 644 } else {
652 645 error = EBUSY;
653 646 }
654 647
655 648 return (error);
656 649 }
657 650
658 651 const char *
659 652 vm_name(struct vm *vm)
660 653 {
661 654 return (vm->name);
662 655 }
663 656
664 657 int
665 658 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
666 659 {
667 660 vm_object_t obj;
668 661
669 662 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
670 663 return (ENOMEM);
671 664 else
672 665 return (0);
673 666 }
674 667
675 668 int
676 669 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
677 670 {
678 671 return (vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len));
679 672 }
680 673
681 674 /*
682 675 * Return 'true' if 'gpa' is allocated in the guest address space.
683 676 *
684 677 * This function is called in the context of a running vcpu which acts as
685 678 * an implicit lock on 'vm->mem_maps[]'.
686 679 */
687 680 bool
688 681 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
689 682 {
690 683 struct mem_map *mm;
691 684 int i;
692 685
693 686 #ifdef INVARIANTS
694 687 int hostcpu, state;
695 688 state = vcpu_get_state(vm, vcpuid, &hostcpu);
696 689 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
697 690 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
698 691 #endif
699 692
700 693 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
701 694 mm = &vm->mem_maps[i];
702 695 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
703 696 return (true); /* 'gpa' is sysmem or devmem */
704 697 }
705 698
706 699 if (ppt_is_mmio(vm, gpa))
707 700 return (true); /* 'gpa' is pci passthru mmio */
708 701
709 702 return (false);
710 703 }
711 704
712 705 int
713 706 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
714 707 {
715 708 struct mem_seg *seg;
716 709 vm_object_t obj;
717 710
718 711 #ifndef __FreeBSD__
719 712 extern pgcnt_t get_max_page_get(void);
720 713 #endif
721 714
722 715 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
723 716 return (EINVAL);
724 717
725 718 if (len == 0 || (len & PAGE_MASK))
726 719 return (EINVAL);
727 720
728 721 #ifndef __FreeBSD__
729 722 if (len > ptob(get_max_page_get()))
730 723 return (EINVAL);
731 724 #endif
732 725
733 726 seg = &vm->mem_segs[ident];
734 727 if (seg->object != NULL) {
735 728 if (seg->len == len && seg->sysmem == sysmem)
736 729 return (EEXIST);
737 730 else
738 731 return (EINVAL);
739 732 }
740 733
741 734 obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
742 735 if (obj == NULL)
743 736 return (ENOMEM);
744 737
745 738 seg->len = len;
746 739 seg->object = obj;
747 740 seg->sysmem = sysmem;
748 741 return (0);
749 742 }
750 743
751 744 int
752 745 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
753 746 vm_object_t *objptr)
754 747 {
755 748 struct mem_seg *seg;
756 749
757 750 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
758 751 return (EINVAL);
759 752
760 753 seg = &vm->mem_segs[ident];
761 754 if (len)
762 755 *len = seg->len;
763 756 if (sysmem)
764 757 *sysmem = seg->sysmem;
765 758 if (objptr)
766 759 *objptr = seg->object;
767 760 return (0);
768 761 }
769 762
770 763 void
771 764 vm_free_memseg(struct vm *vm, int ident)
772 765 {
773 766 struct mem_seg *seg;
774 767
775 768 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
776 769 ("%s: invalid memseg ident %d", __func__, ident));
777 770
778 771 seg = &vm->mem_segs[ident];
779 772 if (seg->object != NULL) {
780 773 vm_object_deallocate(seg->object);
781 774 bzero(seg, sizeof (struct mem_seg));
782 775 }
783 776 }
784 777
785 778 int
786 779 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
787 780 size_t len, int prot, int flags)
788 781 {
789 782 struct mem_seg *seg;
790 783 struct mem_map *m, *map;
791 784 vm_ooffset_t last;
792 785 int i, error;
793 786
794 787 if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
795 788 return (EINVAL);
796 789
797 790 if (flags & ~VM_MEMMAP_F_WIRED)
798 791 return (EINVAL);
799 792
800 793 if (segid < 0 || segid >= VM_MAX_MEMSEGS)
801 794 return (EINVAL);
802 795
803 796 seg = &vm->mem_segs[segid];
804 797 if (seg->object == NULL)
805 798 return (EINVAL);
806 799
807 800 last = first + len;
808 801 if (first < 0 || first >= last || last > seg->len)
809 802 return (EINVAL);
810 803
811 804 if ((gpa | first | last) & PAGE_MASK)
812 805 return (EINVAL);
813 806
814 807 map = NULL;
815 808 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
816 809 m = &vm->mem_maps[i];
817 810 if (m->len == 0) {
818 811 map = m;
819 812 break;
820 813 }
821 814 }
822 815
823 816 if (map == NULL)
824 817 return (ENOSPC);
825 818
826 819 error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
827 820 len, 0, VMFS_NO_SPACE, prot, prot, 0);
828 821 if (error != 0)
829 822 return (EFAULT);
830 823
831 824 vm_object_reference(seg->object);
832 825
833 826 if ((flags & VM_MEMMAP_F_WIRED) != 0) {
834 827 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
835 828 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
836 829 if (error != 0) {
837 830 vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
838 831 return (EFAULT);
839 832 }
840 833 }
841 834
842 835 map->gpa = gpa;
843 836 map->len = len;
844 837 map->segoff = first;
845 838 map->segid = segid;
846 839 map->prot = prot;
847 840 map->flags = flags;
848 841 return (0);
849 842 }
850 843
851 844 int
852 845 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
853 846 {
854 847 struct mem_map *m;
855 848 int i;
856 849
857 850 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
858 851 m = &vm->mem_maps[i];
859 852 if (m->gpa == gpa && m->len == len &&
860 853 (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
861 854 vm_free_memmap(vm, i);
862 855 return (0);
863 856 }
864 857 }
865 858
866 859 return (EINVAL);
867 860 }
868 861
869 862 int
870 863 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
871 864 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
872 865 {
873 866 struct mem_map *mm, *mmnext;
874 867 int i;
875 868
876 869 mmnext = NULL;
877 870 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
878 871 mm = &vm->mem_maps[i];
879 872 if (mm->len == 0 || mm->gpa < *gpa)
880 873 continue;
881 874 if (mmnext == NULL || mm->gpa < mmnext->gpa)
882 875 mmnext = mm;
883 876 }
884 877
885 878 if (mmnext != NULL) {
886 879 *gpa = mmnext->gpa;
887 880 if (segid)
888 881 *segid = mmnext->segid;
889 882 if (segoff)
890 883 *segoff = mmnext->segoff;
891 884 if (len)
892 885 *len = mmnext->len;
893 886 if (prot)
894 887 *prot = mmnext->prot;
895 888 if (flags)
896 889 *flags = mmnext->flags;
897 890 return (0);
898 891 } else {
899 892 return (ENOENT);
900 893 }
901 894 }
902 895
903 896 static void
904 897 vm_free_memmap(struct vm *vm, int ident)
905 898 {
906 899 struct mem_map *mm;
907 900 int error;
908 901
909 902 mm = &vm->mem_maps[ident];
910 903 if (mm->len) {
911 904 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
912 905 mm->gpa + mm->len);
913 906 KASSERT(error == 0, ("%s: vm_map_remove error %d",
914 907 __func__, error));
915 908 bzero(mm, sizeof (struct mem_map));
916 909 }
917 910 }
918 911
919 912 static __inline bool
920 913 sysmem_mapping(struct vm *vm, struct mem_map *mm)
921 914 {
922 915
923 916 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
924 917 return (true);
925 918 else
926 919 return (false);
927 920 }
928 921
929 922 vm_paddr_t
930 923 vmm_sysmem_maxaddr(struct vm *vm)
931 924 {
932 925 struct mem_map *mm;
933 926 vm_paddr_t maxaddr;
934 927 int i;
935 928
936 929 maxaddr = 0;
937 930 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
938 931 mm = &vm->mem_maps[i];
939 932 if (sysmem_mapping(vm, mm)) {
940 933 if (maxaddr < mm->gpa + mm->len)
941 934 maxaddr = mm->gpa + mm->len;
942 935 }
943 936 }
944 937 return (maxaddr);
945 938 }
946 939
947 940 static void
948 941 vm_iommu_modify(struct vm *vm, bool map)
949 942 {
950 943 int i, sz;
951 944 vm_paddr_t gpa, hpa;
952 945 struct mem_map *mm;
953 946 #ifdef __FreeBSD__
954 947 void *vp, *cookie, *host_domain;
955 948 #else
956 949 void *vp, *cookie, *host_domain __unused;
957 950 #endif
958 951
959 952 sz = PAGE_SIZE;
960 953 host_domain = iommu_host_domain();
961 954
962 955 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
963 956 mm = &vm->mem_maps[i];
964 957 if (!sysmem_mapping(vm, mm))
965 958 continue;
966 959
967 960 if (map) {
968 961 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
969 962 ("iommu map found invalid memmap %lx/%lx/%x",
970 963 mm->gpa, mm->len, mm->flags));
971 964 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
972 965 continue;
973 966 mm->flags |= VM_MEMMAP_F_IOMMU;
974 967 } else {
975 968 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
976 969 continue;
977 970 mm->flags &= ~VM_MEMMAP_F_IOMMU;
978 971 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
979 972 ("iommu unmap found invalid memmap %lx/%lx/%x",
980 973 mm->gpa, mm->len, mm->flags));
981 974 }
982 975
983 976 gpa = mm->gpa;
984 977 while (gpa < mm->gpa + mm->len) {
985 978 vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, PROT_WRITE,
986 979 &cookie);
987 980 KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
988 981 vm_name(vm), gpa));
989 982
990 983 vm_gpa_release(cookie);
991 984
992 985 hpa = DMAP_TO_PHYS((uintptr_t)vp);
993 986 if (map) {
994 987 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
995 988 #ifdef __FreeBSD__
996 989 iommu_remove_mapping(host_domain, hpa, sz);
997 990 #endif
998 991 } else {
999 992 iommu_remove_mapping(vm->iommu, gpa, sz);
1000 993 #ifdef __FreeBSD__
1001 994 iommu_create_mapping(host_domain, hpa, hpa, sz);
1002 995 #endif
1003 996 }
1004 997
1005 998 gpa += PAGE_SIZE;
1006 999 }
1007 1000 }
1008 1001
1009 1002 /*
1010 1003 * Invalidate the cached translations associated with the domain
1011 1004 * from which pages were removed.
1012 1005 */
1013 1006 #ifdef __FreeBSD__
1014 1007 if (map)
1015 1008 iommu_invalidate_tlb(host_domain);
1016 1009 else
1017 1010 iommu_invalidate_tlb(vm->iommu);
1018 1011 #else
1019 1012 iommu_invalidate_tlb(vm->iommu);
1020 1013 #endif
1021 1014 }
1022 1015
1023 1016 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false)
1024 1017 #define vm_iommu_map(vm) vm_iommu_modify((vm), true)
1025 1018
1026 1019 int
1027 1020 vm_unassign_pptdev(struct vm *vm, int pptfd)
1028 1021 {
1029 1022 int error;
1030 1023
1031 1024 error = ppt_unassign_device(vm, pptfd);
1032 1025 if (error)
1033 1026 return (error);
1034 1027
1035 1028 if (ppt_assigned_devices(vm) == 0)
1036 1029 vm_iommu_unmap(vm);
1037 1030
1038 1031 return (0);
1039 1032 }
1040 1033
1041 1034 int
1042 1035 vm_assign_pptdev(struct vm *vm, int pptfd)
1043 1036 {
1044 1037 int error;
1045 1038 vm_paddr_t maxaddr;
1046 1039
1047 1040 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1048 1041 if (ppt_assigned_devices(vm) == 0) {
1049 1042 KASSERT(vm->iommu == NULL,
1050 1043 ("vm_assign_pptdev: iommu must be NULL"));
1051 1044 maxaddr = vmm_sysmem_maxaddr(vm);
1052 1045 vm->iommu = iommu_create_domain(maxaddr);
1053 1046 if (vm->iommu == NULL)
1054 1047 return (ENXIO);
1055 1048 vm_iommu_map(vm);
1056 1049 }
1057 1050
1058 1051 error = ppt_assign_device(vm, pptfd);
1059 1052 return (error);
1060 1053 }
1061 1054
1062 1055 void *
1063 1056 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1064 1057 void **cookie)
1065 1058 {
1066 1059 int i, count, pageoff;
1067 1060 struct mem_map *mm;
1068 1061 vm_page_t m;
1069 1062 #ifdef INVARIANTS
1070 1063 /*
1071 1064 * All vcpus are frozen by ioctls that modify the memory map
1072 1065 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1073 1066 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1074 1067 */
1075 1068 int state;
1076 1069 KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1077 1070 __func__, vcpuid));
1078 1071 for (i = 0; i < vm->maxcpus; i++) {
1079 1072 if (vcpuid != -1 && vcpuid != i)
1080 1073 continue;
1081 1074 state = vcpu_get_state(vm, i, NULL);
1082 1075 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1083 1076 __func__, state));
1084 1077 }
1085 1078 #endif
1086 1079 pageoff = gpa & PAGE_MASK;
1087 1080 if (len > PAGE_SIZE - pageoff)
1088 1081 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1089 1082
1090 1083 count = 0;
1091 1084 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1092 1085 mm = &vm->mem_maps[i];
1093 1086 if (mm->len == 0) {
1094 1087 continue;
1095 1088 }
1096 1089 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1097 1090 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1098 1091 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1099 1092 break;
1100 1093 }
1101 1094 }
1102 1095
1103 1096 if (count == 1) {
1104 1097 *cookie = m;
1105 1098 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1106 1099 } else {
1107 1100 *cookie = NULL;
1108 1101 return (NULL);
1109 1102 }
1110 1103 }
1111 1104
1112 1105 void
1113 1106 vm_gpa_release(void *cookie)
1114 1107 {
1115 1108 vm_page_t m = cookie;
1116 1109
1117 1110 vm_page_unwire(m, PQ_ACTIVE);
1118 1111 }
1119 1112
1120 1113 int
1121 1114 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1122 1115 {
1123 1116
1124 1117 if (vcpu < 0 || vcpu >= vm->maxcpus)
1125 1118 return (EINVAL);
1126 1119
1127 1120 if (reg >= VM_REG_LAST)
1128 1121 return (EINVAL);
1129 1122
1130 1123 return (VMGETREG(vm->cookie, vcpu, reg, retval));
1131 1124 }
1132 1125
1133 1126 int
1134 1127 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1135 1128 {
1136 1129 struct vcpu *vcpu;
1137 1130 int error;
1138 1131
1139 1132 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1140 1133 return (EINVAL);
1141 1134
1142 1135 if (reg >= VM_REG_LAST)
1143 1136 return (EINVAL);
1144 1137
1145 1138 error = VMSETREG(vm->cookie, vcpuid, reg, val);
1146 1139 if (error || reg != VM_REG_GUEST_RIP)
1147 1140 return (error);
1148 1141
1149 1142 /* Set 'nextrip' to match the value of %rip */
1150 1143 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1151 1144 vcpu = &vm->vcpu[vcpuid];
1152 1145 vcpu->nextrip = val;
1153 1146 return (0);
1154 1147 }
1155 1148
1156 1149 static bool
1157 1150 is_descriptor_table(int reg)
1158 1151 {
1159 1152 switch (reg) {
1160 1153 case VM_REG_GUEST_IDTR:
1161 1154 case VM_REG_GUEST_GDTR:
1162 1155 return (true);
1163 1156 default:
1164 1157 return (false);
1165 1158 }
1166 1159 }
1167 1160
1168 1161 static bool
1169 1162 is_segment_register(int reg)
1170 1163 {
1171 1164 switch (reg) {
1172 1165 case VM_REG_GUEST_ES:
1173 1166 case VM_REG_GUEST_CS:
1174 1167 case VM_REG_GUEST_SS:
1175 1168 case VM_REG_GUEST_DS:
1176 1169 case VM_REG_GUEST_FS:
1177 1170 case VM_REG_GUEST_GS:
1178 1171 case VM_REG_GUEST_TR:
1179 1172 case VM_REG_GUEST_LDTR:
1180 1173 return (true);
1181 1174 default:
1182 1175 return (false);
1183 1176 }
1184 1177 }
1185 1178
1186 1179 int
1187 1180 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1188 1181 {
1189 1182
1190 1183 if (vcpu < 0 || vcpu >= vm->maxcpus)
1191 1184 return (EINVAL);
1192 1185
1193 1186 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1194 1187 return (EINVAL);
1195 1188
1196 1189 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1197 1190 }
1198 1191
1199 1192 int
1200 1193 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1201 1194 {
1202 1195 if (vcpu < 0 || vcpu >= vm->maxcpus)
1203 1196 return (EINVAL);
1204 1197
1205 1198 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1206 1199 return (EINVAL);
1207 1200
1208 1201 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1209 1202 }
1210 1203
1211 1204 int
1212 1205 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1213 1206 {
1214 1207 struct vcpu *vcpu;
1215 1208
1216 1209 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1217 1210 return (EINVAL);
1218 1211 }
1219 1212
1220 1213 vcpu = &vm->vcpu[vcpuid];
1221 1214
1222 1215 vcpu_lock(vcpu);
1223 1216 *state = vcpu->run_state;
1224 1217 *sipi_vec = vcpu->sipi_vector;
1225 1218 vcpu_unlock(vcpu);
1226 1219
1227 1220 return (0);
1228 1221 }
1229 1222
1230 1223 int
1231 1224 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1232 1225 {
1233 1226 struct vcpu *vcpu;
1234 1227
1235 1228 if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1236 1229 return (EINVAL);
1237 1230 }
1238 1231 if (!VRS_IS_VALID(state)) {
1239 1232 return (EINVAL);
1240 1233 }
1241 1234
1242 1235 vcpu = &vm->vcpu[vcpuid];
1243 1236
1244 1237 vcpu_lock(vcpu);
1245 1238 vcpu->run_state = state;
1246 1239 vcpu->sipi_vector = sipi_vec;
1247 1240 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1248 1241 vcpu_unlock(vcpu);
1249 1242
1250 1243 return (0);
1251 1244 }
1252 1245
1253 1246
1254 1247 static void
1255 1248 restore_guest_fpustate(struct vcpu *vcpu)
1256 1249 {
1257 1250
1258 1251 /* flush host state to the pcb */
1259 1252 fpuexit(curthread);
1260 1253
1261 1254 /* restore guest FPU state */
1262 1255 fpu_stop_emulating();
1263 1256 fpurestore(vcpu->guestfpu);
1264 1257
1265 1258 /* restore guest XCR0 if XSAVE is enabled in the host */
1266 1259 if (rcr4() & CR4_XSAVE)
1267 1260 load_xcr(0, vcpu->guest_xcr0);
1268 1261
1269 1262 /*
1270 1263 * The FPU is now "dirty" with the guest's state so turn on emulation
1271 1264 * to trap any access to the FPU by the host.
1272 1265 */
1273 1266 fpu_start_emulating();
1274 1267 }
1275 1268
1276 1269 static void
1277 1270 save_guest_fpustate(struct vcpu *vcpu)
1278 1271 {
1279 1272
1280 1273 if ((rcr0() & CR0_TS) == 0)
1281 1274 panic("fpu emulation not enabled in host!");
1282 1275
1283 1276 /* save guest XCR0 and restore host XCR0 */
1284 1277 if (rcr4() & CR4_XSAVE) {
1285 1278 vcpu->guest_xcr0 = rxcr(0);
1286 1279 load_xcr(0, vmm_get_host_xcr0());
1287 1280 }
1288 1281
1289 1282 /* save guest FPU state */
1290 1283 fpu_stop_emulating();
1291 1284 fpusave(vcpu->guestfpu);
1292 1285 /*
1293 1286 * When the host state has been restored, we should not re-enable
1294 1287 * CR0.TS on illumos for eager FPU.
1295 1288 */
1296 1289 }
1297 1290
1298 1291 static int
1299 1292 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1300 1293 bool from_idle)
1301 1294 {
1302 1295 struct vcpu *vcpu;
1303 1296 int error;
1304 1297
1305 1298 vcpu = &vm->vcpu[vcpuid];
1306 1299 vcpu_assert_locked(vcpu);
1307 1300
1308 1301 /*
1309 1302 * State transitions from the vmmdev_ioctl() must always begin from
1310 1303 * the VCPU_IDLE state. This guarantees that there is only a single
1311 1304 * ioctl() operating on a vcpu at any point.
1312 1305 */
1313 1306 if (from_idle) {
1314 1307 while (vcpu->state != VCPU_IDLE) {
1315 1308 vcpu->reqidle = 1;
1316 1309 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1317 1310 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1318 1311 "idle requested", vcpu_state2str(vcpu->state));
1319 1312 cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1320 1313 }
1321 1314 } else {
1322 1315 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1323 1316 "vcpu idle state"));
1324 1317 }
1325 1318
1326 1319 if (vcpu->state == VCPU_RUNNING) {
1327 1320 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1328 1321 "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1329 1322 } else {
1330 1323 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1331 1324 "vcpu that is not running", vcpu->hostcpu));
1332 1325 }
1333 1326
1334 1327 /*
1335 1328 * The following state transitions are allowed:
1336 1329 * IDLE -> FROZEN -> IDLE
1337 1330 * FROZEN -> RUNNING -> FROZEN
1338 1331 * FROZEN -> SLEEPING -> FROZEN
1339 1332 */
1340 1333 switch (vcpu->state) {
1341 1334 case VCPU_IDLE:
1342 1335 case VCPU_RUNNING:
1343 1336 case VCPU_SLEEPING:
1344 1337 error = (newstate != VCPU_FROZEN);
1345 1338 break;
1346 1339 case VCPU_FROZEN:
1347 1340 error = (newstate == VCPU_FROZEN);
1348 1341 break;
1349 1342 default:
1350 1343 error = 1;
1351 1344 break;
1352 1345 }
1353 1346
1354 1347 if (error)
1355 1348 return (EBUSY);
1356 1349
1357 1350 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1358 1351 vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1359 1352
1360 1353 vcpu->state = newstate;
1361 1354 if (newstate == VCPU_RUNNING)
1362 1355 vcpu->hostcpu = curcpu;
1363 1356 else
1364 1357 vcpu->hostcpu = NOCPU;
1365 1358
1366 1359 if (newstate == VCPU_IDLE) {
1367 1360 cv_broadcast(&vcpu->state_cv);
1368 1361 }
1369 1362
1370 1363 return (0);
1371 1364 }
1372 1365
1373 1366 static void
1374 1367 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1375 1368 {
1376 1369 int error;
1377 1370
1378 1371 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1379 1372 panic("Error %d setting state to %d\n", error, newstate);
1380 1373 }
1381 1374
1382 1375 static void
1383 1376 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1384 1377 {
1385 1378 int error;
1386 1379
1387 1380 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1388 1381 panic("Error %d setting state to %d", error, newstate);
1389 1382 }
1390 1383
1391 1384 /*
1392 1385 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1393 1386 */
1394 1387 static int
1395 1388 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1396 1389 {
1397 1390 struct vcpu *vcpu;
1398 1391 int vcpu_halted, vm_halted;
1399 1392 bool userspace_exit = false;
1400 1393
1401 1394 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1402 1395
1403 1396 vcpu = &vm->vcpu[vcpuid];
1404 1397 vcpu_halted = 0;
1405 1398 vm_halted = 0;
1406 1399
1407 1400 vcpu_lock(vcpu);
1408 1401 while (1) {
1409 1402 /*
1410 1403 * Do a final check for pending interrupts (including NMI and
1411 1404 * INIT) before putting this thread to sleep.
1412 1405 */
1413 1406 if (vm_nmi_pending(vm, vcpuid))
1414 1407 break;
1415 1408 if (vcpu_run_state_pending(vm, vcpuid))
1416 1409 break;
1417 1410 if (!intr_disabled) {
1418 1411 if (vm_extint_pending(vm, vcpuid) ||
1419 1412 vlapic_pending_intr(vcpu->vlapic, NULL)) {
1420 1413 break;
1421 1414 }
1422 1415 }
1423 1416
1424 1417 /*
1425 1418 * Also check for software events which would cause a wake-up.
1426 1419 * This will set the appropriate exitcode directly, rather than
1427 1420 * requiring a trip through VM_RUN().
1428 1421 */
1429 1422 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1430 1423 userspace_exit = true;
1431 1424 break;
1432 1425 }
1433 1426
1434 1427 /*
1435 1428 * Some Linux guests implement "halt" by having all vcpus
1436 1429 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1437 1430 * track of the vcpus that have entered this state. When all
1438 1431 * vcpus enter the halted state the virtual machine is halted.
1439 1432 */
1440 1433 if (intr_disabled) {
1441 1434 if (!vcpu_halted && halt_detection_enabled) {
1442 1435 vcpu_halted = 1;
1443 1436 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1444 1437 }
1445 1438 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1446 1439 vm_halted = 1;
1447 1440 break;
1448 1441 }
1449 1442 }
1450 1443
1451 1444 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1452 1445 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1453 1446 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1454 1447 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1455 1448 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1456 1449 }
1457 1450
1458 1451 if (vcpu_halted)
1459 1452 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1460 1453
1461 1454 vcpu_unlock(vcpu);
1462 1455
1463 1456 if (vm_halted)
1464 1457 vm_suspend(vm, VM_SUSPEND_HALT);
1465 1458
1466 1459 return (userspace_exit ? -1 : 0);
1467 1460 }
1468 1461
1469 1462 static int
1470 1463 vm_handle_paging(struct vm *vm, int vcpuid)
1471 1464 {
1472 1465 int rv, ftype;
1473 1466 struct vm_map *map;
1474 1467 struct vcpu *vcpu;
1475 1468 struct vm_exit *vme;
1476 1469
1477 1470 vcpu = &vm->vcpu[vcpuid];
1478 1471 vme = &vcpu->exitinfo;
1479 1472
1480 1473 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1481 1474 __func__, vme->inst_length));
1482 1475
1483 1476 ftype = vme->u.paging.fault_type;
1484 1477 KASSERT(ftype == PROT_READ ||
1485 1478 ftype == PROT_WRITE || ftype == PROT_EXEC,
1486 1479 ("vm_handle_paging: invalid fault_type %d", ftype));
1487 1480
1488 1481 if (ftype == PROT_READ || ftype == PROT_WRITE) {
1489 1482 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1490 1483 vme->u.paging.gpa, ftype);
1491 1484 if (rv == 0) {
1492 1485 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1493 1486 ftype == PROT_READ ? "accessed" : "dirty",
1494 1487 vme->u.paging.gpa);
1495 1488 goto done;
1496 1489 }
1497 1490 }
1498 1491
1499 1492 map = &vm->vmspace->vm_map;
1500 1493 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1501 1494
1502 1495 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1503 1496 "ftype = %d", rv, vme->u.paging.gpa, ftype);
1504 1497
1505 1498 if (rv != 0)
1506 1499 return (EFAULT);
1507 1500 done:
1508 1501 return (0);
1509 1502 }
1510 1503
1511 1504 int
1512 1505 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1513 1506 int rsize)
1514 1507 {
1515 1508 int err = ESRCH;
1516 1509
1517 1510 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1518 1511 err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1519 1512 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1520 1513 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1521 1514 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1522 1515 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1523 1516 }
1524 1517
1525 1518 return (err);
1526 1519 }
1527 1520
1528 1521 int
1529 1522 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1530 1523 int wsize)
1531 1524 {
1532 1525 int err = ESRCH;
1533 1526
1534 1527 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1535 1528 err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1536 1529 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1537 1530 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1538 1531 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1539 1532 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1540 1533 }
1541 1534
1542 1535 return (err);
1543 1536 }
1544 1537
1545 1538 static int
1546 1539 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1547 1540 {
1548 1541 struct vie *vie;
1549 1542 struct vcpu *vcpu;
1550 1543 struct vm_exit *vme;
1551 1544 uint64_t inst_addr;
1552 1545 int error, fault, cs_d;
1553 1546
1554 1547 vcpu = &vm->vcpu[vcpuid];
1555 1548 vme = &vcpu->exitinfo;
1556 1549 vie = vcpu->vie_ctx;
1557 1550
1558 1551 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1559 1552 __func__, vme->inst_length));
1560 1553
1561 1554 inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1562 1555 cs_d = vme->u.mmio_emul.cs_d;
1563 1556
1564 1557 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1565 1558 vme->u.mmio_emul.gpa);
1566 1559
1567 1560 /* Fetch the faulting instruction */
1568 1561 if (vie_needs_fetch(vie)) {
1569 1562 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1570 1563 &fault);
1571 1564 if (error != 0) {
1572 1565 return (error);
1573 1566 } else if (fault) {
1574 1567 /*
1575 1568 * If a fault during instruction fetch was encountered,
1576 1569 * it will have asserted that the appropriate exception
1577 1570 * be injected at next entry.
1578 1571 * No further work is required.
1579 1572 */
1580 1573 return (0);
1581 1574 }
1582 1575 }
1583 1576
1584 1577 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1585 1578 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1586 1579 inst_addr);
1587 1580 /* Dump (unrecognized) instruction bytes in userspace */
1588 1581 vie_fallback_exitinfo(vie, vme);
1589 1582 return (-1);
1590 1583 }
1591 1584 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1592 1585 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1593 1586 /* Decoded GLA does not match GLA from VM exit state */
1594 1587 vie_fallback_exitinfo(vie, vme);
1595 1588 return (-1);
1596 1589 }
1597 1590
1598 1591 repeat:
1599 1592 error = vie_emulate_mmio(vie, vm, vcpuid);
1600 1593 if (error < 0) {
1601 1594 /*
1602 1595 * MMIO not handled by any of the in-kernel-emulated devices, so
1603 1596 * make a trip out to userspace for it.
1604 1597 */
1605 1598 vie_exitinfo(vie, vme);
1606 1599 } else if (error == EAGAIN) {
1607 1600 /*
1608 1601 * Continue emulating the rep-prefixed instruction, which has
1609 1602 * not completed its iterations.
1610 1603 *
1611 1604 * In case this can be emulated in-kernel and has a high
1612 1605 * repetition count (causing a tight spin), it should be
1613 1606 * deferential to yield conditions.
1614 1607 */
1615 1608 if (!vcpu_should_yield(vm, vcpuid)) {
1616 1609 goto repeat;
1617 1610 } else {
1618 1611 /*
1619 1612 * Defer to the contending load by making a trip to
1620 1613 * userspace with a no-op (BOGUS) exit reason.
1621 1614 */
1622 1615 vie_reset(vie);
1623 1616 vme->exitcode = VM_EXITCODE_BOGUS;
1624 1617 return (-1);
1625 1618 }
1626 1619 } else if (error == 0) {
1627 1620 /* Update %rip now that instruction has been emulated */
1628 1621 vie_advance_pc(vie, &vcpu->nextrip);
1629 1622 }
1630 1623 return (error);
1631 1624 }
1632 1625
1633 1626 static int
1634 1627 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1635 1628 {
1636 1629 struct vcpu *vcpu;
1637 1630 struct vie *vie;
1638 1631 int err;
1639 1632
1640 1633 vcpu = &vm->vcpu[vcpuid];
1641 1634 vie = vcpu->vie_ctx;
1642 1635
1643 1636 repeat:
1644 1637 err = vie_emulate_inout(vie, vm, vcpuid);
1645 1638
1646 1639 if (err < 0) {
1647 1640 /*
1648 1641 * In/out not handled by any of the in-kernel-emulated devices,
1649 1642 * so make a trip out to userspace for it.
1650 1643 */
1651 1644 vie_exitinfo(vie, vme);
1652 1645 return (err);
1653 1646 } else if (err == EAGAIN) {
1654 1647 /*
1655 1648 * Continue emulating the rep-prefixed ins/outs, which has not
1656 1649 * completed its iterations.
1657 1650 *
1658 1651 * In case this can be emulated in-kernel and has a high
1659 1652 * repetition count (causing a tight spin), it should be
1660 1653 * deferential to yield conditions.
1661 1654 */
1662 1655 if (!vcpu_should_yield(vm, vcpuid)) {
1663 1656 goto repeat;
1664 1657 } else {
1665 1658 /*
1666 1659 * Defer to the contending load by making a trip to
1667 1660 * userspace with a no-op (BOGUS) exit reason.
1668 1661 */
1669 1662 vie_reset(vie);
1670 1663 vme->exitcode = VM_EXITCODE_BOGUS;
1671 1664 return (-1);
1672 1665 }
1673 1666 } else if (err != 0) {
1674 1667 /* Emulation failure. Bail all the way out to userspace. */
1675 1668 vme->exitcode = VM_EXITCODE_INST_EMUL;
1676 1669 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1677 1670 return (-1);
1678 1671 }
1679 1672
1680 1673 vie_advance_pc(vie, &vcpu->nextrip);
1681 1674 return (0);
1682 1675 }
1683 1676
1684 1677 static int
1685 1678 vm_handle_inst_emul(struct vm *vm, int vcpuid)
1686 1679 {
1687 1680 struct vie *vie;
1688 1681 struct vcpu *vcpu;
1689 1682 struct vm_exit *vme;
1690 1683 uint64_t cs_base;
1691 1684 int error, fault, cs_d;
1692 1685
1693 1686 vcpu = &vm->vcpu[vcpuid];
1694 1687 vme = &vcpu->exitinfo;
1695 1688 vie = vcpu->vie_ctx;
1696 1689
1697 1690 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1698 1691
1699 1692 /* Fetch the faulting instruction */
1700 1693 ASSERT(vie_needs_fetch(vie));
1701 1694 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1702 1695 &fault);
1703 1696 if (error != 0) {
1704 1697 return (error);
1705 1698 } else if (fault) {
1706 1699 /*
1707 1700 * If a fault during instruction fetch was encounted, it will
1708 1701 * have asserted that the appropriate exception be injected at
1709 1702 * next entry. No further work is required.
1710 1703 */
1711 1704 return (0);
1712 1705 }
1713 1706
1714 1707 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1715 1708 /* Dump (unrecognized) instruction bytes in userspace */
1716 1709 vie_fallback_exitinfo(vie, vme);
1717 1710 return (-1);
1718 1711 }
1719 1712
1720 1713 error = vie_emulate_other(vie, vm, vcpuid);
1721 1714 if (error != 0) {
1722 1715 /*
1723 1716 * Instruction emulation was unable to complete successfully, so
1724 1717 * kick it out to userspace for handling.
1725 1718 */
1726 1719 vie_fallback_exitinfo(vie, vme);
1727 1720 } else {
1728 1721 /* Update %rip now that instruction has been emulated */
1729 1722 vie_advance_pc(vie, &vcpu->nextrip);
1730 1723 }
1731 1724 return (error);
1732 1725 }
1733 1726
1734 1727 static int
1735 1728 vm_handle_suspend(struct vm *vm, int vcpuid)
1736 1729 {
1737 1730 int i;
1738 1731 struct vcpu *vcpu;
1739 1732
1740 1733 vcpu = &vm->vcpu[vcpuid];
1741 1734
1742 1735 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1743 1736
1744 1737 /*
1745 1738 * Wait until all 'active_cpus' have suspended themselves.
1746 1739 */
1747 1740 vcpu_lock(vcpu);
1748 1741 vcpu_ustate_change(vm, vcpuid, VU_INIT);
1749 1742 while (1) {
1750 1743 int rc;
1751 1744
1752 1745 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1753 1746 VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1754 1747 break;
1755 1748 }
1756 1749
1757 1750 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1758 1751 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1759 1752 TR_CLOCK_TICK);
1760 1753 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1761 1754
1762 1755 /*
1763 1756 * If the userspace process driving the instance is killed, any
1764 1757 * vCPUs yet to be marked suspended (because they are not
1765 1758 * VM_RUN-ing in the kernel presently) will never reach that
1766 1759 * state.
1767 1760 *
1768 1761 * To avoid vm_handle_suspend() getting stuck in the kernel
1769 1762 * waiting for those vCPUs, offer a bail-out even though it
1770 1763 * means returning without all vCPUs in a suspended state.
1771 1764 */
1772 1765 if (rc <= 0) {
1773 1766 if ((curproc->p_flag & SEXITING) != 0) {
1774 1767 break;
1775 1768 }
1776 1769 }
1777 1770 }
1778 1771 vcpu_unlock(vcpu);
1779 1772
1780 1773 /*
1781 1774 * Wakeup the other sleeping vcpus and return to userspace.
1782 1775 */
1783 1776 for (i = 0; i < vm->maxcpus; i++) {
1784 1777 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1785 1778 vcpu_notify_event(vm, i);
1786 1779 }
1787 1780 }
1788 1781
1789 1782 return (-1);
1790 1783 }
1791 1784
1792 1785 static int
1793 1786 vm_handle_reqidle(struct vm *vm, int vcpuid)
1794 1787 {
1795 1788 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1796 1789
1797 1790 vcpu_lock(vcpu);
1798 1791 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1799 1792 vcpu->reqidle = 0;
1800 1793 vcpu_unlock(vcpu);
1801 1794 return (-1);
1802 1795 }
1803 1796
1804 1797 static int
1805 1798 vm_handle_run_state(struct vm *vm, int vcpuid)
1806 1799 {
1807 1800 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1808 1801 bool handled = false;
1809 1802
1810 1803 vcpu_lock(vcpu);
1811 1804 while (1) {
1812 1805 if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1813 1806 vcpu_unlock(vcpu);
1814 1807 VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1815 1808 vcpu_lock(vcpu);
1816 1809
1817 1810 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1818 1811 vcpu->run_state |= VRS_INIT;
1819 1812 }
1820 1813
1821 1814 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1822 1815 (VRS_INIT | VRS_PEND_SIPI)) {
1823 1816 const uint8_t vector = vcpu->sipi_vector;
1824 1817
1825 1818 vcpu_unlock(vcpu);
1826 1819 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1827 1820 vcpu_lock(vcpu);
1828 1821
1829 1822 vcpu->run_state &= ~VRS_PEND_SIPI;
1830 1823 vcpu->run_state |= VRS_RUN;
1831 1824 }
1832 1825
1833 1826 /*
1834 1827 * If the vCPU is now in the running state, there is no need to
1835 1828 * wait for anything prior to re-entry.
1836 1829 */
1837 1830 if ((vcpu->run_state & VRS_RUN) != 0) {
1838 1831 handled = true;
1839 1832 break;
1840 1833 }
1841 1834
1842 1835 /*
1843 1836 * Also check for software events which would cause a wake-up.
1844 1837 * This will set the appropriate exitcode directly, rather than
1845 1838 * requiring a trip through VM_RUN().
1846 1839 */
1847 1840 if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1848 1841 break;
1849 1842 }
1850 1843
1851 1844 vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1852 1845 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1853 1846 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1854 1847 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1855 1848 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1856 1849 }
1857 1850 vcpu_unlock(vcpu);
1858 1851
1859 1852 return (handled ? 0 : -1);
1860 1853 }
1861 1854
1862 1855 static int
1863 1856 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1864 1857 {
1865 1858 const uint32_t code = vme->u.msr.code;
1866 1859 uint64_t val = 0;
1867 1860
1868 1861 switch (code) {
1869 1862 case MSR_MCG_CAP:
1870 1863 case MSR_MCG_STATUS:
1871 1864 val = 0;
1872 1865 break;
1873 1866
1874 1867 case MSR_MTRRcap:
1875 1868 case MSR_MTRRdefType:
1876 1869 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1877 1870 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1878 1871 case MSR_MTRR64kBase:
1879 1872 val = 0;
1880 1873 break;
1881 1874
1882 1875 case MSR_TSC:
1883 1876 /*
1884 1877 * In all likelihood, this should always be handled in guest
1885 1878 * context by VMX/SVM rather than taking an exit. (Both VMX and
1886 1879 * SVM pass through read-only access to MSR_TSC to the guest.)
1887 1880 *
1888 1881 * No physical offset is requested of vcpu_tsc_offset() since
1889 1882 * rdtsc_offset() takes care of that instead.
1890 1883 */
1891 1884 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
1892 1885 break;
1893 1886
1894 1887 default:
1895 1888 /*
1896 1889 * Anything not handled at this point will be kicked out to
1897 1890 * userspace for attempted processing there.
1898 1891 */
1899 1892 return (-1);
1900 1893 }
1901 1894
1902 1895 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
1903 1896 val & 0xffffffff));
1904 1897 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
1905 1898 val >> 32));
1906 1899 return (0);
1907 1900 }
1908 1901
1909 1902 static int
1910 1903 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1911 1904 {
1912 1905 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1913 1906 const uint32_t code = vme->u.msr.code;
1914 1907 const uint64_t val = vme->u.msr.wval;
1915 1908
1916 1909 switch (code) {
1917 1910 case MSR_MCG_CAP:
1918 1911 case MSR_MCG_STATUS:
1919 1912 /* Ignore writes */
1920 1913 break;
1921 1914
1922 1915 case MSR_MTRRcap:
1923 1916 vm_inject_gp(vm, vcpuid);
1924 1917 break;
1925 1918 case MSR_MTRRdefType:
1926 1919 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
1927 1920 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1928 1921 case MSR_MTRR64kBase:
1929 1922 /* Ignore writes */
1930 1923 break;
1931 1924
1932 1925 case MSR_TSC:
1933 1926 /*
1934 1927 * The effect of writing the TSC MSR is that a subsequent read
1935 1928 * of the TSC would report that value written (plus any time
1936 1929 * elapsed between the write and the read). The guest TSC value
1937 1930 * is calculated from a global offset for the guest (which
1938 1931 * effectively makes its TSC read 0 at guest boot) and a
1939 1932 * per-vCPU offset to handle these writes to the MSR.
1940 1933 *
1941 1934 * To calculate that per-vCPU offset, we can work backwards from
1942 1935 * the guest value at the time of write:
1943 1936 *
1944 1937 * value = host TSC + VM boot offset + vCPU offset
1945 1938 *
1946 1939 * so therefore:
1947 1940 *
1948 1941 * value - host TSC - VM boot offset = vCPU offset
1949 1942 */
1950 1943 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
1951 1944 break;
1952 1945
1953 1946 default:
1954 1947 /*
1955 1948 * Anything not handled at this point will be kicked out to
1956 1949 * userspace for attempted processing there.
1957 1950 */
1958 1951 return (-1);
1959 1952 }
1960 1953
1961 1954 return (0);
1962 1955 }
1963 1956
1964 1957 int
1965 1958 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1966 1959 {
1967 1960 int i;
1968 1961
1969 1962 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1970 1963 return (EINVAL);
1971 1964
1972 1965 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1973 1966 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1974 1967 vm->suspend, how);
1975 1968 return (EALREADY);
1976 1969 }
1977 1970
1978 1971 VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1979 1972
1980 1973 /*
1981 1974 * Notify all active vcpus that they are now suspended.
1982 1975 */
1983 1976 for (i = 0; i < vm->maxcpus; i++) {
1984 1977 if (CPU_ISSET(i, &vm->active_cpus))
1985 1978 vcpu_notify_event(vm, i);
1986 1979 }
1987 1980
1988 1981 return (0);
1989 1982 }
1990 1983
1991 1984 void
1992 1985 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1993 1986 {
1994 1987 struct vm_exit *vmexit;
1995 1988
1996 1989 vmexit = vm_exitinfo(vm, vcpuid);
1997 1990 vmexit->rip = rip;
1998 1991 vmexit->inst_length = 0;
1999 1992 vmexit->exitcode = VM_EXITCODE_RUN_STATE;
2000 1993 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
2001 1994 }
2002 1995
2003 1996 /*
2004 1997 * Some vmm resources, such as the lapic, may have CPU-specific resources
2005 1998 * allocated to them which would benefit from migration onto the host CPU which
2006 1999 * is processing the vcpu state.
2007 2000 */
2008 2001 static void
2009 2002 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2010 2003 {
2011 2004 /*
2012 2005 * Localizing cyclic resources requires acquisition of cpu_lock, and
2013 2006 * doing so with kpreempt disabled is a recipe for deadlock disaster.
2014 2007 */
2015 2008 VERIFY(curthread->t_preempt == 0);
2016 2009
2017 2010 /*
2018 2011 * Do not bother with localization if this vCPU is about to return to
2019 2012 * the host CPU it was last localized to.
2020 2013 */
2021 2014 if (vcpu->lastloccpu == curcpu)
2022 2015 return;
2023 2016
2024 2017 /*
2025 2018 * Localize system-wide resources to the primary boot vCPU. While any
2026 2019 * of the other vCPUs may access them, it keeps the potential interrupt
2027 2020 * footprint constrained to CPUs involved with this instance.
2028 2021 */
2029 2022 if (vcpu == &vm->vcpu[0]) {
2030 2023 vhpet_localize_resources(vm->vhpet);
2031 2024 vrtc_localize_resources(vm->vrtc);
2032 2025 vatpit_localize_resources(vm->vatpit);
2033 2026 }
2034 2027
2035 2028 vlapic_localize_resources(vcpu->vlapic);
2036 2029
2037 2030 vcpu->lastloccpu = curcpu;
2038 2031 }
2039 2032
2040 2033 static void
2041 2034 vmm_savectx(void *arg)
2042 2035 {
2043 2036 vm_thread_ctx_t *vtc = arg;
2044 2037 struct vm *vm = vtc->vtc_vm;
2045 2038 const int vcpuid = vtc->vtc_vcpuid;
2046 2039
2047 2040 if (ops->vmsavectx != NULL) {
2048 2041 ops->vmsavectx(vm->cookie, vcpuid);
2049 2042 }
2050 2043
2051 2044 /*
2052 2045 * Account for going off-cpu, unless the vCPU is idled, where being
2053 2046 * off-cpu is the explicit point.
2054 2047 */
2055 2048 if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2056 2049 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2057 2050 vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2058 2051 }
2059 2052
2060 2053 /*
2061 2054 * If the CPU holds the restored guest FPU state, save it and restore
2062 2055 * the host FPU state before this thread goes off-cpu.
2063 2056 */
2064 2057 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2065 2058 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2066 2059
2067 2060 save_guest_fpustate(vcpu);
2068 2061 vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2069 2062 }
2070 2063 }
2071 2064
2072 2065 static void
2073 2066 vmm_restorectx(void *arg)
2074 2067 {
2075 2068 vm_thread_ctx_t *vtc = arg;
2076 2069 struct vm *vm = vtc->vtc_vm;
2077 2070 const int vcpuid = vtc->vtc_vcpuid;
2078 2071
2079 2072 /* Complete microstate accounting for vCPU being off-cpu */
2080 2073 if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2081 2074 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2082 2075 }
2083 2076
2084 2077 /*
2085 2078 * When coming back on-cpu, only restore the guest FPU status if the
2086 2079 * thread is in a context marked as requiring it. This should be rare,
2087 2080 * occurring only when a future logic error results in a voluntary
2088 2081 * sleep during the VMRUN critical section.
2089 2082 *
2090 2083 * The common case will result in elision of the guest FPU state
2091 2084 * restoration, deferring that action until it is clearly necessary
2092 2085 * during vm_run.
2093 2086 */
2094 2087 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2095 2088 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2096 2089 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2097 2090
2098 2091 restore_guest_fpustate(vcpu);
2099 2092 vtc->vtc_status |= VTCS_FPU_RESTORED;
2100 2093 }
2101 2094
2102 2095 if (ops->vmrestorectx != NULL) {
2103 2096 ops->vmrestorectx(vm->cookie, vcpuid);
2104 2097 }
2105 2098
2106 2099 }
2107 2100
2108 2101 /*
2109 2102 * If we're in removectx(), we might still have state to tidy up.
2110 2103 */
2111 2104 static void
2112 2105 vmm_freectx(void *arg, int isexec)
2113 2106 {
2114 2107 vmm_savectx(arg);
2115 2108 }
2116 2109
2117 2110 static int
2118 2111 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2119 2112 struct vm_exit *vme)
2120 2113 {
2121 2114 struct vcpu *vcpu;
2122 2115 struct vie *vie;
2123 2116 int err;
2124 2117
2125 2118 vcpu = &vm->vcpu[vcpuid];
2126 2119 vie = vcpu->vie_ctx;
2127 2120 err = 0;
2128 2121
2129 2122 switch (entry->cmd) {
2130 2123 case VEC_DEFAULT:
2131 2124 return (0);
2132 2125 case VEC_DISCARD_INSTR:
2133 2126 vie_reset(vie);
2134 2127 return (0);
2135 2128 case VEC_FULFILL_MMIO:
2136 2129 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2137 2130 if (err == 0) {
2138 2131 err = vie_emulate_mmio(vie, vm, vcpuid);
2139 2132 if (err == 0) {
2140 2133 vie_advance_pc(vie, &vcpu->nextrip);
2141 2134 } else if (err < 0) {
2142 2135 vie_exitinfo(vie, vme);
2143 2136 } else if (err == EAGAIN) {
2144 2137 /*
2145 2138 * Clear the instruction emulation state in
2146 2139 * order to re-enter VM context and continue
2147 2140 * this 'rep <instruction>'
2148 2141 */
2149 2142 vie_reset(vie);
2150 2143 err = 0;
2151 2144 }
2152 2145 }
2153 2146 break;
2154 2147 case VEC_FULFILL_INOUT:
2155 2148 err = vie_fulfill_inout(vie, &entry->u.inout);
2156 2149 if (err == 0) {
2157 2150 err = vie_emulate_inout(vie, vm, vcpuid);
2158 2151 if (err == 0) {
2159 2152 vie_advance_pc(vie, &vcpu->nextrip);
2160 2153 } else if (err < 0) {
2161 2154 vie_exitinfo(vie, vme);
2162 2155 } else if (err == EAGAIN) {
2163 2156 /*
2164 2157 * Clear the instruction emulation state in
2165 2158 * order to re-enter VM context and continue
2166 2159 * this 'rep ins/outs'
2167 2160 */
2168 2161 vie_reset(vie);
2169 2162 err = 0;
2170 2163 }
2171 2164 }
2172 2165 break;
2173 2166 default:
2174 2167 return (EINVAL);
2175 2168 }
2176 2169 return (err);
2177 2170 }
2178 2171
2179 2172 static int
2180 2173 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2181 2174 {
2182 2175 struct vie *vie;
2183 2176
2184 2177 vie = vm->vcpu[vcpuid].vie_ctx;
2185 2178
2186 2179 if (vie_pending(vie)) {
2187 2180 /*
2188 2181 * Userspace has not fulfilled the pending needs of the
2189 2182 * instruction emulation, so bail back out.
2190 2183 */
2191 2184 vie_exitinfo(vie, vme);
2192 2185 return (-1);
2193 2186 }
2194 2187
2195 2188 return (0);
2196 2189 }
2197 2190
2198 2191 int
2199 2192 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2200 2193 {
2201 2194 int error;
2202 2195 struct vcpu *vcpu;
2203 2196 struct vm_exit *vme;
2204 2197 bool intr_disabled;
2205 2198 pmap_t pmap;
2206 2199 vm_thread_ctx_t vtc;
2207 2200 int affinity_type = CPU_CURRENT;
2208 2201
2209 2202 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2210 2203 return (EINVAL);
2211 2204 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2212 2205 return (EINVAL);
2213 2206 if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2214 2207 return (EINVAL);
2215 2208
2216 2209 pmap = vmspace_pmap(vm->vmspace);
2217 2210 vcpu = &vm->vcpu[vcpuid];
2218 2211 vme = &vcpu->exitinfo;
2219 2212
2220 2213 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2221 2214
2222 2215 vtc.vtc_vm = vm;
2223 2216 vtc.vtc_vcpuid = vcpuid;
2224 2217 vtc.vtc_status = 0;
2225 2218 installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2226 2219 NULL, vmm_freectx, NULL);
2227 2220
2228 2221 error = vm_entry_actions(vm, vcpuid, entry, vme);
2229 2222 if (error != 0) {
2230 2223 goto exit;
2231 2224 }
2232 2225
2233 2226 restart:
2234 2227 error = vm_loop_checks(vm, vcpuid, vme);
2235 2228 if (error != 0) {
2236 2229 goto exit;
2237 2230 }
2238 2231
2239 2232 thread_affinity_set(curthread, affinity_type);
2240 2233 /*
2241 2234 * Resource localization should happen after the CPU affinity for the
2242 2235 * thread has been set to ensure that access from restricted contexts,
2243 2236 * such as VMX-accelerated APIC operations, can occur without inducing
2244 2237 * cyclic cross-calls.
2245 2238 *
2246 2239 * This must be done prior to disabling kpreempt via critical_enter().
2247 2240 */
2248 2241 vm_localize_resources(vm, vcpu);
2249 2242 affinity_type = CPU_CURRENT;
2250 2243 critical_enter();
2251 2244
2252 2245 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2253 2246 ("vm_run: absurd pm_active"));
2254 2247
2255 2248 /* Force a trip through update_sregs to reload %fs/%gs and friends */
2256 2249 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2257 2250
2258 2251 if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2259 2252 restore_guest_fpustate(vcpu);
2260 2253 vtc.vtc_status |= VTCS_FPU_RESTORED;
2261 2254 }
2262 2255 vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2263 2256
2264 2257 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2265 2258 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2266 2259 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2267 2260
2268 2261 /*
2269 2262 * Once clear of the delicate contexts comprising the VM_RUN handler,
2270 2263 * thread CPU affinity can be loosened while other processing occurs.
2271 2264 */
2272 2265 vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2273 2266 thread_affinity_clear(curthread);
2274 2267 critical_exit();
2275 2268
2276 2269 if (error != 0) {
2277 2270 /* Communicate out any error from VMRUN() above */
2278 2271 goto exit;
2279 2272 }
2280 2273
2281 2274 vcpu->nextrip = vme->rip + vme->inst_length;
2282 2275 switch (vme->exitcode) {
2283 2276 case VM_EXITCODE_REQIDLE:
2284 2277 error = vm_handle_reqidle(vm, vcpuid);
2285 2278 break;
2286 2279 case VM_EXITCODE_RUN_STATE:
2287 2280 error = vm_handle_run_state(vm, vcpuid);
2288 2281 break;
2289 2282 case VM_EXITCODE_SUSPENDED:
2290 2283 error = vm_handle_suspend(vm, vcpuid);
2291 2284 break;
2292 2285 case VM_EXITCODE_IOAPIC_EOI:
2293 2286 vioapic_process_eoi(vm, vcpuid,
2294 2287 vme->u.ioapic_eoi.vector);
2295 2288 break;
2296 2289 case VM_EXITCODE_HLT:
2297 2290 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2298 2291 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2299 2292 break;
2300 2293 case VM_EXITCODE_PAGING:
2301 2294 error = vm_handle_paging(vm, vcpuid);
2302 2295 break;
2303 2296 case VM_EXITCODE_MMIO_EMUL:
2304 2297 error = vm_handle_mmio_emul(vm, vcpuid);
2305 2298 break;
2306 2299 case VM_EXITCODE_INOUT:
2307 2300 error = vm_handle_inout(vm, vcpuid, vme);
2308 2301 break;
2309 2302 case VM_EXITCODE_INST_EMUL:
2310 2303 error = vm_handle_inst_emul(vm, vcpuid);
2311 2304 break;
2312 2305 case VM_EXITCODE_MONITOR:
2313 2306 case VM_EXITCODE_MWAIT:
2314 2307 case VM_EXITCODE_VMINSN:
2315 2308 vm_inject_ud(vm, vcpuid);
2316 2309 break;
2317 2310 case VM_EXITCODE_RDMSR:
2318 2311 error = vm_handle_rdmsr(vm, vcpuid, vme);
2319 2312 break;
2320 2313 case VM_EXITCODE_WRMSR:
2321 2314 error = vm_handle_wrmsr(vm, vcpuid, vme);
2322 2315 break;
2323 2316 case VM_EXITCODE_HT:
2324 2317 affinity_type = CPU_BEST;
2325 2318 break;
2326 2319 case VM_EXITCODE_MTRAP:
2327 2320 vm_suspend_cpu(vm, vcpuid);
2328 2321 error = -1;
2329 2322 break;
2330 2323 default:
2331 2324 /* handled in userland */
2332 2325 error = -1;
2333 2326 break;
2334 2327 }
2335 2328
2336 2329 if (error == 0) {
2337 2330 /* VM exit conditions handled in-kernel, continue running */
2338 2331 goto restart;
2339 2332 }
2340 2333
2341 2334 exit:
2342 2335 removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2343 2336 NULL, vmm_freectx);
2344 2337
2345 2338 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2346 2339
2347 2340 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2348 2341 return (error);
2349 2342 }
2350 2343
2351 2344 int
2352 2345 vm_restart_instruction(void *arg, int vcpuid)
2353 2346 {
2354 2347 struct vm *vm;
2355 2348 struct vcpu *vcpu;
2356 2349 enum vcpu_state state;
2357 2350 uint64_t rip;
2358 2351 int error;
2359 2352
2360 2353 vm = arg;
2361 2354 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2362 2355 return (EINVAL);
2363 2356
2364 2357 vcpu = &vm->vcpu[vcpuid];
2365 2358 state = vcpu_get_state(vm, vcpuid, NULL);
2366 2359 if (state == VCPU_RUNNING) {
2367 2360 /*
2368 2361 * When a vcpu is "running" the next instruction is determined
2369 2362 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2370 2363 * Thus setting 'inst_length' to zero will cause the current
2371 2364 * instruction to be restarted.
2372 2365 */
2373 2366 vcpu->exitinfo.inst_length = 0;
2374 2367 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2375 2368 "setting inst_length to zero", vcpu->exitinfo.rip);
2376 2369 } else if (state == VCPU_FROZEN) {
2377 2370 /*
2378 2371 * When a vcpu is "frozen" it is outside the critical section
2379 2372 * around VMRUN() and 'nextrip' points to the next instruction.
2380 2373 * Thus instruction restart is achieved by setting 'nextrip'
2381 2374 * to the vcpu's %rip.
2382 2375 */
2383 2376 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2384 2377 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2385 2378 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2386 2379 "nextrip from %lx to %lx", vcpu->nextrip, rip);
2387 2380 vcpu->nextrip = rip;
2388 2381 } else {
2389 2382 panic("%s: invalid state %d", __func__, state);
2390 2383 }
2391 2384 return (0);
2392 2385 }
2393 2386
2394 2387 int
2395 2388 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2396 2389 {
2397 2390 struct vcpu *vcpu;
2398 2391 int type, vector;
2399 2392
2400 2393 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2401 2394 return (EINVAL);
2402 2395
2403 2396 vcpu = &vm->vcpu[vcpuid];
2404 2397
2405 2398 if (info & VM_INTINFO_VALID) {
2406 2399 type = info & VM_INTINFO_TYPE;
2407 2400 vector = info & 0xff;
2408 2401 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2409 2402 return (EINVAL);
2410 2403 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2411 2404 return (EINVAL);
2412 2405 if (info & VM_INTINFO_RSVD)
2413 2406 return (EINVAL);
2414 2407 } else {
2415 2408 info = 0;
2416 2409 }
2417 2410 VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2418 2411 vcpu->exitintinfo = info;
2419 2412 return (0);
2420 2413 }
2421 2414
2422 2415 enum exc_class {
2423 2416 EXC_BENIGN,
2424 2417 EXC_CONTRIBUTORY,
2425 2418 EXC_PAGEFAULT
2426 2419 };
2427 2420
2428 2421 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */
2429 2422
2430 2423 static enum exc_class
2431 2424 exception_class(uint64_t info)
2432 2425 {
2433 2426 int type, vector;
2434 2427
2435 2428 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2436 2429 type = info & VM_INTINFO_TYPE;
2437 2430 vector = info & 0xff;
2438 2431
2439 2432 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2440 2433 switch (type) {
2441 2434 case VM_INTINFO_HWINTR:
2442 2435 case VM_INTINFO_SWINTR:
2443 2436 case VM_INTINFO_NMI:
2444 2437 return (EXC_BENIGN);
2445 2438 default:
2446 2439 /*
2447 2440 * Hardware exception.
2448 2441 *
2449 2442 * SVM and VT-x use identical type values to represent NMI,
2450 2443 * hardware interrupt and software interrupt.
2451 2444 *
2452 2445 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2453 2446 * for exceptions except #BP and #OF. #BP and #OF use a type
2454 2447 * value of '5' or '6'. Therefore we don't check for explicit
2455 2448 * values of 'type' to classify 'intinfo' into a hardware
2456 2449 * exception.
2457 2450 */
2458 2451 break;
2459 2452 }
2460 2453
2461 2454 switch (vector) {
2462 2455 case IDT_PF:
2463 2456 case IDT_VE:
2464 2457 return (EXC_PAGEFAULT);
2465 2458 case IDT_DE:
2466 2459 case IDT_TS:
2467 2460 case IDT_NP:
2468 2461 case IDT_SS:
2469 2462 case IDT_GP:
2470 2463 return (EXC_CONTRIBUTORY);
2471 2464 default:
2472 2465 return (EXC_BENIGN);
2473 2466 }
2474 2467 }
2475 2468
2476 2469 static int
2477 2470 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2478 2471 uint64_t *retinfo)
2479 2472 {
2480 2473 enum exc_class exc1, exc2;
2481 2474 int type1, vector1;
2482 2475
2483 2476 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2484 2477 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2485 2478
2486 2479 /*
2487 2480 * If an exception occurs while attempting to call the double-fault
2488 2481 * handler the processor enters shutdown mode (aka triple fault).
2489 2482 */
2490 2483 type1 = info1 & VM_INTINFO_TYPE;
2491 2484 vector1 = info1 & 0xff;
2492 2485 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2493 2486 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2494 2487 info1, info2);
2495 2488 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2496 2489 *retinfo = 0;
2497 2490 return (0);
2498 2491 }
2499 2492
2500 2493 /*
2501 2494 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2502 2495 */
2503 2496 exc1 = exception_class(info1);
2504 2497 exc2 = exception_class(info2);
2505 2498 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2506 2499 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2507 2500 /* Convert nested fault into a double fault. */
2508 2501 *retinfo = IDT_DF;
2509 2502 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2510 2503 *retinfo |= VM_INTINFO_DEL_ERRCODE;
2511 2504 } else {
2512 2505 /* Handle exceptions serially */
2513 2506 *retinfo = info2;
2514 2507 }
2515 2508 return (1);
2516 2509 }
2517 2510
2518 2511 static uint64_t
2519 2512 vcpu_exception_intinfo(struct vcpu *vcpu)
2520 2513 {
2521 2514 uint64_t info = 0;
2522 2515
2523 2516 if (vcpu->exception_pending) {
2524 2517 info = vcpu->exc_vector & 0xff;
2525 2518 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2526 2519 if (vcpu->exc_errcode_valid) {
2527 2520 info |= VM_INTINFO_DEL_ERRCODE;
2528 2521 info |= (uint64_t)vcpu->exc_errcode << 32;
2529 2522 }
2530 2523 }
2531 2524 return (info);
2532 2525 }
2533 2526
2534 2527 int
2535 2528 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2536 2529 {
2537 2530 struct vcpu *vcpu;
2538 2531 uint64_t info1, info2;
2539 2532 int valid;
2540 2533
2541 2534 KASSERT(vcpuid >= 0 &&
2542 2535 vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2543 2536
2544 2537 vcpu = &vm->vcpu[vcpuid];
2545 2538
2546 2539 info1 = vcpu->exitintinfo;
2547 2540 vcpu->exitintinfo = 0;
2548 2541
2549 2542 info2 = 0;
2550 2543 if (vcpu->exception_pending) {
2551 2544 info2 = vcpu_exception_intinfo(vcpu);
2552 2545 vcpu->exception_pending = 0;
2553 2546 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2554 2547 vcpu->exc_vector, info2);
2555 2548 }
2556 2549
2557 2550 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2558 2551 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2559 2552 } else if (info1 & VM_INTINFO_VALID) {
2560 2553 *retinfo = info1;
2561 2554 valid = 1;
2562 2555 } else if (info2 & VM_INTINFO_VALID) {
2563 2556 *retinfo = info2;
2564 2557 valid = 1;
2565 2558 } else {
2566 2559 valid = 0;
2567 2560 }
2568 2561
2569 2562 if (valid) {
2570 2563 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2571 2564 "retinfo(%lx)", __func__, info1, info2, *retinfo);
2572 2565 }
2573 2566
2574 2567 return (valid);
2575 2568 }
2576 2569
2577 2570 int
2578 2571 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2579 2572 {
2580 2573 struct vcpu *vcpu;
2581 2574
2582 2575 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2583 2576 return (EINVAL);
2584 2577
2585 2578 vcpu = &vm->vcpu[vcpuid];
2586 2579 *info1 = vcpu->exitintinfo;
2587 2580 *info2 = vcpu_exception_intinfo(vcpu);
2588 2581 return (0);
2589 2582 }
2590 2583
2591 2584 int
2592 2585 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2593 2586 uint32_t errcode, int restart_instruction)
2594 2587 {
2595 2588 struct vcpu *vcpu;
2596 2589 uint64_t regval;
2597 2590 int error;
2598 2591
2599 2592 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2600 2593 return (EINVAL);
2601 2594
2602 2595 if (vector < 0 || vector >= 32)
2603 2596 return (EINVAL);
2604 2597
2605 2598 /*
2606 2599 * NMIs (which bear an exception vector of 2) are to be injected via
2607 2600 * their own specialized path using vm_inject_nmi().
2608 2601 */
2609 2602 if (vector == 2) {
2610 2603 return (EINVAL);
2611 2604 }
2612 2605
2613 2606 /*
2614 2607 * A double fault exception should never be injected directly into
2615 2608 * the guest. It is a derived exception that results from specific
2616 2609 * combinations of nested faults.
2617 2610 */
2618 2611 if (vector == IDT_DF)
2619 2612 return (EINVAL);
2620 2613
2621 2614 vcpu = &vm->vcpu[vcpuid];
2622 2615
2623 2616 if (vcpu->exception_pending) {
2624 2617 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2625 2618 "pending exception %d", vector, vcpu->exc_vector);
2626 2619 return (EBUSY);
2627 2620 }
2628 2621
2629 2622 if (errcode_valid) {
2630 2623 /*
2631 2624 * Exceptions don't deliver an error code in real mode.
2632 2625 */
2633 2626 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val);
2634 2627 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2635 2628 if (!(regval & CR0_PE))
2636 2629 errcode_valid = 0;
2637 2630 }
2638 2631
2639 2632 /*
2640 2633 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2641 2634 *
2642 2635 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2643 2636 * one instruction or incurs an exception.
2644 2637 */
2645 2638 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2646 2639 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2647 2640 __func__, error));
2648 2641
2649 2642 if (restart_instruction)
2650 2643 vm_restart_instruction(vm, vcpuid);
2651 2644
2652 2645 vcpu->exception_pending = 1;
2653 2646 vcpu->exc_vector = vector;
2654 2647 vcpu->exc_errcode = errcode;
2655 2648 vcpu->exc_errcode_valid = errcode_valid;
2656 2649 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2657 2650 return (0);
2658 2651 }
2659 2652
2660 2653 void
2661 2654 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2662 2655 int errcode)
2663 2656 {
2664 2657 int error;
2665 2658
2666 2659 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2667 2660 errcode, 1);
2668 2661 KASSERT(error == 0, ("vm_inject_exception error %d", error));
2669 2662 }
2670 2663
2671 2664 void
2672 2665 vm_inject_ud(struct vm *vm, int vcpuid)
2673 2666 {
2674 2667 vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2675 2668 }
2676 2669
2677 2670 void
2678 2671 vm_inject_gp(struct vm *vm, int vcpuid)
2679 2672 {
2680 2673 vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2681 2674 }
2682 2675
2683 2676 void
2684 2677 vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2685 2678 {
2686 2679 vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2687 2680 }
2688 2681
2689 2682 void
2690 2683 vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2691 2684 {
2692 2685 vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2693 2686 }
2694 2687
2695 2688 void
2696 2689 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2697 2690 {
2698 2691 int error;
2699 2692
2700 2693 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2701 2694 error_code, cr2);
2702 2695
2703 2696 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2704 2697 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2705 2698
2706 2699 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2707 2700 }
2708 2701
2709 2702 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2710 2703
2711 2704 int
2712 2705 vm_inject_nmi(struct vm *vm, int vcpuid)
2713 2706 {
2714 2707 struct vcpu *vcpu;
2715 2708
2716 2709 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2717 2710 return (EINVAL);
2718 2711
2719 2712 vcpu = &vm->vcpu[vcpuid];
2720 2713
2721 2714 vcpu->nmi_pending = 1;
2722 2715 vcpu_notify_event(vm, vcpuid);
2723 2716 return (0);
2724 2717 }
2725 2718
2726 2719 int
2727 2720 vm_nmi_pending(struct vm *vm, int vcpuid)
2728 2721 {
2729 2722 struct vcpu *vcpu;
2730 2723
2731 2724 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2732 2725 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2733 2726
2734 2727 vcpu = &vm->vcpu[vcpuid];
2735 2728
2736 2729 return (vcpu->nmi_pending);
2737 2730 }
2738 2731
2739 2732 void
2740 2733 vm_nmi_clear(struct vm *vm, int vcpuid)
2741 2734 {
2742 2735 struct vcpu *vcpu;
2743 2736
2744 2737 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2745 2738 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2746 2739
2747 2740 vcpu = &vm->vcpu[vcpuid];
2748 2741
2749 2742 if (vcpu->nmi_pending == 0)
2750 2743 panic("vm_nmi_clear: inconsistent nmi_pending state");
2751 2744
2752 2745 vcpu->nmi_pending = 0;
2753 2746 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2754 2747 }
2755 2748
2756 2749 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2757 2750
2758 2751 int
2759 2752 vm_inject_extint(struct vm *vm, int vcpuid)
2760 2753 {
2761 2754 struct vcpu *vcpu;
2762 2755
2763 2756 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2764 2757 return (EINVAL);
2765 2758
2766 2759 vcpu = &vm->vcpu[vcpuid];
2767 2760
2768 2761 vcpu->extint_pending = 1;
2769 2762 vcpu_notify_event(vm, vcpuid);
2770 2763 return (0);
2771 2764 }
2772 2765
2773 2766 int
2774 2767 vm_extint_pending(struct vm *vm, int vcpuid)
2775 2768 {
2776 2769 struct vcpu *vcpu;
2777 2770
2778 2771 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2779 2772 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2780 2773
2781 2774 vcpu = &vm->vcpu[vcpuid];
2782 2775
2783 2776 return (vcpu->extint_pending);
2784 2777 }
2785 2778
2786 2779 void
2787 2780 vm_extint_clear(struct vm *vm, int vcpuid)
2788 2781 {
2789 2782 struct vcpu *vcpu;
2790 2783
2791 2784 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2792 2785 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2793 2786
2794 2787 vcpu = &vm->vcpu[vcpuid];
2795 2788
2796 2789 if (vcpu->extint_pending == 0)
2797 2790 panic("vm_extint_clear: inconsistent extint_pending state");
2798 2791
2799 2792 vcpu->extint_pending = 0;
2800 2793 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2801 2794 }
2802 2795
2803 2796 int
2804 2797 vm_inject_init(struct vm *vm, int vcpuid)
2805 2798 {
2806 2799 struct vcpu *vcpu;
2807 2800
2808 2801 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2809 2802 return (EINVAL);
2810 2803
2811 2804 vcpu = &vm->vcpu[vcpuid];
2812 2805 vcpu_lock(vcpu);
2813 2806 vcpu->run_state |= VRS_PEND_INIT;
2814 2807 /*
2815 2808 * As part of queuing the INIT request, clear any pending SIPI. It
2816 2809 * would not otherwise survive across the reset of the vCPU when it
2817 2810 * undergoes the requested INIT. We would not want it to linger when it
2818 2811 * could be mistaken as a subsequent (after the INIT) SIPI request.
2819 2812 */
2820 2813 vcpu->run_state &= ~VRS_PEND_SIPI;
2821 2814 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2822 2815
2823 2816 vcpu_unlock(vcpu);
2824 2817 return (0);
2825 2818 }
2826 2819
2827 2820 int
2828 2821 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2829 2822 {
2830 2823 struct vcpu *vcpu;
2831 2824
2832 2825 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2833 2826 return (EINVAL);
2834 2827
2835 2828 vcpu = &vm->vcpu[vcpuid];
2836 2829 vcpu_lock(vcpu);
2837 2830 vcpu->run_state |= VRS_PEND_SIPI;
2838 2831 vcpu->sipi_vector = vector;
2839 2832 /* SIPI is only actionable if the CPU is waiting in INIT state */
2840 2833 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2841 2834 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2842 2835 }
2843 2836 vcpu_unlock(vcpu);
2844 2837 return (0);
2845 2838 }
2846 2839
2847 2840 bool
2848 2841 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2849 2842 {
2850 2843 struct vcpu *vcpu;
2851 2844
2852 2845 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2853 2846 vcpu = &vm->vcpu[vcpuid];
2854 2847
2855 2848 /* Of interest: vCPU not in running state or with pending INIT */
2856 2849 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2857 2850 }
2858 2851
2859 2852 int
2860 2853 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2861 2854 {
2862 2855 struct seg_desc desc;
2863 2856 const enum vm_reg_name clear_regs[] = {
2864 2857 VM_REG_GUEST_CR2,
2865 2858 VM_REG_GUEST_CR3,
2866 2859 VM_REG_GUEST_CR4,
2867 2860 VM_REG_GUEST_RAX,
2868 2861 VM_REG_GUEST_RBX,
2869 2862 VM_REG_GUEST_RCX,
2870 2863 VM_REG_GUEST_RSI,
2871 2864 VM_REG_GUEST_RDI,
2872 2865 VM_REG_GUEST_RBP,
2873 2866 VM_REG_GUEST_RSP,
2874 2867 VM_REG_GUEST_R8,
2875 2868 VM_REG_GUEST_R9,
2876 2869 VM_REG_GUEST_R10,
2877 2870 VM_REG_GUEST_R11,
2878 2871 VM_REG_GUEST_R12,
2879 2872 VM_REG_GUEST_R13,
2880 2873 VM_REG_GUEST_R14,
2881 2874 VM_REG_GUEST_R15,
2882 2875 VM_REG_GUEST_DR0,
2883 2876 VM_REG_GUEST_DR1,
2884 2877 VM_REG_GUEST_DR2,
2885 2878 VM_REG_GUEST_DR3,
2886 2879 VM_REG_GUEST_EFER,
2887 2880 };
2888 2881 const enum vm_reg_name data_segs[] = {
2889 2882 VM_REG_GUEST_SS,
2890 2883 VM_REG_GUEST_DS,
2891 2884 VM_REG_GUEST_ES,
2892 2885 VM_REG_GUEST_FS,
2893 2886 VM_REG_GUEST_GS,
2894 2887 };
2895 2888 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2896 2889
2897 2890 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2898 2891 return (EINVAL);
2899 2892
2900 2893 for (uint_t i = 0; i < nitems(clear_regs); i++) {
2901 2894 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2902 2895 }
2903 2896
2904 2897 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2905 2898 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2906 2899 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2907 2900
2908 2901 /*
2909 2902 * The prescribed contents of %rdx differ slightly between the Intel and
2910 2903 * AMD architectural definitions. The former expects the Extended Model
2911 2904 * in bits 16-19 where the latter expects all the Family, Model, and
2912 2905 * Stepping be there. Common boot ROMs appear to disregard this
2913 2906 * anyways, so we stick with a compromise value similar to what is
2914 2907 * spelled out in the Intel SDM.
2915 2908 */
2916 2909 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2917 2910
2918 2911 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2919 2912 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2920 2913
2921 2914 /* CS: Present, R/W, Accessed */
2922 2915 desc.access = 0x0093;
2923 2916 desc.base = 0xffff0000;
2924 2917 desc.limit = 0xffff;
2925 2918 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2926 2919 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2927 2920
2928 2921 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2929 2922 desc.access = 0x0093;
2930 2923 desc.base = 0;
2931 2924 desc.limit = 0xffff;
2932 2925 for (uint_t i = 0; i < nitems(data_segs); i++) {
2933 2926 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2934 2927 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2935 2928 }
2936 2929
2937 2930 /* GDTR, IDTR */
2938 2931 desc.base = 0;
2939 2932 desc.limit = 0xffff;
2940 2933 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2941 2934 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2942 2935
2943 2936 /* LDTR: Present, LDT */
2944 2937 desc.access = 0x0082;
2945 2938 desc.base = 0;
2946 2939 desc.limit = 0xffff;
2947 2940 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2948 2941 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2949 2942
2950 2943 /* TR: Present, 32-bit TSS */
2951 2944 desc.access = 0x008b;
2952 2945 desc.base = 0;
2953 2946 desc.limit = 0xffff;
2954 2947 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2955 2948 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2956 2949
2957 2950 vlapic_reset(vm_lapic(vm, vcpuid));
2958 2951
2959 2952 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2960 2953
2961 2954 vcpu->exitintinfo = 0;
2962 2955 vcpu->exception_pending = 0;
2963 2956 vcpu->nmi_pending = 0;
2964 2957 vcpu->extint_pending = 0;
2965 2958
2966 2959 /*
2967 2960 * A CPU reset caused by power-on or system reset clears more state than
2968 2961 * one which is trigged from an INIT IPI.
2969 2962 */
2970 2963 if (!init_only) {
2971 2964 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2972 2965 fpu_save_area_reset(vcpu->guestfpu);
2973 2966
2974 2967 /* XXX: clear MSRs and other pieces */
2975 2968 }
2976 2969
2977 2970 return (0);
2978 2971 }
2979 2972
2980 2973 static int
2981 2974 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2982 2975 {
2983 2976 struct seg_desc desc;
2984 2977
2985 2978 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2986 2979 return (EINVAL);
2987 2980
2988 2981 /* CS: Present, R/W, Accessed */
2989 2982 desc.access = 0x0093;
2990 2983 desc.base = (uint64_t)vector << 12;
2991 2984 desc.limit = 0xffff;
2992 2985 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2993 2986 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2994 2987 (uint64_t)vector << 8));
2995 2988
2996 2989 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2997 2990
2998 2991 return (0);
2999 2992 }
3000 2993
3001 2994 int
3002 2995 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
3003 2996 {
3004 2997 if (vcpu < 0 || vcpu >= vm->maxcpus)
3005 2998 return (EINVAL);
3006 2999
3007 3000 if (type < 0 || type >= VM_CAP_MAX)
3008 3001 return (EINVAL);
3009 3002
3010 3003 return (VMGETCAP(vm->cookie, vcpu, type, retval));
3011 3004 }
3012 3005
3013 3006 int
3014 3007 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3015 3008 {
3016 3009 if (vcpu < 0 || vcpu >= vm->maxcpus)
3017 3010 return (EINVAL);
3018 3011
3019 3012 if (type < 0 || type >= VM_CAP_MAX)
3020 3013 return (EINVAL);
3021 3014
3022 3015 return (VMSETCAP(vm->cookie, vcpu, type, val));
3023 3016 }
3024 3017
3025 3018 struct vlapic *
3026 3019 vm_lapic(struct vm *vm, int cpu)
3027 3020 {
3028 3021 return (vm->vcpu[cpu].vlapic);
3029 3022 }
3030 3023
3031 3024 struct vioapic *
3032 3025 vm_ioapic(struct vm *vm)
3033 3026 {
3034 3027
3035 3028 return (vm->vioapic);
3036 3029 }
3037 3030
3038 3031 struct vhpet *
3039 3032 vm_hpet(struct vm *vm)
3040 3033 {
3041 3034
3042 3035 return (vm->vhpet);
3043 3036 }
3044 3037
3045 3038 void *
3046 3039 vm_iommu_domain(struct vm *vm)
3047 3040 {
3048 3041
3049 3042 return (vm->iommu);
3050 3043 }
3051 3044
3052 3045 int
3053 3046 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3054 3047 bool from_idle)
3055 3048 {
3056 3049 int error;
3057 3050 struct vcpu *vcpu;
3058 3051
3059 3052 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3060 3053 panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3061 3054
3062 3055 vcpu = &vm->vcpu[vcpuid];
3063 3056
3064 3057 vcpu_lock(vcpu);
3065 3058 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3066 3059 vcpu_unlock(vcpu);
3067 3060
3068 3061 return (error);
3069 3062 }
3070 3063
3071 3064 enum vcpu_state
3072 3065 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3073 3066 {
3074 3067 struct vcpu *vcpu;
3075 3068 enum vcpu_state state;
3076 3069
3077 3070 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3078 3071 panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3079 3072
3080 3073 vcpu = &vm->vcpu[vcpuid];
3081 3074
3082 3075 vcpu_lock(vcpu);
3083 3076 state = vcpu->state;
3084 3077 if (hostcpu != NULL)
3085 3078 *hostcpu = vcpu->hostcpu;
3086 3079 vcpu_unlock(vcpu);
3087 3080
3088 3081 return (state);
3089 3082 }
3090 3083
3091 3084 uint64_t
3092 3085 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3093 3086 {
3094 3087 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3095 3088
3096 3089 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3097 3090
3098 3091 if (phys_adj) {
3099 3092 /* Include any offset for the current physical CPU too */
3100 3093 extern hrtime_t tsc_gethrtime_tick_delta(void);
3101 3094 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3102 3095 }
3103 3096
3104 3097 return (vcpu_off);
3105 3098 }
3106 3099
3107 3100 int
3108 3101 vm_activate_cpu(struct vm *vm, int vcpuid)
3109 3102 {
3110 3103
3111 3104 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3112 3105 return (EINVAL);
3113 3106
3114 3107 if (CPU_ISSET(vcpuid, &vm->active_cpus))
3115 3108 return (EBUSY);
3116 3109
3117 3110 VCPU_CTR0(vm, vcpuid, "activated");
3118 3111 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3119 3112 return (0);
3120 3113 }
3121 3114
3122 3115 int
3123 3116 vm_suspend_cpu(struct vm *vm, int vcpuid)
3124 3117 {
3125 3118 int i;
3126 3119
3127 3120 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3128 3121 return (EINVAL);
3129 3122
3130 3123 if (vcpuid == -1) {
3131 3124 vm->debug_cpus = vm->active_cpus;
3132 3125 for (i = 0; i < vm->maxcpus; i++) {
3133 3126 if (CPU_ISSET(i, &vm->active_cpus))
3134 3127 vcpu_notify_event(vm, i);
3135 3128 }
3136 3129 } else {
3137 3130 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3138 3131 return (EINVAL);
3139 3132
3140 3133 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3141 3134 vcpu_notify_event(vm, vcpuid);
3142 3135 }
3143 3136 return (0);
3144 3137 }
3145 3138
3146 3139 int
3147 3140 vm_resume_cpu(struct vm *vm, int vcpuid)
3148 3141 {
3149 3142
3150 3143 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3151 3144 return (EINVAL);
3152 3145
3153 3146 if (vcpuid == -1) {
3154 3147 CPU_ZERO(&vm->debug_cpus);
3155 3148 } else {
3156 3149 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3157 3150 return (EINVAL);
3158 3151
3159 3152 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3160 3153 }
3161 3154 return (0);
3162 3155 }
3163 3156
3164 3157 static bool
3165 3158 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3166 3159 uint64_t entry_rip)
3167 3160 {
3168 3161 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3169 3162 struct vm_exit *vme = &vcpu->exitinfo;
3170 3163 bool bail = false;
3171 3164
3172 3165 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3173 3166
3174 3167 if (vm->suspend) {
3175 3168 if (on_entry) {
3176 3169 VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3177 3170 vm->suspend < VM_SUSPEND_LAST);
3178 3171
3179 3172 vme->exitcode = VM_EXITCODE_SUSPENDED;
3180 3173 vme->u.suspended.how = vm->suspend;
3181 3174 } else {
3182 3175 /*
3183 3176 * Handling VM suspend is complicated, so if that
3184 3177 * condition is detected outside of VM-entry itself,
3185 3178 * just emit a BOGUS exitcode so we take a lap to pick
3186 3179 * up the event during an entry and are directed into
3187 3180 * the vm_handle_suspend() logic.
3188 3181 */
3189 3182 vme->exitcode = VM_EXITCODE_BOGUS;
3190 3183 }
3191 3184 bail = true;
3192 3185 }
3193 3186 if (vcpu->reqidle) {
3194 3187 vme->exitcode = VM_EXITCODE_REQIDLE;
3195 3188 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3196 3189
3197 3190 if (!on_entry) {
3198 3191 /*
3199 3192 * A reqidle request detected outside of VM-entry can be
3200 3193 * handled directly by clearing the request (and taking
3201 3194 * a lap to userspace).
3202 3195 */
3203 3196 vcpu_assert_locked(vcpu);
3204 3197 vcpu->reqidle = 0;
3205 3198 }
3206 3199 bail = true;
3207 3200 }
3208 3201 if (vcpu_should_yield(vm, vcpuid)) {
3209 3202 vme->exitcode = VM_EXITCODE_BOGUS;
3210 3203 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3211 3204 bail = true;
3212 3205 }
3213 3206 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3214 3207 vme->exitcode = VM_EXITCODE_DEBUG;
3215 3208 bail = true;
3216 3209 }
3217 3210
3218 3211 if (bail) {
3219 3212 if (on_entry) {
3220 3213 /*
3221 3214 * If bailing out during VM-entry, the current %rip must
3222 3215 * be recorded in the exitinfo.
3223 3216 */
3224 3217 vme->rip = entry_rip;
3225 3218 }
3226 3219 vme->inst_length = 0;
3227 3220 }
3228 3221 return (bail);
3229 3222 }
3230 3223
3231 3224 static bool
3232 3225 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3233 3226 {
3234 3227 /*
3235 3228 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3236 3229 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3237 3230 * structure, and we would only modify the exitcode.
3238 3231 */
3239 3232 return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3240 3233 }
3241 3234
3242 3235 bool
3243 3236 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3244 3237 {
3245 3238 /*
3246 3239 * Bail-out checks done as part of VM entry require an updated %rip to
3247 3240 * populate the vm_exit struct if any of the conditions of interest are
3248 3241 * matched in the check.
3249 3242 */
3250 3243 return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3251 3244 }
3252 3245
3253 3246 cpuset_t
3254 3247 vm_active_cpus(struct vm *vm)
3255 3248 {
3256 3249
3257 3250 return (vm->active_cpus);
3258 3251 }
3259 3252
3260 3253 cpuset_t
3261 3254 vm_debug_cpus(struct vm *vm)
3262 3255 {
3263 3256
3264 3257 return (vm->debug_cpus);
3265 3258 }
3266 3259
3267 3260 cpuset_t
3268 3261 vm_suspended_cpus(struct vm *vm)
3269 3262 {
3270 3263
3271 3264 return (vm->suspended_cpus);
3272 3265 }
3273 3266
3274 3267 void *
3275 3268 vcpu_stats(struct vm *vm, int vcpuid)
3276 3269 {
3277 3270
3278 3271 return (vm->vcpu[vcpuid].stats);
3279 3272 }
3280 3273
3281 3274 int
3282 3275 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3283 3276 {
3284 3277 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3285 3278 return (EINVAL);
3286 3279
3287 3280 *state = vm->vcpu[vcpuid].x2apic_state;
3288 3281
3289 3282 return (0);
3290 3283 }
3291 3284
3292 3285 int
3293 3286 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3294 3287 {
3295 3288 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3296 3289 return (EINVAL);
3297 3290
3298 3291 if (state >= X2APIC_STATE_LAST)
3299 3292 return (EINVAL);
3300 3293
3301 3294 vm->vcpu[vcpuid].x2apic_state = state;
3302 3295
3303 3296 vlapic_set_x2apic_state(vm, vcpuid, state);
3304 3297
3305 3298 return (0);
3306 3299 }
3307 3300
3308 3301 /*
3309 3302 * This function is called to ensure that a vcpu "sees" a pending event
3310 3303 * as soon as possible:
3311 3304 * - If the vcpu thread is sleeping then it is woken up.
3312 3305 * - If the vcpu is running on a different host_cpu then an IPI will be directed
3313 3306 * to the host_cpu to cause the vcpu to trap into the hypervisor.
3314 3307 */
3315 3308 static void
3316 3309 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3317 3310 {
3318 3311 int hostcpu;
3319 3312
3320 3313 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3321 3314
3322 3315 hostcpu = vcpu->hostcpu;
3323 3316 if (vcpu->state == VCPU_RUNNING) {
3324 3317 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3325 3318 if (hostcpu != curcpu) {
3326 3319 if (ntype == VCPU_NOTIFY_APIC) {
3327 3320 vlapic_post_intr(vcpu->vlapic, hostcpu,
3328 3321 vmm_ipinum);
3329 3322 } else {
3330 3323 ipi_cpu(hostcpu, vmm_ipinum);
3331 3324 }
3332 3325 } else {
3333 3326 /*
3334 3327 * If the 'vcpu' is running on 'curcpu' then it must
3335 3328 * be sending a notification to itself (e.g. SELF_IPI).
3336 3329 * The pending event will be picked up when the vcpu
3337 3330 * transitions back to guest context.
3338 3331 */
3339 3332 }
3340 3333 } else {
3341 3334 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3342 3335 "with hostcpu %d", vcpu->state, hostcpu));
3343 3336 if (vcpu->state == VCPU_SLEEPING) {
3344 3337 cv_signal(&vcpu->vcpu_cv);
3345 3338 }
3346 3339 }
3347 3340 }
3348 3341
3349 3342 void
3350 3343 vcpu_notify_event(struct vm *vm, int vcpuid)
3351 3344 {
3352 3345 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3353 3346
3354 3347 vcpu_lock(vcpu);
3355 3348 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3356 3349 vcpu_unlock(vcpu);
3357 3350 }
3358 3351
3359 3352 void
3360 3353 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3361 3354 {
3362 3355 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3363 3356
3364 3357 if (ntype == VCPU_NOTIFY_NONE) {
3365 3358 return;
3366 3359 }
3367 3360
3368 3361 vcpu_lock(vcpu);
3369 3362 vcpu_notify_event_locked(vcpu, ntype);
3370 3363 vcpu_unlock(vcpu);
3371 3364 }
3372 3365
3373 3366 void
3374 3367 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3375 3368 {
3376 3369 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3377 3370 hrtime_t now = gethrtime();
3378 3371
3379 3372 ASSERT3U(ustate, !=, vcpu->ustate);
3380 3373 ASSERT3S(ustate, <, VU_MAX);
3381 3374 ASSERT3S(ustate, >=, VU_INIT);
3382 3375
3383 3376 hrtime_t delta = now - vcpu->ustate_when;
3384 3377 vcpu->ustate_total[vcpu->ustate] += delta;
3385 3378
3386 3379 membar_producer();
3387 3380
3388 3381 vcpu->ustate_when = now;
3389 3382 vcpu->ustate = ustate;
3390 3383 }
3391 3384
3392 3385 struct vmspace *
3393 3386 vm_get_vmspace(struct vm *vm)
3394 3387 {
3395 3388
3396 3389 return (vm->vmspace);
3397 3390 }
3398 3391
3399 3392 int
3400 3393 vm_apicid2vcpuid(struct vm *vm, int apicid)
3401 3394 {
3402 3395 /*
3403 3396 * XXX apic id is assumed to be numerically identical to vcpu id
3404 3397 */
3405 3398 return (apicid);
3406 3399 }
3407 3400
3408 3401 struct vatpic *
3409 3402 vm_atpic(struct vm *vm)
3410 3403 {
3411 3404 return (vm->vatpic);
3412 3405 }
3413 3406
3414 3407 struct vatpit *
3415 3408 vm_atpit(struct vm *vm)
3416 3409 {
3417 3410 return (vm->vatpit);
3418 3411 }
3419 3412
3420 3413 struct vpmtmr *
3421 3414 vm_pmtmr(struct vm *vm)
3422 3415 {
3423 3416
3424 3417 return (vm->vpmtmr);
3425 3418 }
3426 3419
3427 3420 struct vrtc *
3428 3421 vm_rtc(struct vm *vm)
3429 3422 {
3430 3423
3431 3424 return (vm->vrtc);
3432 3425 }
3433 3426
3434 3427 enum vm_reg_name
3435 3428 vm_segment_name(int seg)
3436 3429 {
3437 3430 static enum vm_reg_name seg_names[] = {
3438 3431 VM_REG_GUEST_ES,
3439 3432 VM_REG_GUEST_CS,
3440 3433 VM_REG_GUEST_SS,
3441 3434 VM_REG_GUEST_DS,
3442 3435 VM_REG_GUEST_FS,
3443 3436 VM_REG_GUEST_GS
3444 3437 };
3445 3438
3446 3439 KASSERT(seg >= 0 && seg < nitems(seg_names),
3447 3440 ("%s: invalid segment encoding %d", __func__, seg));
3448 3441 return (seg_names[seg]);
3449 3442 }
3450 3443
3451 3444 void
3452 3445 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3453 3446 int num_copyinfo)
3454 3447 {
3455 3448 int idx;
3456 3449
3457 3450 for (idx = 0; idx < num_copyinfo; idx++) {
3458 3451 if (copyinfo[idx].cookie != NULL)
3459 3452 vm_gpa_release(copyinfo[idx].cookie);
3460 3453 }
3461 3454 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3462 3455 }
3463 3456
3464 3457 int
3465 3458 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3466 3459 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3467 3460 int num_copyinfo, int *fault)
3468 3461 {
3469 3462 int error, idx, nused;
3470 3463 size_t n, off, remaining;
3471 3464 void *hva, *cookie;
3472 3465 uint64_t gpa;
3473 3466
3474 3467 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3475 3468
3476 3469 nused = 0;
3477 3470 remaining = len;
3478 3471 while (remaining > 0) {
3479 3472 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3480 3473 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3481 3474 if (error || *fault)
3482 3475 return (error);
3483 3476 off = gpa & PAGE_MASK;
3484 3477 n = min(remaining, PAGE_SIZE - off);
3485 3478 copyinfo[nused].gpa = gpa;
3486 3479 copyinfo[nused].len = n;
3487 3480 remaining -= n;
3488 3481 gla += n;
3489 3482 nused++;
3490 3483 }
3491 3484
3492 3485 for (idx = 0; idx < nused; idx++) {
3493 3486 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3494 3487 copyinfo[idx].len, prot, &cookie);
3495 3488 if (hva == NULL)
3496 3489 break;
3497 3490 copyinfo[idx].hva = hva;
3498 3491 copyinfo[idx].cookie = cookie;
3499 3492 }
3500 3493
3501 3494 if (idx != nused) {
3502 3495 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3503 3496 return (EFAULT);
3504 3497 } else {
3505 3498 *fault = 0;
3506 3499 return (0);
3507 3500 }
3508 3501 }
3509 3502
3510 3503 void
3511 3504 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3512 3505 size_t len)
3513 3506 {
3514 3507 char *dst;
3515 3508 int idx;
3516 3509
3517 3510 dst = kaddr;
3518 3511 idx = 0;
3519 3512 while (len > 0) {
3520 3513 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3521 3514 len -= copyinfo[idx].len;
3522 3515 dst += copyinfo[idx].len;
3523 3516 idx++;
3524 3517 }
3525 3518 }
3526 3519
3527 3520 void
3528 3521 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3529 3522 struct vm_copyinfo *copyinfo, size_t len)
3530 3523 {
3531 3524 const char *src;
3532 3525 int idx;
3533 3526
3534 3527 src = kaddr;
3535 3528 idx = 0;
3536 3529 while (len > 0) {
3537 3530 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3538 3531 len -= copyinfo[idx].len;
3539 3532 src += copyinfo[idx].len;
3540 3533 idx++;
3541 3534 }
3542 3535 }
3543 3536
3544 3537 /*
3545 3538 * Return the amount of in-use and wired memory for the VM. Since
3546 3539 * these are global stats, only return the values with for vCPU 0
3547 3540 */
3548 3541 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3549 3542 VMM_STAT_DECLARE(VMM_MEM_WIRED);
3550 3543
3551 3544 static void
3552 3545 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3553 3546 {
3554 3547
3555 3548 if (vcpu == 0) {
3556 3549 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3557 3550 PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3558 3551 }
3559 3552 }
3560 3553
3561 3554 static void
3562 3555 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3563 3556 {
3564 3557
3565 3558 if (vcpu == 0) {
3566 3559 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3567 3560 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3568 3561 }
3569 3562 }
3570 3563
3571 3564 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3572 3565 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3573 3566
3574 3567 int
3575 3568 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3576 3569 uint8_t bytes, uint32_t *val)
3577 3570 {
3578 3571 return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3579 3572 }
3580 3573
3581 3574 /*
3582 3575 * bhyve-internal interfaces to attach or detach IO port handlers.
3583 3576 * Must be called with VM write lock held for safety.
3584 3577 */
3585 3578 int
3586 3579 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3587 3580 void **cookie)
3588 3581 {
3589 3582 int err;
3590 3583 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3591 3584 if (err == 0) {
3592 3585 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3593 3586 }
3594 3587 return (err);
3595 3588 }
3596 3589 int
3597 3590 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3598 3591 void **old_arg)
3599 3592 {
3600 3593 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3601 3594 int err;
3602 3595
3603 3596 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3604 3597 if (err == 0) {
3605 3598 *cookie = NULL;
3606 3599 }
3607 3600 return (err);
3608 3601 }
3609 3602
3610 3603 /*
3611 3604 * External driver interfaces to attach or detach IO port handlers.
3612 3605 * Must be called with VM write lock held for safety.
3613 3606 */
3614 3607 int
3615 3608 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3616 3609 void *arg, void **cookie)
3617 3610 {
3618 3611 int err;
3619 3612
3620 3613 if (port == 0) {
3621 3614 return (EINVAL);
3622 3615 }
3623 3616
3624 3617 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3625 3618 if (err == 0) {
3626 3619 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3627 3620 }
3628 3621 return (err);
3629 3622 }
3630 3623 void
3631 3624 vm_ioport_unhook(struct vm *vm, void **cookie)
3632 3625 {
3633 3626 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3634 3627 ioport_handler_t old_func;
3635 3628 void *old_arg;
3636 3629 int err;
3637 3630
3638 3631 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3639 3632
3640 3633 /* ioport-hook-using drivers are expected to be well-behaved */
3641 3634 VERIFY0(err);
3642 3635 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3643 3636
3644 3637 *cookie = NULL;
3645 3638 }
3646 3639
3647 3640 int
3648 3641 vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3649 3642 {
3650 3643 struct vm *vm = ksp->ks_private;
3651 3644 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3652 3645 const int vcpuid = vvk->vvk_vcpu.value.ui32;
3653 3646 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3654 3647
3655 3648 ASSERT3U(vcpuid, <, VM_MAXCPU);
|
↓ open down ↓ |
3017 lines elided |
↑ open up ↑ |
3656 3649
3657 3650 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3658 3651 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3659 3652 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3660 3653 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3661 3654 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3662 3655 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3663 3656
3664 3657 return (0);
3665 3658 }
3666 -
3667 -int
3668 -vm_arc_resv(struct vm *vm, uint64_t len)
3669 -{
3670 - /* Since we already have the compat macros included, we use those */
3671 - size_t pages = (size_t)roundup2(len, PAGE_SIZE) >> PAGE_SHIFT;
3672 - int err = 0;
3673 -
3674 - err = arc_virt_machine_reserve(pages);
3675 - if (err != 0)
3676 - return (err);
3677 -
3678 - vm->arc_resv += pages;
3679 - return (0);
3680 -}
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX