Print this page
13275 bhyve needs richer INIT/SIPI support
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/i86pc/io/vmm/vmm.c
+++ new/usr/src/uts/i86pc/io/vmm/vmm.c
1 1 /*-
2 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 3 *
4 4 * Copyright (c) 2011 NetApp, Inc.
5 5 * All rights reserved.
6 6 *
7 7 * Redistribution and use in source and binary forms, with or without
8 8 * modification, are permitted provided that the following conditions
9 9 * are met:
10 10 * 1. Redistributions of source code must retain the above copyright
11 11 * notice, this list of conditions and the following disclaimer.
12 12 * 2. Redistributions in binary form must reproduce the above copyright
13 13 * notice, this list of conditions and the following disclaimer in the
14 14 * documentation and/or other materials provided with the distribution.
15 15 *
16 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 26 * SUCH DAMAGE.
27 27 *
28 28 * $FreeBSD$
29 29 */
30 30 /*
31 31 * This file and its contents are supplied under the terms of the
32 32 * Common Development and Distribution License ("CDDL"), version 1.0.
33 33 * You may only use this file in accordance with the terms of version
34 34 * 1.0 of the CDDL.
35 35 *
36 36 * A full copy of the text of the CDDL should have accompanied this
37 37 * source. A copy of the CDDL is also available via the Internet at
38 38 * http://www.illumos.org/license/CDDL.
39 39 *
40 40 * Copyright 2015 Pluribus Networks Inc.
41 41 * Copyright 2018 Joyent, Inc.
42 42 * Copyright 2020 Oxide Computer Company
43 43 */
44 44
45 45 #include <sys/cdefs.h>
46 46 __FBSDID("$FreeBSD$");
47 47
48 48 #include <sys/param.h>
49 49 #include <sys/systm.h>
50 50 #include <sys/kernel.h>
51 51 #include <sys/module.h>
52 52 #include <sys/sysctl.h>
53 53 #include <sys/malloc.h>
54 54 #include <sys/pcpu.h>
55 55 #include <sys/lock.h>
56 56 #include <sys/mutex.h>
57 57 #include <sys/proc.h>
58 58 #include <sys/rwlock.h>
59 59 #include <sys/sched.h>
60 60 #include <sys/smp.h>
61 61 #include <sys/systm.h>
62 62
63 63 #include <vm/vm.h>
64 64 #include <vm/vm_object.h>
65 65 #include <vm/vm_map.h>
66 66 #include <vm/vm_page.h>
67 67 #include <vm/pmap.h>
68 68 #include <vm/vm_extern.h>
69 69 #include <vm/vm_param.h>
70 70
71 71 #ifdef __FreeBSD__
72 72 #include <machine/cpu.h>
73 73 #endif
74 74 #include <machine/pcb.h>
75 75 #include <machine/smp.h>
76 76 #include <machine/md_var.h>
77 77 #include <x86/psl.h>
78 78 #include <x86/apicreg.h>
79 79
80 80 #include <machine/vmm.h>
81 81 #include <machine/vmm_dev.h>
82 82 #include <sys/vmm_instruction_emul.h>
83 83
84 84 #include "vmm_ioport.h"
85 85 #include "vmm_ktr.h"
86 86 #include "vmm_host.h"
87 87 #include "vmm_mem.h"
88 88 #include "vmm_util.h"
89 89 #include "vatpic.h"
90 90 #include "vatpit.h"
91 91 #include "vhpet.h"
92 92 #include "vioapic.h"
93 93 #include "vlapic.h"
94 94 #include "vpmtmr.h"
95 95 #include "vrtc.h"
96 96 #include "vmm_stat.h"
97 97 #include "vmm_lapic.h"
98 98
99 99 #include "io/ppt.h"
100 100 #include "io/iommu.h"
101 101
|
↓ open down ↓ |
101 lines elided |
↑ open up ↑ |
102 102 struct vlapic;
103 103
104 104 /*
105 105 * Initialization:
106 106 * (a) allocated when vcpu is created
107 107 * (i) initialized when vcpu is created and when it is reinitialized
108 108 * (o) initialized the first time the vcpu is created
109 109 * (x) initialized before use
110 110 */
111 111 struct vcpu {
112 - struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */
112 + /* (o) protects state, run_state, hostcpu, sipi_vector */
113 + struct mtx mtx;
114 +
113 115 enum vcpu_state state; /* (o) vcpu state */
114 -#ifndef __FreeBSD__
116 + enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */
115 117 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */
116 118 kcondvar_t state_cv; /* (o) IDLE-transition cv */
117 -#endif /* __FreeBSD__ */
118 119 int hostcpu; /* (o) vcpu's current host cpu */
119 -#ifndef __FreeBSD__
120 120 int lastloccpu; /* (o) last host cpu localized to */
121 -#endif
122 - uint_t runblock; /* (i) block vcpu from run state */
123 121 int reqidle; /* (i) request vcpu to idle */
124 122 struct vlapic *vlapic; /* (i) APIC device model */
125 123 enum x2apic_state x2apic_state; /* (i) APIC mode */
126 124 uint64_t exitintinfo; /* (i) events pending at VM exit */
127 125 int nmi_pending; /* (i) NMI pending */
128 126 int extint_pending; /* (i) INTR pending */
129 127 int exception_pending; /* (i) exception pending */
130 128 int exc_vector; /* (x) exception collateral */
131 129 int exc_errcode_valid;
132 130 uint32_t exc_errcode;
131 + uint8_t sipi_vector; /* (i) SIPI vector */
133 132 struct savefpu *guestfpu; /* (a,i) guest fpu state */
134 133 uint64_t guest_xcr0; /* (i) guest %xcr0 register */
135 134 void *stats; /* (a,i) statistics */
136 135 struct vm_exit exitinfo; /* (x) exit reason and collateral */
137 136 uint64_t nextrip; /* (x) next instruction to execute */
138 137 struct vie *vie_ctx; /* (x) instruction emulation context */
139 138 #ifndef __FreeBSD__
140 139 uint64_t tsc_offset; /* (x) offset from host TSC */
141 140 #endif
142 141 };
143 142
144 143 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
145 144 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
146 145 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
147 146 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
148 147 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
149 148
150 149 struct mem_seg {
151 150 size_t len;
152 151 bool sysmem;
153 152 struct vm_object *object;
154 153 };
155 154 #ifdef __FreeBSD__
156 155 #define VM_MAX_MEMSEGS 3
157 156 #else
158 157 #define VM_MAX_MEMSEGS 4
159 158 #endif
160 159
161 160 struct mem_map {
162 161 vm_paddr_t gpa;
163 162 size_t len;
164 163 vm_ooffset_t segoff;
165 164 int segid;
166 165 int prot;
167 166 int flags;
168 167 };
169 168 #define VM_MAX_MEMMAPS 8
170 169
171 170 /*
172 171 * Initialization:
173 172 * (o) initialized the first time the VM is created
174 173 * (i) initialized when VM is created and when it is reinitialized
175 174 * (x) initialized before use
176 175 */
177 176 struct vm {
178 177 void *cookie; /* (i) cpu-specific data */
179 178 void *iommu; /* (x) iommu-specific data */
180 179 struct vhpet *vhpet; /* (i) virtual HPET */
181 180 struct vioapic *vioapic; /* (i) virtual ioapic */
182 181 struct vatpic *vatpic; /* (i) virtual atpic */
183 182 struct vatpit *vatpit; /* (i) virtual atpit */
184 183 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
185 184 struct vrtc *vrtc; /* (o) virtual RTC */
186 185 volatile cpuset_t active_cpus; /* (i) active vcpus */
187 186 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */
188 187 int suspend; /* (i) stop VM execution */
189 188 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
190 189 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
191 190 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
192 191 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
|
↓ open down ↓ |
50 lines elided |
↑ open up ↑ |
193 192 struct vmspace *vmspace; /* (o) guest's address space */
194 193 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
195 194 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
196 195 /* The following describe the vm cpu topology */
197 196 uint16_t sockets; /* (o) num of sockets */
198 197 uint16_t cores; /* (o) num of cores/socket */
199 198 uint16_t threads; /* (o) num of threads/core */
200 199 uint16_t maxcpus; /* (o) max pluggable cpus */
201 200
202 201 struct ioport_config ioports; /* (o) ioport handling */
203 -
204 - bool sipi_req; /* (i) SIPI requested */
205 - int sipi_req_vcpu; /* (i) SIPI destination */
206 - uint64_t sipi_req_rip; /* (i) SIPI start %rip */
207 -
208 - /* Miscellaneous VM-wide statistics and counters */
209 - struct vm_wide_stats {
210 - uint64_t sipi_supersede;
211 - } stats;
212 202 };
213 203
214 204 static int vmm_initialized;
215 205
216 206
217 207 static void
218 208 nullop_panic(void)
219 209 {
220 210 panic("null vmm operation call");
221 211 }
222 212
223 213 /* Do not allow use of an un-set `ops` to do anything but panic */
224 214 static struct vmm_ops vmm_ops_null = {
225 215 .init = (vmm_init_func_t)nullop_panic,
226 216 .cleanup = (vmm_cleanup_func_t)nullop_panic,
227 217 .resume = (vmm_resume_func_t)nullop_panic,
228 218 .vminit = (vmi_init_func_t)nullop_panic,
229 219 .vmrun = (vmi_run_func_t)nullop_panic,
230 220 .vmcleanup = (vmi_cleanup_func_t)nullop_panic,
231 221 .vmgetreg = (vmi_get_register_t)nullop_panic,
232 222 .vmsetreg = (vmi_set_register_t)nullop_panic,
233 223 .vmgetdesc = (vmi_get_desc_t)nullop_panic,
234 224 .vmsetdesc = (vmi_set_desc_t)nullop_panic,
235 225 .vmgetcap = (vmi_get_cap_t)nullop_panic,
236 226 .vmsetcap = (vmi_set_cap_t)nullop_panic,
237 227 .vmspace_alloc = (vmi_vmspace_alloc)nullop_panic,
238 228 .vmspace_free = (vmi_vmspace_free)nullop_panic,
239 229 .vlapic_init = (vmi_vlapic_init)nullop_panic,
240 230 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
241 231 .vmsavectx = (vmi_savectx)nullop_panic,
|
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
242 232 .vmrestorectx = (vmi_restorectx)nullop_panic,
243 233 };
244 234
245 235 static struct vmm_ops *ops = &vmm_ops_null;
246 236
247 237 #define VMM_INIT(num) ((*ops->init)(num))
248 238 #define VMM_CLEANUP() ((*ops->cleanup)())
249 239 #define VMM_RESUME() ((*ops->resume)())
250 240
251 241 #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap))
252 -#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \
253 - ((*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo))
242 +#define VMRUN(vmi, vcpu, rip, pmap) \
243 + ((*ops->vmrun)(vmi, vcpu, rip, pmap))
254 244 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
255 245 #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max))
256 246 #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace))
257 247
258 248 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
259 249 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val))
260 250 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
261 251 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
262 252 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv))
263 253 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val))
264 254 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu))
265 255 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic))
266 256
267 257 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
268 258 #define fpu_stop_emulating() clts()
269 259
270 260 SDT_PROVIDER_DEFINE(vmm);
271 261
272 262 static MALLOC_DEFINE(M_VM, "vm", "vm");
273 263
274 264 /* statistics */
275 265 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
276 266
277 267 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
278 268 NULL);
279 269
280 270 /*
281 271 * Halt the guest if all vcpus are executing a HLT instruction with
282 272 * interrupts disabled.
283 273 */
284 274 static int halt_detection_enabled = 1;
|
↓ open down ↓ |
21 lines elided |
↑ open up ↑ |
285 275
286 276 /* IPI vector used for vcpu notifications */
287 277 static int vmm_ipinum;
288 278
289 279 /* Trap into hypervisor on all guest exceptions and reflect them back */
290 280 static int trace_guest_exceptions;
291 281
292 282 static void vm_free_memmap(struct vm *vm, int ident);
293 283 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
294 284 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
285 +static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
286 +static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
295 287
296 288 #ifndef __FreeBSD__
297 289 static void vm_clear_memseg(struct vm *, int);
298 290
299 291 /* Flags for vtc_status */
300 292 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */
301 293 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */
302 294
303 295 typedef struct vm_thread_ctx {
304 296 struct vm *vtc_vm;
305 297 int vtc_vcpuid;
306 298 uint_t vtc_status;
307 299 } vm_thread_ctx_t;
308 300 #endif /* __FreeBSD__ */
309 301
310 302 #ifdef KTR
311 303 static const char *
312 304 vcpu_state2str(enum vcpu_state state)
313 305 {
314 306
315 307 switch (state) {
316 308 case VCPU_IDLE:
317 309 return ("idle");
318 310 case VCPU_FROZEN:
319 311 return ("frozen");
320 312 case VCPU_RUNNING:
321 313 return ("running");
322 314 case VCPU_SLEEPING:
323 315 return ("sleeping");
324 316 default:
325 317 return ("unknown");
326 318 }
327 319 }
328 320 #endif
329 321
330 322 static void
331 323 vcpu_cleanup(struct vm *vm, int i, bool destroy)
332 324 {
333 325 struct vcpu *vcpu = &vm->vcpu[i];
334 326
335 327 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
336 328 if (destroy) {
337 329 vmm_stat_free(vcpu->stats);
338 330 fpu_save_area_free(vcpu->guestfpu);
339 331 vie_free(vcpu->vie_ctx);
340 332 vcpu->vie_ctx = NULL;
341 333 }
342 334 }
343 335
344 336 static void
345 337 vcpu_init(struct vm *vm, int vcpu_id, bool create)
346 338 {
347 339 struct vcpu *vcpu;
348 340
349 341 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
350 342 ("vcpu_init: invalid vcpu %d", vcpu_id));
351 343
352 344 vcpu = &vm->vcpu[vcpu_id];
353 345
354 346 if (create) {
355 347 #ifdef __FreeBSD__
356 348 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
357 349 "initialized", vcpu_id));
358 350 #endif
359 351 vcpu_lock_init(vcpu);
360 352 vcpu->state = VCPU_IDLE;
361 353 vcpu->hostcpu = NOCPU;
362 354 #ifndef __FreeBSD__
|
↓ open down ↓ |
58 lines elided |
↑ open up ↑ |
363 355 vcpu->lastloccpu = NOCPU;
364 356 #endif
365 357 vcpu->guestfpu = fpu_save_area_alloc();
366 358 vcpu->stats = vmm_stat_alloc();
367 359 vcpu->vie_ctx = vie_alloc();
368 360 } else {
369 361 vie_reset(vcpu->vie_ctx);
370 362 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
371 363 }
372 364
365 + vcpu->run_state = VRS_HALT;
373 366 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
374 367 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
375 - vcpu->runblock = 0;
376 368 vcpu->reqidle = 0;
377 369 vcpu->exitintinfo = 0;
378 370 vcpu->nmi_pending = 0;
379 371 vcpu->extint_pending = 0;
380 372 vcpu->exception_pending = 0;
381 373 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
382 374 fpu_save_area_reset(vcpu->guestfpu);
383 375 vmm_stat_init(vcpu->stats);
384 376 }
385 377
386 378 int
387 379 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
388 380 {
389 381
390 382 return (trace_guest_exceptions);
391 383 }
392 384
393 385 struct vm_exit *
394 386 vm_exitinfo(struct vm *vm, int cpuid)
395 387 {
396 388 struct vcpu *vcpu;
397 389
398 390 if (cpuid < 0 || cpuid >= vm->maxcpus)
399 391 panic("vm_exitinfo: invalid cpuid %d", cpuid);
400 392
401 393 vcpu = &vm->vcpu[cpuid];
402 394
403 395 return (&vcpu->exitinfo);
404 396 }
405 397
406 398 struct vie *
407 399 vm_vie_ctx(struct vm *vm, int cpuid)
408 400 {
409 401 if (cpuid < 0 || cpuid >= vm->maxcpus)
410 402 panic("vm_vie_ctx: invalid cpuid %d", cpuid);
411 403
412 404 return (vm->vcpu[cpuid].vie_ctx);
413 405 }
414 406
415 407 static int
416 408 vmm_init(void)
417 409 {
418 410 int error;
419 411
420 412 vmm_host_state_init();
421 413
422 414 #ifdef __FreeBSD__
423 415 vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
424 416 &IDTVEC(justreturn));
425 417 if (vmm_ipinum < 0)
426 418 vmm_ipinum = IPI_AST;
427 419 #else
428 420 /* We use cpu_poke() for IPIs */
429 421 vmm_ipinum = 0;
430 422 #endif
431 423
432 424 error = vmm_mem_init();
433 425 if (error)
434 426 return (error);
435 427
436 428 if (vmm_is_intel())
437 429 ops = &vmm_ops_intel;
438 430 else if (vmm_is_svm())
439 431 ops = &vmm_ops_amd;
440 432 else
441 433 return (ENXIO);
442 434
443 435 #ifdef __FreeBSD__
444 436 vmm_resume_p = vmm_resume;
445 437 #endif
446 438
447 439 return (VMM_INIT(vmm_ipinum));
448 440 }
449 441
450 442 int
451 443 vmm_mod_load()
452 444 {
453 445 int error;
454 446
455 447 VERIFY(vmm_initialized == 0);
456 448
457 449 error = vmm_init();
458 450 if (error == 0)
459 451 vmm_initialized = 1;
460 452
461 453 return (error);
462 454 }
463 455
464 456 int
465 457 vmm_mod_unload()
466 458 {
467 459 int error;
468 460
469 461 VERIFY(vmm_initialized == 1);
470 462
471 463 iommu_cleanup();
472 464 error = VMM_CLEANUP();
473 465 if (error)
474 466 return (error);
475 467 vmm_initialized = 0;
476 468
477 469 return (0);
478 470 }
479 471
480 472 static void
481 473 vm_init(struct vm *vm, bool create)
482 474 {
483 475 int i;
484 476 #ifndef __FreeBSD__
485 477 uint64_t tsc_off;
486 478 #endif
487 479
488 480 vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
489 481 vm->iommu = NULL;
490 482 vm->vioapic = vioapic_init(vm);
491 483 vm->vhpet = vhpet_init(vm);
492 484 vm->vatpic = vatpic_init(vm);
493 485 vm->vatpit = vatpit_init(vm);
494 486 vm->vpmtmr = vpmtmr_init(vm);
495 487 if (create)
496 488 vm->vrtc = vrtc_init(vm);
497 489
498 490 vm_inout_init(vm, &vm->ioports);
499 491
500 492 CPU_ZERO(&vm->active_cpus);
501 493 CPU_ZERO(&vm->debug_cpus);
502 494
503 495 vm->suspend = 0;
504 496 CPU_ZERO(&vm->suspended_cpus);
505 497
506 498 for (i = 0; i < vm->maxcpus; i++)
507 499 vcpu_init(vm, i, create);
508 500
509 501 #ifndef __FreeBSD__
510 502 tsc_off = (uint64_t)(-(int64_t)rdtsc());
511 503 for (i = 0; i < vm->maxcpus; i++) {
512 504 vm->vcpu[i].tsc_offset = tsc_off;
513 505 }
514 506 #endif /* __FreeBSD__ */
515 507 }
516 508
517 509 /*
518 510 * The default CPU topology is a single thread per package.
519 511 */
520 512 uint_t cores_per_package = 1;
521 513 uint_t threads_per_core = 1;
522 514
523 515 int
524 516 vm_create(const char *name, struct vm **retvm)
525 517 {
526 518 struct vm *vm;
527 519 struct vmspace *vmspace;
528 520
529 521 /*
530 522 * If vmm.ko could not be successfully initialized then don't attempt
531 523 * to create the virtual machine.
532 524 */
533 525 if (!vmm_initialized)
534 526 return (ENXIO);
535 527
536 528 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
537 529 return (EINVAL);
538 530
539 531 vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
540 532 if (vmspace == NULL)
541 533 return (ENOMEM);
542 534
543 535 vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
544 536 strcpy(vm->name, name);
545 537 vm->vmspace = vmspace;
546 538
547 539 vm->sockets = 1;
548 540 vm->cores = cores_per_package; /* XXX backwards compatibility */
549 541 vm->threads = threads_per_core; /* XXX backwards compatibility */
550 542 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
551 543
552 544 vm_init(vm, true);
553 545
554 546 *retvm = vm;
555 547 return (0);
556 548 }
557 549
558 550 void
559 551 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
560 552 uint16_t *threads, uint16_t *maxcpus)
561 553 {
562 554 *sockets = vm->sockets;
563 555 *cores = vm->cores;
564 556 *threads = vm->threads;
565 557 *maxcpus = vm->maxcpus;
566 558 }
567 559
568 560 uint16_t
569 561 vm_get_maxcpus(struct vm *vm)
570 562 {
571 563 return (vm->maxcpus);
572 564 }
573 565
574 566 int
575 567 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
576 568 uint16_t threads, uint16_t maxcpus)
577 569 {
578 570 if (maxcpus != 0)
579 571 return (EINVAL); /* XXX remove when supported */
580 572 if ((sockets * cores * threads) > vm->maxcpus)
581 573 return (EINVAL);
582 574 /* XXX need to check sockets * cores * threads == vCPU, how? */
583 575 vm->sockets = sockets;
584 576 vm->cores = cores;
585 577 vm->threads = threads;
586 578 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
587 579 return (0);
588 580 }
589 581
590 582 static void
591 583 vm_cleanup(struct vm *vm, bool destroy)
592 584 {
593 585 struct mem_map *mm;
594 586 int i;
595 587
596 588 ppt_unassign_all(vm);
597 589
598 590 if (vm->iommu != NULL)
599 591 iommu_destroy_domain(vm->iommu);
600 592
601 593 /*
602 594 * Devices which attach their own ioport hooks should be cleaned up
603 595 * first so they can tear down those registrations.
604 596 */
605 597 vpmtmr_cleanup(vm->vpmtmr);
606 598
607 599 vm_inout_cleanup(vm, &vm->ioports);
608 600
609 601 if (destroy)
610 602 vrtc_cleanup(vm->vrtc);
611 603 else
612 604 vrtc_reset(vm->vrtc);
613 605
614 606 vatpit_cleanup(vm->vatpit);
615 607 vhpet_cleanup(vm->vhpet);
616 608 vatpic_cleanup(vm->vatpic);
617 609 vioapic_cleanup(vm->vioapic);
618 610
619 611 for (i = 0; i < vm->maxcpus; i++)
620 612 vcpu_cleanup(vm, i, destroy);
621 613
622 614 VMCLEANUP(vm->cookie);
623 615
624 616 /*
625 617 * System memory is removed from the guest address space only when
626 618 * the VM is destroyed. This is because the mapping remains the same
627 619 * across VM reset.
628 620 *
629 621 * Device memory can be relocated by the guest (e.g. using PCI BARs)
630 622 * so those mappings are removed on a VM reset.
631 623 */
632 624 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
633 625 mm = &vm->mem_maps[i];
634 626 if (destroy || !sysmem_mapping(vm, mm))
635 627 vm_free_memmap(vm, i);
636 628 #ifndef __FreeBSD__
637 629 else {
638 630 /*
639 631 * We need to reset the IOMMU flag so this mapping can
640 632 * be reused when a VM is rebooted. Since the IOMMU
641 633 * domain has already been destroyed we can just reset
642 634 * the flag here.
643 635 */
644 636 mm->flags &= ~VM_MEMMAP_F_IOMMU;
645 637 }
646 638 #endif
647 639 }
648 640
649 641 if (destroy) {
650 642 for (i = 0; i < VM_MAX_MEMSEGS; i++)
651 643 vm_free_memseg(vm, i);
652 644
653 645 VMSPACE_FREE(vm->vmspace);
654 646 vm->vmspace = NULL;
655 647 }
656 648 #ifndef __FreeBSD__
657 649 else {
658 650 /*
659 651 * Clear the first memory segment (low mem), old memory contents
660 652 * could confuse the UEFI firmware.
661 653 */
662 654 vm_clear_memseg(vm, 0);
663 655 }
664 656 #endif
665 657 }
666 658
667 659 void
668 660 vm_destroy(struct vm *vm)
669 661 {
670 662 vm_cleanup(vm, true);
671 663 free(vm, M_VM);
672 664 }
673 665
674 666 int
675 667 vm_reinit(struct vm *vm)
676 668 {
677 669 int error;
678 670
679 671 /*
680 672 * A virtual machine can be reset only if all vcpus are suspended.
681 673 */
682 674 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
683 675 vm_cleanup(vm, false);
684 676 vm_init(vm, false);
685 677 error = 0;
686 678 } else {
687 679 error = EBUSY;
688 680 }
689 681
690 682 return (error);
691 683 }
692 684
693 685 const char *
694 686 vm_name(struct vm *vm)
695 687 {
696 688 return (vm->name);
697 689 }
698 690
699 691 int
700 692 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
701 693 {
702 694 vm_object_t obj;
703 695
704 696 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
705 697 return (ENOMEM);
706 698 else
707 699 return (0);
708 700 }
709 701
710 702 int
711 703 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
712 704 {
713 705
714 706 vmm_mmio_free(vm->vmspace, gpa, len);
715 707 return (0);
716 708 }
717 709
718 710 /*
719 711 * Return 'true' if 'gpa' is allocated in the guest address space.
720 712 *
721 713 * This function is called in the context of a running vcpu which acts as
722 714 * an implicit lock on 'vm->mem_maps[]'.
723 715 */
724 716 bool
725 717 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
726 718 {
727 719 struct mem_map *mm;
728 720 int i;
729 721
730 722 #ifdef INVARIANTS
731 723 int hostcpu, state;
732 724 state = vcpu_get_state(vm, vcpuid, &hostcpu);
733 725 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
734 726 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
735 727 #endif
736 728
737 729 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
738 730 mm = &vm->mem_maps[i];
739 731 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
740 732 return (true); /* 'gpa' is sysmem or devmem */
741 733 }
742 734
743 735 if (ppt_is_mmio(vm, gpa))
744 736 return (true); /* 'gpa' is pci passthru mmio */
745 737
746 738 return (false);
747 739 }
748 740
749 741 int
750 742 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
751 743 {
752 744 struct mem_seg *seg;
753 745 vm_object_t obj;
754 746
755 747 #ifndef __FreeBSD__
756 748 extern pgcnt_t get_max_page_get(void);
757 749 #endif
758 750
759 751 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
760 752 return (EINVAL);
761 753
762 754 if (len == 0 || (len & PAGE_MASK))
763 755 return (EINVAL);
764 756
765 757 #ifndef __FreeBSD__
766 758 if (len > ptob(get_max_page_get()))
767 759 return (EINVAL);
768 760 #endif
769 761
770 762 seg = &vm->mem_segs[ident];
771 763 if (seg->object != NULL) {
772 764 if (seg->len == len && seg->sysmem == sysmem)
773 765 return (EEXIST);
774 766 else
775 767 return (EINVAL);
776 768 }
777 769
778 770 obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
779 771 if (obj == NULL)
780 772 return (ENOMEM);
781 773
782 774 seg->len = len;
783 775 seg->object = obj;
784 776 seg->sysmem = sysmem;
785 777 return (0);
786 778 }
787 779
788 780 int
789 781 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
790 782 vm_object_t *objptr)
791 783 {
792 784 struct mem_seg *seg;
793 785
794 786 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
795 787 return (EINVAL);
796 788
797 789 seg = &vm->mem_segs[ident];
798 790 if (len)
799 791 *len = seg->len;
800 792 if (sysmem)
801 793 *sysmem = seg->sysmem;
802 794 if (objptr)
803 795 *objptr = seg->object;
804 796 return (0);
805 797 }
806 798
807 799 #ifndef __FreeBSD__
808 800 static void
809 801 vm_clear_memseg(struct vm *vm, int ident)
810 802 {
811 803 struct mem_seg *seg;
812 804
813 805 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
814 806 ("%s: invalid memseg ident %d", __func__, ident));
815 807
816 808 seg = &vm->mem_segs[ident];
817 809
818 810 if (seg->object != NULL)
819 811 vm_object_clear(seg->object);
820 812 }
821 813 #endif
822 814
823 815 void
824 816 vm_free_memseg(struct vm *vm, int ident)
825 817 {
826 818 struct mem_seg *seg;
827 819
828 820 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
829 821 ("%s: invalid memseg ident %d", __func__, ident));
830 822
831 823 seg = &vm->mem_segs[ident];
832 824 if (seg->object != NULL) {
833 825 vm_object_deallocate(seg->object);
834 826 bzero(seg, sizeof (struct mem_seg));
835 827 }
836 828 }
837 829
838 830 int
839 831 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
840 832 size_t len, int prot, int flags)
841 833 {
842 834 struct mem_seg *seg;
843 835 struct mem_map *m, *map;
844 836 vm_ooffset_t last;
845 837 int i, error;
846 838
847 839 if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
848 840 return (EINVAL);
849 841
850 842 if (flags & ~VM_MEMMAP_F_WIRED)
851 843 return (EINVAL);
852 844
853 845 if (segid < 0 || segid >= VM_MAX_MEMSEGS)
854 846 return (EINVAL);
855 847
856 848 seg = &vm->mem_segs[segid];
857 849 if (seg->object == NULL)
858 850 return (EINVAL);
859 851
860 852 last = first + len;
861 853 if (first < 0 || first >= last || last > seg->len)
862 854 return (EINVAL);
863 855
864 856 if ((gpa | first | last) & PAGE_MASK)
865 857 return (EINVAL);
866 858
867 859 map = NULL;
868 860 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
869 861 m = &vm->mem_maps[i];
870 862 if (m->len == 0) {
871 863 map = m;
872 864 break;
873 865 }
874 866 }
875 867
876 868 if (map == NULL)
877 869 return (ENOSPC);
878 870
879 871 error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
880 872 len, 0, VMFS_NO_SPACE, prot, prot, 0);
881 873 if (error != KERN_SUCCESS)
882 874 return (EFAULT);
883 875
884 876 vm_object_reference(seg->object);
885 877
886 878 if ((flags & VM_MEMMAP_F_WIRED) != 0) {
887 879 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
888 880 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
889 881 if (error != KERN_SUCCESS) {
890 882 vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
891 883 return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
892 884 EFAULT);
893 885 }
894 886 }
895 887
896 888 map->gpa = gpa;
897 889 map->len = len;
898 890 map->segoff = first;
899 891 map->segid = segid;
900 892 map->prot = prot;
901 893 map->flags = flags;
902 894 return (0);
903 895 }
904 896
905 897 int
906 898 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
907 899 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
908 900 {
909 901 struct mem_map *mm, *mmnext;
910 902 int i;
911 903
912 904 mmnext = NULL;
913 905 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
914 906 mm = &vm->mem_maps[i];
915 907 if (mm->len == 0 || mm->gpa < *gpa)
916 908 continue;
917 909 if (mmnext == NULL || mm->gpa < mmnext->gpa)
918 910 mmnext = mm;
919 911 }
920 912
921 913 if (mmnext != NULL) {
922 914 *gpa = mmnext->gpa;
923 915 if (segid)
924 916 *segid = mmnext->segid;
925 917 if (segoff)
926 918 *segoff = mmnext->segoff;
927 919 if (len)
928 920 *len = mmnext->len;
929 921 if (prot)
930 922 *prot = mmnext->prot;
931 923 if (flags)
932 924 *flags = mmnext->flags;
933 925 return (0);
934 926 } else {
935 927 return (ENOENT);
936 928 }
937 929 }
938 930
939 931 static void
940 932 vm_free_memmap(struct vm *vm, int ident)
941 933 {
942 934 struct mem_map *mm;
943 935 int error;
944 936
945 937 mm = &vm->mem_maps[ident];
946 938 if (mm->len) {
947 939 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
948 940 mm->gpa + mm->len);
949 941 KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
950 942 __func__, error));
951 943 bzero(mm, sizeof (struct mem_map));
952 944 }
953 945 }
954 946
955 947 static __inline bool
956 948 sysmem_mapping(struct vm *vm, struct mem_map *mm)
957 949 {
958 950
959 951 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
960 952 return (true);
961 953 else
962 954 return (false);
963 955 }
964 956
965 957 vm_paddr_t
966 958 vmm_sysmem_maxaddr(struct vm *vm)
967 959 {
968 960 struct mem_map *mm;
969 961 vm_paddr_t maxaddr;
970 962 int i;
971 963
972 964 maxaddr = 0;
973 965 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
974 966 mm = &vm->mem_maps[i];
975 967 if (sysmem_mapping(vm, mm)) {
976 968 if (maxaddr < mm->gpa + mm->len)
977 969 maxaddr = mm->gpa + mm->len;
978 970 }
979 971 }
980 972 return (maxaddr);
981 973 }
982 974
983 975 static void
984 976 vm_iommu_modify(struct vm *vm, bool map)
985 977 {
986 978 int i, sz;
987 979 vm_paddr_t gpa, hpa;
988 980 struct mem_map *mm;
989 981 #ifdef __FreeBSD__
990 982 void *vp, *cookie, *host_domain;
991 983 #else
992 984 void *vp, *cookie, *host_domain __unused;
993 985 #endif
994 986
995 987 sz = PAGE_SIZE;
996 988 host_domain = iommu_host_domain();
997 989
998 990 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
999 991 mm = &vm->mem_maps[i];
1000 992 if (!sysmem_mapping(vm, mm))
1001 993 continue;
1002 994
1003 995 if (map) {
1004 996 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
1005 997 ("iommu map found invalid memmap %lx/%lx/%x",
1006 998 mm->gpa, mm->len, mm->flags));
1007 999 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
1008 1000 continue;
1009 1001 mm->flags |= VM_MEMMAP_F_IOMMU;
1010 1002 } else {
1011 1003 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
1012 1004 continue;
1013 1005 mm->flags &= ~VM_MEMMAP_F_IOMMU;
1014 1006 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
1015 1007 ("iommu unmap found invalid memmap %lx/%lx/%x",
1016 1008 mm->gpa, mm->len, mm->flags));
1017 1009 }
1018 1010
1019 1011 gpa = mm->gpa;
1020 1012 while (gpa < mm->gpa + mm->len) {
1021 1013 vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
1022 1014 &cookie);
1023 1015 KASSERT(vp != NULL, ("vm(%s) could not map gpa %lx",
1024 1016 vm_name(vm), gpa));
1025 1017
1026 1018 vm_gpa_release(cookie);
1027 1019
1028 1020 hpa = DMAP_TO_PHYS((uintptr_t)vp);
1029 1021 if (map) {
1030 1022 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
1031 1023 #ifdef __FreeBSD__
1032 1024 iommu_remove_mapping(host_domain, hpa, sz);
1033 1025 #endif
1034 1026 } else {
1035 1027 iommu_remove_mapping(vm->iommu, gpa, sz);
1036 1028 #ifdef __FreeBSD__
1037 1029 iommu_create_mapping(host_domain, hpa, hpa, sz);
1038 1030 #endif
1039 1031 }
1040 1032
1041 1033 gpa += PAGE_SIZE;
1042 1034 }
1043 1035 }
1044 1036
1045 1037 /*
1046 1038 * Invalidate the cached translations associated with the domain
1047 1039 * from which pages were removed.
1048 1040 */
1049 1041 #ifdef __FreeBSD__
1050 1042 if (map)
1051 1043 iommu_invalidate_tlb(host_domain);
1052 1044 else
1053 1045 iommu_invalidate_tlb(vm->iommu);
1054 1046 #else
1055 1047 iommu_invalidate_tlb(vm->iommu);
1056 1048 #endif
1057 1049 }
1058 1050
1059 1051 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false)
1060 1052 #define vm_iommu_map(vm) vm_iommu_modify((vm), true)
1061 1053
1062 1054 int
1063 1055 vm_unassign_pptdev(struct vm *vm, int pptfd)
1064 1056 {
1065 1057 int error;
1066 1058
1067 1059 error = ppt_unassign_device(vm, pptfd);
1068 1060 if (error)
1069 1061 return (error);
1070 1062
1071 1063 if (ppt_assigned_devices(vm) == 0)
1072 1064 vm_iommu_unmap(vm);
1073 1065
1074 1066 return (0);
1075 1067 }
1076 1068
1077 1069 int
1078 1070 vm_assign_pptdev(struct vm *vm, int pptfd)
1079 1071 {
1080 1072 int error;
1081 1073 vm_paddr_t maxaddr;
1082 1074
1083 1075 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1084 1076 if (ppt_assigned_devices(vm) == 0) {
1085 1077 KASSERT(vm->iommu == NULL,
1086 1078 ("vm_assign_pptdev: iommu must be NULL"));
1087 1079 maxaddr = vmm_sysmem_maxaddr(vm);
1088 1080 vm->iommu = iommu_create_domain(maxaddr);
1089 1081 if (vm->iommu == NULL)
1090 1082 return (ENXIO);
1091 1083 vm_iommu_map(vm);
1092 1084 }
1093 1085
1094 1086 error = ppt_assign_device(vm, pptfd);
1095 1087 return (error);
1096 1088 }
1097 1089
1098 1090 void *
1099 1091 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
1100 1092 void **cookie)
1101 1093 {
1102 1094 int i, count, pageoff;
1103 1095 struct mem_map *mm;
1104 1096 vm_page_t m;
1105 1097 #ifdef INVARIANTS
1106 1098 /*
1107 1099 * All vcpus are frozen by ioctls that modify the memory map
1108 1100 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
1109 1101 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
1110 1102 */
1111 1103 int state;
1112 1104 KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
1113 1105 __func__, vcpuid));
1114 1106 for (i = 0; i < vm->maxcpus; i++) {
1115 1107 if (vcpuid != -1 && vcpuid != i)
1116 1108 continue;
1117 1109 state = vcpu_get_state(vm, i, NULL);
1118 1110 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1119 1111 __func__, state));
1120 1112 }
1121 1113 #endif
1122 1114 pageoff = gpa & PAGE_MASK;
1123 1115 if (len > PAGE_SIZE - pageoff)
1124 1116 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1125 1117
1126 1118 count = 0;
1127 1119 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1128 1120 mm = &vm->mem_maps[i];
1129 1121 if (mm->len == 0) {
1130 1122 continue;
1131 1123 }
1132 1124 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1133 1125 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1134 1126 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1135 1127 break;
1136 1128 }
1137 1129 }
1138 1130
1139 1131 if (count == 1) {
1140 1132 *cookie = m;
1141 1133 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1142 1134 } else {
1143 1135 *cookie = NULL;
1144 1136 return (NULL);
1145 1137 }
1146 1138 }
1147 1139
1148 1140 void
1149 1141 vm_gpa_release(void *cookie)
1150 1142 {
1151 1143 vm_page_t m = cookie;
1152 1144
1153 1145 vm_page_unwire(m, PQ_ACTIVE);
1154 1146 }
1155 1147
1156 1148 int
1157 1149 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1158 1150 {
1159 1151
1160 1152 if (vcpu < 0 || vcpu >= vm->maxcpus)
1161 1153 return (EINVAL);
1162 1154
1163 1155 if (reg >= VM_REG_LAST)
1164 1156 return (EINVAL);
1165 1157
1166 1158 return (VMGETREG(vm->cookie, vcpu, reg, retval));
1167 1159 }
1168 1160
1169 1161 int
1170 1162 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1171 1163 {
1172 1164 struct vcpu *vcpu;
1173 1165 int error;
1174 1166
1175 1167 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1176 1168 return (EINVAL);
1177 1169
1178 1170 if (reg >= VM_REG_LAST)
1179 1171 return (EINVAL);
1180 1172
1181 1173 error = VMSETREG(vm->cookie, vcpuid, reg, val);
1182 1174 if (error || reg != VM_REG_GUEST_RIP)
1183 1175 return (error);
1184 1176
1185 1177 /* Set 'nextrip' to match the value of %rip */
1186 1178 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1187 1179 vcpu = &vm->vcpu[vcpuid];
1188 1180 vcpu->nextrip = val;
1189 1181 return (0);
1190 1182 }
1191 1183
1192 1184 static bool
1193 1185 is_descriptor_table(int reg)
1194 1186 {
1195 1187 switch (reg) {
1196 1188 case VM_REG_GUEST_IDTR:
1197 1189 case VM_REG_GUEST_GDTR:
1198 1190 return (true);
1199 1191 default:
1200 1192 return (false);
1201 1193 }
1202 1194 }
1203 1195
1204 1196 static bool
1205 1197 is_segment_register(int reg)
1206 1198 {
1207 1199 switch (reg) {
1208 1200 case VM_REG_GUEST_ES:
1209 1201 case VM_REG_GUEST_CS:
1210 1202 case VM_REG_GUEST_SS:
1211 1203 case VM_REG_GUEST_DS:
1212 1204 case VM_REG_GUEST_FS:
1213 1205 case VM_REG_GUEST_GS:
1214 1206 case VM_REG_GUEST_TR:
1215 1207 case VM_REG_GUEST_LDTR:
1216 1208 return (true);
1217 1209 default:
1218 1210 return (false);
1219 1211 }
1220 1212 }
1221 1213
1222 1214 int
1223 1215 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1224 1216 {
1225 1217
|
↓ open down ↓ |
840 lines elided |
↑ open up ↑ |
1226 1218 if (vcpu < 0 || vcpu >= vm->maxcpus)
1227 1219 return (EINVAL);
1228 1220
1229 1221 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1230 1222 return (EINVAL);
1231 1223
1232 1224 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1233 1225 }
1234 1226
1235 1227 int
1236 -vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1228 +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1237 1229 {
1238 1230 if (vcpu < 0 || vcpu >= vm->maxcpus)
1239 1231 return (EINVAL);
1240 1232
1241 1233 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1242 1234 return (EINVAL);
1243 1235
1244 1236 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1245 1237 }
1246 1238
1239 +int
1240 +vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1241 +{
1242 + struct vcpu *vcpu;
1243 +
1244 + if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1245 + return (EINVAL);
1246 + }
1247 +
1248 + vcpu = &vm->vcpu[vcpuid];
1249 +
1250 + vcpu_lock(vcpu);
1251 + *state = vcpu->run_state;
1252 + *sipi_vec = vcpu->sipi_vector;
1253 + vcpu_unlock(vcpu);
1254 +
1255 + return (0);
1256 +}
1257 +
1258 +int
1259 +vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1260 +{
1261 + struct vcpu *vcpu;
1262 +
1263 + if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1264 + return (EINVAL);
1265 + }
1266 + if (!VRS_IS_VALID(state)) {
1267 + return (EINVAL);
1268 + }
1269 +
1270 + vcpu = &vm->vcpu[vcpuid];
1271 +
1272 + vcpu_lock(vcpu);
1273 + vcpu->run_state = state;
1274 + vcpu->sipi_vector = sipi_vec;
1275 + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1276 + vcpu_unlock(vcpu);
1277 +
1278 + return (0);
1279 +}
1280 +
1281 +
1247 1282 static void
1248 1283 restore_guest_fpustate(struct vcpu *vcpu)
1249 1284 {
1250 1285
1251 1286 /* flush host state to the pcb */
1252 1287 fpuexit(curthread);
1253 1288
1254 1289 /* restore guest FPU state */
1255 1290 fpu_stop_emulating();
1256 1291 fpurestore(vcpu->guestfpu);
1257 1292
1258 1293 /* restore guest XCR0 if XSAVE is enabled in the host */
1259 1294 if (rcr4() & CR4_XSAVE)
1260 1295 load_xcr(0, vcpu->guest_xcr0);
1261 1296
1262 1297 /*
1263 1298 * The FPU is now "dirty" with the guest's state so turn on emulation
1264 1299 * to trap any access to the FPU by the host.
1265 1300 */
1266 1301 fpu_start_emulating();
1267 1302 }
1268 1303
1269 1304 static void
1270 1305 save_guest_fpustate(struct vcpu *vcpu)
1271 1306 {
1272 1307
1273 1308 if ((rcr0() & CR0_TS) == 0)
1274 1309 panic("fpu emulation not enabled in host!");
1275 1310
1276 1311 /* save guest XCR0 and restore host XCR0 */
1277 1312 if (rcr4() & CR4_XSAVE) {
1278 1313 vcpu->guest_xcr0 = rxcr(0);
1279 1314 load_xcr(0, vmm_get_host_xcr0());
1280 1315 }
1281 1316
1282 1317 /* save guest FPU state */
1283 1318 fpu_stop_emulating();
1284 1319 fpusave(vcpu->guestfpu);
1285 1320 #ifdef __FreeBSD__
1286 1321 fpu_start_emulating();
1287 1322 #else
1288 1323 /*
1289 1324 * When the host state has been restored, we should not re-enable
1290 1325 * CR0.TS on illumos for eager FPU.
1291 1326 */
1292 1327 #endif
1293 1328 }
1294 1329
1295 1330 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
1296 1331
1297 1332 static int
1298 1333 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1299 1334 bool from_idle)
1300 1335 {
1301 1336 struct vcpu *vcpu;
1302 1337 int error;
1303 1338
1304 1339 vcpu = &vm->vcpu[vcpuid];
1305 1340 vcpu_assert_locked(vcpu);
1306 1341
1307 1342 /*
1308 1343 * State transitions from the vmmdev_ioctl() must always begin from
1309 1344 * the VCPU_IDLE state. This guarantees that there is only a single
1310 1345 * ioctl() operating on a vcpu at any point.
1311 1346 */
1312 1347 if (from_idle) {
1313 1348 while (vcpu->state != VCPU_IDLE) {
1314 1349 vcpu->reqidle = 1;
1315 1350 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1316 1351 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1317 1352 "idle requested", vcpu_state2str(vcpu->state));
1318 1353 #ifdef __FreeBSD__
1319 1354 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1320 1355 #else
1321 1356 cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1322 1357 #endif
1323 1358 }
1324 1359 } else {
1325 1360 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1326 1361 "vcpu idle state"));
1327 1362 }
1328 1363
1329 1364 if (vcpu->state == VCPU_RUNNING) {
1330 1365 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1331 1366 "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1332 1367 } else {
1333 1368 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1334 1369 "vcpu that is not running", vcpu->hostcpu));
1335 1370 }
1336 1371
1337 1372 /*
1338 1373 * The following state transitions are allowed:
1339 1374 * IDLE -> FROZEN -> IDLE
1340 1375 * FROZEN -> RUNNING -> FROZEN
1341 1376 * FROZEN -> SLEEPING -> FROZEN
1342 1377 */
1343 1378 switch (vcpu->state) {
1344 1379 case VCPU_IDLE:
1345 1380 case VCPU_RUNNING:
1346 1381 case VCPU_SLEEPING:
|
↓ open down ↓ |
90 lines elided |
↑ open up ↑ |
1347 1382 error = (newstate != VCPU_FROZEN);
1348 1383 break;
1349 1384 case VCPU_FROZEN:
1350 1385 error = (newstate == VCPU_FROZEN);
1351 1386 break;
1352 1387 default:
1353 1388 error = 1;
1354 1389 break;
1355 1390 }
1356 1391
1357 - if (newstate == VCPU_RUNNING) {
1358 - while (vcpu->runblock != 0) {
1359 -#ifdef __FreeBSD__
1360 - msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
1361 -#else
1362 - cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
1363 -#endif
1364 - }
1365 - }
1366 -
1367 1392 if (error)
1368 1393 return (EBUSY);
1369 1394
1370 1395 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1371 1396 vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1372 1397
1373 1398 vcpu->state = newstate;
1374 1399 if (newstate == VCPU_RUNNING)
1375 1400 vcpu->hostcpu = curcpu;
1376 1401 else
1377 1402 vcpu->hostcpu = NOCPU;
1378 1403
1379 - if (newstate == VCPU_IDLE ||
1380 - (newstate == VCPU_FROZEN && vcpu->runblock != 0)) {
1404 + if (newstate == VCPU_IDLE) {
1381 1405 #ifdef __FreeBSD__
1382 1406 wakeup(&vcpu->state);
1383 1407 #else
1384 1408 cv_broadcast(&vcpu->state_cv);
1385 1409 #endif
1386 1410 }
1387 1411
1388 1412 return (0);
1389 1413 }
1390 1414
1391 1415 static void
1392 1416 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1393 1417 {
1394 1418 int error;
1395 1419
1396 1420 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1397 1421 panic("Error %d setting state to %d\n", error, newstate);
1398 1422 }
1399 1423
1400 1424 static void
1401 1425 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1402 1426 {
1403 1427 int error;
1404 1428
1405 1429 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
|
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
1406 1430 panic("Error %d setting state to %d", error, newstate);
1407 1431 }
1408 1432
1409 1433 /*
1410 1434 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1411 1435 */
1412 1436 static int
1413 1437 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1414 1438 {
1415 1439 struct vcpu *vcpu;
1416 -#ifdef __FreeBSD__
1417 - const char *wmesg;
1418 -#else
1419 - const char *wmesg __unused;
1420 -#endif
1421 1440 int t, vcpu_halted, vm_halted;
1441 + bool userspace_exit = false;
1422 1442
1423 1443 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1424 1444
1425 1445 vcpu = &vm->vcpu[vcpuid];
1426 1446 vcpu_halted = 0;
1427 1447 vm_halted = 0;
1428 1448
1429 1449 vcpu_lock(vcpu);
1430 1450 while (1) {
1431 1451 /*
1432 - * Do a final check for pending NMI or interrupts before
1433 - * really putting this thread to sleep. Also check for
1434 - * software events that would cause this vcpu to wakeup.
1435 - *
1436 - * These interrupts/events could have happened after the
1437 - * vcpu returned from VMRUN() and before it acquired the
1438 - * vcpu lock above.
1452 + * Do a final check for pending interrupts (including NMI and
1453 + * INIT) before putting this thread to sleep.
1439 1454 */
1440 - if (vm->suspend || vcpu->reqidle)
1441 - break;
1442 1455 if (vm_nmi_pending(vm, vcpuid))
1443 1456 break;
1457 + if (vcpu_run_state_pending(vm, vcpuid))
1458 + break;
1444 1459 if (!intr_disabled) {
1445 1460 if (vm_extint_pending(vm, vcpuid) ||
1446 1461 vlapic_pending_intr(vcpu->vlapic, NULL)) {
1447 1462 break;
1448 1463 }
1449 1464 }
1450 1465
1451 - /* Don't go to sleep if the vcpu thread needs to yield */
1452 - if (vcpu_should_yield(vm, vcpuid))
1466 + /*
1467 + * Also check for software events which would cause a wake-up.
1468 + * This will set the appropriate exitcode directly, rather than
1469 + * requiring a trip through VM_RUN().
1470 + */
1471 + if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1472 + userspace_exit = true;
1453 1473 break;
1474 + }
1454 1475
1455 - if (vcpu_debugged(vm, vcpuid))
1456 - break;
1457 -
1458 1476 /*
1459 1477 * Some Linux guests implement "halt" by having all vcpus
1460 1478 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1461 1479 * track of the vcpus that have entered this state. When all
1462 1480 * vcpus enter the halted state the virtual machine is halted.
1463 1481 */
1464 1482 if (intr_disabled) {
1465 - wmesg = "vmhalt";
1466 - VCPU_CTR0(vm, vcpuid, "Halted");
1467 1483 if (!vcpu_halted && halt_detection_enabled) {
1468 1484 vcpu_halted = 1;
1469 1485 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1470 1486 }
1471 1487 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1472 1488 vm_halted = 1;
1473 1489 break;
1474 1490 }
1475 - } else {
1476 - wmesg = "vmidle";
1477 1491 }
1478 1492
1479 1493 t = ticks;
1480 1494 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1481 -#ifdef __FreeBSD__
1482 - /*
1483 - * XXX msleep_spin() cannot be interrupted by signals so
1484 - * wake up periodically to check pending signals.
1485 - */
1486 - msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1487 -#else
1488 - /*
1489 - * Fortunately, cv_wait_sig can be interrupted by signals, so
1490 - * there is no need to periodically wake up.
1491 - */
1492 1495 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1493 -#endif
1494 1496 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1495 1497 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1496 1498 }
1497 1499
1498 1500 if (vcpu_halted)
1499 1501 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1500 1502
1501 1503 vcpu_unlock(vcpu);
1502 1504
1503 1505 if (vm_halted)
1504 1506 vm_suspend(vm, VM_SUSPEND_HALT);
1505 1507
1506 - return (0);
1508 + return (userspace_exit ? -1 : 0);
1507 1509 }
1508 1510
1509 1511 static int
1510 1512 vm_handle_paging(struct vm *vm, int vcpuid)
1511 1513 {
1512 1514 int rv, ftype;
1513 1515 struct vm_map *map;
1514 1516 struct vcpu *vcpu;
1515 1517 struct vm_exit *vme;
1516 1518
1517 1519 vcpu = &vm->vcpu[vcpuid];
1518 1520 vme = &vcpu->exitinfo;
1519 1521
1520 1522 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1521 1523 __func__, vme->inst_length));
1522 1524
1523 1525 ftype = vme->u.paging.fault_type;
1524 1526 KASSERT(ftype == VM_PROT_READ ||
1525 1527 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1526 1528 ("vm_handle_paging: invalid fault_type %d", ftype));
1527 1529
1528 1530 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1529 1531 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1530 1532 vme->u.paging.gpa, ftype);
1531 1533 if (rv == 0) {
1532 1534 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %lx",
1533 1535 ftype == VM_PROT_READ ? "accessed" : "dirty",
1534 1536 vme->u.paging.gpa);
1535 1537 goto done;
1536 1538 }
1537 1539 }
1538 1540
1539 1541 map = &vm->vmspace->vm_map;
1540 1542 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1541 1543
1542 1544 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1543 1545 "ftype = %d", rv, vme->u.paging.gpa, ftype);
1544 1546
1545 1547 if (rv != KERN_SUCCESS)
1546 1548 return (EFAULT);
1547 1549 done:
1548 1550 return (0);
1549 1551 }
1550 1552
1551 1553 int
1552 1554 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1553 1555 int rsize)
1554 1556 {
1555 1557 int err = ESRCH;
1556 1558
1557 1559 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1558 1560 err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1559 1561 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1560 1562 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1561 1563 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1562 1564 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1563 1565 }
1564 1566
1565 1567 return (err);
1566 1568 }
1567 1569
1568 1570 int
1569 1571 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1570 1572 int wsize)
1571 1573 {
1572 1574 int err = ESRCH;
1573 1575
1574 1576 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1575 1577 err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1576 1578 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1577 1579 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1578 1580 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1579 1581 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1580 1582 }
1581 1583
1582 1584 return (err);
1583 1585 }
1584 1586
1585 1587 static int
1586 1588 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1587 1589 {
1588 1590 struct vie *vie;
1589 1591 struct vcpu *vcpu;
1590 1592 struct vm_exit *vme;
1591 1593 uint64_t inst_addr;
1592 1594 int error, fault, cs_d;
1593 1595
1594 1596 vcpu = &vm->vcpu[vcpuid];
1595 1597 vme = &vcpu->exitinfo;
1596 1598 vie = vcpu->vie_ctx;
1597 1599
1598 1600 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1599 1601 __func__, vme->inst_length));
1600 1602
1601 1603 inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1602 1604 cs_d = vme->u.mmio_emul.cs_d;
1603 1605
1604 1606 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1605 1607 vme->u.mmio_emul.gpa);
1606 1608
1607 1609 /* Fetch the faulting instruction */
1608 1610 if (vie_needs_fetch(vie)) {
1609 1611 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1610 1612 &fault);
1611 1613 if (error != 0) {
1612 1614 return (error);
1613 1615 } else if (fault) {
1614 1616 /*
1615 1617 * If a fault during instruction fetch was encounted, it
1616 1618 * will have asserted that the appropriate exception be
1617 1619 * injected at next entry. No further work is required.
1618 1620 */
1619 1621 return (0);
1620 1622 }
1621 1623 }
1622 1624
1623 1625 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1624 1626 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1625 1627 inst_addr);
1626 1628 /* Dump (unrecognized) instruction bytes in userspace */
1627 1629 vie_fallback_exitinfo(vie, vme);
1628 1630 return (-1);
1629 1631 }
1630 1632 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1631 1633 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1632 1634 /* Decoded GLA does not match GLA from VM exit state */
1633 1635 vie_fallback_exitinfo(vie, vme);
1634 1636 return (-1);
1635 1637 }
1636 1638
1637 1639 repeat:
1638 1640 error = vie_emulate_mmio(vie, vm, vcpuid);
1639 1641 if (error < 0) {
1640 1642 /*
1641 1643 * MMIO not handled by any of the in-kernel-emulated devices, so
1642 1644 * make a trip out to userspace for it.
1643 1645 */
1644 1646 vie_exitinfo(vie, vme);
1645 1647 } else if (error == EAGAIN) {
1646 1648 /*
1647 1649 * Continue emulating the rep-prefixed instruction, which has
1648 1650 * not completed its iterations.
1649 1651 *
1650 1652 * In case this can be emulated in-kernel and has a high
1651 1653 * repetition count (causing a tight spin), it should be
1652 1654 * deferential to yield conditions.
1653 1655 */
1654 1656 if (!vcpu_should_yield(vm, vcpuid)) {
1655 1657 goto repeat;
1656 1658 } else {
1657 1659 /*
1658 1660 * Defer to the contending load by making a trip to
1659 1661 * userspace with a no-op (BOGUS) exit reason.
1660 1662 */
1661 1663 vie_reset(vie);
1662 1664 vme->exitcode = VM_EXITCODE_BOGUS;
1663 1665 return (-1);
1664 1666 }
1665 1667 } else if (error == 0) {
1666 1668 /* Update %rip now that instruction has been emulated */
1667 1669 vie_advance_pc(vie, &vcpu->nextrip);
1668 1670 }
1669 1671 return (error);
1670 1672 }
1671 1673
1672 1674 static int
1673 1675 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1674 1676 {
1675 1677 struct vcpu *vcpu;
1676 1678 struct vie *vie;
1677 1679 int err;
1678 1680
1679 1681 vcpu = &vm->vcpu[vcpuid];
1680 1682 vie = vcpu->vie_ctx;
1681 1683
1682 1684 repeat:
1683 1685 err = vie_emulate_inout(vie, vm, vcpuid);
1684 1686
1685 1687 if (err < 0) {
1686 1688 /*
1687 1689 * In/out not handled by any of the in-kernel-emulated devices,
1688 1690 * so make a trip out to userspace for it.
1689 1691 */
1690 1692 vie_exitinfo(vie, vme);
1691 1693 return (err);
1692 1694 } else if (err == EAGAIN) {
1693 1695 /*
1694 1696 * Continue emulating the rep-prefixed ins/outs, which has not
1695 1697 * completed its iterations.
1696 1698 *
1697 1699 * In case this can be emulated in-kernel and has a high
1698 1700 * repetition count (causing a tight spin), it should be
1699 1701 * deferential to yield conditions.
1700 1702 */
1701 1703 if (!vcpu_should_yield(vm, vcpuid)) {
1702 1704 goto repeat;
1703 1705 } else {
1704 1706 /*
1705 1707 * Defer to the contending load by making a trip to
1706 1708 * userspace with a no-op (BOGUS) exit reason.
1707 1709 */
1708 1710 vie_reset(vie);
1709 1711 vme->exitcode = VM_EXITCODE_BOGUS;
1710 1712 return (-1);
1711 1713 }
1712 1714 } else if (err != 0) {
1713 1715 /* Emulation failure. Bail all the way out to userspace. */
1714 1716 vme->exitcode = VM_EXITCODE_INST_EMUL;
1715 1717 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1716 1718 return (-1);
1717 1719 }
1718 1720
1719 1721 vie_advance_pc(vie, &vcpu->nextrip);
1720 1722 return (0);
1721 1723 }
1722 1724
1723 1725 static int
1724 1726 vm_handle_suspend(struct vm *vm, int vcpuid)
1725 1727 {
1726 1728 #ifdef __FreeBSD__
1727 1729 int error, i;
1728 1730 struct vcpu *vcpu;
1729 1731 struct thread *td;
1730 1732
1731 1733 error = 0;
1732 1734 vcpu = &vm->vcpu[vcpuid];
1733 1735 td = curthread;
1734 1736 #else
1735 1737 int i;
1736 1738 struct vcpu *vcpu;
1737 1739
1738 1740 vcpu = &vm->vcpu[vcpuid];
1739 1741 #endif
1740 1742
1741 1743 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1742 1744
1743 1745 #ifdef __FreeBSD__
1744 1746 /*
1745 1747 * Wait until all 'active_cpus' have suspended themselves.
1746 1748 *
1747 1749 * Since a VM may be suspended at any time including when one or
1748 1750 * more vcpus are doing a rendezvous we need to call the rendezvous
1749 1751 * handler while we are waiting to prevent a deadlock.
1750 1752 */
1751 1753 vcpu_lock(vcpu);
1752 1754 while (error == 0) {
1753 1755 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1754 1756 VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1755 1757 break;
1756 1758 }
1757 1759
1758 1760 if (vm->rendezvous_func == NULL) {
1759 1761 VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1760 1762 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1761 1763 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1762 1764 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1763 1765 if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
1764 1766 vcpu_unlock(vcpu);
1765 1767 error = thread_check_susp(td, false);
1766 1768 vcpu_lock(vcpu);
1767 1769 }
1768 1770 } else {
1769 1771 VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1770 1772 vcpu_unlock(vcpu);
1771 1773 error = vm_handle_rendezvous(vm, vcpuid);
1772 1774 vcpu_lock(vcpu);
1773 1775 }
1774 1776 }
1775 1777 vcpu_unlock(vcpu);
1776 1778 #else
1777 1779 vcpu_lock(vcpu);
1778 1780 while (1) {
1779 1781 int rc;
1780 1782
1781 1783 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1782 1784 VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1783 1785 break;
1784 1786 }
1785 1787
1786 1788 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1787 1789 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, hz,
1788 1790 TR_CLOCK_TICK);
1789 1791 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1790 1792
1791 1793 /*
1792 1794 * If the userspace process driving the instance is killed, any
1793 1795 * vCPUs yet to be marked suspended (because they are not
1794 1796 * VM_RUN-ing in the kernel presently) will never reach that
1795 1797 * state.
1796 1798 *
1797 1799 * To avoid vm_handle_suspend() getting stuck in the kernel
1798 1800 * waiting for those vCPUs, offer a bail-out even though it
1799 1801 * means returning without all vCPUs in a suspended state.
1800 1802 */
1801 1803 if (rc <= 0) {
1802 1804 if ((curproc->p_flag & SEXITING) != 0) {
1803 1805 break;
1804 1806 }
1805 1807 }
1806 1808 }
1807 1809 vcpu_unlock(vcpu);
1808 1810
1809 1811 #endif
1810 1812
1811 1813 /*
1812 1814 * Wakeup the other sleeping vcpus and return to userspace.
1813 1815 */
1814 1816 for (i = 0; i < vm->maxcpus; i++) {
1815 1817 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1816 1818 vcpu_notify_event(vm, i);
1817 1819 }
1818 1820 }
1819 1821
1820 1822 return (-1);
1821 1823 }
1822 1824
1823 1825 static int
1824 1826 vm_handle_reqidle(struct vm *vm, int vcpuid)
|
↓ open down ↓ |
308 lines elided |
↑ open up ↑ |
1825 1827 {
1826 1828 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1827 1829
1828 1830 vcpu_lock(vcpu);
1829 1831 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1830 1832 vcpu->reqidle = 0;
1831 1833 vcpu_unlock(vcpu);
1832 1834 return (-1);
1833 1835 }
1834 1836
1837 +static int
1838 +vm_handle_run_state(struct vm *vm, int vcpuid)
1839 +{
1840 + struct vcpu *vcpu = &vm->vcpu[vcpuid];
1841 + bool handled = false;
1842 +
1843 + vcpu_lock(vcpu);
1844 + while (1) {
1845 + if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1846 + vcpu_unlock(vcpu);
1847 + VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1848 + vcpu_lock(vcpu);
1849 +
1850 + vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1851 + vcpu->run_state |= VRS_INIT;
1852 + }
1853 +
1854 + if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1855 + (VRS_INIT | VRS_PEND_SIPI)) {
1856 + const uint8_t vector = vcpu->sipi_vector;
1857 +
1858 + vcpu_unlock(vcpu);
1859 + VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1860 + vcpu_lock(vcpu);
1861 +
1862 + vcpu->run_state &= ~VRS_PEND_SIPI;
1863 + vcpu->run_state |= VRS_RUN;
1864 + }
1865 +
1866 + /*
1867 + * If the vCPU is now in the running state, there is no need to
1868 + * wait for anything prior to re-entry.
1869 + */
1870 + if ((vcpu->run_state & VRS_RUN) != 0) {
1871 + handled = true;
1872 + break;
1873 + }
1874 +
1875 + /*
1876 + * Also check for software events which would cause a wake-up.
1877 + * This will set the appropriate exitcode directly, rather than
1878 + * requiring a trip through VM_RUN().
1879 + */
1880 + if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1881 + break;
1882 + }
1883 +
1884 + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1885 + (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
1886 + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1887 + }
1888 + vcpu_unlock(vcpu);
1889 +
1890 + return (handled ? 0 : -1);
1891 +}
1892 +
1835 1893 #ifndef __FreeBSD__
1836 1894 static int
1837 1895 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1838 1896 {
1839 1897 struct vcpu *cpu = &vm->vcpu[vcpuid];
1840 1898 const uint32_t code = vme->u.msr.code;
1841 1899 const uint64_t val = vme->u.msr.wval;
1842 1900
1843 1901 switch (code) {
1844 1902 case MSR_TSC:
1845 1903 cpu->tsc_offset = val - rdtsc();
1846 1904 return (0);
1847 1905 }
1848 1906
1849 1907 return (-1);
1850 1908 }
1851 1909 #endif /* __FreeBSD__ */
1852 1910
1853 -void
1854 -vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip)
1855 -{
1856 - if (vm->sipi_req) {
1857 - /* This should never occur if userspace is doing its job. */
1858 - vm->stats.sipi_supersede++;
1859 - }
1860 - vm->sipi_req = true;
1861 - vm->sipi_req_vcpu = req_vcpuid;
1862 - vm->sipi_req_rip = req_rip;
1863 -}
1864 -
1865 1911 int
1866 1912 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1867 1913 {
1868 1914 int i;
1869 1915
1870 1916 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1871 1917 return (EINVAL);
1872 1918
1873 1919 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
1874 1920 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1875 1921 vm->suspend, how);
1876 1922 return (EALREADY);
1877 1923 }
1878 1924
1879 1925 VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1880 1926
1881 1927 /*
1882 1928 * Notify all active vcpus that they are now suspended.
|
↓ open down ↓ |
8 lines elided |
↑ open up ↑ |
1883 1929 */
1884 1930 for (i = 0; i < vm->maxcpus; i++) {
1885 1931 if (CPU_ISSET(i, &vm->active_cpus))
1886 1932 vcpu_notify_event(vm, i);
1887 1933 }
1888 1934
1889 1935 return (0);
1890 1936 }
1891 1937
1892 1938 void
1893 -vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1939 +vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
1894 1940 {
1895 1941 struct vm_exit *vmexit;
1896 1942
1897 - KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1898 - ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1899 -
1900 1943 vmexit = vm_exitinfo(vm, vcpuid);
1901 1944 vmexit->rip = rip;
1902 1945 vmexit->inst_length = 0;
1903 - vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1904 - vmexit->u.suspended.how = vm->suspend;
1946 + vmexit->exitcode = VM_EXITCODE_RUN_STATE;
1947 + vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
1905 1948 }
1906 1949
1907 -void
1908 -vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
1909 -{
1910 - struct vm_exit *vmexit;
1911 1950
1912 - vmexit = vm_exitinfo(vm, vcpuid);
1913 - vmexit->rip = rip;
1914 - vmexit->inst_length = 0;
1915 - vmexit->exitcode = VM_EXITCODE_DEBUG;
1916 -}
1917 -
1918 -void
1919 -vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip)
1920 -{
1921 - struct vm_exit *vmexit;
1922 -
1923 - vmexit = vm_exitinfo(vm, vcpuid);
1924 - vmexit->rip = rip;
1925 - vmexit->inst_length = 0;
1926 - vmexit->exitcode = VM_EXITCODE_RUNBLOCK;
1927 - vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1);
1928 -}
1929 -
1930 -void
1931 -vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
1932 -{
1933 - struct vm_exit *vmexit;
1934 -
1935 - vmexit = vm_exitinfo(vm, vcpuid);
1936 - vmexit->rip = rip;
1937 - vmexit->inst_length = 0;
1938 - vmexit->exitcode = VM_EXITCODE_REQIDLE;
1939 - vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
1940 -}
1941 -
1942 -void
1943 -vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1944 -{
1945 - struct vm_exit *vmexit;
1946 -
1947 - vmexit = vm_exitinfo(vm, vcpuid);
1948 - vmexit->rip = rip;
1949 - vmexit->inst_length = 0;
1950 - vmexit->exitcode = VM_EXITCODE_BOGUS;
1951 - vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1952 -}
1953 -
1954 1951 #ifndef __FreeBSD__
1955 1952 /*
1956 1953 * Some vmm resources, such as the lapic, may have CPU-specific resources
1957 1954 * allocated to them which would benefit from migration onto the host CPU which
1958 1955 * is processing the vcpu state.
1959 1956 */
1960 1957 static void
1961 1958 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
1962 1959 {
1963 1960 /*
1964 1961 * Localizing cyclic resources requires acquisition of cpu_lock, and
1965 1962 * doing so with kpreempt disabled is a recipe for deadlock disaster.
1966 1963 */
1967 1964 VERIFY(curthread->t_preempt == 0);
1968 1965
1969 1966 /*
1970 1967 * Do not bother with localization if this vCPU is about to return to
1971 1968 * the host CPU it was last localized to.
1972 1969 */
1973 1970 if (vcpu->lastloccpu == curcpu)
1974 1971 return;
1975 1972
1976 1973 /*
1977 1974 * Localize system-wide resources to the primary boot vCPU. While any
1978 1975 * of the other vCPUs may access them, it keeps the potential interrupt
1979 1976 * footprint constrained to CPUs involved with this instance.
1980 1977 */
1981 1978 if (vcpu == &vm->vcpu[0]) {
1982 1979 vhpet_localize_resources(vm->vhpet);
1983 1980 vrtc_localize_resources(vm->vrtc);
1984 1981 vatpit_localize_resources(vm->vatpit);
1985 1982 }
1986 1983
1987 1984 vlapic_localize_resources(vcpu->vlapic);
1988 1985
1989 1986 vcpu->lastloccpu = curcpu;
1990 1987 }
1991 1988
1992 1989 static void
1993 1990 vmm_savectx(void *arg)
1994 1991 {
1995 1992 vm_thread_ctx_t *vtc = arg;
1996 1993 struct vm *vm = vtc->vtc_vm;
1997 1994 const int vcpuid = vtc->vtc_vcpuid;
1998 1995
1999 1996 if (ops->vmsavectx != NULL) {
2000 1997 ops->vmsavectx(vm->cookie, vcpuid);
2001 1998 }
2002 1999
2003 2000 /*
2004 2001 * If the CPU holds the restored guest FPU state, save it and restore
2005 2002 * the host FPU state before this thread goes off-cpu.
2006 2003 */
2007 2004 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2008 2005 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2009 2006
2010 2007 save_guest_fpustate(vcpu);
2011 2008 vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2012 2009 }
2013 2010 }
2014 2011
2015 2012 static void
2016 2013 vmm_restorectx(void *arg)
2017 2014 {
2018 2015 vm_thread_ctx_t *vtc = arg;
2019 2016 struct vm *vm = vtc->vtc_vm;
2020 2017 const int vcpuid = vtc->vtc_vcpuid;
2021 2018
2022 2019 /*
2023 2020 * When coming back on-cpu, only restore the guest FPU status if the
2024 2021 * thread is in a context marked as requiring it. This should be rare,
2025 2022 * occurring only when a future logic error results in a voluntary
2026 2023 * sleep during the VMRUN critical section.
2027 2024 *
2028 2025 * The common case will result in elision of the guest FPU state
2029 2026 * restoration, deferring that action until it is clearly necessary
2030 2027 * during vm_run.
2031 2028 */
2032 2029 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2033 2030 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2034 2031 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2035 2032
2036 2033 restore_guest_fpustate(vcpu);
2037 2034 vtc->vtc_status |= VTCS_FPU_RESTORED;
2038 2035 }
2039 2036
2040 2037 if (ops->vmrestorectx != NULL) {
2041 2038 ops->vmrestorectx(vm->cookie, vcpuid);
2042 2039 }
2043 2040
2044 2041 }
2045 2042
2046 2043 /*
2047 2044 * If we're in removectx(), we might still have state to tidy up.
2048 2045 */
2049 2046 static void
2050 2047 vmm_freectx(void *arg, int isexec)
2051 2048 {
2052 2049 vmm_savectx(arg);
2053 2050 }
2054 2051
2055 2052 #endif /* __FreeBSD */
2056 2053
2057 2054 static int
2058 2055 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2059 2056 struct vm_exit *vme)
2060 2057 {
2061 2058 struct vcpu *vcpu;
2062 2059 struct vie *vie;
2063 2060 int err;
2064 2061
|
↓ open down ↓ |
101 lines elided |
↑ open up ↑ |
2065 2062 vcpu = &vm->vcpu[vcpuid];
2066 2063 vie = vcpu->vie_ctx;
2067 2064 err = 0;
2068 2065
2069 2066 switch (entry->cmd) {
2070 2067 case VEC_DEFAULT:
2071 2068 return (0);
2072 2069 case VEC_DISCARD_INSTR:
2073 2070 vie_reset(vie);
2074 2071 return (0);
2075 - case VEC_COMPLETE_MMIO:
2072 + case VEC_FULFILL_MMIO:
2076 2073 err = vie_fulfill_mmio(vie, &entry->u.mmio);
2077 2074 if (err == 0) {
2078 2075 err = vie_emulate_mmio(vie, vm, vcpuid);
2079 2076 if (err == 0) {
2080 2077 vie_advance_pc(vie, &vcpu->nextrip);
2081 2078 } else if (err < 0) {
2082 2079 vie_exitinfo(vie, vme);
2083 2080 } else if (err == EAGAIN) {
2084 2081 /*
2085 2082 * Clear the instruction emulation state in
2086 2083 * order to re-enter VM context and continue
2087 2084 * this 'rep <instruction>'
2088 2085 */
2089 2086 vie_reset(vie);
2090 2087 err = 0;
2091 2088 }
2092 2089 }
2093 2090 break;
2094 - case VEC_COMPLETE_INOUT:
2091 + case VEC_FULFILL_INOUT:
2095 2092 err = vie_fulfill_inout(vie, &entry->u.inout);
2096 2093 if (err == 0) {
2097 2094 err = vie_emulate_inout(vie, vm, vcpuid);
2098 2095 if (err == 0) {
2099 2096 vie_advance_pc(vie, &vcpu->nextrip);
2100 2097 } else if (err < 0) {
2101 2098 vie_exitinfo(vie, vme);
2102 2099 } else if (err == EAGAIN) {
2103 2100 /*
2104 2101 * Clear the instruction emulation state in
2105 2102 * order to re-enter VM context and continue
2106 2103 * this 'rep ins/outs'
2107 2104 */
2108 2105 vie_reset(vie);
2109 2106 err = 0;
2110 2107 }
2111 2108 }
2112 2109 break;
2113 2110 default:
2114 2111 return (EINVAL);
2115 2112 }
2116 2113 return (err);
2117 2114 }
2118 2115
2119 2116 static int
2120 2117 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2121 2118 {
2122 2119 struct vie *vie;
2123 2120
2124 2121 vie = vm->vcpu[vcpuid].vie_ctx;
|
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
2125 2122
2126 2123 if (vie_pending(vie)) {
2127 2124 /*
2128 2125 * Userspace has not fulfilled the pending needs of the
2129 2126 * instruction emulation, so bail back out.
2130 2127 */
2131 2128 vie_exitinfo(vie, vme);
2132 2129 return (-1);
2133 2130 }
2134 2131
2135 - if (vcpuid == 0 && vm->sipi_req) {
2136 - /* The boot vCPU has sent a SIPI to one of the other CPUs */
2137 - vme->exitcode = VM_EXITCODE_SPINUP_AP;
2138 - vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu;
2139 - vme->u.spinup_ap.rip = vm->sipi_req_rip;
2140 -
2141 - vm->sipi_req = false;
2142 - vm->sipi_req_vcpu = 0;
2143 - vm->sipi_req_rip = 0;
2144 - return (-1);
2145 - }
2146 -
2147 2132 return (0);
2148 2133 }
2149 2134
2150 2135 int
2151 2136 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2152 2137 {
2153 - struct vm_eventinfo evinfo;
2154 2138 int error;
2155 2139 struct vcpu *vcpu;
2156 2140 #ifdef __FreeBSD__
2157 2141 struct pcb *pcb;
2158 2142 #endif
2159 2143 uint64_t tscval;
2160 2144 struct vm_exit *vme;
2161 2145 bool intr_disabled;
2162 2146 pmap_t pmap;
2163 2147 #ifndef __FreeBSD__
2164 2148 vm_thread_ctx_t vtc;
2165 2149 int affinity_type = CPU_CURRENT;
2166 2150 #endif
2167 2151
2168 2152 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2169 2153 return (EINVAL);
|
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
2170 2154
2171 2155 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2172 2156 return (EINVAL);
2173 2157
2174 2158 if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
2175 2159 return (EINVAL);
2176 2160
2177 2161 pmap = vmspace_pmap(vm->vmspace);
2178 2162 vcpu = &vm->vcpu[vcpuid];
2179 2163 vme = &vcpu->exitinfo;
2180 - evinfo.rptr = &vcpu->runblock;
2181 - evinfo.sptr = &vm->suspend;
2182 - evinfo.iptr = &vcpu->reqidle;
2183 2164
2184 2165 #ifndef __FreeBSD__
2185 2166 vtc.vtc_vm = vm;
2186 2167 vtc.vtc_vcpuid = vcpuid;
2187 2168 vtc.vtc_status = 0;
2188 2169
2189 2170 installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2190 2171 NULL, vmm_freectx);
2191 2172 #endif
2192 2173
2193 2174 error = vm_entry_actions(vm, vcpuid, entry, vme);
2194 2175 if (error != 0) {
2195 2176 goto exit;
2196 2177 }
2197 2178
2198 2179 restart:
2199 2180 error = vm_loop_checks(vm, vcpuid, vme);
2200 2181 if (error != 0) {
2201 2182 goto exit;
2202 2183 }
2203 2184
2204 2185 #ifndef __FreeBSD__
2205 2186 thread_affinity_set(curthread, affinity_type);
2206 2187 /*
2207 2188 * Resource localization should happen after the CPU affinity for the
2208 2189 * thread has been set to ensure that access from restricted contexts,
2209 2190 * such as VMX-accelerated APIC operations, can occur without inducing
2210 2191 * cyclic cross-calls.
2211 2192 *
2212 2193 * This must be done prior to disabling kpreempt via critical_enter().
2213 2194 */
2214 2195 vm_localize_resources(vm, vcpu);
2215 2196
2216 2197 affinity_type = CPU_CURRENT;
2217 2198 #endif
2218 2199
2219 2200 critical_enter();
2220 2201
2221 2202 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
2222 2203 ("vm_run: absurd pm_active"));
2223 2204
2224 2205 tscval = rdtsc();
2225 2206
2226 2207 #ifdef __FreeBSD__
2227 2208 pcb = PCPU_GET(curpcb);
2228 2209 set_pcb_flags(pcb, PCB_FULL_IRET);
2229 2210 #else
2230 2211 /* Force a trip through update_sregs to reload %fs/%gs and friends */
2231 2212 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2232 2213 #endif
2233 2214
2234 2215 #ifdef __FreeBSD__
|
↓ open down ↓ |
42 lines elided |
↑ open up ↑ |
2235 2216 restore_guest_fpustate(vcpu);
2236 2217 #else
2237 2218 if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2238 2219 restore_guest_fpustate(vcpu);
2239 2220 vtc.vtc_status |= VTCS_FPU_RESTORED;
2240 2221 }
2241 2222 vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2242 2223 #endif
2243 2224
2244 2225 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2245 - error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
2226 + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
2246 2227 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2247 2228
2248 2229 #ifdef __FreeBSD__
2249 2230 save_guest_fpustate(vcpu);
2250 2231 #else
2251 2232 vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2252 2233 #endif
2253 2234
2254 2235 #ifndef __FreeBSD__
2255 2236 /*
2256 2237 * Once clear of the delicate contexts comprising the VM_RUN handler,
2257 2238 * thread CPU affinity can be loosened while other processing occurs.
2258 2239 */
2259 2240 thread_affinity_clear(curthread);
2260 2241 #endif
2261 2242
2262 2243 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
2263 2244
2264 2245 critical_exit();
2265 2246
|
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
2266 2247 if (error != 0) {
2267 2248 /* Communicate out any error from VMRUN() above */
2268 2249 goto exit;
2269 2250 }
2270 2251
2271 2252 vcpu->nextrip = vme->rip + vme->inst_length;
2272 2253 switch (vme->exitcode) {
2273 2254 case VM_EXITCODE_REQIDLE:
2274 2255 error = vm_handle_reqidle(vm, vcpuid);
2275 2256 break;
2257 + case VM_EXITCODE_RUN_STATE:
2258 + error = vm_handle_run_state(vm, vcpuid);
2259 + break;
2276 2260 case VM_EXITCODE_SUSPENDED:
2277 2261 error = vm_handle_suspend(vm, vcpuid);
2278 2262 break;
2279 2263 case VM_EXITCODE_IOAPIC_EOI:
2280 2264 vioapic_process_eoi(vm, vcpuid,
2281 2265 vme->u.ioapic_eoi.vector);
2282 2266 break;
2283 - case VM_EXITCODE_RUNBLOCK:
2284 - break;
2285 2267 case VM_EXITCODE_HLT:
2286 2268 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2287 2269 error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2288 2270 break;
2289 2271 case VM_EXITCODE_PAGING:
2290 2272 error = vm_handle_paging(vm, vcpuid);
2291 2273 break;
2292 2274 case VM_EXITCODE_MMIO_EMUL:
2293 2275 error = vm_handle_mmio_emul(vm, vcpuid);
2294 2276 break;
2295 2277 case VM_EXITCODE_INOUT:
2296 2278 error = vm_handle_inout(vm, vcpuid, vme);
2297 2279 break;
2298 2280 case VM_EXITCODE_MONITOR:
2299 2281 case VM_EXITCODE_MWAIT:
2300 2282 case VM_EXITCODE_VMINSN:
2301 2283 vm_inject_ud(vm, vcpuid);
2302 2284 break;
2303 2285 #ifndef __FreeBSD__
2304 2286 case VM_EXITCODE_WRMSR:
2305 2287 if (vm_handle_wrmsr(vm, vcpuid, vme) != 0) {
2306 2288 error = -1;
2307 2289 }
2308 2290 break;
2309 2291
2310 2292 case VM_EXITCODE_HT: {
2311 2293 affinity_type = CPU_BEST;
2312 2294 break;
2313 2295 }
2314 2296 #endif
2315 2297
2316 2298 case VM_EXITCODE_MTRAP:
2317 2299 vm_suspend_cpu(vm, vcpuid);
2318 2300 error = -1;
2319 2301 break;
2320 2302 default:
2321 2303 /* handled in userland */
2322 2304 error = -1;
2323 2305 break;
2324 2306 }
2325 2307
2326 2308 if (error == 0) {
2327 2309 /* VM exit conditions handled in-kernel, continue running */
2328 2310 goto restart;
2329 2311 }
2330 2312
2331 2313 exit:
2332 2314 #ifndef __FreeBSD__
2333 2315 removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
2334 2316 NULL, vmm_freectx);
2335 2317 #endif
2336 2318
2337 2319 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2338 2320
2339 2321 return (error);
2340 2322 }
2341 2323
2342 2324 int
2343 2325 vm_restart_instruction(void *arg, int vcpuid)
2344 2326 {
2345 2327 struct vm *vm;
2346 2328 struct vcpu *vcpu;
2347 2329 enum vcpu_state state;
2348 2330 uint64_t rip;
2349 2331 int error;
2350 2332
2351 2333 vm = arg;
2352 2334 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2353 2335 return (EINVAL);
2354 2336
2355 2337 vcpu = &vm->vcpu[vcpuid];
2356 2338 state = vcpu_get_state(vm, vcpuid, NULL);
2357 2339 if (state == VCPU_RUNNING) {
2358 2340 /*
2359 2341 * When a vcpu is "running" the next instruction is determined
2360 2342 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2361 2343 * Thus setting 'inst_length' to zero will cause the current
2362 2344 * instruction to be restarted.
2363 2345 */
2364 2346 vcpu->exitinfo.inst_length = 0;
2365 2347 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2366 2348 "setting inst_length to zero", vcpu->exitinfo.rip);
2367 2349 } else if (state == VCPU_FROZEN) {
2368 2350 /*
2369 2351 * When a vcpu is "frozen" it is outside the critical section
2370 2352 * around VMRUN() and 'nextrip' points to the next instruction.
2371 2353 * Thus instruction restart is achieved by setting 'nextrip'
2372 2354 * to the vcpu's %rip.
2373 2355 */
2374 2356 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2375 2357 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2376 2358 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2377 2359 "nextrip from %lx to %lx", vcpu->nextrip, rip);
2378 2360 vcpu->nextrip = rip;
2379 2361 } else {
2380 2362 panic("%s: invalid state %d", __func__, state);
2381 2363 }
2382 2364 return (0);
2383 2365 }
2384 2366
2385 2367 int
2386 2368 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2387 2369 {
2388 2370 struct vcpu *vcpu;
2389 2371 int type, vector;
2390 2372
2391 2373 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2392 2374 return (EINVAL);
2393 2375
2394 2376 vcpu = &vm->vcpu[vcpuid];
2395 2377
2396 2378 if (info & VM_INTINFO_VALID) {
2397 2379 type = info & VM_INTINFO_TYPE;
2398 2380 vector = info & 0xff;
2399 2381 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2400 2382 return (EINVAL);
2401 2383 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
2402 2384 return (EINVAL);
2403 2385 if (info & VM_INTINFO_RSVD)
2404 2386 return (EINVAL);
2405 2387 } else {
2406 2388 info = 0;
2407 2389 }
2408 2390 VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info);
2409 2391 vcpu->exitintinfo = info;
2410 2392 return (0);
2411 2393 }
2412 2394
2413 2395 enum exc_class {
2414 2396 EXC_BENIGN,
2415 2397 EXC_CONTRIBUTORY,
2416 2398 EXC_PAGEFAULT
2417 2399 };
2418 2400
2419 2401 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */
2420 2402
2421 2403 static enum exc_class
2422 2404 exception_class(uint64_t info)
2423 2405 {
2424 2406 int type, vector;
2425 2407
2426 2408 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
2427 2409 type = info & VM_INTINFO_TYPE;
2428 2410 vector = info & 0xff;
2429 2411
2430 2412 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2431 2413 switch (type) {
2432 2414 case VM_INTINFO_HWINTR:
2433 2415 case VM_INTINFO_SWINTR:
2434 2416 case VM_INTINFO_NMI:
2435 2417 return (EXC_BENIGN);
2436 2418 default:
2437 2419 /*
2438 2420 * Hardware exception.
2439 2421 *
2440 2422 * SVM and VT-x use identical type values to represent NMI,
2441 2423 * hardware interrupt and software interrupt.
2442 2424 *
2443 2425 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2444 2426 * for exceptions except #BP and #OF. #BP and #OF use a type
2445 2427 * value of '5' or '6'. Therefore we don't check for explicit
2446 2428 * values of 'type' to classify 'intinfo' into a hardware
2447 2429 * exception.
2448 2430 */
2449 2431 break;
2450 2432 }
2451 2433
2452 2434 switch (vector) {
2453 2435 case IDT_PF:
2454 2436 case IDT_VE:
2455 2437 return (EXC_PAGEFAULT);
2456 2438 case IDT_DE:
2457 2439 case IDT_TS:
2458 2440 case IDT_NP:
2459 2441 case IDT_SS:
2460 2442 case IDT_GP:
2461 2443 return (EXC_CONTRIBUTORY);
2462 2444 default:
2463 2445 return (EXC_BENIGN);
2464 2446 }
2465 2447 }
2466 2448
2467 2449 static int
2468 2450 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
2469 2451 uint64_t *retinfo)
2470 2452 {
2471 2453 enum exc_class exc1, exc2;
2472 2454 int type1, vector1;
2473 2455
2474 2456 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
2475 2457 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
2476 2458
2477 2459 /*
2478 2460 * If an exception occurs while attempting to call the double-fault
2479 2461 * handler the processor enters shutdown mode (aka triple fault).
2480 2462 */
2481 2463 type1 = info1 & VM_INTINFO_TYPE;
2482 2464 vector1 = info1 & 0xff;
2483 2465 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2484 2466 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)",
2485 2467 info1, info2);
2486 2468 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2487 2469 *retinfo = 0;
2488 2470 return (0);
2489 2471 }
2490 2472
2491 2473 /*
2492 2474 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2493 2475 */
2494 2476 exc1 = exception_class(info1);
2495 2477 exc2 = exception_class(info2);
2496 2478 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2497 2479 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2498 2480 /* Convert nested fault into a double fault. */
2499 2481 *retinfo = IDT_DF;
2500 2482 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2501 2483 *retinfo |= VM_INTINFO_DEL_ERRCODE;
2502 2484 } else {
2503 2485 /* Handle exceptions serially */
2504 2486 *retinfo = info2;
2505 2487 }
2506 2488 return (1);
2507 2489 }
2508 2490
2509 2491 static uint64_t
2510 2492 vcpu_exception_intinfo(struct vcpu *vcpu)
2511 2493 {
2512 2494 uint64_t info = 0;
2513 2495
2514 2496 if (vcpu->exception_pending) {
2515 2497 info = vcpu->exc_vector & 0xff;
2516 2498 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2517 2499 if (vcpu->exc_errcode_valid) {
2518 2500 info |= VM_INTINFO_DEL_ERRCODE;
2519 2501 info |= (uint64_t)vcpu->exc_errcode << 32;
2520 2502 }
2521 2503 }
2522 2504 return (info);
2523 2505 }
2524 2506
2525 2507 int
2526 2508 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2527 2509 {
2528 2510 struct vcpu *vcpu;
2529 2511 uint64_t info1, info2;
2530 2512 int valid;
2531 2513
2532 2514 KASSERT(vcpuid >= 0 &&
2533 2515 vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
2534 2516
2535 2517 vcpu = &vm->vcpu[vcpuid];
2536 2518
2537 2519 info1 = vcpu->exitintinfo;
2538 2520 vcpu->exitintinfo = 0;
2539 2521
2540 2522 info2 = 0;
2541 2523 if (vcpu->exception_pending) {
2542 2524 info2 = vcpu_exception_intinfo(vcpu);
2543 2525 vcpu->exception_pending = 0;
2544 2526 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx",
2545 2527 vcpu->exc_vector, info2);
2546 2528 }
2547 2529
2548 2530 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2549 2531 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
2550 2532 } else if (info1 & VM_INTINFO_VALID) {
2551 2533 *retinfo = info1;
2552 2534 valid = 1;
2553 2535 } else if (info2 & VM_INTINFO_VALID) {
2554 2536 *retinfo = info2;
2555 2537 valid = 1;
2556 2538 } else {
2557 2539 valid = 0;
2558 2540 }
2559 2541
2560 2542 if (valid) {
2561 2543 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), "
2562 2544 "retinfo(%lx)", __func__, info1, info2, *retinfo);
2563 2545 }
2564 2546
2565 2547 return (valid);
2566 2548 }
2567 2549
2568 2550 int
2569 2551 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2570 2552 {
2571 2553 struct vcpu *vcpu;
2572 2554
2573 2555 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2574 2556 return (EINVAL);
2575 2557
2576 2558 vcpu = &vm->vcpu[vcpuid];
2577 2559 *info1 = vcpu->exitintinfo;
2578 2560 *info2 = vcpu_exception_intinfo(vcpu);
2579 2561 return (0);
2580 2562 }
2581 2563
2582 2564 int
2583 2565 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2584 2566 uint32_t errcode, int restart_instruction)
2585 2567 {
2586 2568 struct vcpu *vcpu;
2587 2569 uint64_t regval;
2588 2570 int error;
2589 2571
2590 2572 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2591 2573 return (EINVAL);
2592 2574
2593 2575 if (vector < 0 || vector >= 32)
2594 2576 return (EINVAL);
2595 2577
2596 2578 /*
2597 2579 * NMIs (which bear an exception vector of 2) are to be injected via
2598 2580 * their own specialized path using vm_inject_nmi().
2599 2581 */
2600 2582 if (vector == 2) {
2601 2583 return (EINVAL);
2602 2584 }
2603 2585
2604 2586 /*
2605 2587 * A double fault exception should never be injected directly into
2606 2588 * the guest. It is a derived exception that results from specific
2607 2589 * combinations of nested faults.
2608 2590 */
2609 2591 if (vector == IDT_DF)
2610 2592 return (EINVAL);
2611 2593
2612 2594 vcpu = &vm->vcpu[vcpuid];
2613 2595
2614 2596 if (vcpu->exception_pending) {
2615 2597 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2616 2598 "pending exception %d", vector, vcpu->exc_vector);
2617 2599 return (EBUSY);
2618 2600 }
2619 2601
2620 2602 if (errcode_valid) {
2621 2603 /*
2622 2604 * Exceptions don't deliver an error code in real mode.
2623 2605 */
2624 2606 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val);
2625 2607 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2626 2608 if (!(regval & CR0_PE))
2627 2609 errcode_valid = 0;
2628 2610 }
2629 2611
2630 2612 /*
2631 2613 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2632 2614 *
2633 2615 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2634 2616 * one instruction or incurs an exception.
2635 2617 */
2636 2618 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2637 2619 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2638 2620 __func__, error));
2639 2621
2640 2622 if (restart_instruction)
2641 2623 vm_restart_instruction(vm, vcpuid);
2642 2624
2643 2625 vcpu->exception_pending = 1;
2644 2626 vcpu->exc_vector = vector;
2645 2627 vcpu->exc_errcode = errcode;
2646 2628 vcpu->exc_errcode_valid = errcode_valid;
2647 2629 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2648 2630 return (0);
2649 2631 }
2650 2632
2651 2633 void
2652 2634 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid,
2653 2635 int errcode)
2654 2636 {
2655 2637 int error;
2656 2638
2657 2639 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2658 2640 errcode, 1);
2659 2641 KASSERT(error == 0, ("vm_inject_exception error %d", error));
2660 2642 }
2661 2643
2662 2644 void
2663 2645 vm_inject_ud(struct vm *vm, int vcpuid)
2664 2646 {
2665 2647 vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
2666 2648 }
2667 2649
2668 2650 void
2669 2651 vm_inject_gp(struct vm *vm, int vcpuid)
2670 2652 {
2671 2653 vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
2672 2654 }
2673 2655
2674 2656 void
2675 2657 vm_inject_ac(struct vm *vm, int vcpuid, int errcode)
2676 2658 {
2677 2659 vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
2678 2660 }
2679 2661
2680 2662 void
2681 2663 vm_inject_ss(struct vm *vm, int vcpuid, int errcode)
2682 2664 {
2683 2665 vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
2684 2666 }
2685 2667
2686 2668 void
2687 2669 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
2688 2670 {
2689 2671 int error;
2690 2672
2691 2673 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx",
2692 2674 error_code, cr2);
2693 2675
2694 2676 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2695 2677 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2696 2678
2697 2679 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2698 2680 }
2699 2681
2700 2682 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2701 2683
2702 2684 int
2703 2685 vm_inject_nmi(struct vm *vm, int vcpuid)
2704 2686 {
2705 2687 struct vcpu *vcpu;
2706 2688
2707 2689 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2708 2690 return (EINVAL);
2709 2691
2710 2692 vcpu = &vm->vcpu[vcpuid];
2711 2693
2712 2694 vcpu->nmi_pending = 1;
2713 2695 vcpu_notify_event(vm, vcpuid);
2714 2696 return (0);
2715 2697 }
2716 2698
2717 2699 int
2718 2700 vm_nmi_pending(struct vm *vm, int vcpuid)
2719 2701 {
2720 2702 struct vcpu *vcpu;
2721 2703
2722 2704 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2723 2705 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2724 2706
2725 2707 vcpu = &vm->vcpu[vcpuid];
2726 2708
2727 2709 return (vcpu->nmi_pending);
2728 2710 }
2729 2711
2730 2712 void
2731 2713 vm_nmi_clear(struct vm *vm, int vcpuid)
2732 2714 {
2733 2715 struct vcpu *vcpu;
2734 2716
2735 2717 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2736 2718 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2737 2719
2738 2720 vcpu = &vm->vcpu[vcpuid];
2739 2721
2740 2722 if (vcpu->nmi_pending == 0)
2741 2723 panic("vm_nmi_clear: inconsistent nmi_pending state");
2742 2724
2743 2725 vcpu->nmi_pending = 0;
2744 2726 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2745 2727 }
2746 2728
2747 2729 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2748 2730
2749 2731 int
2750 2732 vm_inject_extint(struct vm *vm, int vcpuid)
2751 2733 {
2752 2734 struct vcpu *vcpu;
2753 2735
2754 2736 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2755 2737 return (EINVAL);
2756 2738
2757 2739 vcpu = &vm->vcpu[vcpuid];
2758 2740
2759 2741 vcpu->extint_pending = 1;
2760 2742 vcpu_notify_event(vm, vcpuid);
2761 2743 return (0);
2762 2744 }
2763 2745
2764 2746 int
2765 2747 vm_extint_pending(struct vm *vm, int vcpuid)
2766 2748 {
2767 2749 struct vcpu *vcpu;
2768 2750
2769 2751 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2770 2752 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2771 2753
2772 2754 vcpu = &vm->vcpu[vcpuid];
2773 2755
2774 2756 return (vcpu->extint_pending);
2775 2757 }
2776 2758
2777 2759 void
2778 2760 vm_extint_clear(struct vm *vm, int vcpuid)
2779 2761 {
2780 2762 struct vcpu *vcpu;
2781 2763
2782 2764 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2783 2765 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2784 2766
|
↓ open down ↓ |
490 lines elided |
↑ open up ↑ |
2785 2767 vcpu = &vm->vcpu[vcpuid];
2786 2768
2787 2769 if (vcpu->extint_pending == 0)
2788 2770 panic("vm_extint_clear: inconsistent extint_pending state");
2789 2771
2790 2772 vcpu->extint_pending = 0;
2791 2773 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2792 2774 }
2793 2775
2794 2776 int
2777 +vm_inject_init(struct vm *vm, int vcpuid)
2778 +{
2779 + struct vcpu *vcpu;
2780 +
2781 + if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2782 + return (EINVAL);
2783 +
2784 + vcpu = &vm->vcpu[vcpuid];
2785 + vcpu_lock(vcpu);
2786 + vcpu->run_state |= VRS_PEND_INIT;
2787 + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2788 + vcpu_unlock(vcpu);
2789 + return (0);
2790 +}
2791 +
2792 +int
2793 +vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2794 +{
2795 + struct vcpu *vcpu;
2796 +
2797 + if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2798 + return (EINVAL);
2799 +
2800 + vcpu = &vm->vcpu[vcpuid];
2801 + vcpu_lock(vcpu);
2802 + vcpu->run_state |= VRS_PEND_SIPI;
2803 + vcpu->sipi_vector = vector;
2804 + /* SIPI is only actionable if the CPU is waiting in INIT state */
2805 + if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2806 + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2807 + }
2808 + vcpu_unlock(vcpu);
2809 + return (0);
2810 +}
2811 +
2812 +bool
2813 +vcpu_run_state_pending(struct vm *vm, int vcpuid)
2814 +{
2815 + struct vcpu *vcpu;
2816 +
2817 + ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2818 + vcpu = &vm->vcpu[vcpuid];
2819 +
2820 + /* Of interest: vCPU not in running state or with pending INIT */
2821 + return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2822 +}
2823 +
2824 +int
2825 +vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2826 +{
2827 + struct seg_desc desc;
2828 + const enum vm_reg_name clear_regs[] = {
2829 + VM_REG_GUEST_CR2,
2830 + VM_REG_GUEST_CR3,
2831 + VM_REG_GUEST_CR4,
2832 + VM_REG_GUEST_RAX,
2833 + VM_REG_GUEST_RBX,
2834 + VM_REG_GUEST_RCX,
2835 + VM_REG_GUEST_RSI,
2836 + VM_REG_GUEST_RDI,
2837 + VM_REG_GUEST_RBP,
2838 + VM_REG_GUEST_RSP,
2839 + VM_REG_GUEST_R8,
2840 + VM_REG_GUEST_R9,
2841 + VM_REG_GUEST_R10,
2842 + VM_REG_GUEST_R11,
2843 + VM_REG_GUEST_R12,
2844 + VM_REG_GUEST_R13,
2845 + VM_REG_GUEST_R14,
2846 + VM_REG_GUEST_R15,
2847 + VM_REG_GUEST_DR0,
2848 + VM_REG_GUEST_DR1,
2849 + VM_REG_GUEST_DR2,
2850 + VM_REG_GUEST_DR3,
2851 + VM_REG_GUEST_EFER,
2852 + };
2853 + const enum vm_reg_name data_segs[] = {
2854 + VM_REG_GUEST_SS,
2855 + VM_REG_GUEST_DS,
2856 + VM_REG_GUEST_ES,
2857 + VM_REG_GUEST_FS,
2858 + VM_REG_GUEST_GS,
2859 + };
2860 + struct vcpu *vcpu = &vm->vcpu[vcpuid];
2861 +
2862 + if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2863 + return (EINVAL);
2864 +
2865 + for (uint_t i = 0; i < nitems(clear_regs); i++) {
2866 + VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2867 + }
2868 +
2869 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2870 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2871 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2872 +
2873 + /*
2874 + * The prescribed contents of %rdx differ slightly between the Intel and
2875 + * AMD architectural definitions. The former expects the Extended Model
2876 + * in bits 16-19 where the latter expects all the Family, Model, and
2877 + * Stepping be there. Common boot ROMs appear to disregard this
2878 + * anyways, so we stick with a compromise value similar to what is
2879 + * spelled out in the Intel SDM.
2880 + */
2881 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2882 +
2883 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2884 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2885 +
2886 + /* CS: Present, R/W, Accessed */
2887 + desc.access = 0x0093;
2888 + desc.base = 0xffff0000;
2889 + desc.limit = 0xffff;
2890 + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2891 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2892 +
2893 + /* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2894 + desc.access = 0x0093;
2895 + desc.base = 0;
2896 + desc.limit = 0xffff;
2897 + for (uint_t i = 0; i < nitems(data_segs); i++) {
2898 + VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2899 + VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2900 + }
2901 +
2902 + /* GDTR, IDTR */
2903 + desc.base = 0;
2904 + desc.limit = 0xffff;
2905 + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2906 + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2907 +
2908 + /* LDTR: Present, LDT */
2909 + desc.access = 0x0082;
2910 + desc.base = 0;
2911 + desc.limit = 0xffff;
2912 + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2913 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2914 +
2915 + /* TR: Present, 32-bit TSS */
2916 + desc.access = 0x008b;
2917 + desc.base = 0;
2918 + desc.limit = 0xffff;
2919 + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2920 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2921 +
2922 + vlapic_reset(vm_lapic(vm, vcpuid));
2923 +
2924 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2925 +
2926 + vcpu->exitintinfo = 0;
2927 + vcpu->exception_pending = 0;
2928 + vcpu->nmi_pending = 0;
2929 + vcpu->extint_pending = 0;
2930 +
2931 + /*
2932 + * A CPU reset caused by power-on or system reset clears more state than
2933 + * one which is trigged from an INIT IPI.
2934 + */
2935 + if (!init_only) {
2936 + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2937 + fpu_save_area_reset(vcpu->guestfpu);
2938 +
2939 + /* XXX: clear MSRs and other pieces */
2940 + }
2941 +
2942 + return (0);
2943 +}
2944 +
2945 +static int
2946 +vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2947 +{
2948 + struct seg_desc desc;
2949 +
2950 + if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2951 + return (EINVAL);
2952 +
2953 + /* CS: Present, R/W, Accessed */
2954 + desc.access = 0x0093;
2955 + desc.base = (uint64_t)vector << 12;
2956 + desc.limit = 0xffff;
2957 + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2958 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2959 + (uint64_t)vector << 8));
2960 +
2961 + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2962 +
2963 + return (0);
2964 +}
2965 +
2966 +int
2795 2967 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2796 2968 {
2797 2969 if (vcpu < 0 || vcpu >= vm->maxcpus)
2798 2970 return (EINVAL);
2799 2971
2800 2972 if (type < 0 || type >= VM_CAP_MAX)
2801 2973 return (EINVAL);
2802 2974
2803 2975 return (VMGETCAP(vm->cookie, vcpu, type, retval));
2804 2976 }
2805 2977
2806 2978 int
2807 2979 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
2808 2980 {
2809 2981 if (vcpu < 0 || vcpu >= vm->maxcpus)
2810 2982 return (EINVAL);
2811 2983
2812 2984 if (type < 0 || type >= VM_CAP_MAX)
2813 2985 return (EINVAL);
2814 2986
2815 2987 return (VMSETCAP(vm->cookie, vcpu, type, val));
2816 2988 }
2817 2989
2818 2990 struct vlapic *
2819 2991 vm_lapic(struct vm *vm, int cpu)
2820 2992 {
2821 2993 return (vm->vcpu[cpu].vlapic);
2822 2994 }
2823 2995
2824 2996 struct vioapic *
2825 2997 vm_ioapic(struct vm *vm)
2826 2998 {
2827 2999
2828 3000 return (vm->vioapic);
2829 3001 }
2830 3002
2831 3003 struct vhpet *
2832 3004 vm_hpet(struct vm *vm)
2833 3005 {
2834 3006
2835 3007 return (vm->vhpet);
2836 3008 }
2837 3009
2838 3010 #ifdef __FreeBSD__
2839 3011 bool
2840 3012 vmm_is_pptdev(int bus, int slot, int func)
2841 3013 {
2842 3014 int b, f, i, n, s;
2843 3015 char *val, *cp, *cp2;
2844 3016 bool found;
2845 3017
2846 3018 /*
2847 3019 * XXX
2848 3020 * The length of an environment variable is limited to 128 bytes which
2849 3021 * puts an upper limit on the number of passthru devices that may be
2850 3022 * specified using a single environment variable.
2851 3023 *
2852 3024 * Work around this by scanning multiple environment variable
2853 3025 * names instead of a single one - yuck!
2854 3026 */
2855 3027 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
2856 3028
2857 3029 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
2858 3030 found = false;
2859 3031 for (i = 0; names[i] != NULL && !found; i++) {
2860 3032 cp = val = kern_getenv(names[i]);
2861 3033 while (cp != NULL && *cp != '\0') {
2862 3034 if ((cp2 = strchr(cp, ' ')) != NULL)
2863 3035 *cp2 = '\0';
2864 3036
2865 3037 n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
2866 3038 if (n == 3 && bus == b && slot == s && func == f) {
2867 3039 found = true;
2868 3040 break;
2869 3041 }
2870 3042
2871 3043 if (cp2 != NULL)
2872 3044 *cp2++ = ' ';
2873 3045
2874 3046 cp = cp2;
2875 3047 }
2876 3048 freeenv(val);
2877 3049 }
2878 3050 return (found);
2879 3051 }
2880 3052 #endif
2881 3053
2882 3054 void *
2883 3055 vm_iommu_domain(struct vm *vm)
2884 3056 {
2885 3057
2886 3058 return (vm->iommu);
|
↓ open down ↓ |
82 lines elided |
↑ open up ↑ |
2887 3059 }
2888 3060
2889 3061 int
2890 3062 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
2891 3063 bool from_idle)
2892 3064 {
2893 3065 int error;
2894 3066 struct vcpu *vcpu;
2895 3067
2896 3068 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2897 - panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
3069 + panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
2898 3070
2899 3071 vcpu = &vm->vcpu[vcpuid];
2900 3072
2901 3073 vcpu_lock(vcpu);
2902 3074 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
2903 3075 vcpu_unlock(vcpu);
2904 3076
2905 3077 return (error);
2906 3078 }
2907 3079
2908 3080 enum vcpu_state
2909 3081 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
2910 3082 {
2911 3083 struct vcpu *vcpu;
2912 3084 enum vcpu_state state;
2913 3085
2914 3086 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2915 - panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
3087 + panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
2916 3088
2917 3089 vcpu = &vm->vcpu[vcpuid];
2918 3090
2919 3091 vcpu_lock(vcpu);
2920 3092 state = vcpu->state;
2921 3093 if (hostcpu != NULL)
2922 3094 *hostcpu = vcpu->hostcpu;
2923 3095 vcpu_unlock(vcpu);
2924 3096
2925 3097 return (state);
2926 3098 }
2927 3099
2928 -void
2929 -vcpu_block_run(struct vm *vm, int vcpuid)
2930 -{
2931 - struct vcpu *vcpu;
2932 -
2933 - if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2934 - panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
2935 -
2936 - vcpu = &vm->vcpu[vcpuid];
2937 -
2938 - vcpu_lock(vcpu);
2939 - vcpu->runblock++;
2940 - if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) {
2941 - vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2942 - }
2943 - while (vcpu->state == VCPU_RUNNING) {
2944 -#ifdef __FreeBSD__
2945 - msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
2946 -#else
2947 - cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
2948 -#endif
2949 - }
2950 - vcpu_unlock(vcpu);
2951 -}
2952 -
2953 -void
2954 -vcpu_unblock_run(struct vm *vm, int vcpuid)
2955 -{
2956 - struct vcpu *vcpu;
2957 -
2958 - if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2959 - panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
2960 -
2961 - vcpu = &vm->vcpu[vcpuid];
2962 -
2963 - vcpu_lock(vcpu);
2964 - KASSERT(vcpu->runblock != 0, ("expected non-zero runblock"));
2965 - vcpu->runblock--;
2966 - if (vcpu->runblock == 0) {
2967 -#ifdef __FreeBSD__
2968 - wakeup(&vcpu->state);
2969 -#else
2970 - cv_broadcast(&vcpu->state_cv);
2971 -#endif
2972 - }
2973 - vcpu_unlock(vcpu);
2974 -}
2975 -
2976 3100 #ifndef __FreeBSD__
2977 3101 uint64_t
2978 3102 vcpu_tsc_offset(struct vm *vm, int vcpuid)
2979 3103 {
2980 3104 return (vm->vcpu[vcpuid].tsc_offset);
2981 3105 }
2982 3106 #endif /* __FreeBSD__ */
2983 3107
2984 3108 int
2985 3109 vm_activate_cpu(struct vm *vm, int vcpuid)
2986 3110 {
2987 3111
2988 3112 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2989 3113 return (EINVAL);
2990 3114
2991 3115 if (CPU_ISSET(vcpuid, &vm->active_cpus))
2992 3116 return (EBUSY);
2993 3117
2994 3118 VCPU_CTR0(vm, vcpuid, "activated");
2995 3119 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
2996 3120 return (0);
2997 3121 }
2998 3122
2999 3123 int
3000 3124 vm_suspend_cpu(struct vm *vm, int vcpuid)
3001 3125 {
3002 3126 int i;
3003 3127
3004 3128 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3005 3129 return (EINVAL);
3006 3130
3007 3131 if (vcpuid == -1) {
3008 3132 vm->debug_cpus = vm->active_cpus;
3009 3133 for (i = 0; i < vm->maxcpus; i++) {
3010 3134 if (CPU_ISSET(i, &vm->active_cpus))
3011 3135 vcpu_notify_event(vm, i);
3012 3136 }
3013 3137 } else {
3014 3138 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3015 3139 return (EINVAL);
3016 3140
3017 3141 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3018 3142 vcpu_notify_event(vm, vcpuid);
3019 3143 }
3020 3144 return (0);
3021 3145 }
3022 3146
3023 3147 int
3024 3148 vm_resume_cpu(struct vm *vm, int vcpuid)
3025 3149 {
3026 3150
3027 3151 if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3028 3152 return (EINVAL);
3029 3153
3030 3154 if (vcpuid == -1) {
|
↓ open down ↓ |
45 lines elided |
↑ open up ↑ |
3031 3155 CPU_ZERO(&vm->debug_cpus);
3032 3156 } else {
3033 3157 if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3034 3158 return (EINVAL);
3035 3159
3036 3160 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3037 3161 }
3038 3162 return (0);
3039 3163 }
3040 3164
3041 -int
3042 -vcpu_debugged(struct vm *vm, int vcpuid)
3165 +static bool
3166 +vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3167 + uint64_t entry_rip)
3043 3168 {
3169 + struct vcpu *vcpu = &vm->vcpu[vcpuid];
3170 + struct vm_exit *vme = &vcpu->exitinfo;
3171 + bool bail = false;
3044 3172
3045 - return (CPU_ISSET(vcpuid, &vm->debug_cpus));
3173 + ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3174 +
3175 + if (vm->suspend) {
3176 + if (on_entry) {
3177 + VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3178 + vm->suspend < VM_SUSPEND_LAST);
3179 +
3180 + vme->exitcode = VM_EXITCODE_SUSPENDED;
3181 + vme->u.suspended.how = vm->suspend;
3182 + } else {
3183 + /*
3184 + * Handling VM suspend is complicated, so if that
3185 + * condition is detected outside of VM-entry itself,
3186 + * just emit a BOGUS exitcode so we take a lap to pick
3187 + * up the event during an entry and are directed into
3188 + * the vm_handle_suspend() logic.
3189 + */
3190 + vme->exitcode = VM_EXITCODE_BOGUS;
3191 + }
3192 + bail = true;
3193 + }
3194 + if (vcpu->reqidle) {
3195 + vme->exitcode = VM_EXITCODE_REQIDLE;
3196 + vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3197 +
3198 + if (!on_entry) {
3199 + /*
3200 + * A reqidle request detected outside of VM-entry can be
3201 + * handled directly by clearing the request (and taking
3202 + * a lap to userspace).
3203 + */
3204 + vcpu_assert_locked(vcpu);
3205 + vcpu->reqidle = 0;
3206 + }
3207 + bail = true;
3208 + }
3209 + if (vcpu_should_yield(vm, vcpuid)) {
3210 + vme->exitcode = VM_EXITCODE_BOGUS;
3211 + vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3212 + bail = true;
3213 + }
3214 + if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3215 + vme->exitcode = VM_EXITCODE_DEBUG;
3216 + bail = true;
3217 + }
3218 +
3219 + if (bail) {
3220 + if (on_entry) {
3221 + /*
3222 + * If bailing out during VM-entry, the current %rip must
3223 + * be recorded in the exitinfo.
3224 + */
3225 + vme->rip = entry_rip;
3226 + }
3227 + vme->inst_length = 0;
3228 + }
3229 + return (bail);
3046 3230 }
3047 3231
3232 +static bool
3233 +vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3234 +{
3235 + /*
3236 + * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3237 + * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3238 + * structure, and we would only modify the exitcode.
3239 + */
3240 + return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3241 +}
3242 +
3243 +bool
3244 +vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3245 +{
3246 + /*
3247 + * Bail-out checks done as part of VM entry require an updated %rip to
3248 + * populate the vm_exit struct if any of the conditions of interest are
3249 + * matched in the check.
3250 + */
3251 + return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3252 +}
3253 +
3048 3254 cpuset_t
3049 3255 vm_active_cpus(struct vm *vm)
3050 3256 {
3051 3257
3052 3258 return (vm->active_cpus);
3053 3259 }
3054 3260
3055 3261 cpuset_t
3056 3262 vm_debug_cpus(struct vm *vm)
3057 3263 {
3058 3264
3059 3265 return (vm->debug_cpus);
3060 3266 }
3061 3267
3062 3268 cpuset_t
3063 3269 vm_suspended_cpus(struct vm *vm)
3064 3270 {
3065 3271
3066 3272 return (vm->suspended_cpus);
3067 3273 }
3068 3274
3069 3275 void *
3070 3276 vcpu_stats(struct vm *vm, int vcpuid)
3071 3277 {
3072 3278
3073 3279 return (vm->vcpu[vcpuid].stats);
3074 3280 }
3075 3281
3076 3282 int
3077 3283 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3078 3284 {
3079 3285 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3080 3286 return (EINVAL);
3081 3287
3082 3288 *state = vm->vcpu[vcpuid].x2apic_state;
3083 3289
3084 3290 return (0);
3085 3291 }
3086 3292
3087 3293 int
3088 3294 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3089 3295 {
3090 3296 if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3091 3297 return (EINVAL);
3092 3298
3093 3299 if (state >= X2APIC_STATE_LAST)
3094 3300 return (EINVAL);
3095 3301
3096 3302 vm->vcpu[vcpuid].x2apic_state = state;
3097 3303
3098 3304 vlapic_set_x2apic_state(vm, vcpuid, state);
3099 3305
3100 3306 return (0);
3101 3307 }
3102 3308
3103 3309 /*
3104 3310 * This function is called to ensure that a vcpu "sees" a pending event
3105 3311 * as soon as possible:
3106 3312 * - If the vcpu thread is sleeping then it is woken up.
3107 3313 * - If the vcpu is running on a different host_cpu then an IPI will be directed
3108 3314 * to the host_cpu to cause the vcpu to trap into the hypervisor.
3109 3315 */
3110 3316 static void
3111 3317 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3112 3318 {
3113 3319 int hostcpu;
3114 3320
3115 3321 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3116 3322
3117 3323 hostcpu = vcpu->hostcpu;
3118 3324 if (vcpu->state == VCPU_RUNNING) {
3119 3325 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3120 3326 if (hostcpu != curcpu) {
3121 3327 if (ntype == VCPU_NOTIFY_APIC) {
3122 3328 vlapic_post_intr(vcpu->vlapic, hostcpu,
3123 3329 vmm_ipinum);
3124 3330 } else {
3125 3331 ipi_cpu(hostcpu, vmm_ipinum);
3126 3332 }
3127 3333 } else {
3128 3334 /*
3129 3335 * If the 'vcpu' is running on 'curcpu' then it must
3130 3336 * be sending a notification to itself (e.g. SELF_IPI).
3131 3337 * The pending event will be picked up when the vcpu
3132 3338 * transitions back to guest context.
3133 3339 */
3134 3340 }
3135 3341 } else {
3136 3342 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3137 3343 "with hostcpu %d", vcpu->state, hostcpu));
3138 3344 if (vcpu->state == VCPU_SLEEPING) {
3139 3345 #ifdef __FreeBSD__
3140 3346 wakeup_one(vcpu);
3141 3347 #else
3142 3348 cv_signal(&vcpu->vcpu_cv);
3143 3349 #endif
3144 3350 }
3145 3351 }
3146 3352 }
3147 3353
3148 3354 void
3149 3355 vcpu_notify_event(struct vm *vm, int vcpuid)
3150 3356 {
3151 3357 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3152 3358
3153 3359 vcpu_lock(vcpu);
3154 3360 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3155 3361 vcpu_unlock(vcpu);
3156 3362 }
3157 3363
3158 3364 void
3159 3365 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3160 3366 {
3161 3367 struct vcpu *vcpu = &vm->vcpu[vcpuid];
3162 3368
3163 3369 if (ntype == VCPU_NOTIFY_NONE) {
3164 3370 return;
3165 3371 }
3166 3372
3167 3373 vcpu_lock(vcpu);
3168 3374 vcpu_notify_event_locked(vcpu, ntype);
3169 3375 vcpu_unlock(vcpu);
3170 3376 }
3171 3377
3172 3378 struct vmspace *
3173 3379 vm_get_vmspace(struct vm *vm)
3174 3380 {
3175 3381
3176 3382 return (vm->vmspace);
3177 3383 }
3178 3384
3179 3385 int
3180 3386 vm_apicid2vcpuid(struct vm *vm, int apicid)
3181 3387 {
3182 3388 /*
3183 3389 * XXX apic id is assumed to be numerically identical to vcpu id
3184 3390 */
3185 3391 return (apicid);
3186 3392 }
3187 3393
3188 3394 struct vatpic *
3189 3395 vm_atpic(struct vm *vm)
3190 3396 {
3191 3397 return (vm->vatpic);
3192 3398 }
3193 3399
3194 3400 struct vatpit *
3195 3401 vm_atpit(struct vm *vm)
3196 3402 {
3197 3403 return (vm->vatpit);
3198 3404 }
3199 3405
3200 3406 struct vpmtmr *
3201 3407 vm_pmtmr(struct vm *vm)
3202 3408 {
3203 3409
3204 3410 return (vm->vpmtmr);
3205 3411 }
3206 3412
3207 3413 struct vrtc *
3208 3414 vm_rtc(struct vm *vm)
3209 3415 {
3210 3416
3211 3417 return (vm->vrtc);
3212 3418 }
3213 3419
3214 3420 enum vm_reg_name
3215 3421 vm_segment_name(int seg)
3216 3422 {
3217 3423 static enum vm_reg_name seg_names[] = {
3218 3424 VM_REG_GUEST_ES,
3219 3425 VM_REG_GUEST_CS,
3220 3426 VM_REG_GUEST_SS,
3221 3427 VM_REG_GUEST_DS,
3222 3428 VM_REG_GUEST_FS,
3223 3429 VM_REG_GUEST_GS
3224 3430 };
3225 3431
3226 3432 KASSERT(seg >= 0 && seg < nitems(seg_names),
3227 3433 ("%s: invalid segment encoding %d", __func__, seg));
3228 3434 return (seg_names[seg]);
3229 3435 }
3230 3436
3231 3437 void
3232 3438 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3233 3439 int num_copyinfo)
3234 3440 {
3235 3441 int idx;
3236 3442
3237 3443 for (idx = 0; idx < num_copyinfo; idx++) {
3238 3444 if (copyinfo[idx].cookie != NULL)
3239 3445 vm_gpa_release(copyinfo[idx].cookie);
3240 3446 }
3241 3447 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3242 3448 }
3243 3449
3244 3450 int
3245 3451 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3246 3452 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3247 3453 int num_copyinfo, int *fault)
3248 3454 {
3249 3455 int error, idx, nused;
3250 3456 size_t n, off, remaining;
3251 3457 void *hva, *cookie;
3252 3458 uint64_t gpa;
3253 3459
3254 3460 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3255 3461
3256 3462 nused = 0;
3257 3463 remaining = len;
3258 3464 while (remaining > 0) {
3259 3465 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3260 3466 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3261 3467 if (error || *fault)
3262 3468 return (error);
3263 3469 off = gpa & PAGE_MASK;
3264 3470 n = min(remaining, PAGE_SIZE - off);
3265 3471 copyinfo[nused].gpa = gpa;
3266 3472 copyinfo[nused].len = n;
3267 3473 remaining -= n;
3268 3474 gla += n;
3269 3475 nused++;
3270 3476 }
3271 3477
3272 3478 for (idx = 0; idx < nused; idx++) {
3273 3479 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
3274 3480 copyinfo[idx].len, prot, &cookie);
3275 3481 if (hva == NULL)
3276 3482 break;
3277 3483 copyinfo[idx].hva = hva;
3278 3484 copyinfo[idx].cookie = cookie;
3279 3485 }
3280 3486
3281 3487 if (idx != nused) {
3282 3488 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3283 3489 return (EFAULT);
3284 3490 } else {
3285 3491 *fault = 0;
3286 3492 return (0);
3287 3493 }
3288 3494 }
3289 3495
3290 3496 void
3291 3497 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3292 3498 size_t len)
3293 3499 {
3294 3500 char *dst;
3295 3501 int idx;
3296 3502
3297 3503 dst = kaddr;
3298 3504 idx = 0;
3299 3505 while (len > 0) {
3300 3506 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3301 3507 len -= copyinfo[idx].len;
3302 3508 dst += copyinfo[idx].len;
3303 3509 idx++;
3304 3510 }
3305 3511 }
3306 3512
3307 3513 void
3308 3514 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3309 3515 struct vm_copyinfo *copyinfo, size_t len)
3310 3516 {
3311 3517 const char *src;
3312 3518 int idx;
3313 3519
3314 3520 src = kaddr;
3315 3521 idx = 0;
3316 3522 while (len > 0) {
3317 3523 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3318 3524 len -= copyinfo[idx].len;
3319 3525 src += copyinfo[idx].len;
3320 3526 idx++;
3321 3527 }
3322 3528 }
3323 3529
3324 3530 /*
3325 3531 * Return the amount of in-use and wired memory for the VM. Since
3326 3532 * these are global stats, only return the values with for vCPU 0
3327 3533 */
3328 3534 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3329 3535 VMM_STAT_DECLARE(VMM_MEM_WIRED);
3330 3536
3331 3537 static void
3332 3538 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3333 3539 {
3334 3540
3335 3541 if (vcpu == 0) {
3336 3542 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3337 3543 PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3338 3544 }
3339 3545 }
3340 3546
3341 3547 static void
3342 3548 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3343 3549 {
3344 3550
3345 3551 if (vcpu == 0) {
3346 3552 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
3347 3553 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
3348 3554 }
3349 3555 }
3350 3556
3351 3557 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3352 3558 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
3353 3559
3354 3560 int
3355 3561 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3356 3562 uint8_t bytes, uint32_t *val)
3357 3563 {
3358 3564 return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3359 3565 }
3360 3566
3361 3567 /*
3362 3568 * bhyve-internal interfaces to attach or detach IO port handlers.
3363 3569 * Must be called with VM write lock held for safety.
3364 3570 */
3365 3571 int
3366 3572 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3367 3573 void **cookie)
3368 3574 {
3369 3575 int err;
3370 3576 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3371 3577 if (err == 0) {
3372 3578 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3373 3579 }
3374 3580 return (err);
3375 3581 }
3376 3582 int
3377 3583 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3378 3584 void **old_arg)
3379 3585 {
3380 3586 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3381 3587 int err;
3382 3588
3383 3589 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3384 3590 if (err == 0) {
3385 3591 *cookie = NULL;
3386 3592 }
3387 3593 return (err);
3388 3594 }
3389 3595
3390 3596 /*
3391 3597 * External driver interfaces to attach or detach IO port handlers.
3392 3598 * Must be called with VM write lock held for safety.
3393 3599 */
3394 3600 int
3395 3601 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3396 3602 void *arg, void **cookie)
3397 3603 {
3398 3604 int err;
3399 3605
3400 3606 if (port == 0) {
3401 3607 return (EINVAL);
3402 3608 }
3403 3609
3404 3610 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3405 3611 if (err == 0) {
3406 3612 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3407 3613 }
3408 3614 return (err);
3409 3615 }
3410 3616 void
3411 3617 vm_ioport_unhook(struct vm *vm, void **cookie)
3412 3618 {
3413 3619 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3414 3620 ioport_handler_t old_func;
3415 3621 void *old_arg;
3416 3622 int err;
3417 3623
3418 3624 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3419 3625
3420 3626 /* ioport-hook-using drivers are expected to be well-behaved */
3421 3627 VERIFY0(err);
3422 3628 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3423 3629
3424 3630 *cookie = NULL;
3425 3631 }
|
↓ open down ↓ |
368 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX