Print this page
13275 bhyve needs richer INIT/SIPI support
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/cmd/bhyve/bhyverun.c
+++ new/usr/src/cmd/bhyve/bhyverun.c
1 1 /*-
2 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 3 *
4 4 * Copyright (c) 2011 NetApp, Inc.
5 5 * All rights reserved.
6 6 *
7 7 * Redistribution and use in source and binary forms, with or without
8 8 * modification, are permitted provided that the following conditions
9 9 * are met:
10 10 * 1. Redistributions of source code must retain the above copyright
11 11 * notice, this list of conditions and the following disclaimer.
12 12 * 2. Redistributions in binary form must reproduce the above copyright
13 13 * notice, this list of conditions and the following disclaimer in the
14 14 * documentation and/or other materials provided with the distribution.
15 15 *
16 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 26 * SUCH DAMAGE.
27 27 *
28 28 * $FreeBSD$
29 29 */
30 30 /*
31 31 * This file and its contents are supplied under the terms of the
32 32 * Common Development and Distribution License ("CDDL"), version 1.0.
33 33 * You may only use this file in accordance with the terms of version
34 34 * 1.0 of the CDDL.
35 35 *
36 36 * A full copy of the text of the CDDL should have accompanied this
37 37 * source. A copy of the CDDL is also available via the Internet at
38 38 * http://www.illumos.org/license/CDDL.
39 39 *
40 40 * Copyright 2015 Pluribus Networks Inc.
41 41 * Copyright 2018 Joyent, Inc.
42 42 * Copyright 2020 Oxide Computer Company
43 43 */
44 44
45 45 #include <sys/cdefs.h>
46 46 __FBSDID("$FreeBSD$");
47 47
48 48 #include <sys/types.h>
49 49 #ifndef WITHOUT_CAPSICUM
50 50 #include <sys/capsicum.h>
51 51 #endif
52 52 #include <sys/mman.h>
53 53 #include <sys/time.h>
54 54 #include <sys/cpuset.h>
55 55
56 56 #ifdef __FreeBSD__
57 57 #include <amd64/vmm/intel/vmcs.h>
58 58 #else
59 59 #include <intel/vmcs.h>
60 60 #endif
61 61
62 62 #include <machine/atomic.h>
63 63 #include <machine/segments.h>
64 64
65 65 #ifndef WITHOUT_CAPSICUM
66 66 #include <capsicum_helpers.h>
67 67 #endif
68 68 #include <stdio.h>
69 69 #include <stdlib.h>
70 70 #include <string.h>
71 71 #include <err.h>
72 72 #include <errno.h>
73 73 #include <libgen.h>
74 74 #include <unistd.h>
75 75 #include <assert.h>
76 76 #include <pthread.h>
77 77 #include <pthread_np.h>
78 78 #include <sysexits.h>
79 79 #include <stdbool.h>
80 80 #include <stdint.h>
81 81
82 82 #include <machine/vmm.h>
83 83 #ifndef WITHOUT_CAPSICUM
84 84 #include <machine/vmm_dev.h>
85 85 #endif
86 86 #include <vmmapi.h>
87 87
88 88 #ifndef __FreeBSD__
89 89 #include <sys/stat.h>
90 90 #endif
91 91
92 92 #include "bhyverun.h"
93 93 #include "acpi.h"
94 94 #include "atkbdc.h"
95 95 #include "console.h"
96 96 #include "bootrom.h"
97 97 #include "inout.h"
98 98 #include "dbgport.h"
99 99 #include "debug.h"
100 100 #include "fwctl.h"
101 101 #include "gdb.h"
102 102 #include "ioapic.h"
103 103 #include "kernemu_dev.h"
104 104 #include "mem.h"
105 105 #include "mevent.h"
106 106 #include "mptbl.h"
107 107 #include "pci_emul.h"
108 108 #include "pci_irq.h"
109 109 #include "pci_lpc.h"
110 110 #include "smbiostbl.h"
111 111 #include "xmsr.h"
112 112 #include "spinup_ap.h"
113 113 #include "rfb.h"
114 114 #include "rtc.h"
115 115 #include "vga.h"
116 116 #include "vmgenc.h"
117 117
118 118 #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
119 119
120 120 #define MB (1024UL * 1024)
121 121 #define GB (1024UL * MB)
122 122
123 123 static const char * const vmx_exit_reason_desc[] = {
124 124 [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
125 125 [EXIT_REASON_EXT_INTR] = "External interrupt",
126 126 [EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
127 127 [EXIT_REASON_INIT] = "INIT signal",
128 128 [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
129 129 [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
130 130 [EXIT_REASON_SMI] = "Other SMI",
131 131 [EXIT_REASON_INTR_WINDOW] = "Interrupt window",
132 132 [EXIT_REASON_NMI_WINDOW] = "NMI window",
133 133 [EXIT_REASON_TASK_SWITCH] = "Task switch",
134 134 [EXIT_REASON_CPUID] = "CPUID",
135 135 [EXIT_REASON_GETSEC] = "GETSEC",
136 136 [EXIT_REASON_HLT] = "HLT",
137 137 [EXIT_REASON_INVD] = "INVD",
138 138 [EXIT_REASON_INVLPG] = "INVLPG",
139 139 [EXIT_REASON_RDPMC] = "RDPMC",
140 140 [EXIT_REASON_RDTSC] = "RDTSC",
141 141 [EXIT_REASON_RSM] = "RSM",
142 142 [EXIT_REASON_VMCALL] = "VMCALL",
143 143 [EXIT_REASON_VMCLEAR] = "VMCLEAR",
144 144 [EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
145 145 [EXIT_REASON_VMPTRLD] = "VMPTRLD",
146 146 [EXIT_REASON_VMPTRST] = "VMPTRST",
147 147 [EXIT_REASON_VMREAD] = "VMREAD",
148 148 [EXIT_REASON_VMRESUME] = "VMRESUME",
149 149 [EXIT_REASON_VMWRITE] = "VMWRITE",
150 150 [EXIT_REASON_VMXOFF] = "VMXOFF",
151 151 [EXIT_REASON_VMXON] = "VMXON",
152 152 [EXIT_REASON_CR_ACCESS] = "Control-register accesses",
153 153 [EXIT_REASON_DR_ACCESS] = "MOV DR",
154 154 [EXIT_REASON_INOUT] = "I/O instruction",
155 155 [EXIT_REASON_RDMSR] = "RDMSR",
156 156 [EXIT_REASON_WRMSR] = "WRMSR",
157 157 [EXIT_REASON_INVAL_VMCS] =
158 158 "VM-entry failure due to invalid guest state",
159 159 [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
160 160 [EXIT_REASON_MWAIT] = "MWAIT",
161 161 [EXIT_REASON_MTF] = "Monitor trap flag",
162 162 [EXIT_REASON_MONITOR] = "MONITOR",
163 163 [EXIT_REASON_PAUSE] = "PAUSE",
164 164 [EXIT_REASON_MCE_DURING_ENTRY] =
165 165 "VM-entry failure due to machine-check event",
166 166 [EXIT_REASON_TPR] = "TPR below threshold",
167 167 [EXIT_REASON_APIC_ACCESS] = "APIC access",
168 168 [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
169 169 [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
170 170 [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
171 171 [EXIT_REASON_EPT_FAULT] = "EPT violation",
172 172 [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
173 173 [EXIT_REASON_INVEPT] = "INVEPT",
174 174 [EXIT_REASON_RDTSCP] = "RDTSCP",
175 175 [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
176 176 [EXIT_REASON_INVVPID] = "INVVPID",
177 177 [EXIT_REASON_WBINVD] = "WBINVD",
178 178 [EXIT_REASON_XSETBV] = "XSETBV",
179 179 [EXIT_REASON_APIC_WRITE] = "APIC write",
180 180 [EXIT_REASON_RDRAND] = "RDRAND",
181 181 [EXIT_REASON_INVPCID] = "INVPCID",
182 182 [EXIT_REASON_VMFUNC] = "VMFUNC",
183 183 [EXIT_REASON_ENCLS] = "ENCLS",
184 184 [EXIT_REASON_RDSEED] = "RDSEED",
185 185 [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
186 186 [EXIT_REASON_XSAVES] = "XSAVES",
187 187 [EXIT_REASON_XRSTORS] = "XRSTORS"
188 188 };
189 189
190 190 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
191 191 extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
192 192
193 193 char *vmname;
194 194
195 195 int guest_ncpus;
196 196 uint16_t cores, maxcpus, sockets, threads;
197 197
198 198 char *guest_uuid_str;
199 199
200 200 int raw_stdio = 0;
201 201
202 202 static int gdb_port = 0;
203 203 static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
204 204 static int virtio_msix = 1;
205 205 static int x2apic_mode = 0; /* default is xAPIC */
206 206
207 207 static int strictio;
208 208 static int strictmsr = 1;
209 209
210 210 static int acpi;
211 211
212 212 static char *progname;
213 213 static const int BSP = 0;
214 214
215 215 static cpuset_t cpumask;
216 216
217 217 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
218 218
219 219 static struct vm_exit vmexit[VM_MAXCPU];
220 220 static struct vm_entry vmentry[VM_MAXCPU];
221 221
222 222 struct bhyvestats {
223 223 uint64_t vmexit_bogus;
224 224 uint64_t vmexit_reqidle;
225 225 uint64_t vmexit_hlt;
226 226 uint64_t vmexit_pause;
227 227 uint64_t vmexit_mtrap;
228 228 uint64_t vmexit_mmio;
229 229 uint64_t vmexit_inout;
230 230 uint64_t cpu_switch_rotate;
231 231 uint64_t cpu_switch_direct;
232 232 uint64_t mmio_unhandled;
233 233 } stats;
234 234
235 235 struct mt_vmm_info {
236 236 pthread_t mt_thr;
237 237 struct vmctx *mt_ctx;
238 238 int mt_vcpu;
239 239 uint64_t mt_startrip;
240 240 } mt_vmm_info[VM_MAXCPU];
241 241
242 242 #ifdef __FreeBSD__
243 243 static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
244 244 #endif
245 245
246 246 static void
247 247 usage(int code)
248 248 {
249 249
250 250 fprintf(stderr,
251 251 "Usage: %s [-abehuwxACHPSWY]\n"
252 252 " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
253 253 " %*s [-g <gdb port>] [-l <lpc>]\n"
254 254 #ifdef __FreeBSD__
255 255 " %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
256 256 #else
257 257 " %*s [-m mem] [-s <pci>] [-U uuid] <vm>\n"
258 258 #endif
259 259 " -a: local apic is in xAPIC mode (deprecated)\n"
260 260 " -A: create ACPI tables\n"
261 261 " -c: number of cpus and/or topology specification\n"
262 262 " -C: include guest memory in core file\n"
263 263 #ifndef __FreeBSD__
264 264 " -d: suspend cpu at boot\n"
265 265 #endif
266 266 " -e: exit on unhandled I/O access\n"
267 267 " -g: gdb port\n"
268 268 " -h: help\n"
269 269 " -H: vmexit from the guest on hlt\n"
270 270 " -l: LPC device configuration\n"
271 271 " -m: memory size\n"
272 272 #ifdef __FreeBSD__
273 273 " -p: pin 'vcpu' to 'hostcpu'\n"
274 274 #endif
275 275 " -P: vmexit from the guest on pause\n"
276 276 " -s: <slot,driver,configinfo> PCI slot config\n"
277 277 " -S: guest memory cannot be swapped\n"
278 278 " -u: RTC keeps UTC time\n"
279 279 " -U: uuid\n"
280 280 " -w: ignore unimplemented MSRs\n"
281 281 " -W: force virtio to use single-vector MSI\n"
282 282 " -x: local apic is in x2APIC mode\n"
283 283 " -Y: disable MPtable generation\n",
284 284 progname, (int)strlen(progname), "", (int)strlen(progname), "",
285 285 (int)strlen(progname), "");
286 286
287 287 exit(code);
288 288 }
289 289
290 290 /*
291 291 * XXX This parser is known to have the following issues:
292 292 * 1. It accepts null key=value tokens ",,".
293 293 * 2. It accepts whitespace after = and before value.
294 294 * 3. Values out of range of INT are silently wrapped.
295 295 * 4. It doesn't check non-final values.
296 296 * 5. The apparently bogus limits of UINT16_MAX are for future expansion.
297 297 *
298 298 * The acceptance of a null specification ('-c ""') is by design to match the
299 299 * manual page syntax specification, this results in a topology of 1 vCPU.
300 300 */
301 301 static int
302 302 topology_parse(const char *opt)
303 303 {
304 304 uint64_t ncpus;
305 305 int c, chk, n, s, t, tmp;
306 306 char *cp, *str;
307 307 bool ns, scts;
308 308
309 309 c = 1, n = 1, s = 1, t = 1;
310 310 ns = false, scts = false;
311 311 str = strdup(opt);
312 312 if (str == NULL)
313 313 goto out;
314 314
315 315 while ((cp = strsep(&str, ",")) != NULL) {
316 316 if (sscanf(cp, "%i%n", &tmp, &chk) == 1) {
317 317 n = tmp;
318 318 ns = true;
319 319 } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) {
320 320 n = tmp;
321 321 ns = true;
322 322 } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) {
323 323 s = tmp;
324 324 scts = true;
325 325 } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) {
326 326 c = tmp;
327 327 scts = true;
328 328 } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) {
329 329 t = tmp;
330 330 scts = true;
331 331 #ifdef notyet /* Do not expose this until vmm.ko implements it */
332 332 } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) {
333 333 m = tmp;
334 334 #endif
335 335 /* Skip the empty argument case from -c "" */
336 336 } else if (cp[0] == '\0')
337 337 continue;
338 338 else
339 339 goto out;
340 340 /* Any trailing garbage causes an error */
341 341 if (cp[chk] != '\0')
342 342 goto out;
343 343 }
344 344 free(str);
345 345 str = NULL;
346 346
347 347 /*
348 348 * Range check 1 <= n <= UINT16_MAX all values
349 349 */
350 350 if (n < 1 || s < 1 || c < 1 || t < 1 ||
351 351 n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX ||
352 352 t > UINT16_MAX)
353 353 return (-1);
354 354
355 355 /* If only the cpus was specified, use that as sockets */
356 356 if (!scts)
357 357 s = n;
358 358 /*
359 359 * Compute sockets * cores * threads avoiding overflow
360 360 * The range check above insures these are 16 bit values
361 361 * If n was specified check it against computed ncpus
362 362 */
363 363 ncpus = (uint64_t)s * c * t;
364 364 if (ncpus > UINT16_MAX || (ns && n != ncpus))
365 365 return (-1);
366 366
367 367 guest_ncpus = ncpus;
368 368 sockets = s;
369 369 cores = c;
370 370 threads = t;
371 371 return(0);
372 372
373 373 out:
374 374 free(str);
375 375 return (-1);
376 376 }
377 377
378 378 #ifndef WITHOUT_CAPSICUM
379 379 /*
380 380 * 11-stable capsicum helpers
381 381 */
382 382 static void
383 383 bhyve_caph_cache_catpages(void)
384 384 {
385 385
386 386 (void)catopen("libc", NL_CAT_LOCALE);
387 387 }
388 388
389 389 static int
390 390 bhyve_caph_limit_stdoe(void)
391 391 {
392 392 cap_rights_t rights;
393 393 unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ };
394 394 int i, fds[] = { STDOUT_FILENO, STDERR_FILENO };
395 395
396 396 cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL);
397 397 cap_rights_set(&rights, CAP_WRITE);
398 398
399 399 for (i = 0; i < nitems(fds); i++) {
400 400 if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS)
401 401 return (-1);
402 402
403 403 if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS)
404 404 return (-1);
405 405
406 406 if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS)
407 407 return (-1);
408 408 }
409 409
410 410 return (0);
411 411 }
412 412
413 413 #endif
414 414
415 415 #ifdef __FreeBSD__
416 416 static int
417 417 pincpu_parse(const char *opt)
418 418 {
419 419 int vcpu, pcpu;
420 420
421 421 if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
422 422 fprintf(stderr, "invalid format: %s\n", opt);
423 423 return (-1);
424 424 }
425 425
426 426 if (vcpu < 0 || vcpu >= VM_MAXCPU) {
427 427 fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
428 428 vcpu, VM_MAXCPU - 1);
429 429 return (-1);
430 430 }
431 431
432 432 if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
433 433 fprintf(stderr, "hostcpu '%d' outside valid range from "
434 434 "0 to %d\n", pcpu, CPU_SETSIZE - 1);
435 435 return (-1);
436 436 }
437 437
438 438 if (vcpumap[vcpu] == NULL) {
439 439 if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
440 440 perror("malloc");
441 441 return (-1);
442 442 }
443 443 CPU_ZERO(vcpumap[vcpu]);
444 444 }
445 445 CPU_SET(pcpu, vcpumap[vcpu]);
446 446 return (0);
447 447 }
448 448
449 449 void
450 450 vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
451 451 int errcode)
452 452 {
453 453 struct vmctx *ctx;
454 454 int error, restart_instruction;
455 455
456 456 ctx = arg;
457 457 restart_instruction = 1;
458 458
459 459 error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
460 460 restart_instruction);
461 461 assert(error == 0);
462 462 }
463 463 #endif /* __FreeBSD__ */
464 464
465 465 void *
466 466 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
467 467 {
468 468
469 469 return (vm_map_gpa(ctx, gaddr, len));
470 470 }
471 471
472 472 int
473 473 fbsdrun_vmexit_on_pause(void)
474 474 {
475 475
476 476 return (guest_vmexit_on_pause);
477 477 }
478 478
479 479 int
480 480 fbsdrun_vmexit_on_hlt(void)
481 481 {
482 482
483 483 return (guest_vmexit_on_hlt);
484 484 }
485 485
486 486 int
487 487 fbsdrun_virtio_msix(void)
488 488 {
489 489
490 490 return (virtio_msix);
491 491 }
492 492
493 493 static void *
494 494 fbsdrun_start_thread(void *param)
495 495 {
496 496 char tname[MAXCOMLEN + 1];
497 497 struct mt_vmm_info *mtp;
498 498 int vcpu;
499 499
500 500 mtp = param;
501 501 vcpu = mtp->mt_vcpu;
502 502
503 503 snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
504 504 pthread_set_name_np(mtp->mt_thr, tname);
505 505
506 506 if (gdb_port != 0)
507 507 gdb_cpu_add(vcpu);
508 508
509 509 vm_loop(mtp->mt_ctx, vcpu, mtp->mt_startrip);
510 510
|
↓ open down ↓ |
510 lines elided |
↑ open up ↑ |
511 511 /* not reached */
512 512 exit(1);
513 513 return (NULL);
514 514 }
515 515
516 516 #ifdef __FreeBSD__
517 517 void
518 518 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
519 519 #else
520 520 void
521 -fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip,
522 - bool suspend)
521 +fbsdrun_addcpu(struct vmctx *ctx, int newcpu, uint64_t rip, bool suspend)
523 522 #endif
524 523 {
525 524 int error;
526 525
526 +#ifdef __FreeBSD__
527 527 assert(fromcpu == BSP);
528 +#endif
528 529
529 530 /*
530 531 * The 'newcpu' must be activated in the context of 'fromcpu'. If
531 532 * vm_activate_cpu() is delayed until newcpu's pthread starts running
532 533 * then vmm.ko is out-of-sync with bhyve and this can create a race
533 534 * with vm_suspend().
534 535 */
535 536 error = vm_activate_cpu(ctx, newcpu);
536 537 if (error != 0)
537 538 err(EX_OSERR, "could not activate CPU %d", newcpu);
538 539
539 540 CPU_SET_ATOMIC(newcpu, &cpumask);
540 541
541 542 #ifndef __FreeBSD__
542 543 if (suspend)
543 544 (void) vm_suspend_cpu(ctx, newcpu);
544 545 #endif
545 546
546 547 /*
547 548 * Set up the vmexit struct to allow execution to start
548 549 * at the given RIP
549 550 */
550 551 mt_vmm_info[newcpu].mt_ctx = ctx;
551 552 mt_vmm_info[newcpu].mt_vcpu = newcpu;
552 553 mt_vmm_info[newcpu].mt_startrip = rip;
553 554
554 555 error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
555 556 fbsdrun_start_thread, &mt_vmm_info[newcpu]);
556 557 assert(error == 0);
557 558 }
558 559
559 560 static int
560 561 fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
561 562 {
562 563
563 564 if (!CPU_ISSET(vcpu, &cpumask)) {
564 565 fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
565 566 exit(4);
566 567 }
567 568
568 569 CPU_CLR_ATOMIC(vcpu, &cpumask);
569 570 return (CPU_EMPTY(&cpumask));
|
↓ open down ↓ |
32 lines elided |
↑ open up ↑ |
570 571 }
571 572
572 573 static void
573 574 vmentry_mmio_read(int vcpu, uint64_t gpa, uint8_t bytes, uint64_t data)
574 575 {
575 576 struct vm_entry *entry = &vmentry[vcpu];
576 577 struct vm_mmio *mmio = &entry->u.mmio;
577 578
578 579 assert(entry->cmd == VEC_DEFAULT);
579 580
580 - entry->cmd = VEC_COMPLETE_MMIO;
581 + entry->cmd = VEC_FULFILL_MMIO;
581 582 mmio->bytes = bytes;
582 583 mmio->read = 1;
583 584 mmio->gpa = gpa;
584 585 mmio->data = data;
585 586 }
586 587
587 588 static void
588 589 vmentry_mmio_write(int vcpu, uint64_t gpa, uint8_t bytes)
589 590 {
590 591 struct vm_entry *entry = &vmentry[vcpu];
591 592 struct vm_mmio *mmio = &entry->u.mmio;
592 593
593 594 assert(entry->cmd == VEC_DEFAULT);
594 595
595 - entry->cmd = VEC_COMPLETE_MMIO;
596 + entry->cmd = VEC_FULFILL_MMIO;
596 597 mmio->bytes = bytes;
597 598 mmio->read = 0;
598 599 mmio->gpa = gpa;
599 600 mmio->data = 0;
600 601 }
601 602
602 603 static void
603 604 vmentry_inout_read(int vcpu, uint16_t port, uint8_t bytes, uint32_t data)
604 605 {
605 606 struct vm_entry *entry = &vmentry[vcpu];
606 607 struct vm_inout *inout = &entry->u.inout;
607 608
608 609 assert(entry->cmd == VEC_DEFAULT);
609 610
610 - entry->cmd = VEC_COMPLETE_INOUT;
611 + entry->cmd = VEC_FULFILL_INOUT;
611 612 inout->bytes = bytes;
612 613 inout->flags = INOUT_IN;
613 614 inout->port = port;
614 615 inout->eax = data;
615 616 }
616 617
617 618 static void
618 619 vmentry_inout_write(int vcpu, uint16_t port, uint8_t bytes)
619 620 {
620 621 struct vm_entry *entry = &vmentry[vcpu];
621 622 struct vm_inout *inout = &entry->u.inout;
622 623
623 624 assert(entry->cmd == VEC_DEFAULT);
624 625
625 - entry->cmd = VEC_COMPLETE_INOUT;
626 + entry->cmd = VEC_FULFILL_INOUT;
626 627 inout->bytes = bytes;
627 628 inout->flags = 0;
628 629 inout->port = port;
629 630 inout->eax = 0;
630 631 }
631 632
632 633 static int
633 634 vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
634 635 uint32_t eax)
635 636 {
636 637 #if BHYVE_DEBUG
637 638 /*
638 639 * put guest-driven debug here
639 640 */
640 641 #endif
641 642 return (VMEXIT_CONTINUE);
642 643 }
643 644
644 645 static int
645 646 vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
646 647 {
647 648 int error;
648 649 int vcpu;
649 650 struct vm_inout inout;
650 651 bool in;
651 652 uint8_t bytes;
652 653
653 654 stats.vmexit_inout++;
654 655
655 656 vcpu = *pvcpu;
656 657 inout = vme->u.inout;
657 658 in = (inout.flags & INOUT_IN) != 0;
658 659 bytes = inout.bytes;
659 660
660 661 /* Extra-special case of host notifications */
661 662 if (!in && inout.port == GUEST_NIO_PORT) {
662 663 error = vmexit_handle_notify(ctx, vme, pvcpu, inout.eax);
663 664 vmentry_inout_write(vcpu, inout.port, bytes);
664 665 return (error);
665 666 }
666 667
667 668 error = emulate_inout(ctx, vcpu, &inout, strictio != 0);
668 669 if (error) {
669 670 fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
670 671 in ? "in" : "out",
671 672 bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
672 673 inout.port, vmexit->rip);
673 674 return (VMEXIT_ABORT);
674 675 } else {
675 676 /*
676 677 * Communicate the status of the inout operation back to the
677 678 * in-kernel instruction emulation.
678 679 */
679 680 if (in) {
680 681 vmentry_inout_read(vcpu, inout.port, bytes, inout.eax);
681 682 } else {
682 683 vmentry_inout_write(vcpu, inout.port, bytes);
683 684 }
684 685 return (VMEXIT_CONTINUE);
685 686 }
686 687 }
687 688
688 689 static int
689 690 vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
690 691 {
691 692 uint64_t val;
692 693 uint32_t eax, edx;
693 694 int error;
694 695
695 696 val = 0;
696 697 error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
697 698 if (error != 0) {
698 699 fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
699 700 vme->u.msr.code, *pvcpu);
700 701 if (strictmsr) {
701 702 vm_inject_gp(ctx, *pvcpu);
702 703 return (VMEXIT_CONTINUE);
703 704 }
704 705 }
705 706
706 707 eax = val;
707 708 error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
708 709 assert(error == 0);
709 710
710 711 edx = val >> 32;
711 712 error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
712 713 assert(error == 0);
713 714
714 715 return (VMEXIT_CONTINUE);
715 716 }
716 717
717 718 static int
718 719 vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
719 720 {
720 721 int error;
721 722
722 723 error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
723 724 if (error != 0) {
|
↓ open down ↓ |
88 lines elided |
↑ open up ↑ |
724 725 fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
725 726 vme->u.msr.code, vme->u.msr.wval, *pvcpu);
726 727 if (strictmsr) {
727 728 vm_inject_gp(ctx, *pvcpu);
728 729 return (VMEXIT_CONTINUE);
729 730 }
730 731 }
731 732 return (VMEXIT_CONTINUE);
732 733 }
733 734
735 +#ifdef __FreeBSD__
734 736 static int
735 737 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
736 738 {
737 739
738 740 (void)spinup_ap(ctx, *pvcpu,
739 741 vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
740 742
741 743 return (VMEXIT_CONTINUE);
742 744 }
745 +#else
746 +static int
747 +vmexit_run_state(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
748 +{
749 + /*
750 + * Run-state transitions (INIT, SIPI, etc) are handled in-kernel, so an
751 + * exit to userspace with that code is not expected.
752 + */
753 + fprintf(stderr, "unexpected run-state VM exit");
754 + return (VMEXIT_ABORT);
755 +}
756 +#endif /* __FreeBSD__ */
743 757
744 758 #ifdef __FreeBSD__
745 759 #define DEBUG_EPT_MISCONFIG
746 760 #else
747 761 /* EPT misconfig debugging not possible now that raw VMCS access is gone */
748 762 #endif
749 763
750 764 #ifdef DEBUG_EPT_MISCONFIG
751 765 #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
752 766
753 767 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
754 768 static int ept_misconfig_ptenum;
755 769 #endif
756 770
757 771 static const char *
758 772 vmexit_vmx_desc(uint32_t exit_reason)
759 773 {
760 774
761 775 if (exit_reason >= nitems(vmx_exit_reason_desc) ||
762 776 vmx_exit_reason_desc[exit_reason] == NULL)
763 777 return ("Unknown");
764 778 return (vmx_exit_reason_desc[exit_reason]);
765 779 }
766 780
767 781 static int
768 782 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
769 783 {
770 784
771 785 fprintf(stderr, "vm exit[%d]\n", *pvcpu);
772 786 fprintf(stderr, "\treason\t\tVMX\n");
773 787 fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
774 788 fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
775 789 fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
776 790 fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
777 791 vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
778 792 fprintf(stderr, "\tqualification\t0x%016lx\n",
779 793 vmexit->u.vmx.exit_qualification);
780 794 fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
781 795 fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
782 796 #ifdef DEBUG_EPT_MISCONFIG
783 797 if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
784 798 vm_get_register(ctx, *pvcpu,
785 799 VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
786 800 &ept_misconfig_gpa);
787 801 vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
788 802 &ept_misconfig_ptenum);
789 803 fprintf(stderr, "\tEPT misconfiguration:\n");
790 804 fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
791 805 fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
792 806 ept_misconfig_ptenum, ept_misconfig_pte[0],
793 807 ept_misconfig_pte[1], ept_misconfig_pte[2],
794 808 ept_misconfig_pte[3]);
795 809 }
796 810 #endif /* DEBUG_EPT_MISCONFIG */
797 811 return (VMEXIT_ABORT);
798 812 }
799 813
800 814 static int
801 815 vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
802 816 {
803 817
804 818 fprintf(stderr, "vm exit[%d]\n", *pvcpu);
805 819 fprintf(stderr, "\treason\t\tSVM\n");
806 820 fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
807 821 fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
808 822 fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
809 823 fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
810 824 fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
811 825 return (VMEXIT_ABORT);
812 826 }
813 827
814 828 static int
815 829 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
816 830 {
817 831
818 832 assert(vmexit->inst_length == 0);
819 833
820 834 stats.vmexit_bogus++;
821 835
822 836 return (VMEXIT_CONTINUE);
823 837 }
824 838
825 839 static int
826 840 vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
827 841 {
828 842
829 843 assert(vmexit->inst_length == 0);
830 844
831 845 stats.vmexit_reqidle++;
832 846
833 847 return (VMEXIT_CONTINUE);
834 848 }
835 849
836 850 static int
837 851 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
838 852 {
839 853
840 854 stats.vmexit_hlt++;
841 855
842 856 /*
843 857 * Just continue execution with the next instruction. We use
844 858 * the HLT VM exit as a way to be friendly with the host
845 859 * scheduler.
846 860 */
847 861 return (VMEXIT_CONTINUE);
848 862 }
849 863
850 864 static int
851 865 vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
852 866 {
853 867
854 868 stats.vmexit_pause++;
855 869
856 870 return (VMEXIT_CONTINUE);
857 871 }
858 872
859 873 static int
860 874 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
861 875 {
862 876
863 877 assert(vmexit->inst_length == 0);
864 878
865 879 stats.vmexit_mtrap++;
866 880
867 881 if (gdb_port == 0) {
868 882 fprintf(stderr, "vm_loop: unexpected VMEXIT_MTRAP\n");
869 883 exit(4);
870 884 }
871 885 gdb_cpu_mtrap(*pvcpu);
872 886 return (VMEXIT_CONTINUE);
873 887 }
874 888
875 889 static int
876 890 vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
877 891 {
878 892 uint8_t i, valid;
879 893
880 894 fprintf(stderr, "Failed to emulate instruction sequence ");
881 895
882 896 valid = vmexit->u.inst_emul.num_valid;
883 897 if (valid != 0) {
884 898 assert(valid <= sizeof (vmexit->u.inst_emul.inst));
885 899 fprintf(stderr, "[");
886 900 for (i = 0; i < valid; i++) {
887 901 if (i == 0) {
888 902 fprintf(stderr, "%02x",
889 903 vmexit->u.inst_emul.inst[i]);
890 904 } else {
891 905 fprintf(stderr, ", %02x",
892 906 vmexit->u.inst_emul.inst[i]);
893 907 }
894 908 }
895 909 fprintf(stderr, "] ");
896 910 }
897 911 fprintf(stderr, "@ %rip = %x\n", vmexit->rip);
898 912
899 913 return (VMEXIT_ABORT);
900 914 }
901 915
902 916 static int
903 917 vmexit_mmio(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
904 918 {
905 919 int vcpu, err;
906 920 struct vm_mmio mmio;
907 921 bool is_read;
908 922
909 923 stats.vmexit_mmio++;
910 924
911 925 vcpu = *pvcpu;
912 926 mmio = vmexit->u.mmio;
913 927 is_read = (mmio.read != 0);
914 928
915 929 err = emulate_mem(ctx, vcpu, &mmio);
916 930
917 931 if (err == ESRCH) {
918 932 fprintf(stderr, "Unhandled memory access to 0x%lx\n", mmio.gpa);
919 933 stats.mmio_unhandled++;
920 934
921 935 /*
922 936 * Access to non-existent physical addresses is not likely to
923 937 * result in fatal errors on hardware machines, but rather reads
924 938 * of all-ones or discarded-but-acknowledged writes.
925 939 */
926 940 mmio.data = ~0UL;
927 941 err = 0;
928 942 }
929 943
930 944 if (err == 0) {
931 945 if (is_read) {
932 946 vmentry_mmio_read(vcpu, mmio.gpa, mmio.bytes,
933 947 mmio.data);
934 948 } else {
935 949 vmentry_mmio_write(vcpu, mmio.gpa, mmio.bytes);
936 950 }
937 951 return (VMEXIT_CONTINUE);
938 952 }
939 953
940 954 fprintf(stderr, "Unhandled mmio error to 0x%lx: %d\n", mmio.gpa, err);
941 955 return (VMEXIT_ABORT);
942 956 }
943 957
944 958 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
945 959 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
946 960
947 961 static int
948 962 vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
949 963 {
950 964 enum vm_suspend_how how;
951 965
952 966 how = vmexit->u.suspended.how;
953 967
954 968 fbsdrun_deletecpu(ctx, *pvcpu);
955 969
956 970 if (*pvcpu != BSP) {
957 971 pthread_mutex_lock(&resetcpu_mtx);
958 972 pthread_cond_signal(&resetcpu_cond);
959 973 pthread_mutex_unlock(&resetcpu_mtx);
960 974 pthread_exit(NULL);
961 975 }
962 976
963 977 pthread_mutex_lock(&resetcpu_mtx);
964 978 while (!CPU_EMPTY(&cpumask)) {
965 979 pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
966 980 }
967 981 pthread_mutex_unlock(&resetcpu_mtx);
968 982
969 983 switch (how) {
970 984 case VM_SUSPEND_RESET:
971 985 exit(0);
972 986 case VM_SUSPEND_POWEROFF:
973 987 exit(1);
974 988 case VM_SUSPEND_HALT:
975 989 exit(2);
976 990 case VM_SUSPEND_TRIPLEFAULT:
977 991 exit(3);
978 992 default:
979 993 fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
980 994 exit(100);
981 995 }
982 996 return (0); /* NOTREACHED */
983 997 }
984 998
985 999 static int
986 1000 vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
987 1001 {
988 1002
989 1003 if (gdb_port == 0) {
990 1004 fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n");
991 1005 exit(4);
992 1006 }
993 1007 gdb_cpu_suspend(*pvcpu);
994 1008 return (VMEXIT_CONTINUE);
995 1009 }
996 1010
997 1011 static int
998 1012 vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
999 1013 {
1000 1014
1001 1015 if (gdb_port == 0) {
1002 1016 fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n");
1003 1017 exit(4);
1004 1018 }
1005 1019 gdb_cpu_breakpoint(*pvcpu, vmexit);
1006 1020 return (VMEXIT_CONTINUE);
1007 1021 }
1008 1022
1009 1023 static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
|
↓ open down ↓ |
257 lines elided |
↑ open up ↑ |
1010 1024 [VM_EXITCODE_INOUT] = vmexit_inout,
1011 1025 [VM_EXITCODE_MMIO] = vmexit_mmio,
1012 1026 [VM_EXITCODE_VMX] = vmexit_vmx,
1013 1027 [VM_EXITCODE_SVM] = vmexit_svm,
1014 1028 [VM_EXITCODE_BOGUS] = vmexit_bogus,
1015 1029 [VM_EXITCODE_REQIDLE] = vmexit_reqidle,
1016 1030 [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
1017 1031 [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
1018 1032 [VM_EXITCODE_MTRAP] = vmexit_mtrap,
1019 1033 [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
1034 +#ifdef __FreeBSD__
1020 1035 [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
1036 +#else
1037 + [VM_EXITCODE_RUN_STATE] = vmexit_run_state,
1038 +#endif
1021 1039 [VM_EXITCODE_SUSPENDED] = vmexit_suspend,
1022 1040 [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
1023 1041 [VM_EXITCODE_DEBUG] = vmexit_debug,
1024 1042 [VM_EXITCODE_BPT] = vmexit_breakpoint,
1025 1043 };
1026 1044
1027 1045 static void
1028 1046 vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
1029 1047 {
1030 1048 int error, rc;
1031 1049 enum vm_exitcode exitcode;
1032 1050 cpuset_t active_cpus;
1033 1051 struct vm_exit *vexit;
1034 1052 struct vm_entry *ventry;
1035 1053
1036 1054 #ifdef __FreeBSD__
1037 1055 if (vcpumap[vcpu] != NULL) {
1038 1056 error = pthread_setaffinity_np(pthread_self(),
1039 1057 sizeof(cpuset_t), vcpumap[vcpu]);
1040 1058 assert(error == 0);
1041 1059 }
1042 1060 #endif
1043 1061 error = vm_active_cpus(ctx, &active_cpus);
1044 1062 assert(CPU_ISSET(vcpu, &active_cpus));
1045 1063
1046 1064 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
1047 1065 assert(error == 0);
1048 1066
1049 1067 ventry = &vmentry[vcpu];
1050 1068 vexit = &vmexit[vcpu];
1051 1069
1052 1070 while (1) {
1053 1071 error = vm_run(ctx, vcpu, ventry, vexit);
1054 1072 if (error != 0)
1055 1073 break;
1056 1074
1057 1075 if (ventry->cmd != VEC_DEFAULT) {
1058 1076 /*
1059 1077 * Discard any lingering entry state after it has been
1060 1078 * submitted via vm_run().
1061 1079 */
1062 1080 bzero(ventry, sizeof (*ventry));
1063 1081 }
1064 1082
1065 1083 exitcode = vexit->exitcode;
1066 1084 if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
1067 1085 fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
1068 1086 exitcode);
1069 1087 exit(4);
1070 1088 }
1071 1089
1072 1090 rc = (*handler[exitcode])(ctx, vexit, &vcpu);
1073 1091
1074 1092 switch (rc) {
1075 1093 case VMEXIT_CONTINUE:
1076 1094 break;
1077 1095 case VMEXIT_ABORT:
1078 1096 abort();
1079 1097 default:
1080 1098 exit(4);
1081 1099 }
1082 1100 }
1083 1101 fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
1084 1102 }
1085 1103
1086 1104 static int
1087 1105 num_vcpus_allowed(struct vmctx *ctx)
1088 1106 {
1089 1107 #ifdef __FreeBSD__
1090 1108 int tmp, error;
1091 1109
1092 1110 error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
1093 1111
1094 1112 /*
1095 1113 * The guest is allowed to spinup more than one processor only if the
1096 1114 * UNRESTRICTED_GUEST capability is available.
1097 1115 */
1098 1116 if (error == 0)
1099 1117 return (VM_MAXCPU);
1100 1118 else
1101 1119 return (1);
1102 1120 #else
1103 1121 /* Unrestricted Guest is always enabled on illumos */
1104 1122 return (VM_MAXCPU);
1105 1123 #endif /* __FreeBSD__ */
1106 1124 }
1107 1125
1108 1126 void
1109 1127 fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
1110 1128 {
1111 1129 int err, tmp;
1112 1130
1113 1131 if (fbsdrun_vmexit_on_hlt()) {
1114 1132 err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
1115 1133 if (err < 0) {
1116 1134 fprintf(stderr, "VM exit on HLT not supported\n");
1117 1135 exit(4);
1118 1136 }
1119 1137 vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
1120 1138 if (cpu == BSP)
1121 1139 handler[VM_EXITCODE_HLT] = vmexit_hlt;
1122 1140 }
1123 1141
1124 1142 if (fbsdrun_vmexit_on_pause()) {
1125 1143 /*
1126 1144 * pause exit support required for this mode
1127 1145 */
1128 1146 err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
1129 1147 if (err < 0) {
1130 1148 fprintf(stderr,
1131 1149 "SMP mux requested, no pause support\n");
1132 1150 exit(4);
1133 1151 }
1134 1152 vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
1135 1153 if (cpu == BSP)
1136 1154 handler[VM_EXITCODE_PAUSE] = vmexit_pause;
1137 1155 }
1138 1156
1139 1157 if (x2apic_mode)
1140 1158 err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
1141 1159 else
1142 1160 err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
1143 1161
1144 1162 if (err) {
1145 1163 fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
1146 1164 exit(4);
1147 1165 }
1148 1166
1149 1167 #ifdef __FreeBSD__
1150 1168 vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
1151 1169 #endif
1152 1170 }
1153 1171
1154 1172 static struct vmctx *
1155 1173 do_open(const char *vmname)
1156 1174 {
1157 1175 struct vmctx *ctx;
1158 1176 int error;
1159 1177 bool reinit, romboot;
1160 1178 #ifndef WITHOUT_CAPSICUM
1161 1179 cap_rights_t rights;
1162 1180 const cap_ioctl_t *cmds;
1163 1181 size_t ncmds;
1164 1182 #endif
1165 1183
1166 1184 reinit = romboot = false;
1167 1185
1168 1186 if (lpc_bootrom())
1169 1187 romboot = true;
1170 1188
1171 1189 error = vm_create(vmname);
1172 1190 if (error) {
1173 1191 if (errno == EEXIST) {
1174 1192 if (romboot) {
1175 1193 reinit = true;
1176 1194 } else {
1177 1195 /*
1178 1196 * The virtual machine has been setup by the
1179 1197 * userspace bootloader.
1180 1198 */
1181 1199 }
1182 1200 } else {
1183 1201 perror("vm_create");
1184 1202 exit(4);
1185 1203 }
1186 1204 } else {
1187 1205 if (!romboot) {
1188 1206 /*
1189 1207 * If the virtual machine was just created then a
1190 1208 * bootrom must be configured to boot it.
1191 1209 */
1192 1210 fprintf(stderr, "virtual machine cannot be booted\n");
1193 1211 exit(4);
1194 1212 }
1195 1213 }
1196 1214
1197 1215 ctx = vm_open(vmname);
1198 1216 if (ctx == NULL) {
1199 1217 perror("vm_open");
1200 1218 exit(4);
1201 1219 }
1202 1220
1203 1221 #ifndef WITHOUT_CAPSICUM
1204 1222 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1205 1223 if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1)
1206 1224 errx(EX_OSERR, "Unable to apply rights for sandbox");
1207 1225 vm_get_ioctls(&ncmds);
1208 1226 cmds = vm_get_ioctls(NULL);
1209 1227 if (cmds == NULL)
1210 1228 errx(EX_OSERR, "out of memory");
1211 1229 if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1)
1212 1230 errx(EX_OSERR, "Unable to apply rights for sandbox");
1213 1231 free((cap_ioctl_t *)cmds);
1214 1232 #endif
1215 1233
1216 1234 if (reinit) {
1217 1235 error = vm_reinit(ctx);
1218 1236 if (error) {
1219 1237 perror("vm_reinit");
1220 1238 exit(4);
1221 1239 }
1222 1240 }
1223 1241 error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
1224 1242 if (error)
1225 1243 errx(EX_OSERR, "vm_set_topology");
1226 1244 return (ctx);
1227 1245 }
1228 1246
1229 1247 #ifndef __FreeBSD__
1230 1248
1231 1249 #define FILE_PROVISIONING "/var/svc/provisioning"
1232 1250 #define FILE_PROVISION_SUCCESS "/var/svc/provision_success"
1233 1251
1234 1252 static void
1235 1253 mark_provisioned(void)
1236 1254 {
1237 1255 struct stat stbuf;
1238 1256
1239 1257 if (lstat(FILE_PROVISIONING, &stbuf) != 0)
1240 1258 return;
1241 1259
1242 1260 if (rename(FILE_PROVISIONING, FILE_PROVISION_SUCCESS) != 0) {
1243 1261 (void) fprintf(stderr, "Cannot rename %s to %s: %s\n",
1244 1262 FILE_PROVISIONING, FILE_PROVISION_SUCCESS,
1245 1263 strerror(errno));
1246 1264 }
1247 1265 }
1248 1266
1249 1267 #endif
1250 1268
1251 1269 int
1252 1270 main(int argc, char *argv[])
1253 1271 {
1254 1272 int c, error, dbg_port, err, bvmcons;
1255 1273 int max_vcpus, mptgen, memflags;
1256 1274 int rtc_localtime;
1257 1275 bool gdb_stop;
1258 1276 #ifndef __FreeBSD__
1259 1277 bool suspend = false;
1260 1278 #endif
1261 1279 struct vmctx *ctx;
1262 1280 uint64_t rip;
1263 1281 size_t memsize;
1264 1282 char *optstr;
1265 1283
1266 1284 bvmcons = 0;
1267 1285 progname = basename(argv[0]);
1268 1286 dbg_port = 0;
1269 1287 gdb_stop = false;
1270 1288 guest_ncpus = 1;
1271 1289 sockets = cores = threads = 1;
1272 1290 maxcpus = 0;
1273 1291 memsize = 256 * MB;
1274 1292 mptgen = 1;
1275 1293 rtc_localtime = 1;
1276 1294 memflags = 0;
1277 1295
1278 1296 #ifdef __FreeBSD__
1279 1297 optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:B:U:";
1280 1298 #else
1281 1299 optstr = "abdehuwxACHIPSWYg:G:c:s:m:l:B:U:";
1282 1300 #endif
1283 1301 while ((c = getopt(argc, argv, optstr)) != -1) {
1284 1302 switch (c) {
1285 1303 case 'a':
1286 1304 x2apic_mode = 0;
1287 1305 break;
1288 1306 case 'A':
1289 1307 acpi = 1;
1290 1308 break;
1291 1309 case 'b':
1292 1310 bvmcons = 1;
1293 1311 break;
1294 1312 case 'B':
1295 1313 if (smbios_parse(optarg) != 0) {
1296 1314 errx(EX_USAGE, "invalid SMBIOS "
1297 1315 "configuration '%s'", optarg);
1298 1316 }
1299 1317 break;
1300 1318 #ifndef __FreeBSD__
1301 1319 case 'd':
1302 1320 suspend = true;
1303 1321 break;
1304 1322 #else
1305 1323 case 'p':
1306 1324 if (pincpu_parse(optarg) != 0) {
1307 1325 errx(EX_USAGE, "invalid vcpu pinning "
1308 1326 "configuration '%s'", optarg);
1309 1327 }
1310 1328 break;
1311 1329 #endif
1312 1330 case 'c':
1313 1331 if (topology_parse(optarg) != 0) {
1314 1332 errx(EX_USAGE, "invalid cpu topology "
1315 1333 "'%s'", optarg);
1316 1334 }
1317 1335 break;
1318 1336 case 'C':
1319 1337 memflags |= VM_MEM_F_INCORE;
1320 1338 break;
1321 1339 case 'g':
1322 1340 dbg_port = atoi(optarg);
1323 1341 break;
1324 1342 case 'G':
1325 1343 if (optarg[0] == 'w') {
1326 1344 gdb_stop = true;
1327 1345 optarg++;
1328 1346 }
1329 1347 gdb_port = atoi(optarg);
1330 1348 break;
1331 1349 case 'l':
1332 1350 if (strncmp(optarg, "help", strlen(optarg)) == 0) {
1333 1351 lpc_print_supported_devices();
1334 1352 exit(0);
1335 1353 } else if (lpc_device_parse(optarg) != 0) {
1336 1354 errx(EX_USAGE, "invalid lpc device "
1337 1355 "configuration '%s'", optarg);
1338 1356 }
1339 1357 break;
1340 1358 case 's':
1341 1359 if (strncmp(optarg, "help", strlen(optarg)) == 0) {
1342 1360 pci_print_supported_devices();
1343 1361 exit(0);
1344 1362 } else if (pci_parse_slot(optarg) != 0)
1345 1363 exit(4);
1346 1364 else
1347 1365 break;
1348 1366 case 'S':
1349 1367 memflags |= VM_MEM_F_WIRED;
1350 1368 break;
1351 1369 case 'm':
1352 1370 error = vm_parse_memsize(optarg, &memsize);
1353 1371 if (error)
1354 1372 errx(EX_USAGE, "invalid memsize '%s'", optarg);
1355 1373 break;
1356 1374 case 'H':
1357 1375 guest_vmexit_on_hlt = 1;
1358 1376 break;
1359 1377 case 'I':
1360 1378 /*
1361 1379 * The "-I" option was used to add an ioapic to the
1362 1380 * virtual machine.
1363 1381 *
1364 1382 * An ioapic is now provided unconditionally for each
1365 1383 * virtual machine and this option is now deprecated.
1366 1384 */
1367 1385 break;
1368 1386 case 'P':
1369 1387 guest_vmexit_on_pause = 1;
1370 1388 break;
1371 1389 case 'e':
1372 1390 strictio = 1;
1373 1391 break;
1374 1392 case 'u':
1375 1393 rtc_localtime = 0;
1376 1394 break;
1377 1395 case 'U':
1378 1396 guest_uuid_str = optarg;
1379 1397 break;
1380 1398 case 'w':
1381 1399 strictmsr = 0;
1382 1400 break;
1383 1401 case 'W':
1384 1402 virtio_msix = 0;
1385 1403 break;
1386 1404 case 'x':
1387 1405 x2apic_mode = 1;
1388 1406 break;
1389 1407 case 'Y':
1390 1408 mptgen = 0;
1391 1409 break;
1392 1410 case 'h':
1393 1411 usage(0);
1394 1412 default:
1395 1413 usage(1);
1396 1414 }
1397 1415 }
1398 1416 argc -= optind;
1399 1417 argv += optind;
1400 1418
1401 1419 if (argc != 1)
1402 1420 usage(1);
1403 1421
1404 1422 vmname = argv[0];
1405 1423 ctx = do_open(vmname);
1406 1424
1407 1425 max_vcpus = num_vcpus_allowed(ctx);
1408 1426 if (guest_ncpus > max_vcpus) {
1409 1427 fprintf(stderr, "%d vCPUs requested but only %d available\n",
1410 1428 guest_ncpus, max_vcpus);
1411 1429 exit(4);
1412 1430 }
1413 1431
1414 1432 fbsdrun_set_capabilities(ctx, BSP);
1415 1433
1416 1434 vm_set_memflags(ctx, memflags);
1417 1435 #ifdef __FreeBSD__
1418 1436 err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
1419 1437 #else
1420 1438 do {
1421 1439 errno = 0;
1422 1440 err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
1423 1441 error = errno;
1424 1442 if (err != 0 && error == ENOMEM) {
1425 1443 (void) fprintf(stderr, "Unable to allocate memory "
1426 1444 "(%llu), retrying in 1 second\n", memsize);
1427 1445 sleep(1);
1428 1446 }
1429 1447 } while (error == ENOMEM);
1430 1448 #endif
1431 1449 if (err) {
1432 1450 fprintf(stderr, "Unable to setup memory (%d)\n", errno);
1433 1451 exit(4);
1434 1452 }
1435 1453
1436 1454 error = init_msr();
1437 1455 if (error) {
1438 1456 fprintf(stderr, "init_msr error %d", error);
1439 1457 exit(4);
1440 1458 }
1441 1459
1442 1460 init_mem();
1443 1461 init_inout();
1444 1462 #ifdef __FreeBSD__
1445 1463 kernemu_dev_init();
1446 1464 #endif
1447 1465 init_bootrom(ctx);
1448 1466 atkbdc_init(ctx);
1449 1467 pci_irq_init(ctx);
1450 1468 ioapic_init(ctx);
1451 1469
1452 1470 rtc_init(ctx, rtc_localtime);
1453 1471 sci_init(ctx);
1454 1472 #ifndef __FreeBSD__
1455 1473 pmtmr_init(ctx);
1456 1474 #endif
1457 1475
1458 1476 /*
1459 1477 * Exit if a device emulation finds an error in its initilization
1460 1478 */
1461 1479 if (init_pci(ctx) != 0) {
1462 1480 perror("device emulation initialization error");
1463 1481 exit(4);
1464 1482 }
1465 1483
1466 1484 /*
1467 1485 * Initialize after PCI, to allow a bootrom file to reserve the high
1468 1486 * region.
1469 1487 */
1470 1488 if (acpi)
1471 1489 vmgenc_init(ctx);
1472 1490
1473 1491 if (dbg_port != 0)
1474 1492 init_dbgport(dbg_port);
1475 1493
1476 1494 #ifdef __FreeBSD__
1477 1495 if (gdb_port != 0)
1478 1496 init_gdb(ctx, gdb_port, gdb_stop);
1479 1497 #else
1480 1498 if (gdb_port < 0) {
1481 1499 /*
1482 1500 * Set up the internal gdb state needed for basic debugging, but
1483 1501 * skip the step of listening on a port for the GDB server.
1484 1502 */
1485 1503 init_mdb(ctx, gdb_stop);
1486 1504 } else if (gdb_port != 0) {
1487 1505 init_gdb(ctx, gdb_port, gdb_stop);
1488 1506 }
1489 1507 #endif
1490 1508
1491 1509 if (bvmcons)
1492 1510 init_bvmcons();
1493 1511
1494 1512 vga_init(1);
1495 1513
1496 1514 if (lpc_bootrom()) {
1497 1515 #ifdef __FreeBSD__
1498 1516 if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
1499 1517 fprintf(stderr, "ROM boot failed: unrestricted guest "
1500 1518 "capability not available\n");
1501 1519 exit(4);
1502 1520 }
1503 1521 #else
1504 1522 /* Unrestricted Guest is always enabled on illumos */
1505 1523 #endif
1506 1524 error = vcpu_reset(ctx, BSP);
1507 1525 assert(error == 0);
1508 1526 }
1509 1527
1510 1528 error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
1511 1529 assert(error == 0);
1512 1530
1513 1531 /*
1514 1532 * build the guest tables, MP etc.
1515 1533 */
1516 1534 if (mptgen) {
1517 1535 error = mptable_build(ctx, guest_ncpus);
1518 1536 if (error) {
1519 1537 perror("error to build the guest tables");
1520 1538 exit(4);
1521 1539 }
1522 1540 }
1523 1541
1524 1542 error = smbios_build(ctx);
1525 1543 assert(error == 0);
1526 1544
1527 1545 if (acpi) {
1528 1546 error = acpi_build(ctx, guest_ncpus);
1529 1547 assert(error == 0);
1530 1548 }
1531 1549
1532 1550 if (lpc_bootrom())
1533 1551 fwctl_init();
1534 1552
1535 1553 /*
1536 1554 * Change the proc title to include the VM name.
1537 1555 */
1538 1556 setproctitle("%s", vmname);
1539 1557
|
↓ open down ↓ |
509 lines elided |
↑ open up ↑ |
1540 1558 #ifndef WITHOUT_CAPSICUM
1541 1559 caph_cache_catpages();
1542 1560
1543 1561 if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
1544 1562 errx(EX_OSERR, "Unable to apply rights for sandbox");
1545 1563
1546 1564 if (caph_enter() == -1)
1547 1565 errx(EX_OSERR, "cap_enter() failed");
1548 1566 #endif
1549 1567
1568 +#ifdef __FreeBSD__
1550 1569 /*
1551 1570 * Add CPU 0
1552 1571 */
1553 -#ifdef __FreeBSD__
1554 1572 fbsdrun_addcpu(ctx, BSP, BSP, rip);
1555 1573 #else
1556 - fbsdrun_addcpu(ctx, BSP, BSP, rip, suspend);
1574 + /* Set BSP to run (unlike the APs which wait for INIT) */
1575 + error = vm_set_run_state(ctx, BSP, VRS_RUN, 0);
1576 + assert(error == 0);
1577 + fbsdrun_addcpu(ctx, BSP, rip, suspend);
1557 1578
1579 + /* Add subsequent CPUs, which will wait until INIT/SIPI-ed */
1580 + for (uint_t i = 1; i < guest_ncpus; i++) {
1581 + spinup_halted_ap(ctx, i);
1582 + }
1558 1583 mark_provisioned();
1559 1584 #endif
1560 1585
1561 1586 /*
1562 1587 * Head off to the main event dispatch loop
1563 1588 */
1564 1589 mevent_dispatch();
1565 1590
1566 1591 exit(4);
1567 1592 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX