1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD$
29 */
30 /*
31 * This file and its contents are supplied under the terms of the
32 * Common Development and Distribution License ("CDDL"), version 1.0.
33 * You may only use this file in accordance with the terms of version
34 * 1.0 of the CDDL.
35 *
36 * A full copy of the text of the CDDL should have accompanied this
37 * source. A copy of the CDDL is also available via the Internet at
38 * http://www.illumos.org/license/CDDL.
39 *
40 * Copyright 2015 Pluribus Networks Inc.
41 * Copyright 2018 Joyent, Inc.
42 * Copyright 2020 Oxide Computer Company
43 */
44
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47
48 #include <sys/types.h>
49 #ifndef WITHOUT_CAPSICUM
50 #include <sys/capsicum.h>
51 #endif
52 #include <sys/mman.h>
53 #include <sys/time.h>
54 #include <sys/cpuset.h>
55
56 #ifdef __FreeBSD__
57 #include <amd64/vmm/intel/vmcs.h>
58 #else
59 #include <intel/vmcs.h>
60 #endif
61
62 #include <machine/atomic.h>
63 #include <machine/segments.h>
64
65 #ifndef WITHOUT_CAPSICUM
66 #include <capsicum_helpers.h>
67 #endif
68 #include <stdio.h>
69 #include <stdlib.h>
70 #include <string.h>
71 #include <err.h>
72 #include <errno.h>
73 #include <libgen.h>
74 #include <unistd.h>
75 #include <assert.h>
76 #include <pthread.h>
77 #include <pthread_np.h>
78 #include <sysexits.h>
79 #include <stdbool.h>
80 #include <stdint.h>
81
82 #include <machine/vmm.h>
83 #ifndef WITHOUT_CAPSICUM
84 #include <machine/vmm_dev.h>
85 #endif
86 #include <vmmapi.h>
87
88 #ifndef __FreeBSD__
89 #include <sys/stat.h>
90 #endif
91
92 #include "bhyverun.h"
93 #include "acpi.h"
94 #include "atkbdc.h"
95 #include "console.h"
96 #include "bootrom.h"
97 #include "config.h"
98 #include "inout.h"
99 #include "debug.h"
100 #include "fwctl.h"
101 #include "gdb.h"
102 #include "ioapic.h"
103 #include "kernemu_dev.h"
104 #include "mem.h"
105 #include "mevent.h"
106 #include "mptbl.h"
107 #include "pci_emul.h"
108 #include "pci_irq.h"
109 #include "pci_lpc.h"
110 #include "smbiostbl.h"
111 #include "xmsr.h"
112 #include "spinup_ap.h"
113 #include "rfb.h"
114 #include "rtc.h"
115 #include "vga.h"
116 #include "vmgenc.h"
117 #ifndef __FreeBSD__
118 #include "privileges.h"
119 #endif
120
121 #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
122
123 #define MB (1024UL * 1024)
124 #define GB (1024UL * MB)
125
126 static const char * const vmx_exit_reason_desc[] = {
127 [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
128 [EXIT_REASON_EXT_INTR] = "External interrupt",
129 [EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
130 [EXIT_REASON_INIT] = "INIT signal",
131 [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
132 [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
133 [EXIT_REASON_SMI] = "Other SMI",
134 [EXIT_REASON_INTR_WINDOW] = "Interrupt window",
135 [EXIT_REASON_NMI_WINDOW] = "NMI window",
136 [EXIT_REASON_TASK_SWITCH] = "Task switch",
137 [EXIT_REASON_CPUID] = "CPUID",
138 [EXIT_REASON_GETSEC] = "GETSEC",
139 [EXIT_REASON_HLT] = "HLT",
140 [EXIT_REASON_INVD] = "INVD",
141 [EXIT_REASON_INVLPG] = "INVLPG",
142 [EXIT_REASON_RDPMC] = "RDPMC",
143 [EXIT_REASON_RDTSC] = "RDTSC",
144 [EXIT_REASON_RSM] = "RSM",
145 [EXIT_REASON_VMCALL] = "VMCALL",
146 [EXIT_REASON_VMCLEAR] = "VMCLEAR",
147 [EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
148 [EXIT_REASON_VMPTRLD] = "VMPTRLD",
149 [EXIT_REASON_VMPTRST] = "VMPTRST",
150 [EXIT_REASON_VMREAD] = "VMREAD",
151 [EXIT_REASON_VMRESUME] = "VMRESUME",
152 [EXIT_REASON_VMWRITE] = "VMWRITE",
153 [EXIT_REASON_VMXOFF] = "VMXOFF",
154 [EXIT_REASON_VMXON] = "VMXON",
155 [EXIT_REASON_CR_ACCESS] = "Control-register accesses",
156 [EXIT_REASON_DR_ACCESS] = "MOV DR",
157 [EXIT_REASON_INOUT] = "I/O instruction",
158 [EXIT_REASON_RDMSR] = "RDMSR",
159 [EXIT_REASON_WRMSR] = "WRMSR",
160 [EXIT_REASON_INVAL_VMCS] =
161 "VM-entry failure due to invalid guest state",
162 [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
163 [EXIT_REASON_MWAIT] = "MWAIT",
164 [EXIT_REASON_MTF] = "Monitor trap flag",
165 [EXIT_REASON_MONITOR] = "MONITOR",
166 [EXIT_REASON_PAUSE] = "PAUSE",
167 [EXIT_REASON_MCE_DURING_ENTRY] =
168 "VM-entry failure due to machine-check event",
169 [EXIT_REASON_TPR] = "TPR below threshold",
170 [EXIT_REASON_APIC_ACCESS] = "APIC access",
171 [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
172 [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
173 [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
174 [EXIT_REASON_EPT_FAULT] = "EPT violation",
175 [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
176 [EXIT_REASON_INVEPT] = "INVEPT",
177 [EXIT_REASON_RDTSCP] = "RDTSCP",
178 [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
179 [EXIT_REASON_INVVPID] = "INVVPID",
180 [EXIT_REASON_WBINVD] = "WBINVD",
181 [EXIT_REASON_XSETBV] = "XSETBV",
182 [EXIT_REASON_APIC_WRITE] = "APIC write",
183 [EXIT_REASON_RDRAND] = "RDRAND",
184 [EXIT_REASON_INVPCID] = "INVPCID",
185 [EXIT_REASON_VMFUNC] = "VMFUNC",
186 [EXIT_REASON_ENCLS] = "ENCLS",
187 [EXIT_REASON_RDSEED] = "RDSEED",
188 [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
189 [EXIT_REASON_XSAVES] = "XSAVES",
190 [EXIT_REASON_XRSTORS] = "XRSTORS"
191 };
192
193 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
194 extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
195
196 int guest_ncpus;
197 uint16_t cores, maxcpus, sockets, threads;
198
199 int raw_stdio = 0;
200
201 static char *progname;
202 static const int BSP = 0;
203
204 static cpuset_t cpumask;
205
206 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
207
208 static struct vm_exit vmexit[VM_MAXCPU];
209 static struct vm_entry vmentry[VM_MAXCPU];
210
211 struct bhyvestats {
212 uint64_t vmexit_bogus;
213 uint64_t vmexit_reqidle;
214 uint64_t vmexit_hlt;
215 uint64_t vmexit_pause;
216 uint64_t vmexit_mtrap;
217 uint64_t vmexit_mmio;
218 uint64_t vmexit_inout;
219 uint64_t cpu_switch_rotate;
220 uint64_t cpu_switch_direct;
221 uint64_t mmio_unhandled;
222 } stats;
223
224 struct mt_vmm_info {
225 pthread_t mt_thr;
226 struct vmctx *mt_ctx;
227 int mt_vcpu;
228 uint64_t mt_startrip;
229 } mt_vmm_info[VM_MAXCPU];
230
231 #ifdef __FreeBSD__
232 static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
233 #endif
234
235 static void
236 usage(int code)
237 {
238
239 fprintf(stderr,
240 #ifdef __FreeBSD__
241 "Usage: %s [-aehuwxACDHPSWY]\n"
242 #else
243 "Usage: %s [-adehuwxACDHPSWY]\n"
244 #endif
245 " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
246 " %*s [-k <file>] [-l <lpc>] [-m mem] [-o <var>=<value>]\n"
247 #ifdef __FreeBSD__
248 " %*s [-p vcpu:hostcpu] [-s <pci>] [-U uuid] [<vm>]\n"
249 #else
250 " %*s [-s <pci>] [-U uuid] [<vm>]\n"
251 #endif
252 " -a: local apic is in xAPIC mode (deprecated)\n"
253 " -A: create ACPI tables\n"
254 " -c: number of cpus and/or topology specification\n"
255 " -C: include guest memory in core file\n"
256 #ifndef __FreeBSD__
257 " -d: suspend cpu at boot\n"
258 #endif
259 " -D: destroy on power-off\n"
260 " -e: exit on unhandled I/O access\n"
261 " -h: help\n"
262 " -H: vmexit from the guest on hlt\n"
263 " -k: key=value flat config file\n"
264 " -l: LPC device configuration\n"
265 " -m: memory size\n"
266 " -o: set config 'var' to 'value'\n"
267 #ifdef __FreeBSD__
268 " -p: pin 'vcpu' to 'hostcpu'\n"
269 #endif
270 " -P: vmexit from the guest on pause\n"
271 " -s: <slot,driver,configinfo> PCI slot config\n"
272 " -S: guest memory cannot be swapped\n"
273 " -u: RTC keeps UTC time\n"
274 " -U: uuid\n"
275 " -w: ignore unimplemented MSRs\n"
276 " -W: force virtio to use single-vector MSI\n"
277 " -x: local apic is in x2APIC mode\n"
278 " -Y: disable MPtable generation\n",
279 progname, (int)strlen(progname), "", (int)strlen(progname), "",
280 (int)strlen(progname), "");
281
282 exit(code);
283 }
284
285 /*
286 * XXX This parser is known to have the following issues:
287 * 1. It accepts null key=value tokens ",," as setting "cpus" to an
288 * empty string.
289 *
290 * The acceptance of a null specification ('-c ""') is by design to match the
291 * manual page syntax specification, this results in a topology of 1 vCPU.
292 */
293 static int
294 topology_parse(const char *opt)
295 {
296 char *cp, *str;
297
298 if (*opt == '\0') {
299 set_config_value("sockets", "1");
300 set_config_value("cores", "1");
301 set_config_value("threads", "1");
302 set_config_value("cpus", "1");
303 return (0);
304 }
305
306 str = strdup(opt);
307 if (str == NULL)
308 errx(4, "Failed to allocate memory");
309
310 while ((cp = strsep(&str, ",")) != NULL) {
311 if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
312 set_config_value("cpus", cp + strlen("cpus="));
313 else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0)
314 set_config_value("sockets", cp + strlen("sockets="));
315 else if (strncmp(cp, "cores=", strlen("cores=")) == 0)
316 set_config_value("cores", cp + strlen("cores="));
317 else if (strncmp(cp, "threads=", strlen("threads=")) == 0)
318 set_config_value("threads", cp + strlen("threads="));
319 #ifdef notyet /* Do not expose this until vmm.ko implements it */
320 else if (strncmp(cp, "maxcpus=", strlen("maxcpus=")) == 0)
321 set_config_value("maxcpus", cp + strlen("maxcpus="));
322 #endif
323 else if (strchr(cp, '=') != NULL)
324 goto out;
325 else
326 set_config_value("cpus", cp);
327 }
328 free(str);
329 return (0);
330
331 out:
332 free(str);
333 return (-1);
334 }
335
336 static int
337 parse_int_value(const char *key, const char *value, int minval, int maxval)
338 {
339 char *cp;
340 long lval;
341
342 errno = 0;
343 lval = strtol(value, &cp, 0);
344 if (errno != 0 || *cp != '\0' || cp == value || lval < minval ||
345 lval > maxval)
346 errx(4, "Invalid value for %s: '%s'", key, value);
347 return (lval);
348 }
349
350 /*
351 * Set the sockets, cores, threads, and guest_cpus variables based on
352 * the configured topology.
353 *
354 * The limits of UINT16_MAX are due to the types passed to
355 * vm_set_topology(). vmm.ko may enforce tighter limits.
356 */
357 static void
358 calc_topolopgy(void)
359 {
360 const char *value;
361 bool explicit_cpus;
362 uint64_t ncpus;
363
364 value = get_config_value("cpus");
365 if (value != NULL) {
366 guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX);
367 explicit_cpus = true;
368 } else {
369 guest_ncpus = 1;
370 explicit_cpus = false;
371 }
372 value = get_config_value("cores");
373 if (value != NULL)
374 cores = parse_int_value("cores", value, 1, UINT16_MAX);
375 else
376 cores = 1;
377 value = get_config_value("threads");
378 if (value != NULL)
379 threads = parse_int_value("threads", value, 1, UINT16_MAX);
380 else
381 threads = 1;
382 value = get_config_value("sockets");
383 if (value != NULL)
384 sockets = parse_int_value("sockets", value, 1, UINT16_MAX);
385 else
386 sockets = guest_ncpus;
387
388 /*
389 * Compute sockets * cores * threads avoiding overflow. The
390 * range check above insures these are 16 bit values.
391 */
392 ncpus = (uint64_t)sockets * cores * threads;
393 if (ncpus > UINT16_MAX)
394 errx(4, "Computed number of vCPUs too high: %ju",
395 (uintmax_t)ncpus);
396
397 if (explicit_cpus) {
398 if (guest_ncpus != ncpus)
399 errx(4, "Topology (%d sockets, %d cores, %d threads) "
400 "does not match %d vCPUs", sockets, cores, threads,
401 guest_ncpus);
402 } else
403 guest_ncpus = ncpus;
404 }
405
406 #ifndef WITHOUT_CAPSICUM
407 /*
408 * 11-stable capsicum helpers
409 */
410 static void
411 bhyve_caph_cache_catpages(void)
412 {
413
414 (void)catopen("libc", NL_CAT_LOCALE);
415 }
416
417 static int
418 bhyve_caph_limit_stdoe(void)
419 {
420 cap_rights_t rights;
421 unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ };
422 int i, fds[] = { STDOUT_FILENO, STDERR_FILENO };
423
424 cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL);
425 cap_rights_set(&rights, CAP_WRITE);
426
427 for (i = 0; i < nitems(fds); i++) {
428 if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS)
429 return (-1);
430
431 if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS)
432 return (-1);
433
434 if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS)
435 return (-1);
436 }
437
438 return (0);
439 }
440
441 #endif
442
443 #ifdef __FreeBSD__
444 static int
445 pincpu_parse(const char *opt)
446 {
447 int vcpu, pcpu;
448
449 if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
450 fprintf(stderr, "invalid format: %s\n", opt);
451 return (-1);
452 }
453
454 if (vcpu < 0 || vcpu >= VM_MAXCPU) {
455 fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
456 vcpu, VM_MAXCPU - 1);
457 return (-1);
458 }
459
460 if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
461 fprintf(stderr, "hostcpu '%d' outside valid range from "
462 "0 to %d\n", pcpu, CPU_SETSIZE - 1);
463 return (-1);
464 }
465
466 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
467 value = get_config_value(key);
468
469 if (asprintf(&newval, "%s%s%d", value != NULL ? value : "",
470 value != NULL ? "," : "", pcpu) == -1) {
471 perror("failed to build new cpuset string");
472 return (-1);
473 }
474
475 set_config_value(key, newval);
476 free(newval);
477 return (0);
478 }
479
480 static void
481 parse_cpuset(int vcpu, const char *list, cpuset_t *set)
482 {
483 char *cp, *token;
484 int pcpu, start;
485
486 CPU_ZERO(set);
487 start = -1;
488 token = __DECONST(char *, list);
489 for (;;) {
490 pcpu = strtoul(token, &cp, 0);
491 if (cp == token)
492 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
493 if (pcpu < 0 || pcpu >= CPU_SETSIZE)
494 errx(4, "hostcpu '%d' outside valid range from 0 to %d",
495 pcpu, CPU_SETSIZE - 1);
496 switch (*cp) {
497 case ',':
498 case '\0':
499 if (start >= 0) {
500 if (start > pcpu)
501 errx(4, "Invalid hostcpu range %d-%d",
502 start, pcpu);
503 while (start < pcpu) {
504 CPU_SET(start, vcpumap[vcpu]);
505 start++;
506 }
507 start = -1;
508 }
509 CPU_SET(pcpu, vcpumap[vcpu]);
510 break;
511 case '-':
512 if (start >= 0)
513 errx(4, "invalid cpuset for vcpu %d: '%s'",
514 vcpu, list);
515 start = pcpu;
516 break;
517 default:
518 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
519 }
520 if (*cp == '\0')
521 break;
522 token = cp + 1;
523 }
524 }
525
526 static void
527 build_vcpumaps(void)
528 {
529 char key[16];
530 const char *value;
531 int vcpu;
532
533 for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
534 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
535 value = get_config_value(key);
536 if (value == NULL)
537 continue;
538 vcpumap[vcpu] = malloc(sizeof(cpuset_t));
539 if (vcpumap[vcpu] == NULL)
540 err(4, "Failed to allocate cpuset for vcpu %d", vcpu);
541 parse_cpuset(vcpu, value, vcpumap[vcpu]);
542 }
543 }
544
545 void
546 vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
547 int errcode)
548 {
549 struct vmctx *ctx;
550 int error, restart_instruction;
551
552 ctx = arg;
553 restart_instruction = 1;
554
555 error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
556 restart_instruction);
557 assert(error == 0);
558 }
559 #endif /* __FreeBSD__ */
560
561 void *
562 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
563 {
564
565 return (vm_map_gpa(ctx, gaddr, len));
566 }
567
568 int
569 fbsdrun_virtio_msix(void)
570 {
571
572 return (get_config_bool_default("virtio_msix", true));
573 }
574
575 static void *
576 fbsdrun_start_thread(void *param)
577 {
578 char tname[MAXCOMLEN + 1];
579 struct mt_vmm_info *mtp;
580 int vcpu;
581
582 mtp = param;
583 vcpu = mtp->mt_vcpu;
584
585 snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
586 pthread_set_name_np(mtp->mt_thr, tname);
587
588 gdb_cpu_add(vcpu);
589
590 vm_loop(mtp->mt_ctx, vcpu, mtp->mt_startrip);
591
592 /* not reached */
593 exit(1);
594 return (NULL);
595 }
596
597 #ifdef __FreeBSD__
598 void
599 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
600 #else
601 void
602 fbsdrun_addcpu(struct vmctx *ctx, int newcpu, uint64_t rip, bool suspend)
603 #endif
604 {
605 int error;
606
607 #ifdef __FreeBSD__
608 assert(fromcpu == BSP);
609 #endif
610
611 /*
612 * The 'newcpu' must be activated in the context of 'fromcpu'. If
613 * vm_activate_cpu() is delayed until newcpu's pthread starts running
614 * then vmm.ko is out-of-sync with bhyve and this can create a race
615 * with vm_suspend().
616 */
617 error = vm_activate_cpu(ctx, newcpu);
618 if (error != 0)
619 err(EX_OSERR, "could not activate CPU %d", newcpu);
620
621 CPU_SET_ATOMIC(newcpu, &cpumask);
622
623 #ifndef __FreeBSD__
624 if (suspend)
625 (void) vm_suspend_cpu(ctx, newcpu);
626 #endif
627
628 /*
629 * Set up the vmexit struct to allow execution to start
630 * at the given RIP
631 */
632 mt_vmm_info[newcpu].mt_ctx = ctx;
633 mt_vmm_info[newcpu].mt_vcpu = newcpu;
634 mt_vmm_info[newcpu].mt_startrip = rip;
635
636 error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
637 fbsdrun_start_thread, &mt_vmm_info[newcpu]);
638 assert(error == 0);
639 }
640
641 static int
642 fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
643 {
644
645 if (!CPU_ISSET(vcpu, &cpumask)) {
646 fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
647 exit(4);
648 }
649
650 CPU_CLR_ATOMIC(vcpu, &cpumask);
651 return (CPU_EMPTY(&cpumask));
652 }
653
654 static void
655 vmentry_mmio_read(int vcpu, uint64_t gpa, uint8_t bytes, uint64_t data)
656 {
657 struct vm_entry *entry = &vmentry[vcpu];
658 struct vm_mmio *mmio = &entry->u.mmio;
659
660 assert(entry->cmd == VEC_DEFAULT);
661
662 entry->cmd = VEC_FULFILL_MMIO;
663 mmio->bytes = bytes;
664 mmio->read = 1;
665 mmio->gpa = gpa;
666 mmio->data = data;
667 }
668
669 static void
670 vmentry_mmio_write(int vcpu, uint64_t gpa, uint8_t bytes)
671 {
672 struct vm_entry *entry = &vmentry[vcpu];
673 struct vm_mmio *mmio = &entry->u.mmio;
674
675 assert(entry->cmd == VEC_DEFAULT);
676
677 entry->cmd = VEC_FULFILL_MMIO;
678 mmio->bytes = bytes;
679 mmio->read = 0;
680 mmio->gpa = gpa;
681 mmio->data = 0;
682 }
683
684 static void
685 vmentry_inout_read(int vcpu, uint16_t port, uint8_t bytes, uint32_t data)
686 {
687 struct vm_entry *entry = &vmentry[vcpu];
688 struct vm_inout *inout = &entry->u.inout;
689
690 assert(entry->cmd == VEC_DEFAULT);
691
692 entry->cmd = VEC_FULFILL_INOUT;
693 inout->bytes = bytes;
694 inout->flags = INOUT_IN;
695 inout->port = port;
696 inout->eax = data;
697 }
698
699 static void
700 vmentry_inout_write(int vcpu, uint16_t port, uint8_t bytes)
701 {
702 struct vm_entry *entry = &vmentry[vcpu];
703 struct vm_inout *inout = &entry->u.inout;
704
705 assert(entry->cmd == VEC_DEFAULT);
706
707 entry->cmd = VEC_FULFILL_INOUT;
708 inout->bytes = bytes;
709 inout->flags = 0;
710 inout->port = port;
711 inout->eax = 0;
712 }
713
714 static int
715 vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
716 uint32_t eax)
717 {
718 #if BHYVE_DEBUG
719 /*
720 * put guest-driven debug here
721 */
722 #endif
723 return (VMEXIT_CONTINUE);
724 }
725
726 static int
727 vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
728 {
729 int error;
730 int vcpu;
731 struct vm_inout inout;
732 bool in;
733 uint8_t bytes;
734
735 stats.vmexit_inout++;
736
737 vcpu = *pvcpu;
738 inout = vme->u.inout;
739 in = (inout.flags & INOUT_IN) != 0;
740 bytes = inout.bytes;
741
742 /* Extra-special case of host notifications */
743 if (!in && inout.port == GUEST_NIO_PORT) {
744 error = vmexit_handle_notify(ctx, vme, pvcpu, inout.eax);
745 vmentry_inout_write(vcpu, inout.port, bytes);
746 return (error);
747 }
748
749 error = emulate_inout(ctx, vcpu, &inout);
750 if (error) {
751 fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
752 in ? "in" : "out",
753 bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
754 inout.port, vmexit->rip);
755 return (VMEXIT_ABORT);
756 } else {
757 /*
758 * Communicate the status of the inout operation back to the
759 * in-kernel instruction emulation.
760 */
761 if (in) {
762 vmentry_inout_read(vcpu, inout.port, bytes, inout.eax);
763 } else {
764 vmentry_inout_write(vcpu, inout.port, bytes);
765 }
766 return (VMEXIT_CONTINUE);
767 }
768 }
769
770 static int
771 vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
772 {
773 uint64_t val;
774 uint32_t eax, edx;
775 int error;
776
777 val = 0;
778 error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
779 if (error != 0) {
780 fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
781 vme->u.msr.code, *pvcpu);
782 if (get_config_bool("x86.strictmsr")) {
783 vm_inject_gp(ctx, *pvcpu);
784 return (VMEXIT_CONTINUE);
785 }
786 }
787
788 eax = val;
789 error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
790 assert(error == 0);
791
792 edx = val >> 32;
793 error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
794 assert(error == 0);
795
796 return (VMEXIT_CONTINUE);
797 }
798
799 static int
800 vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
801 {
802 int error;
803
804 error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
805 if (error != 0) {
806 fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
807 vme->u.msr.code, vme->u.msr.wval, *pvcpu);
808 if (get_config_bool("x86.strictmsr")) {
809 vm_inject_gp(ctx, *pvcpu);
810 return (VMEXIT_CONTINUE);
811 }
812 }
813 return (VMEXIT_CONTINUE);
814 }
815
816 #ifdef __FreeBSD__
817 static int
818 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
819 {
820
821 (void)spinup_ap(ctx, *pvcpu,
822 vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
823
824 return (VMEXIT_CONTINUE);
825 }
826 #else
827 static int
828 vmexit_run_state(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
829 {
830 /*
831 * Run-state transitions (INIT, SIPI, etc) are handled in-kernel, so an
832 * exit to userspace with that code is not expected.
833 */
834 fprintf(stderr, "unexpected run-state VM exit");
835 return (VMEXIT_ABORT);
836 }
837 #endif /* __FreeBSD__ */
838
839 #ifdef __FreeBSD__
840 #define DEBUG_EPT_MISCONFIG
841 #else
842 /* EPT misconfig debugging not possible now that raw VMCS access is gone */
843 #endif
844
845 #ifdef DEBUG_EPT_MISCONFIG
846 #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
847
848 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
849 static int ept_misconfig_ptenum;
850 #endif
851
852 static const char *
853 vmexit_vmx_desc(uint32_t exit_reason)
854 {
855
856 if (exit_reason >= nitems(vmx_exit_reason_desc) ||
857 vmx_exit_reason_desc[exit_reason] == NULL)
858 return ("Unknown");
859 return (vmx_exit_reason_desc[exit_reason]);
860 }
861
862 static int
863 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
864 {
865
866 fprintf(stderr, "vm exit[%d]\n", *pvcpu);
867 fprintf(stderr, "\treason\t\tVMX\n");
868 fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
869 fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
870 fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
871 fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
872 vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
873 fprintf(stderr, "\tqualification\t0x%016lx\n",
874 vmexit->u.vmx.exit_qualification);
875 fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
876 fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
877 #ifdef DEBUG_EPT_MISCONFIG
878 if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
879 vm_get_register(ctx, *pvcpu,
880 VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
881 &ept_misconfig_gpa);
882 vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
883 &ept_misconfig_ptenum);
884 fprintf(stderr, "\tEPT misconfiguration:\n");
885 fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
886 fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
887 ept_misconfig_ptenum, ept_misconfig_pte[0],
888 ept_misconfig_pte[1], ept_misconfig_pte[2],
889 ept_misconfig_pte[3]);
890 }
891 #endif /* DEBUG_EPT_MISCONFIG */
892 return (VMEXIT_ABORT);
893 }
894
895 static int
896 vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
897 {
898
899 fprintf(stderr, "vm exit[%d]\n", *pvcpu);
900 fprintf(stderr, "\treason\t\tSVM\n");
901 fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
902 fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
903 fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
904 fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
905 fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
906 return (VMEXIT_ABORT);
907 }
908
909 static int
910 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
911 {
912
913 assert(vmexit->inst_length == 0);
914
915 stats.vmexit_bogus++;
916
917 return (VMEXIT_CONTINUE);
918 }
919
920 static int
921 vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
922 {
923
924 assert(vmexit->inst_length == 0);
925
926 stats.vmexit_reqidle++;
927
928 return (VMEXIT_CONTINUE);
929 }
930
931 static int
932 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
933 {
934
935 stats.vmexit_hlt++;
936
937 /*
938 * Just continue execution with the next instruction. We use
939 * the HLT VM exit as a way to be friendly with the host
940 * scheduler.
941 */
942 return (VMEXIT_CONTINUE);
943 }
944
945 static int
946 vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
947 {
948
949 stats.vmexit_pause++;
950
951 return (VMEXIT_CONTINUE);
952 }
953
954 static int
955 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
956 {
957
958 assert(vmexit->inst_length == 0);
959
960 stats.vmexit_mtrap++;
961
962 gdb_cpu_mtrap(*pvcpu);
963
964 return (VMEXIT_CONTINUE);
965 }
966
967 static int
968 vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
969 {
970 uint8_t i, valid;
971
972 fprintf(stderr, "Failed to emulate instruction sequence ");
973
974 valid = vmexit->u.inst_emul.num_valid;
975 if (valid != 0) {
976 assert(valid <= sizeof (vmexit->u.inst_emul.inst));
977 fprintf(stderr, "[");
978 for (i = 0; i < valid; i++) {
979 if (i == 0) {
980 fprintf(stderr, "%02x",
981 vmexit->u.inst_emul.inst[i]);
982 } else {
983 fprintf(stderr, ", %02x",
984 vmexit->u.inst_emul.inst[i]);
985 }
986 }
987 fprintf(stderr, "] ");
988 }
989 fprintf(stderr, "@ %rip = %x\n", vmexit->rip);
990
991 return (VMEXIT_ABORT);
992 }
993
994 static int
995 vmexit_mmio(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
996 {
997 int vcpu, err;
998 struct vm_mmio mmio;
999 bool is_read;
1000
1001 stats.vmexit_mmio++;
1002
1003 vcpu = *pvcpu;
1004 mmio = vmexit->u.mmio;
1005 is_read = (mmio.read != 0);
1006
1007 err = emulate_mem(ctx, vcpu, &mmio);
1008
1009 if (err == ESRCH) {
1010 fprintf(stderr, "Unhandled memory access to 0x%lx\n", mmio.gpa);
1011 stats.mmio_unhandled++;
1012
1013 /*
1014 * Access to non-existent physical addresses is not likely to
1015 * result in fatal errors on hardware machines, but rather reads
1016 * of all-ones or discarded-but-acknowledged writes.
1017 */
1018 mmio.data = ~0UL;
1019 err = 0;
1020 }
1021
1022 if (err == 0) {
1023 if (is_read) {
1024 vmentry_mmio_read(vcpu, mmio.gpa, mmio.bytes,
1025 mmio.data);
1026 } else {
1027 vmentry_mmio_write(vcpu, mmio.gpa, mmio.bytes);
1028 }
1029 return (VMEXIT_CONTINUE);
1030 }
1031
1032 fprintf(stderr, "Unhandled mmio error to 0x%lx: %d\n", mmio.gpa, err);
1033 return (VMEXIT_ABORT);
1034 }
1035
1036 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
1037 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
1038
1039 static int
1040 vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
1041 {
1042 enum vm_suspend_how how;
1043
1044 how = vmexit->u.suspended.how;
1045
1046 fbsdrun_deletecpu(ctx, *pvcpu);
1047
1048 if (*pvcpu != BSP) {
1049 pthread_mutex_lock(&resetcpu_mtx);
1050 pthread_cond_signal(&resetcpu_cond);
1051 pthread_mutex_unlock(&resetcpu_mtx);
1052 pthread_exit(NULL);
1053 }
1054
1055 pthread_mutex_lock(&resetcpu_mtx);
1056 while (!CPU_EMPTY(&cpumask)) {
1057 pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
1058 }
1059 pthread_mutex_unlock(&resetcpu_mtx);
1060
1061 switch (how) {
1062 case VM_SUSPEND_RESET:
1063 exit(0);
1064 case VM_SUSPEND_POWEROFF:
1065 if (get_config_bool_default("destroy_on_poweroff", false))
1066 vm_destroy(ctx);
1067 exit(1);
1068 case VM_SUSPEND_HALT:
1069 exit(2);
1070 case VM_SUSPEND_TRIPLEFAULT:
1071 exit(3);
1072 default:
1073 fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
1074 exit(100);
1075 }
1076 return (0); /* NOTREACHED */
1077 }
1078
1079 static int
1080 vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
1081 {
1082
1083 gdb_cpu_suspend(*pvcpu);
1084 return (VMEXIT_CONTINUE);
1085 }
1086
1087 static int
1088 vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
1089 {
1090
1091 gdb_cpu_breakpoint(*pvcpu, vmexit);
1092 return (VMEXIT_CONTINUE);
1093 }
1094
1095 static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
1096 [VM_EXITCODE_INOUT] = vmexit_inout,
1097 [VM_EXITCODE_MMIO] = vmexit_mmio,
1098 [VM_EXITCODE_VMX] = vmexit_vmx,
1099 [VM_EXITCODE_SVM] = vmexit_svm,
1100 [VM_EXITCODE_BOGUS] = vmexit_bogus,
1101 [VM_EXITCODE_REQIDLE] = vmexit_reqidle,
1102 [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
1103 [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
1104 [VM_EXITCODE_MTRAP] = vmexit_mtrap,
1105 [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
1106 #ifdef __FreeBSD__
1107 [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
1108 #else
1109 [VM_EXITCODE_RUN_STATE] = vmexit_run_state,
1110 #endif
1111 [VM_EXITCODE_SUSPENDED] = vmexit_suspend,
1112 [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
1113 [VM_EXITCODE_DEBUG] = vmexit_debug,
1114 [VM_EXITCODE_BPT] = vmexit_breakpoint,
1115 };
1116
1117 static void
1118 vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
1119 {
1120 int error, rc;
1121 enum vm_exitcode exitcode;
1122 cpuset_t active_cpus;
1123 struct vm_exit *vexit;
1124 struct vm_entry *ventry;
1125
1126 #ifdef __FreeBSD__
1127 if (vcpumap[vcpu] != NULL) {
1128 error = pthread_setaffinity_np(pthread_self(),
1129 sizeof(cpuset_t), vcpumap[vcpu]);
1130 assert(error == 0);
1131 }
1132 #endif
1133 error = vm_active_cpus(ctx, &active_cpus);
1134 assert(CPU_ISSET(vcpu, &active_cpus));
1135
1136 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
1137 assert(error == 0);
1138
1139 ventry = &vmentry[vcpu];
1140 vexit = &vmexit[vcpu];
1141
1142 while (1) {
1143 error = vm_run(ctx, vcpu, ventry, vexit);
1144 if (error != 0)
1145 break;
1146
1147 if (ventry->cmd != VEC_DEFAULT) {
1148 /*
1149 * Discard any lingering entry state after it has been
1150 * submitted via vm_run().
1151 */
1152 bzero(ventry, sizeof (*ventry));
1153 }
1154
1155 exitcode = vexit->exitcode;
1156 if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
1157 fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
1158 exitcode);
1159 exit(4);
1160 }
1161
1162 rc = (*handler[exitcode])(ctx, vexit, &vcpu);
1163
1164 switch (rc) {
1165 case VMEXIT_CONTINUE:
1166 break;
1167 case VMEXIT_ABORT:
1168 abort();
1169 default:
1170 exit(4);
1171 }
1172 }
1173 fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
1174 }
1175
1176 static int
1177 num_vcpus_allowed(struct vmctx *ctx)
1178 {
1179 #ifdef __FreeBSD__
1180 int tmp, error;
1181
1182 error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
1183
1184 /*
1185 * The guest is allowed to spinup more than one processor only if the
1186 * UNRESTRICTED_GUEST capability is available.
1187 */
1188 if (error == 0)
1189 return (VM_MAXCPU);
1190 else
1191 return (1);
1192 #else
1193 /* Unrestricted Guest is always enabled on illumos */
1194 return (VM_MAXCPU);
1195 #endif /* __FreeBSD__ */
1196 }
1197
1198 void
1199 fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
1200 {
1201 int err, tmp;
1202
1203 if (get_config_bool_default("x86.vmexit_on_hlt", false)) {
1204 err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
1205 if (err < 0) {
1206 fprintf(stderr, "VM exit on HLT not supported\n");
1207 exit(4);
1208 }
1209 vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
1210 if (cpu == BSP)
1211 handler[VM_EXITCODE_HLT] = vmexit_hlt;
1212 }
1213
1214 if (get_config_bool_default("x86.vmexit_on_pause", false)) {
1215 /*
1216 * pause exit support required for this mode
1217 */
1218 err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
1219 if (err < 0) {
1220 fprintf(stderr,
1221 "SMP mux requested, no pause support\n");
1222 exit(4);
1223 }
1224 vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
1225 if (cpu == BSP)
1226 handler[VM_EXITCODE_PAUSE] = vmexit_pause;
1227 }
1228
1229 if (get_config_bool_default("x86.x2apic", false))
1230 err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
1231 else
1232 err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
1233
1234 if (err) {
1235 fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
1236 exit(4);
1237 }
1238
1239 #ifdef __FreeBSD__
1240 vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
1241 #endif
1242 }
1243
1244 static struct vmctx *
1245 do_open(const char *vmname)
1246 {
1247 struct vmctx *ctx;
1248 int error;
1249 bool reinit, romboot;
1250 #ifndef WITHOUT_CAPSICUM
1251 cap_rights_t rights;
1252 const cap_ioctl_t *cmds;
1253 size_t ncmds;
1254 #endif
1255
1256 reinit = romboot = false;
1257
1258 if (lpc_bootrom())
1259 romboot = true;
1260
1261 error = vm_create(vmname);
1262 if (error) {
1263 if (errno == EEXIST) {
1264 if (romboot) {
1265 reinit = true;
1266 } else {
1267 /*
1268 * The virtual machine has been setup by the
1269 * userspace bootloader.
1270 */
1271 }
1272 } else {
1273 perror("vm_create");
1274 exit(4);
1275 }
1276 } else {
1277 if (!romboot) {
1278 /*
1279 * If the virtual machine was just created then a
1280 * bootrom must be configured to boot it.
1281 */
1282 fprintf(stderr, "virtual machine cannot be booted\n");
1283 exit(4);
1284 }
1285 }
1286
1287 ctx = vm_open(vmname);
1288 if (ctx == NULL) {
1289 perror("vm_open");
1290 exit(4);
1291 }
1292
1293 #ifndef WITHOUT_CAPSICUM
1294 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1295 if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1)
1296 errx(EX_OSERR, "Unable to apply rights for sandbox");
1297 vm_get_ioctls(&ncmds);
1298 cmds = vm_get_ioctls(NULL);
1299 if (cmds == NULL)
1300 errx(EX_OSERR, "out of memory");
1301 if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1)
1302 errx(EX_OSERR, "Unable to apply rights for sandbox");
1303 free((cap_ioctl_t *)cmds);
1304 #endif
1305
1306 if (reinit) {
1307 error = vm_reinit(ctx);
1308 if (error) {
1309 perror("vm_reinit");
1310 exit(4);
1311 }
1312 }
1313 error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
1314 if (error)
1315 errx(EX_OSERR, "vm_set_topology");
1316 return (ctx);
1317 }
1318
1319 #ifndef __FreeBSD__
1320
1321 #define FILE_PROVISIONING "/var/svc/provisioning"
1322 #define FILE_PROVISION_SUCCESS "/var/svc/provision_success"
1323
1324 static void
1325 mark_provisioned(void)
1326 {
1327 struct stat stbuf;
1328
1329 if (lstat(FILE_PROVISIONING, &stbuf) != 0)
1330 return;
1331
1332 if (rename(FILE_PROVISIONING, FILE_PROVISION_SUCCESS) != 0) {
1333 (void) fprintf(stderr, "Cannot rename %s to %s: %s\n",
1334 FILE_PROVISIONING, FILE_PROVISION_SUCCESS,
1335 strerror(errno));
1336 }
1337 }
1338
1339 #endif
1340
1341 static bool
1342 parse_config_option(const char *option)
1343 {
1344 const char *value;
1345 char *path;
1346
1347 value = strchr(option, '=');
1348 if (value == NULL || value[1] == '\0')
1349 return (false);
1350 path = strndup(option, value - option);
1351 if (path == NULL)
1352 err(4, "Failed to allocate memory");
1353 set_config_value(path, value + 1);
1354 return (true);
1355 }
1356
1357 static void
1358 parse_simple_config_file(const char *path)
1359 {
1360 FILE *fp;
1361 char *line, *cp;
1362 size_t linecap;
1363 unsigned int lineno;
1364
1365 fp = fopen(path, "r");
1366 if (fp == NULL)
1367 err(4, "Failed to open configuration file %s", path);
1368 line = NULL;
1369 linecap = 0;
1370 lineno = 1;
1371 for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) {
1372 if (*line == '#' || *line == '\n')
1373 continue;
1374 cp = strchr(line, '\n');
1375 if (cp != NULL)
1376 *cp = '\0';
1377 if (!parse_config_option(line))
1378 errx(4, "%s line %u: invalid config option '%s'", path,
1379 lineno, line);
1380 }
1381 free(line);
1382 fclose(fp);
1383 }
1384
1385 static void
1386 set_defaults(void)
1387 {
1388
1389 set_config_bool("acpi_tables", false);
1390 set_config_value("memory.size", "256M");
1391 set_config_bool("x86.strictmsr", true);
1392 }
1393
1394 int
1395 main(int argc, char *argv[])
1396 {
1397 int c, error, err;
1398 int max_vcpus, memflags;
1399 struct vmctx *ctx;
1400 uint64_t rip;
1401 size_t memsize;
1402 const char *value, *vmname;
1403 char *optstr;
1404
1405 init_config();
1406 set_defaults();
1407 progname = basename(argv[0]);
1408
1409 #ifdef __FreeBSD__
1410 optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:";
1411 #else
1412 /* +d, +B, -p */
1413 optstr = "adehuwxACDHIPSWYk:o:G:c:s:m:l:B:U:";
1414 #endif
1415 while ((c = getopt(argc, argv, optstr)) != -1) {
1416 switch (c) {
1417 case 'a':
1418 set_config_bool("x86.x2apic", false);
1419 break;
1420 case 'A':
1421 set_config_bool("acpi_tables", true);
1422 break;
1423 case 'D':
1424 set_config_bool("destroy_on_poweroff", true);
1425 break;
1426 #ifndef __FreeBSD__
1427 case 'B':
1428 if (smbios_parse(optarg) != 0) {
1429 errx(EX_USAGE, "invalid SMBIOS "
1430 "configuration '%s'", optarg);
1431 }
1432 break;
1433 case 'd':
1434 set_config_bool("suspend_at_boot", true);
1435 break;
1436 #endif
1437 #ifdef __FreeBSD__
1438 case 'p':
1439 if (pincpu_parse(optarg) != 0) {
1440 errx(EX_USAGE, "invalid vcpu pinning "
1441 "configuration '%s'", optarg);
1442 }
1443 break;
1444 #endif
1445 case 'c':
1446 if (topology_parse(optarg) != 0) {
1447 errx(EX_USAGE, "invalid cpu topology "
1448 "'%s'", optarg);
1449 }
1450 break;
1451 case 'C':
1452 set_config_bool("memory.guest_in_core", true);
1453 break;
1454 case 'G':
1455 if (optarg[0] == 'w') {
1456 set_config_bool("gdb.wait", true);
1457 optarg++;
1458 }
1459 set_config_value("gdb.port", optarg);
1460 break;
1461 case 'k':
1462 parse_simple_config_file(optarg);
1463 break;
1464 case 'l':
1465 if (strncmp(optarg, "help", strlen(optarg)) == 0) {
1466 lpc_print_supported_devices();
1467 exit(0);
1468 } else if (lpc_device_parse(optarg) != 0) {
1469 errx(EX_USAGE, "invalid lpc device "
1470 "configuration '%s'", optarg);
1471 }
1472 break;
1473 case 's':
1474 if (strncmp(optarg, "help", strlen(optarg)) == 0) {
1475 pci_print_supported_devices();
1476 exit(0);
1477 } else if (pci_parse_slot(optarg) != 0)
1478 exit(4);
1479 else
1480 break;
1481 case 'S':
1482 set_config_bool("memory.wired", true);
1483 break;
1484 case 'm':
1485 set_config_value("memory.size", optarg);
1486 break;
1487 case 'o':
1488 if (!parse_config_option(optarg))
1489 errx(EX_USAGE, "invalid configuration option '%s'", optarg);
1490 break;
1491 case 'H':
1492 set_config_bool("x86.vmexit_on_hlt", true);
1493 break;
1494 case 'I':
1495 /*
1496 * The "-I" option was used to add an ioapic to the
1497 * virtual machine.
1498 *
1499 * An ioapic is now provided unconditionally for each
1500 * virtual machine and this option is now deprecated.
1501 */
1502 break;
1503 case 'P':
1504 set_config_bool("x86.vmexit_on_pause", true);
1505 break;
1506 case 'e':
1507 set_config_bool("x86.strictio", true);
1508 break;
1509 case 'u':
1510 set_config_bool("rtc.use_localtime", false);
1511 break;
1512 case 'U':
1513 set_config_value("uuid", optarg);
1514 break;
1515 case 'w':
1516 set_config_bool("x86.strictmsr", false);
1517 break;
1518 case 'W':
1519 set_config_bool("virtio_msix", false);
1520 break;
1521 case 'x':
1522 set_config_bool("x86.x2apic", true);
1523 break;
1524 case 'Y':
1525 set_config_bool("x86.mptable", false);
1526 break;
1527 case 'h':
1528 usage(0);
1529 default:
1530 usage(1);
1531 }
1532 }
1533 argc -= optind;
1534 argv += optind;
1535
1536 if (argc > 1)
1537 usage(1);
1538
1539 if (argc == 1)
1540 set_config_value("name", argv[0]);
1541
1542 vmname = get_config_value("name");
1543 if (vmname == NULL)
1544 usage(1);
1545
1546 if (get_config_bool_default("config.dump", false)) {
1547 dump_config();
1548 exit(1);
1549 }
1550
1551 #ifndef __FreeBSD__
1552 illumos_priv_init();
1553 #endif
1554
1555 calc_topolopgy();
1556 #ifdef __FreeBSD__
1557 build_vcpumaps();
1558 #endif
1559
1560 value = get_config_value("memory.size");
1561 error = vm_parse_memsize(value, &memsize);
1562 if (error)
1563 errx(EX_USAGE, "invalid memsize '%s'", value);
1564
1565 ctx = do_open(vmname);
1566
1567 max_vcpus = num_vcpus_allowed(ctx);
1568 if (guest_ncpus > max_vcpus) {
1569 fprintf(stderr, "%d vCPUs requested but only %d available\n",
1570 guest_ncpus, max_vcpus);
1571 exit(4);
1572 }
1573
1574 fbsdrun_set_capabilities(ctx, BSP);
1575
1576 memflags = 0;
1577 if (get_config_bool_default("memory.wired", false))
1578 memflags |= VM_MEM_F_WIRED;
1579 if (get_config_bool_default("memory.guest_in_core", false))
1580 memflags |= VM_MEM_F_INCORE;
1581 vm_set_memflags(ctx, memflags);
1582 #ifdef __FreeBSD__
1583 err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
1584 #else
1585 err = vm_arc_resv(ctx, memsize);
1586 if (err != 0) {
1587 (void) fprintf(stderr, "Could not shrink ARC: %s\n",
1588 strerror(err));
1589 exit(4);
1590 }
1591
1592 do {
1593 errno = 0;
1594 err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
1595 error = errno;
1596 if (err != 0 && error == ENOMEM) {
1597 (void) fprintf(stderr, "Unable to allocate memory "
1598 "(%llu), retrying in 1 second\n", memsize);
1599 sleep(1);
1600 }
1601 } while (error == ENOMEM);
1602 #endif
1603 if (err) {
1604 fprintf(stderr, "Unable to set up memory (%d)\n", errno);
1605 exit(4);
1606 }
1607
1608 error = init_msr();
1609 if (error) {
1610 fprintf(stderr, "init_msr error %d", error);
1611 exit(4);
1612 }
1613
1614 init_mem();
1615 init_inout();
1616 #ifdef __FreeBSD__
1617 kernemu_dev_init();
1618 #endif
1619 init_bootrom(ctx);
1620 atkbdc_init(ctx);
1621 pci_irq_init(ctx);
1622 ioapic_init(ctx);
1623
1624 rtc_init(ctx);
1625 sci_init(ctx);
1626 #ifndef __FreeBSD__
1627 pmtmr_init(ctx);
1628 #endif
1629
1630 /*
1631 * Exit if a device emulation finds an error in its initilization
1632 */
1633 if (init_pci(ctx) != 0) {
1634 perror("device emulation initialization error");
1635 exit(4);
1636 }
1637
1638 /*
1639 * Initialize after PCI, to allow a bootrom file to reserve the high
1640 * region.
1641 */
1642 if (get_config_bool("acpi_tables"))
1643 vmgenc_init(ctx);
1644
1645 value = get_config_value("gdb.port");
1646 #ifdef __FreeBSD__
1647 if (value != NULL)
1648 init_gdb(ctx, atoi(value), get_config_bool_default("gdb.wait",
1649 false));
1650 #else
1651 if (value != NULL) {
1652 int port = atoi(value);
1653
1654 if (port < 0) {
1655 init_mdb(ctx,
1656 get_config_bool_default("gdb.wait", false));
1657 } else {
1658 init_gdb(ctx, port,
1659 get_config_bool_default("gdb.wait", false));
1660 }
1661 }
1662 #endif
1663
1664 vga_init(1);
1665
1666 if (lpc_bootrom()) {
1667 #ifdef __FreeBSD__
1668 if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
1669 fprintf(stderr, "ROM boot failed: unrestricted guest "
1670 "capability not available\n");
1671 exit(4);
1672 }
1673 #else
1674 /* Unrestricted Guest is always enabled on illumos */
1675 #endif
1676 error = vcpu_reset(ctx, BSP);
1677 assert(error == 0);
1678 }
1679
1680 error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
1681 assert(error == 0);
1682
1683 /*
1684 * build the guest tables, MP etc.
1685 */
1686 if (get_config_bool_default("x86.mptable", true)) {
1687 error = mptable_build(ctx, guest_ncpus);
1688 if (error) {
1689 perror("error to build the guest tables");
1690 exit(4);
1691 }
1692 }
1693
1694 #ifndef __FreeBSD__
1695 smbios_apply();
1696 #endif
1697 error = smbios_build(ctx);
1698 assert(error == 0);
1699
1700 if (get_config_bool("acpi_tables")) {
1701 error = acpi_build(ctx, guest_ncpus);
1702 assert(error == 0);
1703 }
1704
1705 if (lpc_bootrom())
1706 fwctl_init();
1707
1708 /*
1709 * Change the proc title to include the VM name.
1710 */
1711 setproctitle("%s", vmname);
1712
1713 #ifndef WITHOUT_CAPSICUM
1714 caph_cache_catpages();
1715
1716 if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
1717 errx(EX_OSERR, "Unable to apply rights for sandbox");
1718
1719 if (caph_enter() == -1)
1720 errx(EX_OSERR, "cap_enter() failed");
1721 #endif
1722
1723 /* XXX SmartOS: Upstream drops privs here, but we can't yet. See below... */
1724
1725 #ifdef __FreeBSD__
1726 /*
1727 * Add CPU 0
1728 */
1729 fbsdrun_addcpu(ctx, BSP, BSP, rip);
1730 #else
1731 /* Set BSP to run (unlike the APs which wait for INIT) */
1732 error = vm_set_run_state(ctx, BSP, VRS_RUN, 0);
1733 assert(error == 0);
1734 fbsdrun_addcpu(ctx, BSP, rip,
1735 get_config_bool_default("suspend_at_boot", false));
1736
1737 /* Add subsequent CPUs, which will wait until INIT/SIPI-ed */
1738 for (uint_t i = 1; i < guest_ncpus; i++) {
1739 spinup_halted_ap(ctx, i);
1740 }
1741 mark_provisioned();
1742 /*
1743 * XXX SmartOS: The mark_provisioned() call above required file-access
1744 * privileges that are dropped by the generic call. We must widen the
1745 * full-privilege window a bit. A better solution might be to have
1746 * a way to keep file-access a bit longer, and only have THAT privilege
1747 * to drop here.
1748 */
1749 illumos_priv_lock();
1750 #endif
1751
1752 /*
1753 * Head off to the main event dispatch loop
1754 */
1755 mevent_dispatch();
1756
1757 exit(4);
1758 }