1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2011 NetApp, Inc.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  *
  28  * $FreeBSD$
  29  */
  30 /*
  31  * This file and its contents are supplied under the terms of the
  32  * Common Development and Distribution License ("CDDL"), version 1.0.
  33  * You may only use this file in accordance with the terms of version
  34  * 1.0 of the CDDL.
  35  *
  36  * A full copy of the text of the CDDL should have accompanied this
  37  * source.  A copy of the CDDL is also available via the Internet at
  38  * http://www.illumos.org/license/CDDL.
  39  *
  40  * Copyright 2015 Pluribus Networks Inc.
  41  * Copyright 2018 Joyent, Inc.
  42  * Copyright 2020 Oxide Computer Company
  43  */
  44 
  45 #include <sys/cdefs.h>
  46 __FBSDID("$FreeBSD$");
  47 
  48 #include <sys/types.h>
  49 #ifndef WITHOUT_CAPSICUM
  50 #include <sys/capsicum.h>
  51 #endif
  52 #include <sys/mman.h>
  53 #include <sys/time.h>
  54 #include <sys/cpuset.h>
  55 
  56 #ifdef __FreeBSD__
  57 #include <amd64/vmm/intel/vmcs.h>
  58 #else
  59 #include <intel/vmcs.h>
  60 #endif
  61 
  62 #include <machine/atomic.h>
  63 #include <machine/segments.h>
  64 
  65 #ifndef WITHOUT_CAPSICUM
  66 #include <capsicum_helpers.h>
  67 #endif
  68 #include <stdio.h>
  69 #include <stdlib.h>
  70 #include <string.h>
  71 #include <err.h>
  72 #include <errno.h>
  73 #include <libgen.h>
  74 #include <unistd.h>
  75 #include <assert.h>
  76 #include <pthread.h>
  77 #include <pthread_np.h>
  78 #include <sysexits.h>
  79 #include <stdbool.h>
  80 #include <stdint.h>
  81 
  82 #include <machine/vmm.h>
  83 #ifndef WITHOUT_CAPSICUM
  84 #include <machine/vmm_dev.h>
  85 #endif
  86 #include <vmmapi.h>
  87 
  88 #ifndef __FreeBSD__
  89 #include <sys/stat.h>
  90 #endif
  91 
  92 #include "bhyverun.h"
  93 #include "acpi.h"
  94 #include "atkbdc.h"
  95 #include "console.h"
  96 #include "bootrom.h"
  97 #include "config.h"
  98 #include "inout.h"
  99 #include "debug.h"
 100 #include "fwctl.h"
 101 #include "gdb.h"
 102 #include "ioapic.h"
 103 #include "kernemu_dev.h"
 104 #include "mem.h"
 105 #include "mevent.h"
 106 #include "mptbl.h"
 107 #include "pci_emul.h"
 108 #include "pci_irq.h"
 109 #include "pci_lpc.h"
 110 #include "smbiostbl.h"
 111 #include "xmsr.h"
 112 #include "spinup_ap.h"
 113 #include "rfb.h"
 114 #include "rtc.h"
 115 #include "vga.h"
 116 #include "vmgenc.h"
 117 #ifndef __FreeBSD__
 118 #include "privileges.h"
 119 #endif
 120 
 121 #define GUEST_NIO_PORT          0x488   /* guest upcalls via i/o port */
 122 
 123 #define MB              (1024UL * 1024)
 124 #define GB              (1024UL * MB)
 125 
 126 static const char * const vmx_exit_reason_desc[] = {
 127         [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
 128         [EXIT_REASON_EXT_INTR] = "External interrupt",
 129         [EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
 130         [EXIT_REASON_INIT] = "INIT signal",
 131         [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
 132         [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
 133         [EXIT_REASON_SMI] = "Other SMI",
 134         [EXIT_REASON_INTR_WINDOW] = "Interrupt window",
 135         [EXIT_REASON_NMI_WINDOW] = "NMI window",
 136         [EXIT_REASON_TASK_SWITCH] = "Task switch",
 137         [EXIT_REASON_CPUID] = "CPUID",
 138         [EXIT_REASON_GETSEC] = "GETSEC",
 139         [EXIT_REASON_HLT] = "HLT",
 140         [EXIT_REASON_INVD] = "INVD",
 141         [EXIT_REASON_INVLPG] = "INVLPG",
 142         [EXIT_REASON_RDPMC] = "RDPMC",
 143         [EXIT_REASON_RDTSC] = "RDTSC",
 144         [EXIT_REASON_RSM] = "RSM",
 145         [EXIT_REASON_VMCALL] = "VMCALL",
 146         [EXIT_REASON_VMCLEAR] = "VMCLEAR",
 147         [EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
 148         [EXIT_REASON_VMPTRLD] = "VMPTRLD",
 149         [EXIT_REASON_VMPTRST] = "VMPTRST",
 150         [EXIT_REASON_VMREAD] = "VMREAD",
 151         [EXIT_REASON_VMRESUME] = "VMRESUME",
 152         [EXIT_REASON_VMWRITE] = "VMWRITE",
 153         [EXIT_REASON_VMXOFF] = "VMXOFF",
 154         [EXIT_REASON_VMXON] = "VMXON",
 155         [EXIT_REASON_CR_ACCESS] = "Control-register accesses",
 156         [EXIT_REASON_DR_ACCESS] = "MOV DR",
 157         [EXIT_REASON_INOUT] = "I/O instruction",
 158         [EXIT_REASON_RDMSR] = "RDMSR",
 159         [EXIT_REASON_WRMSR] = "WRMSR",
 160         [EXIT_REASON_INVAL_VMCS] =
 161             "VM-entry failure due to invalid guest state",
 162         [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
 163         [EXIT_REASON_MWAIT] = "MWAIT",
 164         [EXIT_REASON_MTF] = "Monitor trap flag",
 165         [EXIT_REASON_MONITOR] = "MONITOR",
 166         [EXIT_REASON_PAUSE] = "PAUSE",
 167         [EXIT_REASON_MCE_DURING_ENTRY] =
 168             "VM-entry failure due to machine-check event",
 169         [EXIT_REASON_TPR] = "TPR below threshold",
 170         [EXIT_REASON_APIC_ACCESS] = "APIC access",
 171         [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
 172         [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
 173         [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
 174         [EXIT_REASON_EPT_FAULT] = "EPT violation",
 175         [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
 176         [EXIT_REASON_INVEPT] = "INVEPT",
 177         [EXIT_REASON_RDTSCP] = "RDTSCP",
 178         [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
 179         [EXIT_REASON_INVVPID] = "INVVPID",
 180         [EXIT_REASON_WBINVD] = "WBINVD",
 181         [EXIT_REASON_XSETBV] = "XSETBV",
 182         [EXIT_REASON_APIC_WRITE] = "APIC write",
 183         [EXIT_REASON_RDRAND] = "RDRAND",
 184         [EXIT_REASON_INVPCID] = "INVPCID",
 185         [EXIT_REASON_VMFUNC] = "VMFUNC",
 186         [EXIT_REASON_ENCLS] = "ENCLS",
 187         [EXIT_REASON_RDSEED] = "RDSEED",
 188         [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
 189         [EXIT_REASON_XSAVES] = "XSAVES",
 190         [EXIT_REASON_XRSTORS] = "XRSTORS"
 191 };
 192 
 193 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
 194 extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
 195 
 196 int guest_ncpus;
 197 uint16_t cores, maxcpus, sockets, threads;
 198 
 199 int raw_stdio = 0;
 200 
 201 static char *progname;
 202 static const int BSP = 0;
 203 
 204 static cpuset_t cpumask;
 205 
 206 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
 207 
 208 static struct vm_exit vmexit[VM_MAXCPU];
 209 static struct vm_entry vmentry[VM_MAXCPU];
 210 
 211 struct bhyvestats {
 212         uint64_t        vmexit_bogus;
 213         uint64_t        vmexit_reqidle;
 214         uint64_t        vmexit_hlt;
 215         uint64_t        vmexit_pause;
 216         uint64_t        vmexit_mtrap;
 217         uint64_t        vmexit_mmio;
 218         uint64_t        vmexit_inout;
 219         uint64_t        cpu_switch_rotate;
 220         uint64_t        cpu_switch_direct;
 221         uint64_t        mmio_unhandled;
 222 } stats;
 223 
 224 struct mt_vmm_info {
 225         pthread_t       mt_thr;
 226         struct vmctx    *mt_ctx;
 227         int             mt_vcpu;
 228         uint64_t        mt_startrip;
 229 } mt_vmm_info[VM_MAXCPU];
 230 
 231 #ifdef  __FreeBSD__
 232 static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
 233 #endif
 234 
 235 static void
 236 usage(int code)
 237 {
 238 
 239         fprintf(stderr,
 240 #ifdef  __FreeBSD__
 241                 "Usage: %s [-aehuwxACDHPSWY]\n"
 242 #else
 243                 "Usage: %s [-adehuwxACDHPSWY]\n"
 244 #endif
 245                 "       %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
 246                 "       %*s [-k <file>] [-l <lpc>] [-m mem] [-o <var>=<value>]\n"
 247 #ifdef  __FreeBSD__
 248                 "       %*s [-p vcpu:hostcpu] [-s <pci>] [-U uuid] [<vm>]\n"
 249 #else
 250                 "       %*s [-s <pci>] [-U uuid] [<vm>]\n"
 251 #endif
 252                 "       -a: local apic is in xAPIC mode (deprecated)\n"
 253                 "       -A: create ACPI tables\n"
 254                 "       -c: number of cpus and/or topology specification\n"
 255                 "       -C: include guest memory in core file\n"
 256 #ifndef __FreeBSD__
 257                 "       -d: suspend cpu at boot\n"
 258 #endif
 259                 "       -D: destroy on power-off\n"
 260                 "       -e: exit on unhandled I/O access\n"
 261                 "       -h: help\n"
 262                 "       -H: vmexit from the guest on hlt\n"
 263                 "       -k: key=value flat config file\n"
 264                 "       -l: LPC device configuration\n"
 265                 "       -m: memory size\n"
 266                 "       -o: set config 'var' to 'value'\n"
 267 #ifdef  __FreeBSD__
 268                 "       -p: pin 'vcpu' to 'hostcpu'\n"
 269 #endif
 270                 "       -P: vmexit from the guest on pause\n"
 271                 "       -s: <slot,driver,configinfo> PCI slot config\n"
 272                 "       -S: guest memory cannot be swapped\n"
 273                 "       -u: RTC keeps UTC time\n"
 274                 "       -U: uuid\n"
 275                 "       -w: ignore unimplemented MSRs\n"
 276                 "       -W: force virtio to use single-vector MSI\n"
 277                 "       -x: local apic is in x2APIC mode\n"
 278                 "       -Y: disable MPtable generation\n",
 279                 progname, (int)strlen(progname), "", (int)strlen(progname), "",
 280                 (int)strlen(progname), "");
 281 
 282         exit(code);
 283 }
 284 
 285 /*
 286  * XXX This parser is known to have the following issues:
 287  * 1.  It accepts null key=value tokens ",," as setting "cpus" to an
 288  *     empty string.
 289  *
 290  * The acceptance of a null specification ('-c ""') is by design to match the
 291  * manual page syntax specification, this results in a topology of 1 vCPU.
 292  */
 293 static int
 294 topology_parse(const char *opt)
 295 {
 296         char *cp, *str;
 297 
 298         if (*opt == '\0') {
 299                 set_config_value("sockets", "1");
 300                 set_config_value("cores", "1");
 301                 set_config_value("threads", "1");
 302                 set_config_value("cpus", "1");
 303                 return (0);
 304         }
 305 
 306         str = strdup(opt);
 307         if (str == NULL)
 308                 errx(4, "Failed to allocate memory");
 309 
 310         while ((cp = strsep(&str, ",")) != NULL) {
 311                 if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
 312                         set_config_value("cpus", cp + strlen("cpus="));
 313                 else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0)
 314                         set_config_value("sockets", cp + strlen("sockets="));
 315                 else if (strncmp(cp, "cores=", strlen("cores=")) == 0)
 316                         set_config_value("cores", cp + strlen("cores="));
 317                 else if (strncmp(cp, "threads=", strlen("threads=")) == 0)
 318                         set_config_value("threads", cp + strlen("threads="));
 319 #ifdef notyet  /* Do not expose this until vmm.ko implements it */
 320                 else if (strncmp(cp, "maxcpus=", strlen("maxcpus=")) == 0)
 321                         set_config_value("maxcpus", cp + strlen("maxcpus="));
 322 #endif
 323                 else if (strchr(cp, '=') != NULL)
 324                         goto out;
 325                 else
 326                         set_config_value("cpus", cp);
 327         }
 328         free(str);
 329         return (0);
 330 
 331 out:
 332         free(str);
 333         return (-1);
 334 }
 335 
 336 static int
 337 parse_int_value(const char *key, const char *value, int minval, int maxval)
 338 {
 339         char *cp;
 340         long lval;
 341 
 342         errno = 0;
 343         lval = strtol(value, &cp, 0);
 344         if (errno != 0 || *cp != '\0' || cp == value || lval < minval ||
 345             lval > maxval)
 346                 errx(4, "Invalid value for %s: '%s'", key, value);
 347         return (lval);
 348 }
 349 
 350 /*
 351  * Set the sockets, cores, threads, and guest_cpus variables based on
 352  * the configured topology.
 353  *
 354  * The limits of UINT16_MAX are due to the types passed to
 355  * vm_set_topology().  vmm.ko may enforce tighter limits.
 356  */
 357 static void
 358 calc_topolopgy(void)
 359 {
 360         const char *value;
 361         bool explicit_cpus;
 362         uint64_t ncpus;
 363 
 364         value = get_config_value("cpus");
 365         if (value != NULL) {
 366                 guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX);
 367                 explicit_cpus = true;
 368         } else {
 369                 guest_ncpus = 1;
 370                 explicit_cpus = false;
 371         }
 372         value = get_config_value("cores");
 373         if (value != NULL)
 374                 cores = parse_int_value("cores", value, 1, UINT16_MAX);
 375         else
 376                 cores = 1;
 377         value = get_config_value("threads");
 378         if (value != NULL)
 379                 threads = parse_int_value("threads", value, 1, UINT16_MAX);
 380         else
 381                 threads = 1;
 382         value = get_config_value("sockets");
 383         if (value != NULL)
 384                 sockets = parse_int_value("sockets", value, 1, UINT16_MAX);
 385         else
 386                 sockets = guest_ncpus;
 387 
 388         /*
 389          * Compute sockets * cores * threads avoiding overflow.  The
 390          * range check above insures these are 16 bit values.
 391          */
 392         ncpus = (uint64_t)sockets * cores * threads;
 393         if (ncpus > UINT16_MAX)
 394                 errx(4, "Computed number of vCPUs too high: %ju",
 395                     (uintmax_t)ncpus);
 396 
 397         if (explicit_cpus) {
 398                 if (guest_ncpus != ncpus)
 399                         errx(4, "Topology (%d sockets, %d cores, %d threads) "
 400                             "does not match %d vCPUs", sockets, cores, threads,
 401                             guest_ncpus);
 402         } else
 403                 guest_ncpus = ncpus;
 404 }
 405 
 406 #ifndef WITHOUT_CAPSICUM
 407 /*
 408  * 11-stable capsicum helpers
 409  */
 410 static void
 411 bhyve_caph_cache_catpages(void)
 412 {
 413 
 414         (void)catopen("libc", NL_CAT_LOCALE);
 415 }
 416 
 417 static int
 418 bhyve_caph_limit_stdoe(void)
 419 {
 420         cap_rights_t rights;
 421         unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ };
 422         int i, fds[] = { STDOUT_FILENO, STDERR_FILENO };
 423 
 424         cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL);
 425         cap_rights_set(&rights, CAP_WRITE);
 426 
 427         for (i = 0; i < nitems(fds); i++) {
 428                 if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS)
 429                         return (-1);
 430 
 431                 if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS)
 432                         return (-1);
 433 
 434                 if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS)
 435                         return (-1);
 436         }
 437 
 438         return (0);
 439 }
 440 
 441 #endif
 442 
 443 #ifdef  __FreeBSD__
 444 static int
 445 pincpu_parse(const char *opt)
 446 {
 447         int vcpu, pcpu;
 448 
 449         if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
 450                 fprintf(stderr, "invalid format: %s\n", opt);
 451                 return (-1);
 452         }
 453 
 454         if (vcpu < 0 || vcpu >= VM_MAXCPU) {
 455                 fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
 456                     vcpu, VM_MAXCPU - 1);
 457                 return (-1);
 458         }
 459 
 460         if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
 461                 fprintf(stderr, "hostcpu '%d' outside valid range from "
 462                     "0 to %d\n", pcpu, CPU_SETSIZE - 1);
 463                 return (-1);
 464         }
 465 
 466         snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
 467         value = get_config_value(key);
 468 
 469         if (asprintf(&newval, "%s%s%d", value != NULL ? value : "",
 470             value != NULL ? "," : "", pcpu) == -1) {
 471                 perror("failed to build new cpuset string");
 472                 return (-1);
 473         }
 474 
 475         set_config_value(key, newval);
 476         free(newval);
 477         return (0);
 478 }
 479 
 480 static void
 481 parse_cpuset(int vcpu, const char *list, cpuset_t *set)
 482 {
 483         char *cp, *token;
 484         int pcpu, start;
 485 
 486         CPU_ZERO(set);
 487         start = -1;
 488         token = __DECONST(char *, list);
 489         for (;;) {
 490                 pcpu = strtoul(token, &cp, 0);
 491                 if (cp == token)
 492                         errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
 493                 if (pcpu < 0 || pcpu >= CPU_SETSIZE)
 494                         errx(4, "hostcpu '%d' outside valid range from 0 to %d",
 495                             pcpu, CPU_SETSIZE - 1);
 496                 switch (*cp) {
 497                 case ',':
 498                 case '\0':
 499                         if (start >= 0) {
 500                                 if (start > pcpu)
 501                                         errx(4, "Invalid hostcpu range %d-%d",
 502                                             start, pcpu);
 503                                 while (start < pcpu) {
 504                                         CPU_SET(start, vcpumap[vcpu]);
 505                                         start++;
 506                                 }
 507                                 start = -1;
 508                         }
 509                         CPU_SET(pcpu, vcpumap[vcpu]);
 510                         break;
 511                 case '-':
 512                         if (start >= 0)
 513                                 errx(4, "invalid cpuset for vcpu %d: '%s'",
 514                                     vcpu, list);
 515                         start = pcpu;
 516                         break;
 517                 default:
 518                         errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
 519                 }
 520                 if (*cp == '\0')
 521                         break;
 522                 token = cp + 1;
 523         }
 524 }
 525 
 526 static void
 527 build_vcpumaps(void)
 528 {
 529         char key[16];
 530         const char *value;
 531         int vcpu;
 532 
 533         for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
 534                 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
 535                 value = get_config_value(key);
 536                 if (value == NULL)
 537                         continue;
 538                 vcpumap[vcpu] = malloc(sizeof(cpuset_t));
 539                 if (vcpumap[vcpu] == NULL)
 540                         err(4, "Failed to allocate cpuset for vcpu %d", vcpu);
 541                 parse_cpuset(vcpu, value, vcpumap[vcpu]);
 542         }
 543 }
 544 
 545 void
 546 vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
 547     int errcode)
 548 {
 549         struct vmctx *ctx;
 550         int error, restart_instruction;
 551 
 552         ctx = arg;
 553         restart_instruction = 1;
 554 
 555         error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
 556             restart_instruction);
 557         assert(error == 0);
 558 }
 559 #endif /* __FreeBSD__ */
 560 
 561 void *
 562 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
 563 {
 564 
 565         return (vm_map_gpa(ctx, gaddr, len));
 566 }
 567 
 568 int
 569 fbsdrun_virtio_msix(void)
 570 {
 571 
 572         return (get_config_bool_default("virtio_msix", true));
 573 }
 574 
 575 static void *
 576 fbsdrun_start_thread(void *param)
 577 {
 578         char tname[MAXCOMLEN + 1];
 579         struct mt_vmm_info *mtp;
 580         int vcpu;
 581 
 582         mtp = param;
 583         vcpu = mtp->mt_vcpu;
 584 
 585         snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
 586         pthread_set_name_np(mtp->mt_thr, tname);
 587 
 588         gdb_cpu_add(vcpu);
 589 
 590         vm_loop(mtp->mt_ctx, vcpu, mtp->mt_startrip);
 591 
 592         /* not reached */
 593         exit(1);
 594         return (NULL);
 595 }
 596 
 597 #ifdef __FreeBSD__
 598 void
 599 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 600 #else
 601 void
 602 fbsdrun_addcpu(struct vmctx *ctx, int newcpu, uint64_t rip, bool suspend)
 603 #endif
 604 {
 605         int error;
 606 
 607 #ifdef __FreeBSD__
 608         assert(fromcpu == BSP);
 609 #endif
 610 
 611         /*
 612          * The 'newcpu' must be activated in the context of 'fromcpu'. If
 613          * vm_activate_cpu() is delayed until newcpu's pthread starts running
 614          * then vmm.ko is out-of-sync with bhyve and this can create a race
 615          * with vm_suspend().
 616          */
 617         error = vm_activate_cpu(ctx, newcpu);
 618         if (error != 0)
 619                 err(EX_OSERR, "could not activate CPU %d", newcpu);
 620 
 621         CPU_SET_ATOMIC(newcpu, &cpumask);
 622 
 623 #ifndef __FreeBSD__
 624         if (suspend)
 625                 (void) vm_suspend_cpu(ctx, newcpu);
 626 #endif
 627 
 628         /*
 629          * Set up the vmexit struct to allow execution to start
 630          * at the given RIP
 631          */
 632         mt_vmm_info[newcpu].mt_ctx = ctx;
 633         mt_vmm_info[newcpu].mt_vcpu = newcpu;
 634         mt_vmm_info[newcpu].mt_startrip = rip;
 635 
 636         error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
 637             fbsdrun_start_thread, &mt_vmm_info[newcpu]);
 638         assert(error == 0);
 639 }
 640 
 641 static int
 642 fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
 643 {
 644 
 645         if (!CPU_ISSET(vcpu, &cpumask)) {
 646                 fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
 647                 exit(4);
 648         }
 649 
 650         CPU_CLR_ATOMIC(vcpu, &cpumask);
 651         return (CPU_EMPTY(&cpumask));
 652 }
 653 
 654 static void
 655 vmentry_mmio_read(int vcpu, uint64_t gpa, uint8_t bytes, uint64_t data)
 656 {
 657         struct vm_entry *entry = &vmentry[vcpu];
 658         struct vm_mmio *mmio = &entry->u.mmio;
 659 
 660         assert(entry->cmd == VEC_DEFAULT);
 661 
 662         entry->cmd = VEC_FULFILL_MMIO;
 663         mmio->bytes = bytes;
 664         mmio->read = 1;
 665         mmio->gpa = gpa;
 666         mmio->data = data;
 667 }
 668 
 669 static void
 670 vmentry_mmio_write(int vcpu, uint64_t gpa, uint8_t bytes)
 671 {
 672         struct vm_entry *entry = &vmentry[vcpu];
 673         struct vm_mmio *mmio = &entry->u.mmio;
 674 
 675         assert(entry->cmd == VEC_DEFAULT);
 676 
 677         entry->cmd = VEC_FULFILL_MMIO;
 678         mmio->bytes = bytes;
 679         mmio->read = 0;
 680         mmio->gpa = gpa;
 681         mmio->data = 0;
 682 }
 683 
 684 static void
 685 vmentry_inout_read(int vcpu, uint16_t port, uint8_t bytes, uint32_t data)
 686 {
 687         struct vm_entry *entry = &vmentry[vcpu];
 688         struct vm_inout *inout = &entry->u.inout;
 689 
 690         assert(entry->cmd == VEC_DEFAULT);
 691 
 692         entry->cmd = VEC_FULFILL_INOUT;
 693         inout->bytes = bytes;
 694         inout->flags = INOUT_IN;
 695         inout->port = port;
 696         inout->eax = data;
 697 }
 698 
 699 static void
 700 vmentry_inout_write(int vcpu, uint16_t port, uint8_t bytes)
 701 {
 702         struct vm_entry *entry = &vmentry[vcpu];
 703         struct vm_inout *inout = &entry->u.inout;
 704 
 705         assert(entry->cmd == VEC_DEFAULT);
 706 
 707         entry->cmd = VEC_FULFILL_INOUT;
 708         inout->bytes = bytes;
 709         inout->flags = 0;
 710         inout->port = port;
 711         inout->eax = 0;
 712 }
 713 
 714 static int
 715 vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
 716                      uint32_t eax)
 717 {
 718 #if BHYVE_DEBUG
 719         /*
 720          * put guest-driven debug here
 721          */
 722 #endif
 723         return (VMEXIT_CONTINUE);
 724 }
 725 
 726 static int
 727 vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 728 {
 729         int error;
 730         int vcpu;
 731         struct vm_inout inout;
 732         bool in;
 733         uint8_t bytes;
 734 
 735         stats.vmexit_inout++;
 736 
 737         vcpu = *pvcpu;
 738         inout = vme->u.inout;
 739         in = (inout.flags & INOUT_IN) != 0;
 740         bytes = inout.bytes;
 741 
 742         /* Extra-special case of host notifications */
 743         if (!in && inout.port == GUEST_NIO_PORT) {
 744                 error = vmexit_handle_notify(ctx, vme, pvcpu, inout.eax);
 745                 vmentry_inout_write(vcpu, inout.port, bytes);
 746                 return (error);
 747         }
 748 
 749         error = emulate_inout(ctx, vcpu, &inout);
 750         if (error) {
 751                 fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
 752                     in ? "in" : "out",
 753                     bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
 754                     inout.port, vmexit->rip);
 755                 return (VMEXIT_ABORT);
 756         } else {
 757                 /*
 758                  * Communicate the status of the inout operation back to the
 759                  * in-kernel instruction emulation.
 760                  */
 761                 if (in) {
 762                         vmentry_inout_read(vcpu, inout.port, bytes, inout.eax);
 763                 } else {
 764                         vmentry_inout_write(vcpu, inout.port, bytes);
 765                 }
 766                 return (VMEXIT_CONTINUE);
 767         }
 768 }
 769 
 770 static int
 771 vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 772 {
 773         uint64_t val;
 774         uint32_t eax, edx;
 775         int error;
 776 
 777         val = 0;
 778         error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
 779         if (error != 0) {
 780                 fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
 781                     vme->u.msr.code, *pvcpu);
 782                 if (get_config_bool("x86.strictmsr")) {
 783                         vm_inject_gp(ctx, *pvcpu);
 784                         return (VMEXIT_CONTINUE);
 785                 }
 786         }
 787 
 788         eax = val;
 789         error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
 790         assert(error == 0);
 791 
 792         edx = val >> 32;
 793         error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
 794         assert(error == 0);
 795 
 796         return (VMEXIT_CONTINUE);
 797 }
 798 
 799 static int
 800 vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 801 {
 802         int error;
 803 
 804         error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
 805         if (error != 0) {
 806                 fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
 807                     vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 808                 if (get_config_bool("x86.strictmsr")) {
 809                         vm_inject_gp(ctx, *pvcpu);
 810                         return (VMEXIT_CONTINUE);
 811                 }
 812         }
 813         return (VMEXIT_CONTINUE);
 814 }
 815 
 816 #ifdef __FreeBSD__
 817 static int
 818 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 819 {
 820 
 821         (void)spinup_ap(ctx, *pvcpu,
 822                     vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
 823 
 824         return (VMEXIT_CONTINUE);
 825 }
 826 #else
 827 static int
 828 vmexit_run_state(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 829 {
 830         /*
 831          * Run-state transitions (INIT, SIPI, etc) are handled in-kernel, so an
 832          * exit to userspace with that code is not expected.
 833          */
 834         fprintf(stderr, "unexpected run-state VM exit");
 835         return (VMEXIT_ABORT);
 836 }
 837 #endif /* __FreeBSD__ */
 838 
 839 #ifdef __FreeBSD__
 840 #define DEBUG_EPT_MISCONFIG
 841 #else
 842 /* EPT misconfig debugging not possible now that raw VMCS access is gone */
 843 #endif
 844 
 845 #ifdef DEBUG_EPT_MISCONFIG
 846 #define VMCS_GUEST_PHYSICAL_ADDRESS     0x00002400
 847 
 848 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
 849 static int ept_misconfig_ptenum;
 850 #endif
 851 
 852 static const char *
 853 vmexit_vmx_desc(uint32_t exit_reason)
 854 {
 855 
 856         if (exit_reason >= nitems(vmx_exit_reason_desc) ||
 857             vmx_exit_reason_desc[exit_reason] == NULL)
 858                 return ("Unknown");
 859         return (vmx_exit_reason_desc[exit_reason]);
 860 }
 861 
 862 static int
 863 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 864 {
 865 
 866         fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 867         fprintf(stderr, "\treason\t\tVMX\n");
 868         fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 869         fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 870         fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
 871         fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
 872             vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
 873         fprintf(stderr, "\tqualification\t0x%016lx\n",
 874             vmexit->u.vmx.exit_qualification);
 875         fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 876         fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
 877 #ifdef DEBUG_EPT_MISCONFIG
 878         if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
 879                 vm_get_register(ctx, *pvcpu,
 880                     VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
 881                     &ept_misconfig_gpa);
 882                 vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
 883                     &ept_misconfig_ptenum);
 884                 fprintf(stderr, "\tEPT misconfiguration:\n");
 885                 fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
 886                 fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
 887                     ept_misconfig_ptenum, ept_misconfig_pte[0],
 888                     ept_misconfig_pte[1], ept_misconfig_pte[2],
 889                     ept_misconfig_pte[3]);
 890         }
 891 #endif  /* DEBUG_EPT_MISCONFIG */
 892         return (VMEXIT_ABORT);
 893 }
 894 
 895 static int
 896 vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 897 {
 898 
 899         fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 900         fprintf(stderr, "\treason\t\tSVM\n");
 901         fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 902         fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 903         fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
 904         fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
 905         fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
 906         return (VMEXIT_ABORT);
 907 }
 908 
 909 static int
 910 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 911 {
 912 
 913         assert(vmexit->inst_length == 0);
 914 
 915         stats.vmexit_bogus++;
 916 
 917         return (VMEXIT_CONTINUE);
 918 }
 919 
 920 static int
 921 vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 922 {
 923 
 924         assert(vmexit->inst_length == 0);
 925 
 926         stats.vmexit_reqidle++;
 927 
 928         return (VMEXIT_CONTINUE);
 929 }
 930 
 931 static int
 932 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 933 {
 934 
 935         stats.vmexit_hlt++;
 936 
 937         /*
 938          * Just continue execution with the next instruction. We use
 939          * the HLT VM exit as a way to be friendly with the host
 940          * scheduler.
 941          */
 942         return (VMEXIT_CONTINUE);
 943 }
 944 
 945 static int
 946 vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 947 {
 948 
 949         stats.vmexit_pause++;
 950 
 951         return (VMEXIT_CONTINUE);
 952 }
 953 
 954 static int
 955 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 956 {
 957 
 958         assert(vmexit->inst_length == 0);
 959 
 960         stats.vmexit_mtrap++;
 961 
 962         gdb_cpu_mtrap(*pvcpu);
 963 
 964         return (VMEXIT_CONTINUE);
 965 }
 966 
 967 static int
 968 vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 969 {
 970         uint8_t i, valid;
 971 
 972         fprintf(stderr, "Failed to emulate instruction sequence ");
 973 
 974         valid = vmexit->u.inst_emul.num_valid;
 975         if (valid != 0) {
 976                 assert(valid <= sizeof (vmexit->u.inst_emul.inst));
 977                 fprintf(stderr, "[");
 978                 for (i = 0; i < valid; i++) {
 979                         if (i == 0) {
 980                                 fprintf(stderr, "%02x",
 981                                     vmexit->u.inst_emul.inst[i]);
 982                         } else {
 983                                 fprintf(stderr, ", %02x",
 984                                     vmexit->u.inst_emul.inst[i]);
 985                         }
 986                 }
 987                 fprintf(stderr, "] ");
 988         }
 989         fprintf(stderr, "@ %rip = %x\n", vmexit->rip);
 990 
 991         return (VMEXIT_ABORT);
 992 }
 993 
 994 static int
 995 vmexit_mmio(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 996 {
 997         int vcpu, err;
 998         struct vm_mmio mmio;
 999         bool is_read;
1000 
1001         stats.vmexit_mmio++;
1002 
1003         vcpu = *pvcpu;
1004         mmio = vmexit->u.mmio;
1005         is_read = (mmio.read != 0);
1006 
1007         err = emulate_mem(ctx, vcpu, &mmio);
1008 
1009         if (err == ESRCH) {
1010                 fprintf(stderr, "Unhandled memory access to 0x%lx\n", mmio.gpa);
1011                 stats.mmio_unhandled++;
1012 
1013                 /*
1014                  * Access to non-existent physical addresses is not likely to
1015                  * result in fatal errors on hardware machines, but rather reads
1016                  * of all-ones or discarded-but-acknowledged writes.
1017                  */
1018                 mmio.data = ~0UL;
1019                 err = 0;
1020         }
1021 
1022         if (err == 0) {
1023                 if (is_read) {
1024                         vmentry_mmio_read(vcpu, mmio.gpa, mmio.bytes,
1025                             mmio.data);
1026                 } else {
1027                         vmentry_mmio_write(vcpu, mmio.gpa, mmio.bytes);
1028                 }
1029                 return (VMEXIT_CONTINUE);
1030         }
1031 
1032         fprintf(stderr, "Unhandled mmio error to 0x%lx: %d\n", mmio.gpa, err);
1033         return (VMEXIT_ABORT);
1034 }
1035 
1036 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
1037 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
1038 
1039 static int
1040 vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
1041 {
1042         enum vm_suspend_how how;
1043 
1044         how = vmexit->u.suspended.how;
1045 
1046         fbsdrun_deletecpu(ctx, *pvcpu);
1047 
1048         if (*pvcpu != BSP) {
1049                 pthread_mutex_lock(&resetcpu_mtx);
1050                 pthread_cond_signal(&resetcpu_cond);
1051                 pthread_mutex_unlock(&resetcpu_mtx);
1052                 pthread_exit(NULL);
1053         }
1054 
1055         pthread_mutex_lock(&resetcpu_mtx);
1056         while (!CPU_EMPTY(&cpumask)) {
1057                 pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
1058         }
1059         pthread_mutex_unlock(&resetcpu_mtx);
1060 
1061         switch (how) {
1062         case VM_SUSPEND_RESET:
1063                 exit(0);
1064         case VM_SUSPEND_POWEROFF:
1065                 if (get_config_bool_default("destroy_on_poweroff", false))
1066                         vm_destroy(ctx);
1067                 exit(1);
1068         case VM_SUSPEND_HALT:
1069                 exit(2);
1070         case VM_SUSPEND_TRIPLEFAULT:
1071                 exit(3);
1072         default:
1073                 fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
1074                 exit(100);
1075         }
1076         return (0);     /* NOTREACHED */
1077 }
1078 
1079 static int
1080 vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
1081 {
1082 
1083         gdb_cpu_suspend(*pvcpu);
1084         return (VMEXIT_CONTINUE);
1085 }
1086 
1087 static int
1088 vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
1089 {
1090 
1091         gdb_cpu_breakpoint(*pvcpu, vmexit);
1092         return (VMEXIT_CONTINUE);
1093 }
1094 
1095 static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
1096         [VM_EXITCODE_INOUT]  = vmexit_inout,
1097         [VM_EXITCODE_MMIO]  = vmexit_mmio,
1098         [VM_EXITCODE_VMX]    = vmexit_vmx,
1099         [VM_EXITCODE_SVM]    = vmexit_svm,
1100         [VM_EXITCODE_BOGUS]  = vmexit_bogus,
1101         [VM_EXITCODE_REQIDLE] = vmexit_reqidle,
1102         [VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
1103         [VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
1104         [VM_EXITCODE_MTRAP]  = vmexit_mtrap,
1105         [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
1106 #ifdef __FreeBSD__
1107         [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
1108 #else
1109         [VM_EXITCODE_RUN_STATE] = vmexit_run_state,
1110 #endif
1111         [VM_EXITCODE_SUSPENDED] = vmexit_suspend,
1112         [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
1113         [VM_EXITCODE_DEBUG] = vmexit_debug,
1114         [VM_EXITCODE_BPT] = vmexit_breakpoint,
1115 };
1116 
1117 static void
1118 vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
1119 {
1120         int error, rc;
1121         enum vm_exitcode exitcode;
1122         cpuset_t active_cpus;
1123         struct vm_exit *vexit;
1124         struct vm_entry *ventry;
1125 
1126 #ifdef  __FreeBSD__
1127         if (vcpumap[vcpu] != NULL) {
1128                 error = pthread_setaffinity_np(pthread_self(),
1129                     sizeof(cpuset_t), vcpumap[vcpu]);
1130                 assert(error == 0);
1131         }
1132 #endif
1133         error = vm_active_cpus(ctx, &active_cpus);
1134         assert(CPU_ISSET(vcpu, &active_cpus));
1135 
1136         error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
1137         assert(error == 0);
1138 
1139         ventry = &vmentry[vcpu];
1140         vexit = &vmexit[vcpu];
1141 
1142         while (1) {
1143                 error = vm_run(ctx, vcpu, ventry, vexit);
1144                 if (error != 0)
1145                         break;
1146 
1147                 if (ventry->cmd != VEC_DEFAULT) {
1148                         /*
1149                          * Discard any lingering entry state after it has been
1150                          * submitted via vm_run().
1151                          */
1152                         bzero(ventry, sizeof (*ventry));
1153                 }
1154 
1155                 exitcode = vexit->exitcode;
1156                 if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
1157                         fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
1158                             exitcode);
1159                         exit(4);
1160                 }
1161 
1162                 rc = (*handler[exitcode])(ctx, vexit, &vcpu);
1163 
1164                 switch (rc) {
1165                 case VMEXIT_CONTINUE:
1166                         break;
1167                 case VMEXIT_ABORT:
1168                         abort();
1169                 default:
1170                         exit(4);
1171                 }
1172         }
1173         fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
1174 }
1175 
1176 static int
1177 num_vcpus_allowed(struct vmctx *ctx)
1178 {
1179 #ifdef __FreeBSD__
1180         int tmp, error;
1181 
1182         error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
1183 
1184         /*
1185          * The guest is allowed to spinup more than one processor only if the
1186          * UNRESTRICTED_GUEST capability is available.
1187          */
1188         if (error == 0)
1189                 return (VM_MAXCPU);
1190         else
1191                 return (1);
1192 #else
1193         /* Unrestricted Guest is always enabled on illumos */
1194         return (VM_MAXCPU);
1195 #endif /* __FreeBSD__ */
1196 }
1197 
1198 void
1199 fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
1200 {
1201         int err, tmp;
1202 
1203         if (get_config_bool_default("x86.vmexit_on_hlt", false)) {
1204                 err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
1205                 if (err < 0) {
1206                         fprintf(stderr, "VM exit on HLT not supported\n");
1207                         exit(4);
1208                 }
1209                 vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
1210                 if (cpu == BSP)
1211                         handler[VM_EXITCODE_HLT] = vmexit_hlt;
1212         }
1213 
1214         if (get_config_bool_default("x86.vmexit_on_pause", false)) {
1215                 /*
1216                  * pause exit support required for this mode
1217                  */
1218                 err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
1219                 if (err < 0) {
1220                         fprintf(stderr,
1221                             "SMP mux requested, no pause support\n");
1222                         exit(4);
1223                 }
1224                 vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
1225                 if (cpu == BSP)
1226                         handler[VM_EXITCODE_PAUSE] = vmexit_pause;
1227         }
1228 
1229         if (get_config_bool_default("x86.x2apic", false))
1230                 err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
1231         else
1232                 err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
1233 
1234         if (err) {
1235                 fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
1236                 exit(4);
1237         }
1238 
1239 #ifdef  __FreeBSD__
1240         vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
1241 #endif
1242 }
1243 
1244 static struct vmctx *
1245 do_open(const char *vmname)
1246 {
1247         struct vmctx *ctx;
1248         int error;
1249         bool reinit, romboot;
1250 #ifndef WITHOUT_CAPSICUM
1251         cap_rights_t rights;
1252         const cap_ioctl_t *cmds;        
1253         size_t ncmds;
1254 #endif
1255 
1256         reinit = romboot = false;
1257 
1258         if (lpc_bootrom())
1259                 romboot = true;
1260 
1261         error = vm_create(vmname);
1262         if (error) {
1263                 if (errno == EEXIST) {
1264                         if (romboot) {
1265                                 reinit = true;
1266                         } else {
1267                                 /*
1268                                  * The virtual machine has been setup by the
1269                                  * userspace bootloader.
1270                                  */
1271                         }
1272                 } else {
1273                         perror("vm_create");
1274                         exit(4);
1275                 }
1276         } else {
1277                 if (!romboot) {
1278                         /*
1279                          * If the virtual machine was just created then a
1280                          * bootrom must be configured to boot it.
1281                          */
1282                         fprintf(stderr, "virtual machine cannot be booted\n");
1283                         exit(4);
1284                 }
1285         }
1286 
1287         ctx = vm_open(vmname);
1288         if (ctx == NULL) {
1289                 perror("vm_open");
1290                 exit(4);
1291         }
1292 
1293 #ifndef WITHOUT_CAPSICUM
1294         cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1295         if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) 
1296                 errx(EX_OSERR, "Unable to apply rights for sandbox");
1297         vm_get_ioctls(&ncmds);
1298         cmds = vm_get_ioctls(NULL);
1299         if (cmds == NULL)
1300                 errx(EX_OSERR, "out of memory");
1301         if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1)
1302                 errx(EX_OSERR, "Unable to apply rights for sandbox");
1303         free((cap_ioctl_t *)cmds);
1304 #endif
1305  
1306         if (reinit) {
1307                 error = vm_reinit(ctx);
1308                 if (error) {
1309                         perror("vm_reinit");
1310                         exit(4);
1311                 }
1312         }
1313         error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
1314         if (error)
1315                 errx(EX_OSERR, "vm_set_topology");
1316         return (ctx);
1317 }
1318 
1319 #ifndef __FreeBSD__
1320 
1321 #define FILE_PROVISIONING       "/var/svc/provisioning"
1322 #define FILE_PROVISION_SUCCESS  "/var/svc/provision_success"
1323 
1324 static void
1325 mark_provisioned(void)
1326 {
1327         struct stat stbuf;
1328 
1329         if (lstat(FILE_PROVISIONING, &stbuf) != 0)
1330                 return;
1331 
1332         if (rename(FILE_PROVISIONING, FILE_PROVISION_SUCCESS) != 0) {
1333                 (void) fprintf(stderr, "Cannot rename %s to %s: %s\n",
1334                     FILE_PROVISIONING, FILE_PROVISION_SUCCESS,
1335                     strerror(errno));
1336         }
1337 }
1338 
1339 #endif
1340 
1341 static bool
1342 parse_config_option(const char *option)
1343 {
1344         const char *value;
1345         char *path;
1346 
1347         value = strchr(option, '=');
1348         if (value == NULL || value[1] == '\0')
1349                 return (false);
1350         path = strndup(option, value - option);
1351         if (path == NULL)
1352                 err(4, "Failed to allocate memory");
1353         set_config_value(path, value + 1);
1354         return (true);
1355 }
1356 
1357 static void
1358 parse_simple_config_file(const char *path)
1359 {
1360         FILE *fp;
1361         char *line, *cp;
1362         size_t linecap;
1363         unsigned int lineno;
1364 
1365         fp = fopen(path, "r");
1366         if (fp == NULL)
1367                 err(4, "Failed to open configuration file %s", path);
1368         line = NULL;
1369         linecap = 0;
1370         lineno = 1;
1371         for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) {
1372                 if (*line == '#' || *line == '\n')
1373                         continue;
1374                 cp = strchr(line, '\n');
1375                 if (cp != NULL)
1376                         *cp = '\0';
1377                 if (!parse_config_option(line))
1378                         errx(4, "%s line %u: invalid config option '%s'", path,
1379                             lineno, line);
1380         }
1381         free(line);
1382         fclose(fp);
1383 }
1384 
1385 static void
1386 set_defaults(void)
1387 {
1388 
1389         set_config_bool("acpi_tables", false);
1390         set_config_value("memory.size", "256M");
1391         set_config_bool("x86.strictmsr", true);
1392 }
1393 
1394 int
1395 main(int argc, char *argv[])
1396 {
1397         int c, error, err;
1398         int max_vcpus, memflags;
1399         struct vmctx *ctx;
1400         uint64_t rip;
1401         size_t memsize;
1402         const char *value, *vmname;
1403         char *optstr;
1404 
1405         init_config();
1406         set_defaults();
1407         progname = basename(argv[0]);
1408 
1409 #ifdef  __FreeBSD__
1410         optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:";
1411 #else
1412         /* +d, +B, -p */
1413         optstr = "adehuwxACDHIPSWYk:o:G:c:s:m:l:B:U:";
1414 #endif
1415         while ((c = getopt(argc, argv, optstr)) != -1) {
1416                 switch (c) {
1417                 case 'a':
1418                         set_config_bool("x86.x2apic", false);
1419                         break;
1420                 case 'A':
1421                         set_config_bool("acpi_tables", true);
1422                         break;
1423                 case 'D':
1424                         set_config_bool("destroy_on_poweroff", true);
1425                         break;
1426 #ifndef __FreeBSD__
1427                 case 'B':
1428                         if (smbios_parse(optarg) != 0) {
1429                                 errx(EX_USAGE, "invalid SMBIOS "
1430                                     "configuration '%s'", optarg);
1431                         }
1432                         break;
1433                 case 'd':
1434                         set_config_bool("suspend_at_boot", true);
1435                         break;
1436 #endif
1437 #ifdef  __FreeBSD__
1438                 case 'p':
1439                         if (pincpu_parse(optarg) != 0) {
1440                                 errx(EX_USAGE, "invalid vcpu pinning "
1441                                     "configuration '%s'", optarg);
1442                         }
1443                         break;
1444 #endif
1445                 case 'c':
1446                         if (topology_parse(optarg) != 0) {
1447                             errx(EX_USAGE, "invalid cpu topology "
1448                                 "'%s'", optarg);
1449                         }
1450                         break;
1451                 case 'C':
1452                         set_config_bool("memory.guest_in_core", true);
1453                         break;
1454                 case 'G':
1455                         if (optarg[0] == 'w') {
1456                                 set_config_bool("gdb.wait", true);
1457                                 optarg++;
1458                         }
1459                         set_config_value("gdb.port", optarg);
1460                         break;
1461                 case 'k':
1462                         parse_simple_config_file(optarg);
1463                         break;
1464                 case 'l':
1465                         if (strncmp(optarg, "help", strlen(optarg)) == 0) {
1466                                 lpc_print_supported_devices();
1467                                 exit(0);
1468                         } else if (lpc_device_parse(optarg) != 0) {
1469                                 errx(EX_USAGE, "invalid lpc device "
1470                                     "configuration '%s'", optarg);
1471                         }
1472                         break;
1473                 case 's':
1474                         if (strncmp(optarg, "help", strlen(optarg)) == 0) {
1475                                 pci_print_supported_devices();
1476                                 exit(0);
1477                         } else if (pci_parse_slot(optarg) != 0)
1478                                 exit(4);
1479                         else
1480                                 break;
1481                 case 'S':
1482                         set_config_bool("memory.wired", true);
1483                         break;
1484                 case 'm':
1485                         set_config_value("memory.size", optarg);
1486                         break;
1487                 case 'o':
1488                         if (!parse_config_option(optarg))
1489                                 errx(EX_USAGE, "invalid configuration option '%s'", optarg);
1490                         break;
1491                 case 'H':
1492                         set_config_bool("x86.vmexit_on_hlt", true);
1493                         break;
1494                 case 'I':
1495                         /*
1496                          * The "-I" option was used to add an ioapic to the
1497                          * virtual machine.
1498                          *
1499                          * An ioapic is now provided unconditionally for each
1500                          * virtual machine and this option is now deprecated.
1501                          */
1502                         break;
1503                 case 'P':
1504                         set_config_bool("x86.vmexit_on_pause", true);
1505                         break;
1506                 case 'e':
1507                         set_config_bool("x86.strictio", true);
1508                         break;
1509                 case 'u':
1510                         set_config_bool("rtc.use_localtime", false);
1511                         break;
1512                 case 'U':
1513                         set_config_value("uuid", optarg);
1514                         break;
1515                 case 'w':
1516                         set_config_bool("x86.strictmsr", false);
1517                         break;
1518                 case 'W':
1519                         set_config_bool("virtio_msix", false);
1520                         break;
1521                 case 'x':
1522                         set_config_bool("x86.x2apic", true);
1523                         break;
1524                 case 'Y':
1525                         set_config_bool("x86.mptable", false);
1526                         break;
1527                 case 'h':
1528                         usage(0);                       
1529                 default:
1530                         usage(1);
1531                 }
1532         }
1533         argc -= optind;
1534         argv += optind;
1535 
1536         if (argc > 1)
1537                 usage(1);
1538 
1539         if (argc == 1)
1540                 set_config_value("name", argv[0]);
1541 
1542         vmname = get_config_value("name");
1543         if (vmname == NULL)
1544                 usage(1);
1545 
1546         if (get_config_bool_default("config.dump", false)) {
1547                 dump_config();
1548                 exit(1);
1549         }
1550 
1551 #ifndef __FreeBSD__
1552         illumos_priv_init();
1553 #endif
1554 
1555         calc_topolopgy();
1556 #ifdef __FreeBSD__
1557         build_vcpumaps();
1558 #endif
1559 
1560         value = get_config_value("memory.size");
1561         error = vm_parse_memsize(value, &memsize);
1562         if (error)
1563                 errx(EX_USAGE, "invalid memsize '%s'", value);
1564 
1565         ctx = do_open(vmname);
1566 
1567         max_vcpus = num_vcpus_allowed(ctx);
1568         if (guest_ncpus > max_vcpus) {
1569                 fprintf(stderr, "%d vCPUs requested but only %d available\n",
1570                         guest_ncpus, max_vcpus);
1571                 exit(4);
1572         }
1573 
1574         fbsdrun_set_capabilities(ctx, BSP);
1575 
1576         memflags = 0;
1577         if (get_config_bool_default("memory.wired", false))
1578                 memflags |= VM_MEM_F_WIRED;
1579         if (get_config_bool_default("memory.guest_in_core", false))
1580                 memflags |= VM_MEM_F_INCORE;
1581         vm_set_memflags(ctx, memflags);
1582 #ifdef  __FreeBSD__
1583         err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
1584 #else
1585         err = vm_arc_resv(ctx, memsize);
1586         if (err != 0) {
1587                 (void) fprintf(stderr, "Could not shrink ARC: %s\n",
1588                     strerror(err));
1589                 exit(4);
1590         }
1591 
1592         do {
1593                 errno = 0;
1594                 err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
1595                 error = errno;
1596                 if (err != 0 && error == ENOMEM) {
1597                         (void) fprintf(stderr, "Unable to allocate memory "
1598                             "(%llu), retrying in 1 second\n", memsize);
1599                         sleep(1);
1600                 }
1601         } while (error == ENOMEM);
1602 #endif
1603         if (err) {
1604                 fprintf(stderr, "Unable to set up memory (%d)\n", errno);
1605                 exit(4);
1606         }
1607 
1608         error = init_msr();
1609         if (error) {
1610                 fprintf(stderr, "init_msr error %d", error);
1611                 exit(4);
1612         }
1613 
1614         init_mem();
1615         init_inout();
1616 #ifdef  __FreeBSD__
1617         kernemu_dev_init();
1618 #endif
1619         init_bootrom(ctx);
1620         atkbdc_init(ctx);
1621         pci_irq_init(ctx);
1622         ioapic_init(ctx);
1623 
1624         rtc_init(ctx);
1625         sci_init(ctx);
1626 #ifndef __FreeBSD__
1627         pmtmr_init(ctx);
1628 #endif
1629 
1630         /*
1631          * Exit if a device emulation finds an error in its initilization
1632          */
1633         if (init_pci(ctx) != 0) {
1634                 perror("device emulation initialization error");
1635                 exit(4);
1636         }
1637 
1638         /*
1639          * Initialize after PCI, to allow a bootrom file to reserve the high
1640          * region.
1641          */
1642         if (get_config_bool("acpi_tables"))
1643                 vmgenc_init(ctx);
1644 
1645         value = get_config_value("gdb.port");
1646 #ifdef __FreeBSD__
1647         if (value != NULL)
1648                 init_gdb(ctx, atoi(value), get_config_bool_default("gdb.wait",
1649                     false));
1650 #else
1651         if (value != NULL) {
1652                 int port = atoi(value);
1653 
1654                 if (port < 0) {
1655                         init_mdb(ctx,
1656                             get_config_bool_default("gdb.wait", false));
1657                 } else {
1658                         init_gdb(ctx, port,
1659                             get_config_bool_default("gdb.wait", false));
1660                 }
1661         }
1662 #endif
1663 
1664         vga_init(1);
1665 
1666         if (lpc_bootrom()) {
1667 #ifdef __FreeBSD__
1668                 if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
1669                         fprintf(stderr, "ROM boot failed: unrestricted guest "
1670                             "capability not available\n");
1671                         exit(4);
1672                 }
1673 #else
1674                 /* Unrestricted Guest is always enabled on illumos */
1675 #endif
1676                 error = vcpu_reset(ctx, BSP);
1677                 assert(error == 0);
1678         }
1679 
1680         error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
1681         assert(error == 0);
1682 
1683         /*
1684          * build the guest tables, MP etc.
1685          */
1686         if (get_config_bool_default("x86.mptable", true)) {
1687                 error = mptable_build(ctx, guest_ncpus);
1688                 if (error) {
1689                         perror("error to build the guest tables");
1690                         exit(4);
1691                 }
1692         }
1693 
1694 #ifndef __FreeBSD__
1695         smbios_apply();
1696 #endif
1697         error = smbios_build(ctx);
1698         assert(error == 0);
1699 
1700         if (get_config_bool("acpi_tables")) {
1701                 error = acpi_build(ctx, guest_ncpus);
1702                 assert(error == 0);
1703         }
1704 
1705         if (lpc_bootrom())
1706                 fwctl_init();
1707 
1708         /*
1709          * Change the proc title to include the VM name.
1710          */
1711         setproctitle("%s", vmname);
1712 
1713 #ifndef WITHOUT_CAPSICUM
1714         caph_cache_catpages();
1715 
1716         if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
1717                 errx(EX_OSERR, "Unable to apply rights for sandbox");
1718 
1719         if (caph_enter() == -1)
1720                 errx(EX_OSERR, "cap_enter() failed");
1721 #endif
1722 
1723 /* XXX SmartOS:  Upstream drops privs here, but we can't yet.  See below... */
1724 
1725 #ifdef __FreeBSD__
1726         /*
1727          * Add CPU 0
1728          */
1729         fbsdrun_addcpu(ctx, BSP, BSP, rip);
1730 #else
1731         /* Set BSP to run (unlike the APs which wait for INIT) */
1732         error = vm_set_run_state(ctx, BSP, VRS_RUN, 0);
1733         assert(error == 0);
1734         fbsdrun_addcpu(ctx, BSP, rip,
1735             get_config_bool_default("suspend_at_boot", false));
1736 
1737         /* Add subsequent CPUs, which will wait until INIT/SIPI-ed */
1738         for (uint_t i = 1; i < guest_ncpus; i++) {
1739                 spinup_halted_ap(ctx, i);
1740         }
1741         mark_provisioned();
1742         /*
1743          * XXX SmartOS:  The mark_provisioned() call above required file-access
1744          * privileges that are dropped by the generic call.  We must widen the
1745          * full-privilege window a bit.  A better solution might be to have
1746          * a way to keep file-access a bit longer, and only have THAT privilege
1747          * to drop here.
1748          */
1749         illumos_priv_lock();
1750 #endif
1751 
1752         /*
1753          * Head off to the main event dispatch loop
1754          */
1755         mevent_dispatch();
1756 
1757         exit(4);
1758 }