Print this page
    
Revert "OS-8005 bhyve memory pressure needs to target ARC better (#354)"
This reverts commit a6033573eedd94118d2b9e65f45deca0bf4b42f7.
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/cmd/bhyve/bhyverun.c
          +++ new/usr/src/cmd/bhyve/bhyverun.c
   1    1  /*-
   2    2   * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3    3   *
   4    4   * Copyright (c) 2011 NetApp, Inc.
   5    5   * All rights reserved.
   6    6   *
   7    7   * Redistribution and use in source and binary forms, with or without
   8    8   * modification, are permitted provided that the following conditions
   9    9   * are met:
  10   10   * 1. Redistributions of source code must retain the above copyright
  11   11   *    notice, this list of conditions and the following disclaimer.
  12   12   * 2. Redistributions in binary form must reproduce the above copyright
  13   13   *    notice, this list of conditions and the following disclaimer in the
  14   14   *    documentation and/or other materials provided with the distribution.
  15   15   *
  16   16   * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  17   17   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18   18   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19   19   * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  20   20   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21   21   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22   22   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23   23   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24   24   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25   25   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26   26   * SUCH DAMAGE.
  27   27   *
  28   28   * $FreeBSD$
  29   29   */
  30   30  /*
  31   31   * This file and its contents are supplied under the terms of the
  32   32   * Common Development and Distribution License ("CDDL"), version 1.0.
  33   33   * You may only use this file in accordance with the terms of version
  34   34   * 1.0 of the CDDL.
  35   35   *
  36   36   * A full copy of the text of the CDDL should have accompanied this
  37   37   * source.  A copy of the CDDL is also available via the Internet at
  38   38   * http://www.illumos.org/license/CDDL.
  39   39   *
  40   40   * Copyright 2015 Pluribus Networks Inc.
  41   41   * Copyright 2018 Joyent, Inc.
  42   42   * Copyright 2020 Oxide Computer Company
  43   43   */
  44   44  
  45   45  #include <sys/cdefs.h>
  46   46  __FBSDID("$FreeBSD$");
  47   47  
  48   48  #include <sys/types.h>
  49   49  #ifndef WITHOUT_CAPSICUM
  50   50  #include <sys/capsicum.h>
  51   51  #endif
  52   52  #include <sys/mman.h>
  53   53  #include <sys/time.h>
  54   54  #include <sys/cpuset.h>
  55   55  
  56   56  #ifdef __FreeBSD__
  57   57  #include <amd64/vmm/intel/vmcs.h>
  58   58  #else
  59   59  #include <intel/vmcs.h>
  60   60  #endif
  61   61  
  62   62  #include <machine/atomic.h>
  63   63  #include <machine/segments.h>
  64   64  
  65   65  #ifndef WITHOUT_CAPSICUM
  66   66  #include <capsicum_helpers.h>
  67   67  #endif
  68   68  #include <stdio.h>
  69   69  #include <stdlib.h>
  70   70  #include <string.h>
  71   71  #include <err.h>
  72   72  #include <errno.h>
  73   73  #include <libgen.h>
  74   74  #include <unistd.h>
  75   75  #include <assert.h>
  76   76  #include <pthread.h>
  77   77  #include <pthread_np.h>
  78   78  #include <sysexits.h>
  79   79  #include <stdbool.h>
  80   80  #include <stdint.h>
  81   81  
  82   82  #include <machine/vmm.h>
  83   83  #ifndef WITHOUT_CAPSICUM
  84   84  #include <machine/vmm_dev.h>
  85   85  #endif
  86   86  #include <vmmapi.h>
  87   87  
  88   88  #ifndef __FreeBSD__
  89   89  #include <sys/stat.h>
  90   90  #endif
  91   91  
  92   92  #include "bhyverun.h"
  93   93  #include "acpi.h"
  94   94  #include "atkbdc.h"
  95   95  #include "console.h"
  96   96  #include "bootrom.h"
  97   97  #include "config.h"
  98   98  #include "inout.h"
  99   99  #include "debug.h"
 100  100  #include "fwctl.h"
 101  101  #include "gdb.h"
 102  102  #include "ioapic.h"
 103  103  #include "kernemu_dev.h"
 104  104  #include "mem.h"
 105  105  #include "mevent.h"
 106  106  #include "mptbl.h"
 107  107  #include "pci_emul.h"
 108  108  #include "pci_irq.h"
 109  109  #include "pci_lpc.h"
 110  110  #include "smbiostbl.h"
 111  111  #include "xmsr.h"
 112  112  #include "spinup_ap.h"
 113  113  #include "rfb.h"
 114  114  #include "rtc.h"
 115  115  #include "vga.h"
 116  116  #include "vmgenc.h"
 117  117  #ifndef __FreeBSD__
 118  118  #include "privileges.h"
 119  119  #endif
 120  120  
 121  121  #define GUEST_NIO_PORT          0x488   /* guest upcalls via i/o port */
 122  122  
 123  123  #define MB              (1024UL * 1024)
 124  124  #define GB              (1024UL * MB)
 125  125  
 126  126  static const char * const vmx_exit_reason_desc[] = {
 127  127          [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
 128  128          [EXIT_REASON_EXT_INTR] = "External interrupt",
 129  129          [EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
 130  130          [EXIT_REASON_INIT] = "INIT signal",
 131  131          [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
 132  132          [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
 133  133          [EXIT_REASON_SMI] = "Other SMI",
 134  134          [EXIT_REASON_INTR_WINDOW] = "Interrupt window",
 135  135          [EXIT_REASON_NMI_WINDOW] = "NMI window",
 136  136          [EXIT_REASON_TASK_SWITCH] = "Task switch",
 137  137          [EXIT_REASON_CPUID] = "CPUID",
 138  138          [EXIT_REASON_GETSEC] = "GETSEC",
 139  139          [EXIT_REASON_HLT] = "HLT",
 140  140          [EXIT_REASON_INVD] = "INVD",
 141  141          [EXIT_REASON_INVLPG] = "INVLPG",
 142  142          [EXIT_REASON_RDPMC] = "RDPMC",
 143  143          [EXIT_REASON_RDTSC] = "RDTSC",
 144  144          [EXIT_REASON_RSM] = "RSM",
 145  145          [EXIT_REASON_VMCALL] = "VMCALL",
 146  146          [EXIT_REASON_VMCLEAR] = "VMCLEAR",
 147  147          [EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
 148  148          [EXIT_REASON_VMPTRLD] = "VMPTRLD",
 149  149          [EXIT_REASON_VMPTRST] = "VMPTRST",
 150  150          [EXIT_REASON_VMREAD] = "VMREAD",
 151  151          [EXIT_REASON_VMRESUME] = "VMRESUME",
 152  152          [EXIT_REASON_VMWRITE] = "VMWRITE",
 153  153          [EXIT_REASON_VMXOFF] = "VMXOFF",
 154  154          [EXIT_REASON_VMXON] = "VMXON",
 155  155          [EXIT_REASON_CR_ACCESS] = "Control-register accesses",
 156  156          [EXIT_REASON_DR_ACCESS] = "MOV DR",
 157  157          [EXIT_REASON_INOUT] = "I/O instruction",
 158  158          [EXIT_REASON_RDMSR] = "RDMSR",
 159  159          [EXIT_REASON_WRMSR] = "WRMSR",
 160  160          [EXIT_REASON_INVAL_VMCS] =
 161  161              "VM-entry failure due to invalid guest state",
 162  162          [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
 163  163          [EXIT_REASON_MWAIT] = "MWAIT",
 164  164          [EXIT_REASON_MTF] = "Monitor trap flag",
 165  165          [EXIT_REASON_MONITOR] = "MONITOR",
 166  166          [EXIT_REASON_PAUSE] = "PAUSE",
 167  167          [EXIT_REASON_MCE_DURING_ENTRY] =
 168  168              "VM-entry failure due to machine-check event",
 169  169          [EXIT_REASON_TPR] = "TPR below threshold",
 170  170          [EXIT_REASON_APIC_ACCESS] = "APIC access",
 171  171          [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
 172  172          [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
 173  173          [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
 174  174          [EXIT_REASON_EPT_FAULT] = "EPT violation",
 175  175          [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
 176  176          [EXIT_REASON_INVEPT] = "INVEPT",
 177  177          [EXIT_REASON_RDTSCP] = "RDTSCP",
 178  178          [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
 179  179          [EXIT_REASON_INVVPID] = "INVVPID",
 180  180          [EXIT_REASON_WBINVD] = "WBINVD",
 181  181          [EXIT_REASON_XSETBV] = "XSETBV",
 182  182          [EXIT_REASON_APIC_WRITE] = "APIC write",
 183  183          [EXIT_REASON_RDRAND] = "RDRAND",
 184  184          [EXIT_REASON_INVPCID] = "INVPCID",
 185  185          [EXIT_REASON_VMFUNC] = "VMFUNC",
 186  186          [EXIT_REASON_ENCLS] = "ENCLS",
 187  187          [EXIT_REASON_RDSEED] = "RDSEED",
 188  188          [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
 189  189          [EXIT_REASON_XSAVES] = "XSAVES",
 190  190          [EXIT_REASON_XRSTORS] = "XRSTORS"
 191  191  };
 192  192  
 193  193  typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
 194  194  extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
 195  195  
 196  196  int guest_ncpus;
 197  197  uint16_t cores, maxcpus, sockets, threads;
 198  198  
 199  199  int raw_stdio = 0;
 200  200  
 201  201  static char *progname;
 202  202  static const int BSP = 0;
 203  203  
 204  204  static cpuset_t cpumask;
 205  205  
 206  206  static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
 207  207  
 208  208  static struct vm_exit vmexit[VM_MAXCPU];
 209  209  static struct vm_entry vmentry[VM_MAXCPU];
 210  210  
 211  211  struct bhyvestats {
 212  212          uint64_t        vmexit_bogus;
 213  213          uint64_t        vmexit_reqidle;
 214  214          uint64_t        vmexit_hlt;
 215  215          uint64_t        vmexit_pause;
 216  216          uint64_t        vmexit_mtrap;
 217  217          uint64_t        vmexit_mmio;
 218  218          uint64_t        vmexit_inout;
 219  219          uint64_t        cpu_switch_rotate;
 220  220          uint64_t        cpu_switch_direct;
 221  221          uint64_t        mmio_unhandled;
 222  222  } stats;
 223  223  
 224  224  struct mt_vmm_info {
 225  225          pthread_t       mt_thr;
 226  226          struct vmctx    *mt_ctx;
 227  227          int             mt_vcpu;
 228  228          uint64_t        mt_startrip;
 229  229  } mt_vmm_info[VM_MAXCPU];
 230  230  
 231  231  #ifdef  __FreeBSD__
 232  232  static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
 233  233  #endif
 234  234  
 235  235  static void
 236  236  usage(int code)
 237  237  {
 238  238  
 239  239          fprintf(stderr,
 240  240  #ifdef  __FreeBSD__
 241  241                  "Usage: %s [-aehuwxACDHPSWY]\n"
 242  242  #else
 243  243                  "Usage: %s [-adehuwxACDHPSWY]\n"
 244  244  #endif
 245  245                  "       %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
 246  246                  "       %*s [-k <file>] [-l <lpc>] [-m mem] [-o <var>=<value>]\n"
 247  247  #ifdef  __FreeBSD__
 248  248                  "       %*s [-p vcpu:hostcpu] [-s <pci>] [-U uuid] [<vm>]\n"
 249  249  #else
 250  250                  "       %*s [-s <pci>] [-U uuid] [<vm>]\n"
 251  251  #endif
 252  252                  "       -a: local apic is in xAPIC mode (deprecated)\n"
 253  253                  "       -A: create ACPI tables\n"
 254  254                  "       -c: number of cpus and/or topology specification\n"
 255  255                  "       -C: include guest memory in core file\n"
 256  256  #ifndef __FreeBSD__
 257  257                  "       -d: suspend cpu at boot\n"
 258  258  #endif
 259  259                  "       -D: destroy on power-off\n"
 260  260                  "       -e: exit on unhandled I/O access\n"
 261  261                  "       -h: help\n"
 262  262                  "       -H: vmexit from the guest on hlt\n"
 263  263                  "       -k: key=value flat config file\n"
 264  264                  "       -l: LPC device configuration\n"
 265  265                  "       -m: memory size\n"
 266  266                  "       -o: set config 'var' to 'value'\n"
 267  267  #ifdef  __FreeBSD__
 268  268                  "       -p: pin 'vcpu' to 'hostcpu'\n"
 269  269  #endif
 270  270                  "       -P: vmexit from the guest on pause\n"
 271  271                  "       -s: <slot,driver,configinfo> PCI slot config\n"
 272  272                  "       -S: guest memory cannot be swapped\n"
 273  273                  "       -u: RTC keeps UTC time\n"
 274  274                  "       -U: uuid\n"
 275  275                  "       -w: ignore unimplemented MSRs\n"
 276  276                  "       -W: force virtio to use single-vector MSI\n"
 277  277                  "       -x: local apic is in x2APIC mode\n"
 278  278                  "       -Y: disable MPtable generation\n",
 279  279                  progname, (int)strlen(progname), "", (int)strlen(progname), "",
 280  280                  (int)strlen(progname), "");
 281  281  
 282  282          exit(code);
 283  283  }
 284  284  
 285  285  /*
 286  286   * XXX This parser is known to have the following issues:
 287  287   * 1.  It accepts null key=value tokens ",," as setting "cpus" to an
 288  288   *     empty string.
 289  289   *
 290  290   * The acceptance of a null specification ('-c ""') is by design to match the
 291  291   * manual page syntax specification, this results in a topology of 1 vCPU.
 292  292   */
 293  293  static int
 294  294  topology_parse(const char *opt)
 295  295  {
 296  296          char *cp, *str;
 297  297  
 298  298          if (*opt == '\0') {
 299  299                  set_config_value("sockets", "1");
 300  300                  set_config_value("cores", "1");
 301  301                  set_config_value("threads", "1");
 302  302                  set_config_value("cpus", "1");
 303  303                  return (0);
 304  304          }
 305  305  
 306  306          str = strdup(opt);
 307  307          if (str == NULL)
 308  308                  errx(4, "Failed to allocate memory");
 309  309  
 310  310          while ((cp = strsep(&str, ",")) != NULL) {
 311  311                  if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
 312  312                          set_config_value("cpus", cp + strlen("cpus="));
 313  313                  else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0)
 314  314                          set_config_value("sockets", cp + strlen("sockets="));
 315  315                  else if (strncmp(cp, "cores=", strlen("cores=")) == 0)
 316  316                          set_config_value("cores", cp + strlen("cores="));
 317  317                  else if (strncmp(cp, "threads=", strlen("threads=")) == 0)
 318  318                          set_config_value("threads", cp + strlen("threads="));
 319  319  #ifdef notyet  /* Do not expose this until vmm.ko implements it */
 320  320                  else if (strncmp(cp, "maxcpus=", strlen("maxcpus=")) == 0)
 321  321                          set_config_value("maxcpus", cp + strlen("maxcpus="));
 322  322  #endif
 323  323                  else if (strchr(cp, '=') != NULL)
 324  324                          goto out;
 325  325                  else
 326  326                          set_config_value("cpus", cp);
 327  327          }
 328  328          free(str);
 329  329          return (0);
 330  330  
 331  331  out:
 332  332          free(str);
 333  333          return (-1);
 334  334  }
 335  335  
 336  336  static int
 337  337  parse_int_value(const char *key, const char *value, int minval, int maxval)
 338  338  {
 339  339          char *cp;
 340  340          long lval;
 341  341  
 342  342          errno = 0;
 343  343          lval = strtol(value, &cp, 0);
 344  344          if (errno != 0 || *cp != '\0' || cp == value || lval < minval ||
 345  345              lval > maxval)
 346  346                  errx(4, "Invalid value for %s: '%s'", key, value);
 347  347          return (lval);
 348  348  }
 349  349  
 350  350  /*
 351  351   * Set the sockets, cores, threads, and guest_cpus variables based on
 352  352   * the configured topology.
 353  353   *
 354  354   * The limits of UINT16_MAX are due to the types passed to
 355  355   * vm_set_topology().  vmm.ko may enforce tighter limits.
 356  356   */
 357  357  static void
 358  358  calc_topolopgy(void)
 359  359  {
 360  360          const char *value;
 361  361          bool explicit_cpus;
 362  362          uint64_t ncpus;
 363  363  
 364  364          value = get_config_value("cpus");
 365  365          if (value != NULL) {
 366  366                  guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX);
 367  367                  explicit_cpus = true;
 368  368          } else {
 369  369                  guest_ncpus = 1;
 370  370                  explicit_cpus = false;
 371  371          }
 372  372          value = get_config_value("cores");
 373  373          if (value != NULL)
 374  374                  cores = parse_int_value("cores", value, 1, UINT16_MAX);
 375  375          else
 376  376                  cores = 1;
 377  377          value = get_config_value("threads");
 378  378          if (value != NULL)
 379  379                  threads = parse_int_value("threads", value, 1, UINT16_MAX);
 380  380          else
 381  381                  threads = 1;
 382  382          value = get_config_value("sockets");
 383  383          if (value != NULL)
 384  384                  sockets = parse_int_value("sockets", value, 1, UINT16_MAX);
 385  385          else
 386  386                  sockets = guest_ncpus;
 387  387  
 388  388          /*
 389  389           * Compute sockets * cores * threads avoiding overflow.  The
 390  390           * range check above insures these are 16 bit values.
 391  391           */
 392  392          ncpus = (uint64_t)sockets * cores * threads;
 393  393          if (ncpus > UINT16_MAX)
 394  394                  errx(4, "Computed number of vCPUs too high: %ju",
 395  395                      (uintmax_t)ncpus);
 396  396  
 397  397          if (explicit_cpus) {
 398  398                  if (guest_ncpus != ncpus)
 399  399                          errx(4, "Topology (%d sockets, %d cores, %d threads) "
 400  400                              "does not match %d vCPUs", sockets, cores, threads,
 401  401                              guest_ncpus);
 402  402          } else
 403  403                  guest_ncpus = ncpus;
 404  404  }
 405  405  
 406  406  #ifndef WITHOUT_CAPSICUM
 407  407  /*
 408  408   * 11-stable capsicum helpers
 409  409   */
 410  410  static void
 411  411  bhyve_caph_cache_catpages(void)
 412  412  {
 413  413  
 414  414          (void)catopen("libc", NL_CAT_LOCALE);
 415  415  }
 416  416  
 417  417  static int
 418  418  bhyve_caph_limit_stdoe(void)
 419  419  {
 420  420          cap_rights_t rights;
 421  421          unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ };
 422  422          int i, fds[] = { STDOUT_FILENO, STDERR_FILENO };
 423  423  
 424  424          cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL);
 425  425          cap_rights_set(&rights, CAP_WRITE);
 426  426  
 427  427          for (i = 0; i < nitems(fds); i++) {
 428  428                  if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS)
 429  429                          return (-1);
 430  430  
 431  431                  if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS)
 432  432                          return (-1);
 433  433  
 434  434                  if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS)
 435  435                          return (-1);
 436  436          }
 437  437  
 438  438          return (0);
 439  439  }
 440  440  
 441  441  #endif
 442  442  
 443  443  #ifdef  __FreeBSD__
 444  444  static int
 445  445  pincpu_parse(const char *opt)
 446  446  {
 447  447          int vcpu, pcpu;
 448  448  
 449  449          if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
 450  450                  fprintf(stderr, "invalid format: %s\n", opt);
 451  451                  return (-1);
 452  452          }
 453  453  
 454  454          if (vcpu < 0 || vcpu >= VM_MAXCPU) {
 455  455                  fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
 456  456                      vcpu, VM_MAXCPU - 1);
 457  457                  return (-1);
 458  458          }
 459  459  
 460  460          if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
 461  461                  fprintf(stderr, "hostcpu '%d' outside valid range from "
 462  462                      "0 to %d\n", pcpu, CPU_SETSIZE - 1);
 463  463                  return (-1);
 464  464          }
 465  465  
 466  466          snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
 467  467          value = get_config_value(key);
 468  468  
 469  469          if (asprintf(&newval, "%s%s%d", value != NULL ? value : "",
 470  470              value != NULL ? "," : "", pcpu) == -1) {
 471  471                  perror("failed to build new cpuset string");
 472  472                  return (-1);
 473  473          }
 474  474  
 475  475          set_config_value(key, newval);
 476  476          free(newval);
 477  477          return (0);
 478  478  }
 479  479  
 480  480  static void
 481  481  parse_cpuset(int vcpu, const char *list, cpuset_t *set)
 482  482  {
 483  483          char *cp, *token;
 484  484          int pcpu, start;
 485  485  
 486  486          CPU_ZERO(set);
 487  487          start = -1;
 488  488          token = __DECONST(char *, list);
 489  489          for (;;) {
 490  490                  pcpu = strtoul(token, &cp, 0);
 491  491                  if (cp == token)
 492  492                          errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
 493  493                  if (pcpu < 0 || pcpu >= CPU_SETSIZE)
 494  494                          errx(4, "hostcpu '%d' outside valid range from 0 to %d",
 495  495                              pcpu, CPU_SETSIZE - 1);
 496  496                  switch (*cp) {
 497  497                  case ',':
 498  498                  case '\0':
 499  499                          if (start >= 0) {
 500  500                                  if (start > pcpu)
 501  501                                          errx(4, "Invalid hostcpu range %d-%d",
 502  502                                              start, pcpu);
 503  503                                  while (start < pcpu) {
 504  504                                          CPU_SET(start, vcpumap[vcpu]);
 505  505                                          start++;
 506  506                                  }
 507  507                                  start = -1;
 508  508                          }
 509  509                          CPU_SET(pcpu, vcpumap[vcpu]);
 510  510                          break;
 511  511                  case '-':
 512  512                          if (start >= 0)
 513  513                                  errx(4, "invalid cpuset for vcpu %d: '%s'",
 514  514                                      vcpu, list);
 515  515                          start = pcpu;
 516  516                          break;
 517  517                  default:
 518  518                          errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
 519  519                  }
 520  520                  if (*cp == '\0')
 521  521                          break;
 522  522                  token = cp + 1;
 523  523          }
 524  524  }
 525  525  
 526  526  static void
 527  527  build_vcpumaps(void)
 528  528  {
 529  529          char key[16];
 530  530          const char *value;
 531  531          int vcpu;
 532  532  
 533  533          for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
 534  534                  snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
 535  535                  value = get_config_value(key);
 536  536                  if (value == NULL)
 537  537                          continue;
 538  538                  vcpumap[vcpu] = malloc(sizeof(cpuset_t));
 539  539                  if (vcpumap[vcpu] == NULL)
 540  540                          err(4, "Failed to allocate cpuset for vcpu %d", vcpu);
 541  541                  parse_cpuset(vcpu, value, vcpumap[vcpu]);
 542  542          }
 543  543  }
 544  544  
 545  545  void
 546  546  vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
 547  547      int errcode)
 548  548  {
 549  549          struct vmctx *ctx;
 550  550          int error, restart_instruction;
 551  551  
 552  552          ctx = arg;
 553  553          restart_instruction = 1;
 554  554  
 555  555          error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
 556  556              restart_instruction);
 557  557          assert(error == 0);
 558  558  }
 559  559  #endif /* __FreeBSD__ */
 560  560  
 561  561  void *
 562  562  paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
 563  563  {
 564  564  
 565  565          return (vm_map_gpa(ctx, gaddr, len));
 566  566  }
 567  567  
 568  568  int
 569  569  fbsdrun_virtio_msix(void)
 570  570  {
 571  571  
 572  572          return (get_config_bool_default("virtio_msix", true));
 573  573  }
 574  574  
 575  575  static void *
 576  576  fbsdrun_start_thread(void *param)
 577  577  {
 578  578          char tname[MAXCOMLEN + 1];
 579  579          struct mt_vmm_info *mtp;
 580  580          int vcpu;
 581  581  
 582  582          mtp = param;
 583  583          vcpu = mtp->mt_vcpu;
 584  584  
 585  585          snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
 586  586          pthread_set_name_np(mtp->mt_thr, tname);
 587  587  
 588  588          gdb_cpu_add(vcpu);
 589  589  
 590  590          vm_loop(mtp->mt_ctx, vcpu, mtp->mt_startrip);
 591  591  
 592  592          /* not reached */
 593  593          exit(1);
 594  594          return (NULL);
 595  595  }
 596  596  
 597  597  #ifdef __FreeBSD__
 598  598  void
 599  599  fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 600  600  #else
 601  601  void
 602  602  fbsdrun_addcpu(struct vmctx *ctx, int newcpu, uint64_t rip, bool suspend)
 603  603  #endif
 604  604  {
 605  605          int error;
 606  606  
 607  607  #ifdef __FreeBSD__
 608  608          assert(fromcpu == BSP);
 609  609  #endif
 610  610  
 611  611          /*
 612  612           * The 'newcpu' must be activated in the context of 'fromcpu'. If
 613  613           * vm_activate_cpu() is delayed until newcpu's pthread starts running
 614  614           * then vmm.ko is out-of-sync with bhyve and this can create a race
 615  615           * with vm_suspend().
 616  616           */
 617  617          error = vm_activate_cpu(ctx, newcpu);
 618  618          if (error != 0)
 619  619                  err(EX_OSERR, "could not activate CPU %d", newcpu);
 620  620  
 621  621          CPU_SET_ATOMIC(newcpu, &cpumask);
 622  622  
 623  623  #ifndef __FreeBSD__
 624  624          if (suspend)
 625  625                  (void) vm_suspend_cpu(ctx, newcpu);
 626  626  #endif
 627  627  
 628  628          /*
 629  629           * Set up the vmexit struct to allow execution to start
 630  630           * at the given RIP
 631  631           */
 632  632          mt_vmm_info[newcpu].mt_ctx = ctx;
 633  633          mt_vmm_info[newcpu].mt_vcpu = newcpu;
 634  634          mt_vmm_info[newcpu].mt_startrip = rip;
 635  635  
 636  636          error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
 637  637              fbsdrun_start_thread, &mt_vmm_info[newcpu]);
 638  638          assert(error == 0);
 639  639  }
 640  640  
 641  641  static int
 642  642  fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
 643  643  {
 644  644  
 645  645          if (!CPU_ISSET(vcpu, &cpumask)) {
 646  646                  fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
 647  647                  exit(4);
 648  648          }
 649  649  
 650  650          CPU_CLR_ATOMIC(vcpu, &cpumask);
 651  651          return (CPU_EMPTY(&cpumask));
 652  652  }
 653  653  
 654  654  static void
 655  655  vmentry_mmio_read(int vcpu, uint64_t gpa, uint8_t bytes, uint64_t data)
 656  656  {
 657  657          struct vm_entry *entry = &vmentry[vcpu];
 658  658          struct vm_mmio *mmio = &entry->u.mmio;
 659  659  
 660  660          assert(entry->cmd == VEC_DEFAULT);
 661  661  
 662  662          entry->cmd = VEC_FULFILL_MMIO;
 663  663          mmio->bytes = bytes;
 664  664          mmio->read = 1;
 665  665          mmio->gpa = gpa;
 666  666          mmio->data = data;
 667  667  }
 668  668  
 669  669  static void
 670  670  vmentry_mmio_write(int vcpu, uint64_t gpa, uint8_t bytes)
 671  671  {
 672  672          struct vm_entry *entry = &vmentry[vcpu];
 673  673          struct vm_mmio *mmio = &entry->u.mmio;
 674  674  
 675  675          assert(entry->cmd == VEC_DEFAULT);
 676  676  
 677  677          entry->cmd = VEC_FULFILL_MMIO;
 678  678          mmio->bytes = bytes;
 679  679          mmio->read = 0;
 680  680          mmio->gpa = gpa;
 681  681          mmio->data = 0;
 682  682  }
 683  683  
 684  684  static void
 685  685  vmentry_inout_read(int vcpu, uint16_t port, uint8_t bytes, uint32_t data)
 686  686  {
 687  687          struct vm_entry *entry = &vmentry[vcpu];
 688  688          struct vm_inout *inout = &entry->u.inout;
 689  689  
 690  690          assert(entry->cmd == VEC_DEFAULT);
 691  691  
 692  692          entry->cmd = VEC_FULFILL_INOUT;
 693  693          inout->bytes = bytes;
 694  694          inout->flags = INOUT_IN;
 695  695          inout->port = port;
 696  696          inout->eax = data;
 697  697  }
 698  698  
 699  699  static void
 700  700  vmentry_inout_write(int vcpu, uint16_t port, uint8_t bytes)
 701  701  {
 702  702          struct vm_entry *entry = &vmentry[vcpu];
 703  703          struct vm_inout *inout = &entry->u.inout;
 704  704  
 705  705          assert(entry->cmd == VEC_DEFAULT);
 706  706  
 707  707          entry->cmd = VEC_FULFILL_INOUT;
 708  708          inout->bytes = bytes;
 709  709          inout->flags = 0;
 710  710          inout->port = port;
 711  711          inout->eax = 0;
 712  712  }
 713  713  
 714  714  static int
 715  715  vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
 716  716                       uint32_t eax)
 717  717  {
 718  718  #if BHYVE_DEBUG
 719  719          /*
 720  720           * put guest-driven debug here
 721  721           */
 722  722  #endif
 723  723          return (VMEXIT_CONTINUE);
 724  724  }
 725  725  
 726  726  static int
 727  727  vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 728  728  {
 729  729          int error;
 730  730          int vcpu;
 731  731          struct vm_inout inout;
 732  732          bool in;
 733  733          uint8_t bytes;
 734  734  
 735  735          stats.vmexit_inout++;
 736  736  
 737  737          vcpu = *pvcpu;
 738  738          inout = vme->u.inout;
 739  739          in = (inout.flags & INOUT_IN) != 0;
 740  740          bytes = inout.bytes;
 741  741  
 742  742          /* Extra-special case of host notifications */
 743  743          if (!in && inout.port == GUEST_NIO_PORT) {
 744  744                  error = vmexit_handle_notify(ctx, vme, pvcpu, inout.eax);
 745  745                  vmentry_inout_write(vcpu, inout.port, bytes);
 746  746                  return (error);
 747  747          }
 748  748  
 749  749          error = emulate_inout(ctx, vcpu, &inout);
 750  750          if (error) {
 751  751                  fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
 752  752                      in ? "in" : "out",
 753  753                      bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
 754  754                      inout.port, vmexit->rip);
 755  755                  return (VMEXIT_ABORT);
 756  756          } else {
 757  757                  /*
 758  758                   * Communicate the status of the inout operation back to the
 759  759                   * in-kernel instruction emulation.
 760  760                   */
 761  761                  if (in) {
 762  762                          vmentry_inout_read(vcpu, inout.port, bytes, inout.eax);
 763  763                  } else {
 764  764                          vmentry_inout_write(vcpu, inout.port, bytes);
 765  765                  }
 766  766                  return (VMEXIT_CONTINUE);
 767  767          }
 768  768  }
 769  769  
 770  770  static int
 771  771  vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 772  772  {
 773  773          uint64_t val;
 774  774          uint32_t eax, edx;
 775  775          int error;
 776  776  
 777  777          val = 0;
 778  778          error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
 779  779          if (error != 0) {
 780  780                  fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
 781  781                      vme->u.msr.code, *pvcpu);
 782  782                  if (get_config_bool("x86.strictmsr")) {
 783  783                          vm_inject_gp(ctx, *pvcpu);
 784  784                          return (VMEXIT_CONTINUE);
 785  785                  }
 786  786          }
 787  787  
 788  788          eax = val;
 789  789          error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
 790  790          assert(error == 0);
 791  791  
 792  792          edx = val >> 32;
 793  793          error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
 794  794          assert(error == 0);
 795  795  
 796  796          return (VMEXIT_CONTINUE);
 797  797  }
 798  798  
 799  799  static int
 800  800  vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 801  801  {
 802  802          int error;
 803  803  
 804  804          error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
 805  805          if (error != 0) {
 806  806                  fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
 807  807                      vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 808  808                  if (get_config_bool("x86.strictmsr")) {
 809  809                          vm_inject_gp(ctx, *pvcpu);
 810  810                          return (VMEXIT_CONTINUE);
 811  811                  }
 812  812          }
 813  813          return (VMEXIT_CONTINUE);
 814  814  }
 815  815  
 816  816  #ifdef __FreeBSD__
 817  817  static int
 818  818  vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 819  819  {
 820  820  
 821  821          (void)spinup_ap(ctx, *pvcpu,
 822  822                      vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
 823  823  
 824  824          return (VMEXIT_CONTINUE);
 825  825  }
 826  826  #else
 827  827  static int
 828  828  vmexit_run_state(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 829  829  {
 830  830          /*
 831  831           * Run-state transitions (INIT, SIPI, etc) are handled in-kernel, so an
 832  832           * exit to userspace with that code is not expected.
 833  833           */
 834  834          fprintf(stderr, "unexpected run-state VM exit");
 835  835          return (VMEXIT_ABORT);
 836  836  }
 837  837  #endif /* __FreeBSD__ */
 838  838  
 839  839  #ifdef __FreeBSD__
 840  840  #define DEBUG_EPT_MISCONFIG
 841  841  #else
 842  842  /* EPT misconfig debugging not possible now that raw VMCS access is gone */
 843  843  #endif
 844  844  
 845  845  #ifdef DEBUG_EPT_MISCONFIG
 846  846  #define VMCS_GUEST_PHYSICAL_ADDRESS     0x00002400
 847  847  
 848  848  static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
 849  849  static int ept_misconfig_ptenum;
 850  850  #endif
 851  851  
 852  852  static const char *
 853  853  vmexit_vmx_desc(uint32_t exit_reason)
 854  854  {
 855  855  
 856  856          if (exit_reason >= nitems(vmx_exit_reason_desc) ||
 857  857              vmx_exit_reason_desc[exit_reason] == NULL)
 858  858                  return ("Unknown");
 859  859          return (vmx_exit_reason_desc[exit_reason]);
 860  860  }
 861  861  
 862  862  static int
 863  863  vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 864  864  {
 865  865  
 866  866          fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 867  867          fprintf(stderr, "\treason\t\tVMX\n");
 868  868          fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 869  869          fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 870  870          fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
 871  871          fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
 872  872              vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
 873  873          fprintf(stderr, "\tqualification\t0x%016lx\n",
 874  874              vmexit->u.vmx.exit_qualification);
 875  875          fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 876  876          fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
 877  877  #ifdef DEBUG_EPT_MISCONFIG
 878  878          if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
 879  879                  vm_get_register(ctx, *pvcpu,
 880  880                      VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
 881  881                      &ept_misconfig_gpa);
 882  882                  vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
 883  883                      &ept_misconfig_ptenum);
 884  884                  fprintf(stderr, "\tEPT misconfiguration:\n");
 885  885                  fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
 886  886                  fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
 887  887                      ept_misconfig_ptenum, ept_misconfig_pte[0],
 888  888                      ept_misconfig_pte[1], ept_misconfig_pte[2],
 889  889                      ept_misconfig_pte[3]);
 890  890          }
 891  891  #endif  /* DEBUG_EPT_MISCONFIG */
 892  892          return (VMEXIT_ABORT);
 893  893  }
 894  894  
 895  895  static int
 896  896  vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 897  897  {
 898  898  
 899  899          fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 900  900          fprintf(stderr, "\treason\t\tSVM\n");
 901  901          fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 902  902          fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 903  903          fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
 904  904          fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
 905  905          fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
 906  906          return (VMEXIT_ABORT);
 907  907  }
 908  908  
 909  909  static int
 910  910  vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 911  911  {
 912  912  
 913  913          assert(vmexit->inst_length == 0);
 914  914  
 915  915          stats.vmexit_bogus++;
 916  916  
 917  917          return (VMEXIT_CONTINUE);
 918  918  }
 919  919  
 920  920  static int
 921  921  vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 922  922  {
 923  923  
 924  924          assert(vmexit->inst_length == 0);
 925  925  
 926  926          stats.vmexit_reqidle++;
 927  927  
 928  928          return (VMEXIT_CONTINUE);
 929  929  }
 930  930  
 931  931  static int
 932  932  vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 933  933  {
 934  934  
 935  935          stats.vmexit_hlt++;
 936  936  
 937  937          /*
 938  938           * Just continue execution with the next instruction. We use
 939  939           * the HLT VM exit as a way to be friendly with the host
 940  940           * scheduler.
 941  941           */
 942  942          return (VMEXIT_CONTINUE);
 943  943  }
 944  944  
 945  945  static int
 946  946  vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 947  947  {
 948  948  
 949  949          stats.vmexit_pause++;
 950  950  
 951  951          return (VMEXIT_CONTINUE);
 952  952  }
 953  953  
 954  954  static int
 955  955  vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 956  956  {
 957  957  
 958  958          assert(vmexit->inst_length == 0);
 959  959  
 960  960          stats.vmexit_mtrap++;
 961  961  
 962  962          gdb_cpu_mtrap(*pvcpu);
 963  963  
 964  964          return (VMEXIT_CONTINUE);
 965  965  }
 966  966  
 967  967  static int
 968  968  vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 969  969  {
 970  970          uint8_t i, valid;
 971  971  
 972  972          fprintf(stderr, "Failed to emulate instruction sequence ");
 973  973  
 974  974          valid = vmexit->u.inst_emul.num_valid;
 975  975          if (valid != 0) {
 976  976                  assert(valid <= sizeof (vmexit->u.inst_emul.inst));
 977  977                  fprintf(stderr, "[");
 978  978                  for (i = 0; i < valid; i++) {
 979  979                          if (i == 0) {
 980  980                                  fprintf(stderr, "%02x",
 981  981                                      vmexit->u.inst_emul.inst[i]);
 982  982                          } else {
 983  983                                  fprintf(stderr, ", %02x",
 984  984                                      vmexit->u.inst_emul.inst[i]);
 985  985                          }
 986  986                  }
 987  987                  fprintf(stderr, "] ");
 988  988          }
 989  989          fprintf(stderr, "@ %rip = %x\n", vmexit->rip);
 990  990  
 991  991          return (VMEXIT_ABORT);
 992  992  }
 993  993  
 994  994  static int
 995  995  vmexit_mmio(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 996  996  {
 997  997          int vcpu, err;
 998  998          struct vm_mmio mmio;
 999  999          bool is_read;
1000 1000  
1001 1001          stats.vmexit_mmio++;
1002 1002  
1003 1003          vcpu = *pvcpu;
1004 1004          mmio = vmexit->u.mmio;
1005 1005          is_read = (mmio.read != 0);
1006 1006  
1007 1007          err = emulate_mem(ctx, vcpu, &mmio);
1008 1008  
1009 1009          if (err == ESRCH) {
1010 1010                  fprintf(stderr, "Unhandled memory access to 0x%lx\n", mmio.gpa);
1011 1011                  stats.mmio_unhandled++;
1012 1012  
1013 1013                  /*
1014 1014                   * Access to non-existent physical addresses is not likely to
1015 1015                   * result in fatal errors on hardware machines, but rather reads
1016 1016                   * of all-ones or discarded-but-acknowledged writes.
1017 1017                   */
1018 1018                  mmio.data = ~0UL;
1019 1019                  err = 0;
1020 1020          }
1021 1021  
1022 1022          if (err == 0) {
1023 1023                  if (is_read) {
1024 1024                          vmentry_mmio_read(vcpu, mmio.gpa, mmio.bytes,
1025 1025                              mmio.data);
1026 1026                  } else {
1027 1027                          vmentry_mmio_write(vcpu, mmio.gpa, mmio.bytes);
1028 1028                  }
1029 1029                  return (VMEXIT_CONTINUE);
1030 1030          }
1031 1031  
1032 1032          fprintf(stderr, "Unhandled mmio error to 0x%lx: %d\n", mmio.gpa, err);
1033 1033          return (VMEXIT_ABORT);
1034 1034  }
1035 1035  
1036 1036  static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
1037 1037  static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
1038 1038  
1039 1039  static int
1040 1040  vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
1041 1041  {
1042 1042          enum vm_suspend_how how;
1043 1043  
1044 1044          how = vmexit->u.suspended.how;
1045 1045  
1046 1046          fbsdrun_deletecpu(ctx, *pvcpu);
1047 1047  
1048 1048          if (*pvcpu != BSP) {
1049 1049                  pthread_mutex_lock(&resetcpu_mtx);
1050 1050                  pthread_cond_signal(&resetcpu_cond);
1051 1051                  pthread_mutex_unlock(&resetcpu_mtx);
1052 1052                  pthread_exit(NULL);
1053 1053          }
1054 1054  
1055 1055          pthread_mutex_lock(&resetcpu_mtx);
1056 1056          while (!CPU_EMPTY(&cpumask)) {
1057 1057                  pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
1058 1058          }
1059 1059          pthread_mutex_unlock(&resetcpu_mtx);
1060 1060  
1061 1061          switch (how) {
1062 1062          case VM_SUSPEND_RESET:
1063 1063                  exit(0);
1064 1064          case VM_SUSPEND_POWEROFF:
1065 1065                  if (get_config_bool_default("destroy_on_poweroff", false))
1066 1066                          vm_destroy(ctx);
1067 1067                  exit(1);
1068 1068          case VM_SUSPEND_HALT:
1069 1069                  exit(2);
1070 1070          case VM_SUSPEND_TRIPLEFAULT:
1071 1071                  exit(3);
1072 1072          default:
1073 1073                  fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
1074 1074                  exit(100);
1075 1075          }
1076 1076          return (0);     /* NOTREACHED */
1077 1077  }
1078 1078  
1079 1079  static int
1080 1080  vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
1081 1081  {
1082 1082  
1083 1083          gdb_cpu_suspend(*pvcpu);
1084 1084          return (VMEXIT_CONTINUE);
1085 1085  }
1086 1086  
1087 1087  static int
1088 1088  vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
1089 1089  {
1090 1090  
1091 1091          gdb_cpu_breakpoint(*pvcpu, vmexit);
1092 1092          return (VMEXIT_CONTINUE);
1093 1093  }
1094 1094  
1095 1095  static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
1096 1096          [VM_EXITCODE_INOUT]  = vmexit_inout,
1097 1097          [VM_EXITCODE_MMIO]  = vmexit_mmio,
1098 1098          [VM_EXITCODE_VMX]    = vmexit_vmx,
1099 1099          [VM_EXITCODE_SVM]    = vmexit_svm,
1100 1100          [VM_EXITCODE_BOGUS]  = vmexit_bogus,
1101 1101          [VM_EXITCODE_REQIDLE] = vmexit_reqidle,
1102 1102          [VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
1103 1103          [VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
1104 1104          [VM_EXITCODE_MTRAP]  = vmexit_mtrap,
1105 1105          [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
1106 1106  #ifdef __FreeBSD__
1107 1107          [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
1108 1108  #else
1109 1109          [VM_EXITCODE_RUN_STATE] = vmexit_run_state,
1110 1110  #endif
1111 1111          [VM_EXITCODE_SUSPENDED] = vmexit_suspend,
1112 1112          [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
1113 1113          [VM_EXITCODE_DEBUG] = vmexit_debug,
1114 1114          [VM_EXITCODE_BPT] = vmexit_breakpoint,
1115 1115  };
1116 1116  
1117 1117  static void
1118 1118  vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
1119 1119  {
1120 1120          int error, rc;
1121 1121          enum vm_exitcode exitcode;
1122 1122          cpuset_t active_cpus;
1123 1123          struct vm_exit *vexit;
1124 1124          struct vm_entry *ventry;
1125 1125  
1126 1126  #ifdef  __FreeBSD__
1127 1127          if (vcpumap[vcpu] != NULL) {
1128 1128                  error = pthread_setaffinity_np(pthread_self(),
1129 1129                      sizeof(cpuset_t), vcpumap[vcpu]);
1130 1130                  assert(error == 0);
1131 1131          }
1132 1132  #endif
1133 1133          error = vm_active_cpus(ctx, &active_cpus);
1134 1134          assert(CPU_ISSET(vcpu, &active_cpus));
1135 1135  
1136 1136          error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
1137 1137          assert(error == 0);
1138 1138  
1139 1139          ventry = &vmentry[vcpu];
1140 1140          vexit = &vmexit[vcpu];
1141 1141  
1142 1142          while (1) {
1143 1143                  error = vm_run(ctx, vcpu, ventry, vexit);
1144 1144                  if (error != 0)
1145 1145                          break;
1146 1146  
1147 1147                  if (ventry->cmd != VEC_DEFAULT) {
1148 1148                          /*
1149 1149                           * Discard any lingering entry state after it has been
1150 1150                           * submitted via vm_run().
1151 1151                           */
1152 1152                          bzero(ventry, sizeof (*ventry));
1153 1153                  }
1154 1154  
1155 1155                  exitcode = vexit->exitcode;
1156 1156                  if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
1157 1157                          fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
1158 1158                              exitcode);
1159 1159                          exit(4);
1160 1160                  }
1161 1161  
1162 1162                  rc = (*handler[exitcode])(ctx, vexit, &vcpu);
1163 1163  
1164 1164                  switch (rc) {
1165 1165                  case VMEXIT_CONTINUE:
1166 1166                          break;
1167 1167                  case VMEXIT_ABORT:
1168 1168                          abort();
1169 1169                  default:
1170 1170                          exit(4);
1171 1171                  }
1172 1172          }
1173 1173          fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
1174 1174  }
1175 1175  
1176 1176  static int
1177 1177  num_vcpus_allowed(struct vmctx *ctx)
1178 1178  {
1179 1179  #ifdef __FreeBSD__
1180 1180          int tmp, error;
1181 1181  
1182 1182          error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
1183 1183  
1184 1184          /*
1185 1185           * The guest is allowed to spinup more than one processor only if the
1186 1186           * UNRESTRICTED_GUEST capability is available.
1187 1187           */
1188 1188          if (error == 0)
1189 1189                  return (VM_MAXCPU);
1190 1190          else
1191 1191                  return (1);
1192 1192  #else
1193 1193          /* Unrestricted Guest is always enabled on illumos */
1194 1194          return (VM_MAXCPU);
1195 1195  #endif /* __FreeBSD__ */
1196 1196  }
1197 1197  
1198 1198  void
1199 1199  fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
1200 1200  {
1201 1201          int err, tmp;
1202 1202  
1203 1203          if (get_config_bool_default("x86.vmexit_on_hlt", false)) {
1204 1204                  err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
1205 1205                  if (err < 0) {
1206 1206                          fprintf(stderr, "VM exit on HLT not supported\n");
1207 1207                          exit(4);
1208 1208                  }
1209 1209                  vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
1210 1210                  if (cpu == BSP)
1211 1211                          handler[VM_EXITCODE_HLT] = vmexit_hlt;
1212 1212          }
1213 1213  
1214 1214          if (get_config_bool_default("x86.vmexit_on_pause", false)) {
1215 1215                  /*
1216 1216                   * pause exit support required for this mode
1217 1217                   */
1218 1218                  err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
1219 1219                  if (err < 0) {
1220 1220                          fprintf(stderr,
1221 1221                              "SMP mux requested, no pause support\n");
1222 1222                          exit(4);
1223 1223                  }
1224 1224                  vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
1225 1225                  if (cpu == BSP)
1226 1226                          handler[VM_EXITCODE_PAUSE] = vmexit_pause;
1227 1227          }
1228 1228  
1229 1229          if (get_config_bool_default("x86.x2apic", false))
1230 1230                  err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
1231 1231          else
1232 1232                  err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
1233 1233  
1234 1234          if (err) {
1235 1235                  fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
1236 1236                  exit(4);
1237 1237          }
1238 1238  
1239 1239  #ifdef  __FreeBSD__
1240 1240          vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
1241 1241  #endif
1242 1242  }
1243 1243  
1244 1244  static struct vmctx *
1245 1245  do_open(const char *vmname)
1246 1246  {
1247 1247          struct vmctx *ctx;
1248 1248          int error;
1249 1249          bool reinit, romboot;
1250 1250  #ifndef WITHOUT_CAPSICUM
1251 1251          cap_rights_t rights;
1252 1252          const cap_ioctl_t *cmds;        
1253 1253          size_t ncmds;
1254 1254  #endif
1255 1255  
1256 1256          reinit = romboot = false;
1257 1257  
1258 1258          if (lpc_bootrom())
1259 1259                  romboot = true;
1260 1260  
1261 1261          error = vm_create(vmname);
1262 1262          if (error) {
1263 1263                  if (errno == EEXIST) {
1264 1264                          if (romboot) {
1265 1265                                  reinit = true;
1266 1266                          } else {
1267 1267                                  /*
1268 1268                                   * The virtual machine has been setup by the
1269 1269                                   * userspace bootloader.
1270 1270                                   */
1271 1271                          }
1272 1272                  } else {
1273 1273                          perror("vm_create");
1274 1274                          exit(4);
1275 1275                  }
1276 1276          } else {
1277 1277                  if (!romboot) {
1278 1278                          /*
1279 1279                           * If the virtual machine was just created then a
1280 1280                           * bootrom must be configured to boot it.
1281 1281                           */
1282 1282                          fprintf(stderr, "virtual machine cannot be booted\n");
1283 1283                          exit(4);
1284 1284                  }
1285 1285          }
1286 1286  
1287 1287          ctx = vm_open(vmname);
1288 1288          if (ctx == NULL) {
1289 1289                  perror("vm_open");
1290 1290                  exit(4);
1291 1291          }
1292 1292  
1293 1293  #ifndef WITHOUT_CAPSICUM
1294 1294          cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1295 1295          if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) 
1296 1296                  errx(EX_OSERR, "Unable to apply rights for sandbox");
1297 1297          vm_get_ioctls(&ncmds);
1298 1298          cmds = vm_get_ioctls(NULL);
1299 1299          if (cmds == NULL)
1300 1300                  errx(EX_OSERR, "out of memory");
1301 1301          if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1)
1302 1302                  errx(EX_OSERR, "Unable to apply rights for sandbox");
1303 1303          free((cap_ioctl_t *)cmds);
1304 1304  #endif
1305 1305   
1306 1306          if (reinit) {
1307 1307                  error = vm_reinit(ctx);
1308 1308                  if (error) {
1309 1309                          perror("vm_reinit");
1310 1310                          exit(4);
1311 1311                  }
1312 1312          }
1313 1313          error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
1314 1314          if (error)
1315 1315                  errx(EX_OSERR, "vm_set_topology");
1316 1316          return (ctx);
1317 1317  }
1318 1318  
1319 1319  #ifndef __FreeBSD__
1320 1320  
1321 1321  #define FILE_PROVISIONING       "/var/svc/provisioning"
1322 1322  #define FILE_PROVISION_SUCCESS  "/var/svc/provision_success"
1323 1323  
1324 1324  static void
1325 1325  mark_provisioned(void)
1326 1326  {
1327 1327          struct stat stbuf;
1328 1328  
1329 1329          if (lstat(FILE_PROVISIONING, &stbuf) != 0)
1330 1330                  return;
1331 1331  
1332 1332          if (rename(FILE_PROVISIONING, FILE_PROVISION_SUCCESS) != 0) {
1333 1333                  (void) fprintf(stderr, "Cannot rename %s to %s: %s\n",
1334 1334                      FILE_PROVISIONING, FILE_PROVISION_SUCCESS,
1335 1335                      strerror(errno));
1336 1336          }
1337 1337  }
1338 1338  
1339 1339  #endif
1340 1340  
1341 1341  static bool
1342 1342  parse_config_option(const char *option)
1343 1343  {
1344 1344          const char *value;
1345 1345          char *path;
1346 1346  
1347 1347          value = strchr(option, '=');
1348 1348          if (value == NULL || value[1] == '\0')
1349 1349                  return (false);
1350 1350          path = strndup(option, value - option);
1351 1351          if (path == NULL)
1352 1352                  err(4, "Failed to allocate memory");
1353 1353          set_config_value(path, value + 1);
1354 1354          return (true);
1355 1355  }
1356 1356  
1357 1357  static void
1358 1358  parse_simple_config_file(const char *path)
1359 1359  {
1360 1360          FILE *fp;
1361 1361          char *line, *cp;
1362 1362          size_t linecap;
1363 1363          unsigned int lineno;
1364 1364  
1365 1365          fp = fopen(path, "r");
1366 1366          if (fp == NULL)
1367 1367                  err(4, "Failed to open configuration file %s", path);
1368 1368          line = NULL;
1369 1369          linecap = 0;
1370 1370          lineno = 1;
1371 1371          for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) {
1372 1372                  if (*line == '#' || *line == '\n')
1373 1373                          continue;
1374 1374                  cp = strchr(line, '\n');
1375 1375                  if (cp != NULL)
1376 1376                          *cp = '\0';
1377 1377                  if (!parse_config_option(line))
1378 1378                          errx(4, "%s line %u: invalid config option '%s'", path,
1379 1379                              lineno, line);
1380 1380          }
1381 1381          free(line);
1382 1382          fclose(fp);
1383 1383  }
1384 1384  
1385 1385  static void
1386 1386  set_defaults(void)
1387 1387  {
1388 1388  
1389 1389          set_config_bool("acpi_tables", false);
1390 1390          set_config_value("memory.size", "256M");
1391 1391          set_config_bool("x86.strictmsr", true);
1392 1392  }
1393 1393  
1394 1394  int
1395 1395  main(int argc, char *argv[])
1396 1396  {
1397 1397          int c, error, err;
1398 1398          int max_vcpus, memflags;
1399 1399          struct vmctx *ctx;
1400 1400          uint64_t rip;
1401 1401          size_t memsize;
1402 1402          const char *value, *vmname;
1403 1403          char *optstr;
1404 1404  
1405 1405          init_config();
1406 1406          set_defaults();
1407 1407          progname = basename(argv[0]);
1408 1408  
1409 1409  #ifdef  __FreeBSD__
1410 1410          optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:";
1411 1411  #else
1412 1412          /* +d, +B, -p */
1413 1413          optstr = "adehuwxACDHIPSWYk:o:G:c:s:m:l:B:U:";
1414 1414  #endif
1415 1415          while ((c = getopt(argc, argv, optstr)) != -1) {
1416 1416                  switch (c) {
1417 1417                  case 'a':
1418 1418                          set_config_bool("x86.x2apic", false);
1419 1419                          break;
1420 1420                  case 'A':
1421 1421                          set_config_bool("acpi_tables", true);
1422 1422                          break;
1423 1423                  case 'D':
1424 1424                          set_config_bool("destroy_on_poweroff", true);
1425 1425                          break;
1426 1426  #ifndef __FreeBSD__
1427 1427                  case 'B':
1428 1428                          if (smbios_parse(optarg) != 0) {
1429 1429                                  errx(EX_USAGE, "invalid SMBIOS "
1430 1430                                      "configuration '%s'", optarg);
1431 1431                          }
1432 1432                          break;
1433 1433                  case 'd':
1434 1434                          set_config_bool("suspend_at_boot", true);
1435 1435                          break;
1436 1436  #endif
1437 1437  #ifdef  __FreeBSD__
1438 1438                  case 'p':
1439 1439                          if (pincpu_parse(optarg) != 0) {
1440 1440                                  errx(EX_USAGE, "invalid vcpu pinning "
1441 1441                                      "configuration '%s'", optarg);
1442 1442                          }
1443 1443                          break;
1444 1444  #endif
1445 1445                  case 'c':
1446 1446                          if (topology_parse(optarg) != 0) {
1447 1447                              errx(EX_USAGE, "invalid cpu topology "
1448 1448                                  "'%s'", optarg);
1449 1449                          }
1450 1450                          break;
1451 1451                  case 'C':
1452 1452                          set_config_bool("memory.guest_in_core", true);
1453 1453                          break;
1454 1454                  case 'G':
1455 1455                          if (optarg[0] == 'w') {
1456 1456                                  set_config_bool("gdb.wait", true);
1457 1457                                  optarg++;
1458 1458                          }
1459 1459                          set_config_value("gdb.port", optarg);
1460 1460                          break;
1461 1461                  case 'k':
1462 1462                          parse_simple_config_file(optarg);
1463 1463                          break;
1464 1464                  case 'l':
1465 1465                          if (strncmp(optarg, "help", strlen(optarg)) == 0) {
1466 1466                                  lpc_print_supported_devices();
1467 1467                                  exit(0);
1468 1468                          } else if (lpc_device_parse(optarg) != 0) {
1469 1469                                  errx(EX_USAGE, "invalid lpc device "
1470 1470                                      "configuration '%s'", optarg);
1471 1471                          }
1472 1472                          break;
1473 1473                  case 's':
1474 1474                          if (strncmp(optarg, "help", strlen(optarg)) == 0) {
1475 1475                                  pci_print_supported_devices();
1476 1476                                  exit(0);
1477 1477                          } else if (pci_parse_slot(optarg) != 0)
1478 1478                                  exit(4);
1479 1479                          else
1480 1480                                  break;
1481 1481                  case 'S':
1482 1482                          set_config_bool("memory.wired", true);
1483 1483                          break;
1484 1484                  case 'm':
1485 1485                          set_config_value("memory.size", optarg);
1486 1486                          break;
1487 1487                  case 'o':
1488 1488                          if (!parse_config_option(optarg))
1489 1489                                  errx(EX_USAGE, "invalid configuration option '%s'", optarg);
1490 1490                          break;
1491 1491                  case 'H':
1492 1492                          set_config_bool("x86.vmexit_on_hlt", true);
1493 1493                          break;
1494 1494                  case 'I':
1495 1495                          /*
1496 1496                           * The "-I" option was used to add an ioapic to the
1497 1497                           * virtual machine.
1498 1498                           *
1499 1499                           * An ioapic is now provided unconditionally for each
1500 1500                           * virtual machine and this option is now deprecated.
1501 1501                           */
1502 1502                          break;
1503 1503                  case 'P':
1504 1504                          set_config_bool("x86.vmexit_on_pause", true);
1505 1505                          break;
1506 1506                  case 'e':
1507 1507                          set_config_bool("x86.strictio", true);
1508 1508                          break;
1509 1509                  case 'u':
1510 1510                          set_config_bool("rtc.use_localtime", false);
1511 1511                          break;
1512 1512                  case 'U':
1513 1513                          set_config_value("uuid", optarg);
1514 1514                          break;
1515 1515                  case 'w':
1516 1516                          set_config_bool("x86.strictmsr", false);
1517 1517                          break;
1518 1518                  case 'W':
1519 1519                          set_config_bool("virtio_msix", false);
1520 1520                          break;
1521 1521                  case 'x':
1522 1522                          set_config_bool("x86.x2apic", true);
1523 1523                          break;
1524 1524                  case 'Y':
1525 1525                          set_config_bool("x86.mptable", false);
1526 1526                          break;
1527 1527                  case 'h':
1528 1528                          usage(0);                       
1529 1529                  default:
1530 1530                          usage(1);
1531 1531                  }
1532 1532          }
1533 1533          argc -= optind;
1534 1534          argv += optind;
1535 1535  
1536 1536          if (argc > 1)
1537 1537                  usage(1);
1538 1538  
1539 1539          if (argc == 1)
1540 1540                  set_config_value("name", argv[0]);
1541 1541  
1542 1542          vmname = get_config_value("name");
1543 1543          if (vmname == NULL)
1544 1544                  usage(1);
1545 1545  
1546 1546          if (get_config_bool_default("config.dump", false)) {
1547 1547                  dump_config();
1548 1548                  exit(1);
1549 1549          }
1550 1550  
1551 1551  #ifndef __FreeBSD__
1552 1552          illumos_priv_init();
1553 1553  #endif
1554 1554  
1555 1555          calc_topolopgy();
1556 1556  #ifdef __FreeBSD__
1557 1557          build_vcpumaps();
1558 1558  #endif
1559 1559  
1560 1560          value = get_config_value("memory.size");
1561 1561          error = vm_parse_memsize(value, &memsize);
1562 1562          if (error)
1563 1563                  errx(EX_USAGE, "invalid memsize '%s'", value);
1564 1564  
1565 1565          ctx = do_open(vmname);
1566 1566  
1567 1567          max_vcpus = num_vcpus_allowed(ctx);
1568 1568          if (guest_ncpus > max_vcpus) {
1569 1569                  fprintf(stderr, "%d vCPUs requested but only %d available\n",
1570 1570                          guest_ncpus, max_vcpus);
1571 1571                  exit(4);
1572 1572          }
1573 1573  
1574 1574          fbsdrun_set_capabilities(ctx, BSP);
  
    | 
      ↓ open down ↓ | 
    1574 lines elided | 
    
      ↑ open up ↑ | 
  
1575 1575  
1576 1576          memflags = 0;
1577 1577          if (get_config_bool_default("memory.wired", false))
1578 1578                  memflags |= VM_MEM_F_WIRED;
1579 1579          if (get_config_bool_default("memory.guest_in_core", false))
1580 1580                  memflags |= VM_MEM_F_INCORE;
1581 1581          vm_set_memflags(ctx, memflags);
1582 1582  #ifdef  __FreeBSD__
1583 1583          err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
1584 1584  #else
1585      -        err = vm_arc_resv(ctx, memsize);
1586      -        if (err != 0) {
1587      -                (void) fprintf(stderr, "Could not shrink ARC: %s\n",
1588      -                    strerror(err));
1589      -                exit(4);
1590      -        }
1591      -
1592 1585          do {
1593 1586                  errno = 0;
1594 1587                  err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
1595 1588                  error = errno;
1596 1589                  if (err != 0 && error == ENOMEM) {
1597 1590                          (void) fprintf(stderr, "Unable to allocate memory "
1598 1591                              "(%llu), retrying in 1 second\n", memsize);
1599 1592                          sleep(1);
1600 1593                  }
1601 1594          } while (error == ENOMEM);
1602 1595  #endif
1603 1596          if (err) {
1604 1597                  fprintf(stderr, "Unable to set up memory (%d)\n", errno);
1605 1598                  exit(4);
1606 1599          }
1607 1600  
1608 1601          error = init_msr();
1609 1602          if (error) {
1610 1603                  fprintf(stderr, "init_msr error %d", error);
1611 1604                  exit(4);
1612 1605          }
1613 1606  
1614 1607          init_mem();
1615 1608          init_inout();
1616 1609  #ifdef  __FreeBSD__
1617 1610          kernemu_dev_init();
1618 1611  #endif
1619 1612          init_bootrom(ctx);
1620 1613          atkbdc_init(ctx);
1621 1614          pci_irq_init(ctx);
1622 1615          ioapic_init(ctx);
1623 1616  
1624 1617          rtc_init(ctx);
1625 1618          sci_init(ctx);
1626 1619  #ifndef __FreeBSD__
1627 1620          pmtmr_init(ctx);
1628 1621  #endif
1629 1622  
1630 1623          /*
1631 1624           * Exit if a device emulation finds an error in its initilization
1632 1625           */
1633 1626          if (init_pci(ctx) != 0) {
1634 1627                  perror("device emulation initialization error");
1635 1628                  exit(4);
1636 1629          }
1637 1630  
1638 1631          /*
1639 1632           * Initialize after PCI, to allow a bootrom file to reserve the high
1640 1633           * region.
1641 1634           */
1642 1635          if (get_config_bool("acpi_tables"))
1643 1636                  vmgenc_init(ctx);
1644 1637  
1645 1638          value = get_config_value("gdb.port");
1646 1639  #ifdef __FreeBSD__
1647 1640          if (value != NULL)
1648 1641                  init_gdb(ctx, atoi(value), get_config_bool_default("gdb.wait",
1649 1642                      false));
1650 1643  #else
1651 1644          if (value != NULL) {
1652 1645                  int port = atoi(value);
1653 1646  
1654 1647                  if (port < 0) {
1655 1648                          init_mdb(ctx,
1656 1649                              get_config_bool_default("gdb.wait", false));
1657 1650                  } else {
1658 1651                          init_gdb(ctx, port,
1659 1652                              get_config_bool_default("gdb.wait", false));
1660 1653                  }
1661 1654          }
1662 1655  #endif
1663 1656  
1664 1657          vga_init(1);
1665 1658  
1666 1659          if (lpc_bootrom()) {
1667 1660  #ifdef __FreeBSD__
1668 1661                  if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
1669 1662                          fprintf(stderr, "ROM boot failed: unrestricted guest "
1670 1663                              "capability not available\n");
1671 1664                          exit(4);
1672 1665                  }
1673 1666  #else
1674 1667                  /* Unrestricted Guest is always enabled on illumos */
1675 1668  #endif
1676 1669                  error = vcpu_reset(ctx, BSP);
1677 1670                  assert(error == 0);
1678 1671          }
1679 1672  
1680 1673          error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
1681 1674          assert(error == 0);
1682 1675  
1683 1676          /*
1684 1677           * build the guest tables, MP etc.
1685 1678           */
1686 1679          if (get_config_bool_default("x86.mptable", true)) {
1687 1680                  error = mptable_build(ctx, guest_ncpus);
1688 1681                  if (error) {
1689 1682                          perror("error to build the guest tables");
1690 1683                          exit(4);
1691 1684                  }
1692 1685          }
1693 1686  
1694 1687  #ifndef __FreeBSD__
1695 1688          smbios_apply();
1696 1689  #endif
1697 1690          error = smbios_build(ctx);
1698 1691          assert(error == 0);
1699 1692  
1700 1693          if (get_config_bool("acpi_tables")) {
1701 1694                  error = acpi_build(ctx, guest_ncpus);
1702 1695                  assert(error == 0);
1703 1696          }
1704 1697  
1705 1698          if (lpc_bootrom())
1706 1699                  fwctl_init();
1707 1700  
1708 1701          /*
1709 1702           * Change the proc title to include the VM name.
1710 1703           */
1711 1704          setproctitle("%s", vmname);
1712 1705  
1713 1706  #ifndef WITHOUT_CAPSICUM
1714 1707          caph_cache_catpages();
1715 1708  
1716 1709          if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
1717 1710                  errx(EX_OSERR, "Unable to apply rights for sandbox");
1718 1711  
1719 1712          if (caph_enter() == -1)
1720 1713                  errx(EX_OSERR, "cap_enter() failed");
1721 1714  #endif
1722 1715  
1723 1716  /* XXX SmartOS:  Upstream drops privs here, but we can't yet.  See below... */
1724 1717  
1725 1718  #ifdef __FreeBSD__
1726 1719          /*
1727 1720           * Add CPU 0
1728 1721           */
1729 1722          fbsdrun_addcpu(ctx, BSP, BSP, rip);
1730 1723  #else
1731 1724          /* Set BSP to run (unlike the APs which wait for INIT) */
1732 1725          error = vm_set_run_state(ctx, BSP, VRS_RUN, 0);
1733 1726          assert(error == 0);
1734 1727          fbsdrun_addcpu(ctx, BSP, rip,
1735 1728              get_config_bool_default("suspend_at_boot", false));
1736 1729  
1737 1730          /* Add subsequent CPUs, which will wait until INIT/SIPI-ed */
1738 1731          for (uint_t i = 1; i < guest_ncpus; i++) {
1739 1732                  spinup_halted_ap(ctx, i);
1740 1733          }
1741 1734          mark_provisioned();
1742 1735          /*
1743 1736           * XXX SmartOS:  The mark_provisioned() call above required file-access
1744 1737           * privileges that are dropped by the generic call.  We must widen the
1745 1738           * full-privilege window a bit.  A better solution might be to have
1746 1739           * a way to keep file-access a bit longer, and only have THAT privilege
1747 1740           * to drop here.
1748 1741           */
1749 1742          illumos_priv_lock();
1750 1743  #endif
1751 1744  
1752 1745          /*
1753 1746           * Head off to the main event dispatch loop
1754 1747           */
1755 1748          mevent_dispatch();
1756 1749  
1757 1750          exit(4);
1758 1751  }
  
    | 
      ↓ open down ↓ | 
    157 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX