Print this page
    
15254 %ymm registers not restored after signal handler
15367 x86 getfpregs() summons corrupting %xmm ghosts
15333 want x86 /proc xregs support (libc_db, libproc, mdb, etc.)
15336 want libc functions for extended ucontext_t
15334 want ps_lwphandle-specific reg routines
15328 FPU_CW_INIT mistreats reserved bit
15335 i86pc fpu_subr.c isn't really platform-specific
15332 setcontext(2) isn't actually noreturn
15331 need <sys/stdalign.h>
Change-Id: I7060aa86042dfb989f77fc3323c065ea2eafa9ad
Conflicts:
    usr/src/uts/common/fs/proc/prcontrol.c
    usr/src/uts/intel/os/archdep.c
    usr/src/uts/intel/sys/ucontext.h
    usr/src/uts/intel/syscall/getcontext.c
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/intel/os/cpuid.c
          +++ new/usr/src/uts/intel/os/cpuid.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24   24   * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25   25   * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
  26   26   * Copyright 2020 Joyent, Inc.
  27   27   * Copyright 2023 Oxide Computer Company
  28   28   * Copyright 2022 MNX Cloud, Inc.
  29   29   */
  30   30  /*
  31   31   * Copyright (c) 2010, Intel Corporation.
  32   32   * All rights reserved.
  33   33   */
  34   34  /*
  35   35   * Portions Copyright 2009 Advanced Micro Devices, Inc.
  36   36   */
  37   37  
  38   38  /*
  39   39   * CPU Identification logic
  40   40   *
  41   41   * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  42   42   * with the identification of CPUs, their features, and their topologies. More
  43   43   * specifically, this file helps drive the following:
  44   44   *
  45   45   * 1. Enumeration of features of the processor which are used by the kernel to
  46   46   *    determine what features to enable or disable. These may be instruction set
  47   47   *    enhancements or features that we use.
  48   48   *
  49   49   * 2. Enumeration of instruction set architecture (ISA) additions that userland
  50   50   *    will be told about through the auxiliary vector.
  51   51   *
  52   52   * 3. Understanding the physical topology of the CPU such as the number of
  53   53   *    caches, how many cores it has, whether or not it supports symmetric
  54   54   *    multi-processing (SMT), etc.
  55   55   *
  56   56   * ------------------------
  57   57   * CPUID History and Basics
  58   58   * ------------------------
  59   59   *
  60   60   * The cpuid instruction was added by Intel roughly around the time that the
  61   61   * original Pentium was introduced. The purpose of cpuid was to tell in a
  62   62   * programmatic fashion information about the CPU that previously was guessed
  63   63   * at. For example, an important part of cpuid is that we can know what
  64   64   * extensions to the ISA exist. If you use an invalid opcode you would get a
  65   65   * #UD, so this method allows a program (whether a user program or the kernel)
  66   66   * to determine what exists without crashing or getting a SIGILL. Of course,
  67   67   * this was also during the era of the clones and the AMD Am5x86. The vendor
  68   68   * name shows up first in cpuid for a reason.
  69   69   *
  70   70   * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  71   71   * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  72   72   * its own meaning. The different leaves are broken down into different regions:
  73   73   *
  74   74   *      [ 0, 7fffffff ]                 This region is called the 'basic'
  75   75   *                                      region. This region is generally defined
  76   76   *                                      by Intel, though some of the original
  77   77   *                                      portions have different meanings based
  78   78   *                                      on the manufacturer. These days, Intel
  79   79   *                                      adds most new features to this region.
  80   80   *                                      AMD adds non-Intel compatible
  81   81   *                                      information in the third, extended
  82   82   *                                      region. Intel uses this for everything
  83   83   *                                      including ISA extensions, CPU
  84   84   *                                      features, cache information, topology,
  85   85   *                                      and more.
  86   86   *
  87   87   *                                      There is a hole carved out of this
  88   88   *                                      region which is reserved for
  89   89   *                                      hypervisors.
  90   90   *
  91   91   *      [ 40000000, 4fffffff ]          This region, which is found in the
  92   92   *                                      middle of the previous region, is
  93   93   *                                      explicitly promised to never be used by
  94   94   *                                      CPUs. Instead, it is used by hypervisors
  95   95   *                                      to communicate information about
  96   96   *                                      themselves to the operating system. The
  97   97   *                                      values and details are unique for each
  98   98   *                                      hypervisor.
  99   99   *
 100  100   *      [ 80000000, ffffffff ]          This region is called the 'extended'
 101  101   *                                      region. Some of the low leaves mirror
 102  102   *                                      parts of the basic leaves. This region
 103  103   *                                      has generally been used by AMD for
 104  104   *                                      various extensions. For example, AMD-
 105  105   *                                      specific information about caches,
 106  106   *                                      features, and topology are found in this
 107  107   *                                      region.
 108  108   *
 109  109   * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 110  110   * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 111  111   * the ranges, one of the primary things returned is the maximum valid leaf in
 112  112   * that range. This allows for discovery of what range of CPUID is valid.
 113  113   *
 114  114   * The CPUs have potentially surprising behavior when using an invalid leaf or
 115  115   * unimplemented leaf. If the requested leaf is within the valid basic or
 116  116   * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 117  117   * set to zero. However, if you specify a leaf that is outside of a valid range,
 118  118   * then instead it will be filled with the last valid _basic_ leaf. For example,
 119  119   * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 120  120   * an invalid extended leaf will return the information for leaf 3.
 121  121   *
 122  122   * Some leaves are broken down into sub-leaves. This means that the value
 123  123   * depends on both the leaf asked for in %eax and a secondary register. For
 124  124   * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 125  125   * additional information. Or when getting topology information in leaf 0xb, the
 126  126   * initial value in %ecx changes which level of the topology that you are
 127  127   * getting information about.
 128  128   *
 129  129   * cpuid values are always kept to 32 bits regardless of whether or not the
 130  130   * program is in 64-bit mode. When executing in 64-bit mode, the upper
 131  131   * 32 bits of the register are always set to zero so that way the values are the
 132  132   * same regardless of execution mode.
 133  133   *
 134  134   * ----------------------
 135  135   * Identifying Processors
 136  136   * ----------------------
 137  137   *
 138  138   * We can identify a processor in two steps. The first step looks at cpuid leaf
 139  139   * 0. Leaf 0 contains the processor's vendor information. This is done by
 140  140   * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 141  141   * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 142  142   *
 143  143   * From there, a processor is identified by a combination of three different
 144  144   * values:
 145  145   *
 146  146   *  1. Family
 147  147   *  2. Model
 148  148   *  3. Stepping
 149  149   *
 150  150   * Each vendor uses the family and model to uniquely identify a processor. The
 151  151   * way that family and model are changed depends on the vendor. For example,
 152  152   * Intel has been using family 0x6 for almost all of their processor since the
 153  153   * Pentium Pro/Pentium II era, often called the P6. The model is used to
 154  154   * identify the exact processor. Different models are often used for the client
 155  155   * (consumer) and server parts. Even though each processor often has major
 156  156   * architectural differences, they still are considered the same family by
 157  157   * Intel.
 158  158   *
 159  159   * On the other hand, each major AMD architecture generally has its own family.
 160  160   * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 161  161   * the model number is used to help identify specific processors.  As AMD's
 162  162   * product lines have expanded, they have started putting a mixed bag of
 163  163   * processors into the same family, with each processor under a single
 164  164   * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
 165  165   * refer to each such collection as a processor family, distinct from cpuid
 166  166   * family.  Importantly, each processor family has a BIOS and Kernel Developer's
 167  167   * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
 168  168   * defines the processor family's non-architectural features.  In general, we'll
 169  169   * use "family" here to mean the family number reported by the cpuid instruction
 170  170   * and distinguish the processor family from it where appropriate.
 171  171   *
 172  172   * The stepping is used to refer to a revision of a specific microprocessor. The
 173  173   * term comes from equipment used to produce masks that are used to create
 174  174   * integrated circuits.
 175  175   *
 176  176   * The information is present in leaf 1, %eax. In technical documentation you
 177  177   * will see the terms extended model and extended family. The original family,
 178  178   * model, and stepping fields were each 4 bits wide. If the values in either
 179  179   * are 0xf, then one is to consult the extended model and extended family, which
 180  180   * take previously reserved bits and allow for a larger number of models and add
 181  181   * 0xf to them.
 182  182   *
 183  183   * When we process this information, we store the full family, model, and
 184  184   * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 185  185   * cpi_step, respectively. Whenever you are performing comparisons with the
 186  186   * family, model, and stepping, you should use these members and not the raw
 187  187   * values from cpuid. If you must use the raw values from cpuid directly, you
 188  188   * must make sure that you add the extended model and family to the base model
 189  189   * and family.
 190  190   *
 191  191   * In general, we do not use information about the family, model, and stepping
 192  192   * to determine whether or not a feature is present; that is generally driven by
 193  193   * specific leaves. However, when something we care about on the processor is
 194  194   * not considered 'architectural' meaning that it is specific to a set of
 195  195   * processors and not promised in the architecture model to be consistent from
 196  196   * generation to generation, then we will fall back on this information. The
 197  197   * most common cases where this comes up is when we have to workaround errata in
 198  198   * the processor, are dealing with processor-specific features such as CPU
 199  199   * performance counters, or we want to provide additional information for things
 200  200   * such as fault management.
 201  201   *
 202  202   * While processors also do have a brand string, which is the name that people
 203  203   * are familiar with when buying the processor, they are not meant for
 204  204   * programmatic consumption. That is what the family, model, and stepping are
 205  205   * for.
 206  206   *
 207  207   * We use the x86_chiprev_t to encode a combination of vendor, processor family,
 208  208   * and stepping(s) that refer to a single or very closely related set of silicon
 209  209   * implementations; while there are sometimes more specific ways to learn of the
 210  210   * presence or absence of a particular erratum or workaround, one may generally
 211  211   * assume that all processors of the same chiprev have the same errata and we
 212  212   * have chosen to represent them this way precisely because that is how AMD
 213  213   * groups them in their revision guides (errata documentation).  The processor
 214  214   * family (x86_processor_family_t) may be extracted from the chiprev if that
 215  215   * level of detail is not needed.  Processor families are considered unordered
 216  216   * but revisions within a family may be compared for either an exact match or at
 217  217   * least as recent as a reference revision.  See the chiprev_xxx() functions
 218  218   * below.
 219  219   *
 220  220   * Similarly, each processor family implements a particular microarchitecture,
 221  221   * which itself may have multiple revisions.  In general, non-architectural
 222  222   * features are specific to a processor family, but some may exist across
 223  223   * families containing cores that implement the same microarchitectural revision
 224  224   * (and, such cores share common bugs, too).  We provide utility routines
 225  225   * analogous to those for extracting and comparing chiprevs for
 226  226   * microarchitectures as well; see the uarch_xxx() functions.
 227  227   *
 228  228   * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
 229  229   * present used and available only for AMD and AMD-like processors.
 230  230   *
 231  231   * ------------
 232  232   * CPUID Passes
 233  233   * ------------
 234  234   *
 235  235   * As part of performing feature detection, we break this into several different
 236  236   * passes. There used to be a pass 0 that was done from assembly in locore.s to
 237  237   * support processors that have a missing or broken cpuid instruction (notably
 238  238   * certain Cyrix processors) but those were all 32-bit processors which are no
 239  239   * longer supported. Passes are no longer numbered explicitly to make it easier
 240  240   * to break them up or move them around as needed; however, they still have a
 241  241   * well-defined execution ordering enforced by the definition of cpuid_pass_t in
 242  242   * x86_archext.h. The external interface to execute a cpuid pass or determine
 243  243   * whether a pass has been completed consists of cpuid_execpass() and
 244  244   * cpuid_checkpass() respectively.  The passes now, in that execution order,
 245  245   * are as follows:
 246  246   *
 247  247   *      PRELUDE         This pass does not have any dependencies on system
 248  248   *                      setup; in particular, unlike all subsequent passes it is
 249  249   *                      guaranteed not to require PCI config space access.  It
 250  250   *                      sets the flag indicating that the processor we are
 251  251   *                      running on supports the cpuid instruction, which all
 252  252   *                      64-bit processors do.  This would also be the place to
 253  253   *                      add any other basic state that is required later on and
 254  254   *                      can be learned without dependencies.
 255  255   *
 256  256   *      IDENT           Determine which vendor manufactured the CPU, the family,
 257  257   *                      model, and stepping information, and compute basic
 258  258   *                      identifying tags from those values.  This is done first
 259  259   *                      so that machine-dependent code can control the features
 260  260   *                      the cpuid instruction will report during subsequent
 261  261   *                      passes if needed, and so that any intervening
 262  262   *                      machine-dependent code that needs basic identity will
 263  263   *                      have it available.  This includes synthesised
 264  264   *                      identifiers such as chiprev and uarchrev as well as the
 265  265   *                      values obtained directly from cpuid.  Prior to executing
 266  266   *                      this pass, machine-depedent boot code is responsible for
 267  267   *                      ensuring that the PCI configuration space access
 268  268   *                      functions have been set up and, if necessary, that
 269  269   *                      determine_platform() has been called.
 270  270   *
 271  271   *      BASIC           This is the primary pass and is responsible for doing a
 272  272   *                      large number of different things:
 273  273   *
 274  274   *                      1. Gathering a large number of feature flags to
 275  275   *                      determine which features the CPU support and which
 276  276   *                      indicate things that we need to do other work in the OS
 277  277   *                      to enable. Features detected this way are added to the
 278  278   *                      x86_featureset which can be queried to
 279  279   *                      determine what we should do. This includes processing
 280  280   *                      all of the basic and extended CPU features that we care
 281  281   *                      about.
 282  282   *
 283  283   *                      2. Determining the CPU's topology. This includes
 284  284   *                      information about how many cores and threads are present
 285  285   *                      in the package. It also is responsible for figuring out
 286  286   *                      which logical CPUs are potentially part of the same core
 287  287   *                      and what other resources they might share. For more
 288  288   *                      information see the 'Topology' section.
 289  289   *
 290  290   *                      3. Determining the set of CPU security-specific features
 291  291   *                      that we need to worry about and determine the
 292  292   *                      appropriate set of workarounds.
 293  293   *
 294  294   *                      Pass 1 on the boot CPU occurs before KMDB is started.
 295  295   *
 296  296   *      EXTENDED        The second pass is done after startup(). Here, we check
 297  297   *                      other miscellaneous features. Most of this is gathering
 298  298   *                      additional basic and extended features that we'll use in
 299  299   *                      later passes or for debugging support.
 300  300   *
 301  301   *      DYNAMIC         The third pass occurs after the kernel memory allocator
 302  302   *                      has been fully initialized. This gathers information
 303  303   *                      where we might need dynamic memory available for our
 304  304   *                      uses. This includes several varying width leaves that
 305  305   *                      have cache information and the processor's brand string.
 306  306   *
 307  307   *      RESOLVE         The fourth and final normal pass is performed after the
 308  308   *                      kernel has brought most everything online. This is
 309  309   *                      invoked from post_startup(). In this pass, we go through
 310  310   *                      the set of features that we have enabled and turn that
 311  311   *                      into the hardware auxiliary vector features that
 312  312   *                      userland receives. This is used by userland, primarily
 313  313   *                      by the run-time link-editor (RTLD), though userland
 314  314   *                      software could also refer to it directly.
 315  315   *
 316  316   * The function that performs a pass is currently assumed to be infallible, and
 317  317   * all existing implementation are.  This simplifies callers by allowing
 318  318   * cpuid_execpass() to return void. Similarly, implementers do not need to check
 319  319   * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
 320  320   * Both of these assumptions can be relaxed if needed by future developments.
 321  321   * Tracking of completed states is handled by cpuid_execpass(). It is programmer
 322  322   * error to attempt to execute a pass before all previous passes have been
 323  323   * completed on the specified CPU, or to request cpuid information before the
 324  324   * pass that captures it has been executed.  These conditions can be tested
 325  325   * using cpuid_checkpass().
 326  326   *
 327  327   * The Microcode Pass
 328  328   *
 329  329   * After a microcode update, we do a selective rescan of the cpuid leaves to
 330  330   * determine what features have changed. Microcode updates can provide more
 331  331   * details about security related features to deal with issues like Spectre and
 332  332   * L1TF. On occasion, vendors have violated their contract and removed bits.
 333  333   * However, we don't try to detect that because that puts us in a situation that
 334  334   * we really can't deal with. As such, the only thing we rescan are security
 335  335   * related features today. See cpuid_pass_ucode().  This pass may be run in a
 336  336   * different sequence on APs and therefore is not part of the sequential order;
 337  337   * It is invoked directly instead of by cpuid_execpass() and its completion
 338  338   * status cannot be checked by cpuid_checkpass().  This could be integrated with
 339  339   * a more complex dependency mechanism if warranted by future developments.
 340  340   *
 341  341   * All of the passes are run on all CPUs. However, for the most part we only
 342  342   * care about what the boot CPU says about this information and use the other
 343  343   * CPUs as a rough guide to sanity check that we have the same feature set.
 344  344   *
 345  345   * We do not support running multiple logical CPUs with disjoint, let alone
 346  346   * different, feature sets.
 347  347   *
 348  348   * ------------------
 349  349   * Processor Topology
 350  350   * ------------------
 351  351   *
 352  352   * One of the important things that we need to do is to understand the topology
 353  353   * of the underlying processor. When we say topology in this case, we're trying
 354  354   * to understand the relationship between the logical CPUs that the operating
 355  355   * system sees and the underlying physical layout. Different logical CPUs may
 356  356   * share different resources which can have important consequences for the
 357  357   * performance of the system. For example, they may share caches, execution
 358  358   * units, and more.
 359  359   *
 360  360   * The topology of the processor changes from generation to generation and
 361  361   * vendor to vendor.  Along with that, different vendors use different
 362  362   * terminology, and the operating system itself uses occasionally overlapping
 363  363   * terminology. It's important to understand what this topology looks like so
 364  364   * one can understand the different things that we try to calculate and
 365  365   * determine.
 366  366   *
 367  367   * To get started, let's talk about a little bit of terminology that we've used
 368  368   * so far, is used throughout this file, and is fairly generic across multiple
 369  369   * vendors:
 370  370   *
 371  371   * CPU
 372  372   *      A central processing unit (CPU) refers to a logical and/or virtual
 373  373   *      entity that the operating system can execute instructions on. The
 374  374   *      underlying resources for this CPU may be shared between multiple
 375  375   *      entities; however, to the operating system it is a discrete unit.
 376  376   *
 377  377   * PROCESSOR and PACKAGE
 378  378   *
 379  379   *      Generally, when we use the term 'processor' on its own, we are referring
 380  380   *      to the physical entity that one buys and plugs into a board. However,
 381  381   *      because processor has been overloaded and one might see it used to mean
 382  382   *      multiple different levels, we will instead use the term 'package' for
 383  383   *      the rest of this file. The term package comes from the electrical
 384  384   *      engineering side and refers to the physical entity that encloses the
 385  385   *      electronics inside. Strictly speaking the package can contain more than
 386  386   *      just the CPU, for example, on many processors it may also have what's
 387  387   *      called an 'integrated graphical processing unit (GPU)'. Because the
 388  388   *      package can encapsulate multiple units, it is the largest physical unit
 389  389   *      that we refer to.
 390  390   *
 391  391   * SOCKET
 392  392   *
 393  393   *      A socket refers to unit on a system board (generally the motherboard)
 394  394   *      that can receive a package. A single package, or processor, is plugged
 395  395   *      into a single socket. A system may have multiple sockets. Often times,
 396  396   *      the term socket is used interchangeably with package and refers to the
 397  397   *      electrical component that has plugged in, and not the receptacle itself.
 398  398   *
 399  399   * CORE
 400  400   *
 401  401   *      A core refers to the physical instantiation of a CPU, generally, with a
 402  402   *      full set of hardware resources available to it. A package may contain
 403  403   *      multiple cores inside of it or it may just have a single one. A
 404  404   *      processor with more than one core is often referred to as 'multi-core'.
 405  405   *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 406  406   *      that has 'multi-core' processors.
 407  407   *
 408  408   *      A core may expose a single logical CPU to the operating system, or it
 409  409   *      may expose multiple CPUs, which we call threads, defined below.
 410  410   *
 411  411   *      Some resources may still be shared by cores in the same package. For
 412  412   *      example, many processors will share the level 3 cache between cores.
 413  413   *      Some AMD generations share hardware resources between cores. For more
 414  414   *      information on that see the section 'AMD Topology'.
 415  415   *
 416  416   * THREAD and STRAND
 417  417   *
 418  418   *      In this file, generally a thread refers to a hardware resources and not
 419  419   *      the operating system's logical abstraction. A thread is always exposed
 420  420   *      as an independent logical CPU to the operating system. A thread belongs
 421  421   *      to a specific core. A core may have more than one thread. When that is
 422  422   *      the case, the threads that are part of the same core are often referred
 423  423   *      to as 'siblings'.
 424  424   *
 425  425   *      When multiple threads exist, this is generally referred to as
 426  426   *      simultaneous multi-threading (SMT). When Intel introduced this in their
 427  427   *      processors they called it hyper-threading (HT). When multiple threads
 428  428   *      are active in a core, they split the resources of the core. For example,
 429  429   *      two threads may share the same set of hardware execution units.
 430  430   *
 431  431   *      The operating system often uses the term 'strand' to refer to a thread.
 432  432   *      This helps disambiguate it from the software concept.
 433  433   *
 434  434   * CHIP
 435  435   *
 436  436   *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 437  437   *      base meaning, it is used to refer to a single integrated circuit, which
 438  438   *      may or may not be the only thing in the package. In illumos, when you
 439  439   *      see the term 'chip' it is almost always referring to the same thing as
 440  440   *      the 'package'. However, many vendors may use chip to refer to one of
 441  441   *      many integrated circuits that have been placed in the package. As an
 442  442   *      example, see the subsequent definition.
 443  443   *
 444  444   *      To try and keep things consistent, we will only use chip when referring
 445  445   *      to the entire integrated circuit package, with the exception of the
 446  446   *      definition of multi-chip module (because it is in the name) and use the
 447  447   *      term 'die' when we want the more general, potential sub-component
 448  448   *      definition.
 449  449   *
 450  450   * DIE
 451  451   *
 452  452   *      A die refers to an integrated circuit. Inside of the package there may
 453  453   *      be a single die or multiple dies. This is sometimes called a 'chip' in
 454  454   *      vendor's parlance, but in this file, we use the term die to refer to a
 455  455   *      subcomponent.
 456  456   *
 457  457   * MULTI-CHIP MODULE
 458  458   *
 459  459   *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 460  460   *      are connected together in the same package. When a multi-chip design is
 461  461   *      used, generally each chip is manufactured independently and then joined
 462  462   *      together in the package. For example, on AMD's Zen microarchitecture
 463  463   *      (family 0x17), the package contains several dies (the second meaning of
 464  464   *      chip from above) that are connected together.
 465  465   *
 466  466   * CACHE
 467  467   *
 468  468   *      A cache is a part of the processor that maintains copies of recently
 469  469   *      accessed memory. Caches are split into levels and then into types.
 470  470   *      Commonly there are one to three levels, called level one, two, and
 471  471   *      three. The lower the level, the smaller it is, the closer it is to the
 472  472   *      execution units of the CPU, and the faster it is to access. The layout
 473  473   *      and design of the cache come in many different flavors, consult other
 474  474   *      resources for a discussion of those.
 475  475   *
 476  476   *      Caches are generally split into two types, the instruction and data
 477  477   *      cache. The caches contain what their names suggest, the instruction
 478  478   *      cache has executable program text, while the data cache has all other
 479  479   *      memory that the processor accesses. As of this writing, data is kept
 480  480   *      coherent between all of the caches on x86, so if one modifies program
 481  481   *      text before it is executed, that will be in the data cache, and the
 482  482   *      instruction cache will be synchronized with that change when the
 483  483   *      processor actually executes those instructions. This coherency also
 484  484   *      covers the fact that data could show up in multiple caches.
 485  485   *
 486  486   *      Generally, the lowest level caches are specific to a core. However, the
 487  487   *      last layer cache is shared between some number of cores. The number of
 488  488   *      CPUs sharing this last level cache is important. This has implications
 489  489   *      for the choices that the scheduler makes, as accessing memory that might
 490  490   *      be in a remote cache after thread migration can be quite expensive.
 491  491   *
 492  492   *      Sometimes, the word cache is abbreviated with a '$', because in US
 493  493   *      English the word cache is pronounced the same as cash. So L1D$ refers to
 494  494   *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 495  495   *      in the rest of this theory statement for clarity.
 496  496   *
 497  497   * MEMORY CONTROLLER
 498  498   *
 499  499   *      The memory controller is a component that provides access to DRAM. Each
 500  500   *      memory controller can access a set number of DRAM channels. Each channel
 501  501   *      can have a number of DIMMs (sticks of memory) associated with it. A
 502  502   *      given package may have more than one memory controller. The association
 503  503   *      of the memory controller to a group of cores is important as it is
 504  504   *      cheaper to access memory on the controller that you are associated with.
 505  505   *
 506  506   * NUMA
 507  507   *
 508  508   *      NUMA or non-uniform memory access, describes a way that systems are
 509  509   *      built. On x86, any processor core can address all of the memory in the
 510  510   *      system. However, When using multiple sockets or possibly within a
 511  511   *      multi-chip module, some of that memory is physically closer and some of
 512  512   *      it is further. Memory that is further away is more expensive to access.
 513  513   *      Consider the following image of multiple sockets with memory:
 514  514   *
 515  515   *      +--------+                                                +--------+
 516  516   *      | DIMM A |         +----------+      +----------+         | DIMM D |
 517  517   *      +--------+-+       |          |      |          |       +-+------+-+
 518  518   *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 519  519   *        +--------+-+     |          |      |          |     +-+------+-+
 520  520   *          | DIMM C |     +----------+      +----------+     | DIMM F |
 521  521   *          +--------+                                        +--------+
 522  522   *
 523  523   *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 524  524   *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 525  525   *      access DIMMs A-C and more expensive to access D-F as it has to go
 526  526   *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 527  527   *      D-F are cheaper than A-C. While the socket form is the most common, when
 528  528   *      using multi-chip modules, this can also sometimes occur. For another
 529  529   *      example of this that's more involved, see the AMD topology section.
 530  530   *
 531  531   *
 532  532   * Intel Topology
 533  533   * --------------
 534  534   *
 535  535   * Most Intel processors since Nehalem, (as of this writing the current gen
 536  536   * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 537  537   * the package is a single monolithic die. MCMs currently aren't used. Most
 538  538   * parts have three levels of caches, with the L3 cache being shared between
 539  539   * all of the cores on the package. The L1/L2 cache is generally specific to
 540  540   * an individual core. The following image shows at a simplified level what
 541  541   * this looks like. The memory controller is commonly part of something called
 542  542   * the 'Uncore', that used to be separate physical chips that were not a part of
 543  543   * the package, but are now part of the same chip.
 544  544   *
 545  545   *  +-----------------------------------------------------------------------+
 546  546   *  | Package                                                               |
 547  547   *  |  +-------------------+  +-------------------+  +-------------------+  |
 548  548   *  |  | Core              |  | Core              |  | Core              |  |
 549  549   *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 550  550   *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 551  551   *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 552  552   *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 553  553   *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 554  554   *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 555  555   *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 556  556   *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 557  557   *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 558  558   *  |  +-------------------+  +-------------------+  +-------------------+  |
 559  559   *  | +-------------------------------------------------------------------+ |
 560  560   *  | |                         Shared L3 Cache                           | |
 561  561   *  | +-------------------------------------------------------------------+ |
 562  562   *  | +-------------------------------------------------------------------+ |
 563  563   *  | |                        Memory Controller                          | |
 564  564   *  | +-------------------------------------------------------------------+ |
 565  565   *  +-----------------------------------------------------------------------+
 566  566   *
 567  567   * A side effect of this current architecture is that what we care about from a
 568  568   * scheduling and topology perspective, is simplified. In general we care about
 569  569   * understanding which logical CPUs are part of the same core and socket.
 570  570   *
 571  571   * To determine the relationship between threads and cores, Intel initially used
 572  572   * the identifier in the advanced programmable interrupt controller (APIC). They
 573  573   * also added cpuid leaf 4 to give additional information about the number of
 574  574   * threads and CPUs in the processor. With the addition of x2apic (which
 575  575   * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 576  576   * additional cpuid topology leaf 0xB was added.
 577  577   *
 578  578   * AMD Topology
 579  579   * ------------
 580  580   *
 581  581   * When discussing AMD topology, we want to break this into three distinct
 582  582   * generations of topology. There's the basic topology that has been used in
 583  583   * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 584  584   * with family 0x15 (Bulldozer), and there's the topology that was introduced
 585  585   * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
 586  586   * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
 587  587   * additional terminology that's worth talking about.
 588  588   *
 589  589   * Until the introduction of family 0x17 (Zen), AMD did not implement something
 590  590   * that they considered SMT. Whether or not the AMD processors have SMT
 591  591   * influences many things including scheduling and reliability, availability,
 592  592   * and serviceability (RAS) features.
 593  593   *
 594  594   * NODE
 595  595   *
 596  596   *      AMD uses the term node to refer to a die that contains a number of cores
 597  597   *      and I/O resources. Depending on the processor family and model, more
 598  598   *      than one node can be present in the package. When there is more than one
 599  599   *      node this indicates a multi-chip module. Usually each node has its own
 600  600   *      access to memory and I/O devices. This is important and generally
 601  601   *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 602  602   *      result, we track this relationship in the operating system.
 603  603   *
 604  604   *      In processors with an L3 cache, the L3 cache is generally shared across
 605  605   *      the entire node, though the way this is carved up varies from generation
 606  606   *      to generation.
 607  607   *
 608  608   * BULLDOZER
 609  609   *
 610  610   *      Starting with the Bulldozer family (0x15) and continuing until the
 611  611   *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 612  612   *      compute unit. In a compute unit, two traditional cores share a number of
 613  613   *      hardware resources. Critically, they share the FPU, L1 instruction
 614  614   *      cache, and the L2 cache. Several compute units were then combined inside
 615  615   *      of a single node.  Because the integer execution units, L1 data cache,
 616  616   *      and some other resources were not shared between the cores, AMD never
 617  617   *      considered this to be SMT.
 618  618   *
 619  619   * ZEN
 620  620   *
 621  621   *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 622  622   *      is called Zeppelin. These modules are similar to the idea of nodes used
 623  623   *      previously. Each of these nodes has two DRAM channels which all of the
 624  624   *      cores in the node can access uniformly. These nodes are linked together
 625  625   *      in the package, creating a NUMA environment.
 626  626   *
 627  627   *      The Zeppelin die itself contains two different 'core complexes'. Each
 628  628   *      core complex consists of four cores which each have two threads, for a
 629  629   *      total of 8 logical CPUs per complex. Unlike other generations,
 630  630   *      where all the logical CPUs in a given node share the L3 cache, here each
 631  631   *      core complex has its own shared L3 cache.
 632  632   *
 633  633   *      A further thing that we need to consider is that in some configurations,
 634  634   *      particularly with the Threadripper line of processors, not every die
 635  635   *      actually has its memory controllers wired up to actual memory channels.
 636  636   *      This means that some cores have memory attached to them and others
 637  637   *      don't.
 638  638   *
 639  639   *      To put Zen in perspective, consider the following images:
 640  640   *
 641  641   *      +--------------------------------------------------------+
 642  642   *      | Core Complex                                           |
 643  643   *      | +-------------------+    +-------------------+  +---+  |
 644  644   *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 645  645   *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 646  646   *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 647  647   *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 648  648   *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 649  649   *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 650  650   *      | +-------------------+    +-------------------+  | C |  |
 651  651   *      | +-------------------+    +-------------------+  | a |  |
 652  652   *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 653  653   *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 654  654   *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 655  655   *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 656  656   *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 657  657   *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 658  658   *      | +-------------------+    +-------------------+  +---+  |
 659  659   *      |                                                        |
 660  660   *      +--------------------------------------------------------+
 661  661   *
 662  662   *  This first image represents a single Zen core complex that consists of four
 663  663   *  cores.
 664  664   *
 665  665   *
 666  666   *      +--------------------------------------------------------+
 667  667   *      | Zeppelin Die                                           |
 668  668   *      |  +--------------------------------------------------+  |
 669  669   *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 670  670   *      |  +--------------------------------------------------+  |
 671  671   *      |                           HH                           |
 672  672   *      |          +-----------+    HH    +-----------+          |
 673  673   *      |          |           |    HH    |           |          |
 674  674   *      |          |    Core   |==========|    Core   |          |
 675  675   *      |          |  Complex  |==========|  Complex  |          |
 676  676   *      |          |           |    HH    |           |          |
 677  677   *      |          +-----------+    HH    +-----------+          |
 678  678   *      |                           HH                           |
 679  679   *      |  +--------------------------------------------------+  |
 680  680   *      |  |                Memory Controller                 |  |
 681  681   *      |  +--------------------------------------------------+  |
 682  682   *      |                                                        |
 683  683   *      +--------------------------------------------------------+
 684  684   *
 685  685   *  This image represents a single Zeppelin Die. Note how both cores are
 686  686   *  connected to the same memory controller and I/O units. While each core
 687  687   *  complex has its own L3 cache as seen in the first image, they both have
 688  688   *  uniform access to memory.
 689  689   *
 690  690   *
 691  691   *                      PP                     PP
 692  692   *                      PP                     PP
 693  693   *           +----------PP---------------------PP---------+
 694  694   *           |          PP                     PP         |
 695  695   *           |    +-----------+          +-----------+    |
 696  696   *           |    |           |          |           |    |
 697  697   *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 698  698   *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 699  699   *           |    |           |          |           |    |
 700  700   *           |    +-----------+ooo    ...+-----------+    |
 701  701   *           |          HH      ooo  ...       HH         |
 702  702   *           |          HH        oo..         HH         |
 703  703   *           |          HH        ..oo         HH         |
 704  704   *           |          HH      ...  ooo       HH         |
 705  705   *           |    +-----------+...    ooo+-----------+    |
 706  706   *           |    |           |          |           |    |
 707  707   *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 708  708   *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 709  709   *           |    |           |          |           |    |
 710  710   *           |    +-----------+          +-----------+    |
 711  711   *           |          PP                     PP         |
 712  712   *           +----------PP---------------------PP---------+
 713  713   *                      PP                     PP
 714  714   *                      PP                     PP
 715  715   *
 716  716   *  This image represents a single Zen package. In this example, it has four
 717  717   *  Zeppelin dies, though some configurations only have a single one. In this
 718  718   *  example, each die is directly connected to the next. Also, each die is
 719  719   *  represented as being connected to memory by the 'M' character and connected
 720  720   *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 721  721   *  die is made up of two core complexes, we have multiple different NUMA
 722  722   *  domains that we care about for these systems.
 723  723   *
 724  724   * ZEN 2
 725  725   *
 726  726   *      Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
 727  727   *      each Zeppelin Die had its own I/O die, that has been moved out of the
 728  728   *      core complex in Zen 2. The actual core complex looks pretty similar, but
 729  729   *      now the die actually looks much simpler:
 730  730   *
 731  731   *      +--------------------------------------------------------+
 732  732   *      | Zen 2 Core Complex Die    HH                           |
 733  733   *      |                           HH                           |
 734  734   *      |          +-----------+    HH    +-----------+          |
 735  735   *      |          |           |    HH    |           |          |
 736  736   *      |          |    Core   |==========|    Core   |          |
 737  737   *      |          |  Complex  |==========|  Complex  |          |
 738  738   *      |          |           |    HH    |           |          |
 739  739   *      |          +-----------+    HH    +-----------+          |
 740  740   *      |                           HH                           |
 741  741   *      |                           HH                           |
 742  742   *      +--------------------------------------------------------+
 743  743   *
 744  744   *      From here, when we add the central I/O die, this changes things a bit.
 745  745   *      Each die is connected to the I/O die, rather than trying to interconnect
 746  746   *      them directly. The following image takes the same Zen 1 image that we
 747  747   *      had earlier and shows what it looks like with the I/O die instead:
 748  748   *
 749  749   *                                 PP    PP
 750  750   *                                 PP    PP
 751  751   *           +---------------------PP----PP---------------------+
 752  752   *           |                     PP    PP                     |
 753  753   *           |  +-----------+      PP    PP      +-----------+  |
 754  754   *           |  |           |      PP    PP      |           |  |
 755  755   *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
 756  756   *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
 757  757   *           |  |         |o|oooo|          |oooo|o|         |  |
 758  758   *           |  +-----------+    |          |    +-----------+  |
 759  759   *           |                   |   I/O    |                   |
 760  760   *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
 761  761   *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
 762  762   *           |                   |          |                   |
 763  763   *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
 764  764   *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
 765  765   *           |                   |          |                   |
 766  766   *           |  +-----------+    |          |    +-----------+  |
 767  767   *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
 768  768   *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
 769  769   *           |  |    Die    |      PP    PP      |    Die    |  |
 770  770   *           |  |           |      PP    PP      |           |  |
 771  771   *           |  +-----------+      PP    PP      +-----------+  |
 772  772   *           |                     PP    PP                     |
 773  773   *           +---------------------PP----PP---------------------+
 774  774   *                                 PP    PP
 775  775   *                                 PP    PP
 776  776   *
 777  777   *      The above has four core complex dies installed, though the Zen 2 EPYC
 778  778   *      and ThreadRipper parts allow for up to eight, while the Ryzen parts
 779  779   *      generally only have one to two. The more notable difference here is how
 780  780   *      everything communicates. Note that memory and PCIe come out of the
 781  781   *      central die. This changes the way that one die accesses a resource. It
 782  782   *      basically always has to go to the I/O die, where as in Zen 1 it may have
 783  783   *      satisfied it locally. In general, this ends up being a better strategy
 784  784   *      for most things, though it is possible to still treat everything in four
 785  785   *      distinct NUMA domains with each Zen 2 die slightly closer to some memory
 786  786   *      and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
 787  787   *      now there is only one 'node' present.
 788  788   *
 789  789   * ZEN 3
 790  790   *
 791  791   *      From an architectural perspective, Zen 3 is a much smaller change from
 792  792   *      Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
 793  793   *      its microarchitectural changes. The biggest thing for us is how the die
 794  794   *      changes. In Zen 1 and Zen 2, each core complex still had its own L3
 795  795   *      cache. However, in Zen 3, the L3 is now shared between the entire core
 796  796   *      complex die and is no longer partitioned between each core complex. This
 797  797   *      means that all cores on the die can share the same L3 cache. Otherwise,
 798  798   *      the general layout of the overall package with various core complexes
 799  799   *      and an I/O die stays the same. Here's what the Core Complex Die looks
 800  800   *      like in a bit more detail:
 801  801   *
 802  802   *               +-------------------------------------------------+
 803  803   *               | Zen 3 Core Complex Die                          |
 804  804   *               | +-------------------+    +-------------------+  |
 805  805   *               | | Core       +----+ |    | Core       +----+ |  |
 806  806   *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 807  807   *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 808  808   *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 809  809   *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 810  810   *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 811  811   *               | +-------------------+    +-------------------+  |
 812  812   *               | +-------------------+    +-------------------+  |
 813  813   *               | | Core       +----+ |    | Core       +----+ |  |
 814  814   *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 815  815   *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 816  816   *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 817  817   *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 818  818   *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 819  819   *               | +-------------------+    +-------------------+  |
 820  820   *               |                                                 |
 821  821   *               | +--------------------------------------------+  |
 822  822   *               | |                 L3 Cache                   |  |
 823  823   *               | +--------------------------------------------+  |
 824  824   *               |                                                 |
 825  825   *               | +-------------------+    +-------------------+  |
 826  826   *               | | Core       +----+ |    | Core       +----+ |  |
 827  827   *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 828  828   *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 829  829   *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 830  830   *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 831  831   *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 832  832   *               | +-------------------+    +-------------------+  |
 833  833   *               | +-------------------+    +-------------------+  |
 834  834   *               | | Core       +----+ |    | Core       +----+ |  |
 835  835   *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 836  836   *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 837  837   *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 838  838   *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 839  839   *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 840  840   *               | +-------------------+    +-------------------+  |
 841  841   *               +-------------------------------------------------+
 842  842   *
 843  843   *      While it is not pictured, there are connections from the die to the
 844  844   *      broader data fabric and additional functional blocks to support that
 845  845   *      communication and coherency.
 846  846   *
 847  847   * CPUID LEAVES
 848  848   *
 849  849   * There are a few different CPUID leaves that we can use to try and understand
 850  850   * the actual state of the world. As part of the introduction of family 0xf, AMD
 851  851   * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 852  852   * processors that are in the system. Because families before Zen didn't have
 853  853   * SMT, this was always the number of cores that were in the system. However, it
 854  854   * should always be thought of as the number of logical threads to be consistent
 855  855   * between generations. In addition we also get the size of the APIC ID that is
 856  856   * used to represent the number of logical processors. This is important for
 857  857   * deriving topology information.
 858  858   *
 859  859   * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 860  860   * bit between Bulldozer and later families, but it is quite useful in
 861  861   * determining the topology information. Because this information has changed
 862  862   * across family generations, it's worth calling out what these mean
 863  863   * explicitly. The registers have the following meanings:
 864  864   *
 865  865   *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 866  866   *              APIC ID, even though on systems without x2apic support, it will
 867  867   *              be limited to 8 bits.
 868  868   *
 869  869   *      %ebx    On Bulldozer-era systems this contains information about the
 870  870   *              number of cores that are in a compute unit (cores that share
 871  871   *              resources). It also contains a per-package compute unit ID that
 872  872   *              identifies which compute unit the logical CPU is a part of.
 873  873   *
 874  874   *              On Zen-era systems this instead contains the number of threads
 875  875   *              per core and the ID of the core that the logical CPU is a part
 876  876   *              of. Note, this ID is unique only to the package, it is not
 877  877   *              globally unique across the entire system.
 878  878   *
 879  879   *      %ecx    This contains the number of nodes that exist in the package. It
 880  880   *              also contains an ID that identifies which node the logical CPU
 881  881   *              is a part of.
 882  882   *
 883  883   * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 884  884   * cache layout to determine which logical CPUs are sharing which caches.
 885  885   *
 886  886   * illumos Topology
 887  887   * ----------------
 888  888   *
 889  889   * Based on the above we synthesize the information into several different
 890  890   * variables that we store in the 'struct cpuid_info'. We'll go into the details
 891  891   * of what each member is supposed to represent and their uniqueness. In
 892  892   * general, there are two levels of uniqueness that we care about. We care about
 893  893   * an ID that is globally unique. That means that it will be unique across all
 894  894   * entities in the system. For example, the default logical CPU ID is globally
 895  895   * unique. On the other hand, there is some information that we only care about
 896  896   * being unique within the context of a single package / socket. Here are the
 897  897   * variables that we keep track of and their meaning.
 898  898   *
 899  899   * Several of the values that are asking for an identifier, with the exception
 900  900   * of cpi_apicid, are allowed to be synthetic.
 901  901   *
 902  902   *
 903  903   * cpi_apicid
 904  904   *
 905  905   *      This is the value of the CPU's APIC id. This should be the full 32-bit
 906  906   *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 907  907   *      APIC ID. This value is globally unique between all logical CPUs across
 908  908   *      all packages. This is usually required by the APIC.
 909  909   *
 910  910   * cpi_chipid
 911  911   *
 912  912   *      This value indicates the ID of the package that the logical CPU is a
 913  913   *      part of. This value is allowed to be synthetic. It is usually derived by
 914  914   *      taking the CPU's APIC ID and determining how many bits are used to
 915  915   *      represent CPU cores in the package. All logical CPUs that are part of
 916  916   *      the same package must have the same value.
 917  917   *
 918  918   * cpi_coreid
 919  919   *
 920  920   *      This represents the ID of a CPU core. Two logical CPUs should only have
 921  921   *      the same cpi_coreid value if they are part of the same core. These
 922  922   *      values may be synthetic. On systems that support SMT, this value is
 923  923   *      usually derived from the APIC ID, otherwise it is often synthetic and
 924  924   *      just set to the value of the cpu_id in the cpu_t.
 925  925   *
 926  926   * cpi_pkgcoreid
 927  927   *
 928  928   *      This is similar to the cpi_coreid in that logical CPUs that are part of
 929  929   *      the same core should have the same ID. The main difference is that these
 930  930   *      values are only required to be unique to a given socket.
 931  931   *
 932  932   * cpi_clogid
 933  933   *
 934  934   *      This represents the logical ID of a logical CPU. This value should be
 935  935   *      unique within a given socket for each logical CPU. This is allowed to be
 936  936   *      synthetic, though it is usually based off of the CPU's apic ID. The
 937  937   *      broader system expects that logical CPUs that have are part of the same
 938  938   *      core have contiguous numbers. For example, if there were two threads per
 939  939   *      core, then the core IDs divided by two should be the same and the first
 940  940   *      modulus two should be zero and the second one. For example, IDs 4 and 5
 941  941   *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 942  942   *      6 represent two logical CPUs that are part of different cores.
 943  943   *
 944  944   *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 945  945   *      from the same source, strictly speaking, they don't have to be and the
 946  946   *      two values should be considered logically independent. One should not
 947  947   *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 948  948   *      some kind of relationship. While this is tempting, we've seen cases on
 949  949   *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 950  950   *
 951  951   * cpi_ncpu_per_chip
 952  952   *
 953  953   *      This value indicates the total number of logical CPUs that exist in the
 954  954   *      physical package. Critically, this is not the number of logical CPUs
 955  955   *      that exist for just the single core.
 956  956   *
 957  957   *      This value should be the same for all logical CPUs in the same package.
 958  958   *
 959  959   * cpi_ncore_per_chip
 960  960   *
 961  961   *      This value indicates the total number of physical CPU cores that exist
 962  962   *      in the package. The system compares this value with cpi_ncpu_per_chip to
 963  963   *      determine if simultaneous multi-threading (SMT) is enabled. When
 964  964   *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 965  965   *      the X86FSET_HTT feature is not set. If this value is greater than one,
 966  966   *      than we consider the processor to have the feature X86FSET_CMP, to
 967  967   *      indicate that there is support for more than one core.
 968  968   *
 969  969   *      This value should be the same for all logical CPUs in the same package.
 970  970   *
 971  971   * cpi_procnodes_per_pkg
 972  972   *
 973  973   *      This value indicates the number of 'nodes' that exist in the package.
 974  974   *      When processors are actually a multi-chip module, this represents the
 975  975   *      number of such modules that exist in the package. Currently, on Intel
 976  976   *      based systems this member is always set to 1.
 977  977   *
 978  978   *      This value should be the same for all logical CPUs in the same package.
 979  979   *
 980  980   * cpi_procnodeid
 981  981   *
 982  982   *      This value indicates the ID of the node that the logical CPU is a part
 983  983   *      of. All logical CPUs that are in the same node must have the same value
 984  984   *      here. This value must be unique across all of the packages in the
 985  985   *      system.  On Intel based systems, this is currently set to the value in
 986  986   *      cpi_chipid because there is only one node.
 987  987   *
 988  988   * cpi_cores_per_compunit
 989  989   *
 990  990   *      This value indicates the number of cores that are part of a compute
 991  991   *      unit. See the AMD topology section for this. This member only has real
 992  992   *      meaning currently for AMD Bulldozer family processors. For all other
 993  993   *      processors, this should currently be set to 1.
 994  994   *
 995  995   * cpi_compunitid
 996  996   *
 997  997   *      This indicates the compute unit that the logical CPU belongs to. For
 998  998   *      processors without AMD Bulldozer-style compute units this should be set
 999  999   *      to the value of cpi_coreid.
1000 1000   *
1001 1001   * cpi_ncpu_shr_last_cache
1002 1002   *
1003 1003   *      This indicates the number of logical CPUs that are sharing the same last
1004 1004   *      level cache. This value should be the same for all CPUs that are sharing
1005 1005   *      that cache. The last cache refers to the cache that is closest to memory
1006 1006   *      and furthest away from the CPU.
1007 1007   *
1008 1008   * cpi_last_lvl_cacheid
1009 1009   *
1010 1010   *      This indicates the ID of the last cache that the logical CPU uses. This
1011 1011   *      cache is often shared between multiple logical CPUs and is the cache
1012 1012   *      that is closest to memory and furthest away from the CPU. This value
1013 1013   *      should be the same for a group of logical CPUs only if they actually
1014 1014   *      share the same last level cache. IDs should not overlap between
1015 1015   *      packages.
1016 1016   *
1017 1017   * cpi_ncore_bits
1018 1018   *
1019 1019   *      This indicates the number of bits that are required to represent all of
1020 1020   *      the cores in the system. As cores are derived based on their APIC IDs,
1021 1021   *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1022 1022   *      this value to be larger than the actual number of IDs that are present
1023 1023   *      in the system. This is used to size tables by the CMI framework. It is
1024 1024   *      only filled in for Intel and AMD CPUs.
1025 1025   *
1026 1026   * cpi_nthread_bits
1027 1027   *
1028 1028   *      This indicates the number of bits required to represent all of the IDs
1029 1029   *      that cover the logical CPUs that exist on a given core. It's OK for this
1030 1030   *      value to be larger than the actual number of IDs that are present in the
1031 1031   *      system.  This is used to size tables by the CMI framework. It is
1032 1032   *      only filled in for Intel and AMD CPUs.
1033 1033   *
1034 1034   * -----------
1035 1035   * Hypervisors
1036 1036   * -----------
1037 1037   *
1038 1038   * If trying to manage the differences between vendors wasn't bad enough, it can
1039 1039   * get worse thanks to our friend hardware virtualization. Hypervisors are given
1040 1040   * the ability to interpose on all cpuid instructions and change them to suit
1041 1041   * their purposes. In general, this is necessary as the hypervisor wants to be
1042 1042   * able to present a more uniform set of features or not necessarily give the
1043 1043   * guest operating system kernel knowledge of all features so it can be
1044 1044   * more easily migrated between systems.
1045 1045   *
1046 1046   * When it comes to trying to determine topology information, this can be a
1047 1047   * double edged sword. When a hypervisor doesn't actually implement a cpuid
1048 1048   * leaf, it'll often return all zeros. Because of that, you'll often see various
1049 1049   * checks scattered about fields being non-zero before we assume we can use
1050 1050   * them.
1051 1051   *
1052 1052   * When it comes to topology information, the hypervisor is often incentivized
1053 1053   * to lie to you about topology. This is because it doesn't always actually
1054 1054   * guarantee that topology at all. The topology path we take in the system
1055 1055   * depends on how the CPU advertises itself. If it advertises itself as an Intel
1056 1056   * or AMD CPU, then we basically do our normal path. However, when they don't
1057 1057   * use an actual vendor, then that usually turns into multiple one-core CPUs
1058 1058   * that we enumerate that are often on different sockets. The actual behavior
1059 1059   * depends greatly on what the hypervisor actually exposes to us.
1060 1060   *
1061 1061   * --------------------
1062 1062   * Exposing Information
1063 1063   * --------------------
1064 1064   *
1065 1065   * We expose CPUID information in three different forms in the system.
1066 1066   *
1067 1067   * The first is through the x86_featureset variable. This is used in conjunction
1068 1068   * with the is_x86_feature() function. This is queried by x86-specific functions
1069 1069   * to determine which features are or aren't present in the system and to make
1070 1070   * decisions based upon them. For example, users of this include everything from
1071 1071   * parts of the system dedicated to reliability, availability, and
1072 1072   * serviceability (RAS), to making decisions about how to handle security
1073 1073   * mitigations, to various x86-specific drivers. General purpose or
1074 1074   * architecture independent drivers should never be calling this function.
1075 1075   *
1076 1076   * The second means is through the auxiliary vector. The auxiliary vector is a
1077 1077   * series of tagged data that the kernel passes down to a user program when it
1078 1078   * begins executing. This information is used to indicate to programs what
1079 1079   * instruction set extensions are present. For example, information about the
1080 1080   * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1081 1081   * since user programs cannot make use of it. However, things like the AVX
1082 1082   * instruction sets are. Programs use this information to make run-time
1083 1083   * decisions about what features they should use. As an example, the run-time
1084 1084   * link-editor (rtld) can relocate different functions depending on the hardware
1085 1085   * support available.
1086 1086   *
1087 1087   * The final form is through a series of accessor functions that all have the
1088 1088   * form cpuid_get*. This is used by a number of different subsystems in the
1089 1089   * kernel to determine more detailed information about what we're running on,
1090 1090   * topology information, etc. Some of these subsystems include processor groups
1091 1091   * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1092 1092   * microcode, and performance monitoring. These functions all ASSERT that the
1093 1093   * CPU they're being called on has reached a certain cpuid pass. If the passes
1094 1094   * are rearranged, then this needs to be adjusted.
1095 1095   *
1096 1096   * -----------------------------------------------
1097 1097   * Speculative Execution CPU Side Channel Security
1098 1098   * -----------------------------------------------
1099 1099   *
1100 1100   * With the advent of the Spectre and Meltdown attacks which exploit speculative
1101 1101   * execution in the CPU to create side channels there have been a number of
1102 1102   * different attacks and corresponding issues that the operating system needs to
1103 1103   * mitigate against. The following list is some of the common, but not
1104 1104   * exhaustive, set of issues that we know about and have done some or need to do
1105 1105   * more work in the system to mitigate against:
1106 1106   *
1107 1107   *   - Spectre v1
1108 1108   *   - swapgs (Spectre v1 variant)
1109 1109   *   - Spectre v2
1110 1110   *   - Meltdown (Spectre v3)
1111 1111   *   - Rogue Register Read (Spectre v3a)
1112 1112   *   - Speculative Store Bypass (Spectre v4)
1113 1113   *   - ret2spec, SpectreRSB
1114 1114   *   - L1 Terminal Fault (L1TF)
1115 1115   *   - Microarchitectural Data Sampling (MDS)
1116 1116   *
1117 1117   * Each of these requires different sets of mitigations and has different attack
1118 1118   * surfaces. For the most part, this discussion is about protecting the kernel
1119 1119   * from non-kernel executing environments such as user processes and hardware
1120 1120   * virtual machines. Unfortunately, there are a number of user vs. user
1121 1121   * scenarios that exist with these. The rest of this section will describe the
1122 1122   * overall approach that the system has taken to address these as well as their
1123 1123   * shortcomings. Unfortunately, not all of the above have been handled today.
1124 1124   *
1125 1125   * SPECTRE v2, ret2spec, SpectreRSB
1126 1126   *
1127 1127   * The second variant of the spectre attack focuses on performing branch target
1128 1128   * injection. This generally impacts indirect call instructions in the system.
1129 1129   * There are four different ways to mitigate this issue that are commonly
1130 1130   * described today:
1131 1131   *
1132 1132   *  1. Using Indirect Branch Restricted Speculation (IBRS).
1133 1133   *  2. Using Retpolines and RSB Stuffing
1134 1134   *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1135 1135   *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1136 1136   *
1137 1137   * IBRS uses a feature added to microcode to restrict speculation, among other
1138 1138   * things. This form of mitigation has not been used as it has been generally
1139 1139   * seen as too expensive and requires reactivation upon various transitions in
1140 1140   * the system.
1141 1141   *
1142 1142   * As a less impactful alternative to IBRS, retpolines were developed by
1143 1143   * Google. These basically require one to replace indirect calls with a specific
1144 1144   * trampoline that will cause speculation to fail and break the attack.
1145 1145   * Retpolines require compiler support. We always build with retpolines in the
1146 1146   * external thunk mode. This means that a traditional indirect call is replaced
1147 1147   * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1148 1148   * of this is that all indirect function calls are performed through a register.
1149 1149   *
1150 1150   * We have to use a common external location of the thunk and not inline it into
1151 1151   * the callsite so that way we can have a single place to patch these functions.
1152 1152   * As it turns out, we currently have two different forms of retpolines that
1153 1153   * exist in the system:
1154 1154   *
1155 1155   *  1. A full retpoline
1156 1156   *  2. A no-op version
1157 1157   *
1158 1158   * The first one is used in the general case. Historically, there was an
1159 1159   * AMD-specific optimized retopoline variant that was based around using a
1160 1160   * serializing lfence instruction; however, in March 2022 it was announced that
1161 1161   * this was actually still vulnerable to Spectre v2 and therefore we no longer
1162 1162   * use it and it is no longer available in the system.
1163 1163   *
1164 1164   * The third form described above is the most curious. It turns out that the way
1165 1165   * that retpolines are implemented is that they rely on how speculation is
1166 1166   * performed on a 'ret' instruction. Intel has continued to optimize this
1167 1167   * process (which is partly why we need to have return stack buffer stuffing,
1168 1168   * but more on that in a bit) and in processors starting with Cascade Lake
1169 1169   * on the server side, it's dangerous to rely on retpolines. Instead, a new
1170 1170   * mechanism has been introduced called Enhanced IBRS (eIBRS).
1171 1171   *
1172 1172   * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1173 1173   * physical core. However, if this is the case, we don't want to use retpolines
1174 1174   * any more. Therefore if eIBRS is present, we end up turning each retpoline
1175 1175   * function (called a thunk) into a jmp instruction. This means that we're still
1176 1176   * paying the cost of an extra jump to the external thunk, but it gives us
1177 1177   * flexibility and the ability to have a single kernel image that works across a
1178 1178   * wide variety of systems and hardware features.
1179 1179   *
1180 1180   * Unfortunately, this alone is insufficient. First, Skylake systems have
1181 1181   * additional speculation for the Return Stack Buffer (RSB) which is used to
1182 1182   * return from call instructions which retpolines take advantage of. However,
1183 1183   * this problem is not just limited to Skylake and is actually more pernicious.
1184 1184   * The SpectreRSB paper introduces several more problems that can arise with
1185 1185   * dealing with this. The RSB can be poisoned just like the indirect branch
1186 1186   * predictor. This means that one needs to clear the RSB when transitioning
1187 1187   * between two different privilege domains. Some examples include:
1188 1188   *
1189 1189   *  - Switching between two different user processes
1190 1190   *  - Going between user land and the kernel
1191 1191   *  - Returning to the kernel from a hardware virtual machine
1192 1192   *
1193 1193   * Mitigating this involves combining a couple of different things. The first is
1194 1194   * SMEP (supervisor mode execution protection) which was introduced in Ivy
1195 1195   * Bridge. When an RSB entry refers to a user address and we're executing in the
1196 1196   * kernel, speculation through it will be stopped when SMEP is enabled. This
1197 1197   * protects against a number of the different cases that we would normally be
1198 1198   * worried about such as when we enter the kernel from user land.
1199 1199   *
1200 1200   * To prevent against additional manipulation of the RSB from other contexts
1201 1201   * such as a non-root VMX context attacking the kernel we first look to
1202 1202   * enhanced IBRS. When eIBRS is present and enabled, then there should be
1203 1203   * nothing else that we need to do to protect the kernel at this time.
1204 1204   *
1205 1205   * Unfortunately, eIBRS or not, we need to manually overwrite the contents of
1206 1206   * the return stack buffer. We do this through the x86_rsb_stuff() function.
1207 1207   * Currently this is employed on context switch and vmx_exit. The
1208 1208   * x86_rsb_stuff() function is disabled only when mitigations in general are.
1209 1209   *
1210 1210   * If SMEP is not present, then we would have to stuff the RSB every time we
1211 1211   * transitioned from user mode to the kernel, which isn't very practical right
1212 1212   * now.
1213 1213   *
1214 1214   * To fully protect user to user and vmx to vmx attacks from these classes of
1215 1215   * issues, we would also need to allow them to opt into performing an Indirect
1216 1216   * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1217 1217   *
1218 1218   * The fourth form of mitigation here is specific to AMD and is called Automated
1219 1219   * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1220 1220   * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1221 1221   * (extended feature enable register) MSR. This bit basically says that IBRS
1222 1222   * acts as though it is always active when executing at CPL0 and when executing
1223 1223   * in the 'host' context when SEV-SNP is enabled.
1224 1224   *
1225 1225   * When this is active, AMD states that the RSB is cleared on VMEXIT and
1226 1226   * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1227 1227   * to the kernel, we must still consider the remaining cases that exist, just
1228 1228   * like above. While traditionally AMD employed a 32 entry RSB allowing the
1229 1229   * traditional technique to work, this is not true on all CPUs. While a write to
1230 1230   * IBRS would clear the RSB if the processor supports more than 32 entries (but
1231 1231   * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1232 1232   * guard page is present between user and kernel address spaces and SMEP is
1233 1233   * enabled, then there is no need to clear the RSB at all.
1234 1234   *
1235 1235   * By default, the system will enable RSB stuffing and the required variant of
1236 1236   * retpolines and store that information in the x86_spectrev2_mitigation value.
1237 1237   * This will be evaluated after a microcode update as well, though it is
1238 1238   * expected that microcode updates will not take away features. This may mean
1239 1239   * that a late loaded microcode may not end up in the optimal configuration
1240 1240   * (though this should be rare).
1241 1241   *
1242 1242   * Currently we do not build kmdb with retpolines or perform any additional side
1243 1243   * channel security mitigations for it. One complication with kmdb is that it
1244 1244   * requires its own retpoline thunks and it would need to adjust itself based on
1245 1245   * what the kernel does. The threat model of kmdb is more limited and therefore
1246 1246   * it may make more sense to investigate using prediction barriers as the whole
1247 1247   * system is only executing a single instruction at a time while in kmdb.
1248 1248   *
1249 1249   * SPECTRE v1, v4
1250 1250   *
1251 1251   * The v1 and v4 variants of spectre are not currently mitigated in the
1252 1252   * system and require other classes of changes to occur in the code.
1253 1253   *
1254 1254   * SPECTRE v1 (SWAPGS VARIANT)
1255 1255   *
1256 1256   * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1257 1257   * can generally affect any branch-dependent code. The swapgs issue is one
1258 1258   * variant of this. If we are coming in from userspace, we can have code like
1259 1259   * this:
1260 1260   *
1261 1261   *      cmpw    $KCS_SEL, REGOFF_CS(%rsp)
1262 1262   *      je      1f
1263 1263   *      movq    $0, REGOFF_SAVFP(%rsp)
1264 1264   *      swapgs
1265 1265   *      1:
1266 1266   *      movq    %gs:CPU_THREAD, %rax
1267 1267   *
1268 1268   * If an attacker can cause a mis-speculation of the branch here, we could skip
1269 1269   * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1270 1270   * load. If subsequent code can act as the usual Spectre cache gadget, this
1271 1271   * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1272 1272   * any use of the %gs override.
1273 1273   *
1274 1274   * The other case is also an issue: if we're coming into a trap from kernel
1275 1275   * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1276 1276   * using it. AMD systems are not vulnerable to this version, as a swapgs is
1277 1277   * serializing with respect to subsequent uses. But as AMD /does/ need the other
1278 1278   * case, and the fix is the same in both cases (an lfence at the branch target
1279 1279   * 1: in this example), we'll just do it unconditionally.
1280 1280   *
1281 1281   * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1282 1282   * harder for user-space to actually set a useful %gsbase value: although it's
1283 1283   * not clear, it might still be feasible via lwp_setprivate(), though, so we
1284 1284   * mitigate anyway.
1285 1285   *
1286 1286   * MELTDOWN
1287 1287   *
1288 1288   * Meltdown, or spectre v3, allowed a user process to read any data in their
1289 1289   * address space regardless of whether or not the page tables in question
1290 1290   * allowed the user to have the ability to read them. The solution to meltdown
1291 1291   * is kernel page table isolation. In this world, there are two page tables that
1292 1292   * are used for a process, one in user land and one in the kernel. To implement
1293 1293   * this we use per-CPU page tables and switch between the user and kernel
1294 1294   * variants when entering and exiting the kernel.  For more information about
1295 1295   * this process and how the trampolines work, please see the big theory
1296 1296   * statements and additional comments in:
1297 1297   *
1298 1298   *  - uts/i86pc/ml/kpti_trampolines.s
1299 1299   *  - uts/i86pc/vm/hat_i86.c
1300 1300   *
1301 1301   * While Meltdown only impacted Intel systems and there are also Intel systems
1302 1302   * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1303 1303   * kernel page table isolation enabled. While this may at first seem weird, an
1304 1304   * important thing to remember is that you can't speculatively read an address
1305 1305   * if it's never in your page table at all. Having user processes without kernel
1306 1306   * pages present provides us with an important layer of defense in the kernel
1307 1307   * against any other side channel attacks that exist and have yet to be
1308 1308   * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1309 1309   * default, no matter the x86 system.
1310 1310   *
1311 1311   * L1 TERMINAL FAULT
1312 1312   *
1313 1313   * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1314 1314   * execution uses page table entries. Effectively, it is two different problems.
1315 1315   * The first is that it ignores the not present bit in the page table entries
1316 1316   * when performing speculative execution. This means that something can
1317 1317   * speculatively read the listed physical address if it's present in the L1
1318 1318   * cache under certain conditions (see Intel's documentation for the full set of
1319 1319   * conditions). Secondly, this can be used to bypass hardware virtualization
1320 1320   * extended page tables (EPT) that are part of Intel's hardware virtual machine
1321 1321   * instructions.
1322 1322   *
1323 1323   * For the non-hardware virtualized case, this is relatively easy to deal with.
1324 1324   * We must make sure that all unmapped pages have an address of zero. This means
1325 1325   * that they could read the first 4k of physical memory; however, we never use
1326 1326   * that first page in the operating system and always skip putting it in our
1327 1327   * memory map, even if firmware tells us we can use it in our memory map. While
1328 1328   * other systems try to put extra metadata in the address and reserved bits,
1329 1329   * which led to this being problematic in those cases, we do not.
1330 1330   *
1331 1331   * For hardware virtual machines things are more complicated. Because they can
1332 1332   * construct their own page tables, it isn't hard for them to perform this
1333 1333   * attack against any physical address. The one wrinkle is that this physical
1334 1334   * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1335 1335   * to flush the L1 data cache. We wrap this up in the function
1336 1336   * spec_uarch_flush(). This function is also used in the mitigation of
1337 1337   * microarchitectural data sampling (MDS) discussed later on. Kernel based
1338 1338   * hypervisors such as KVM or bhyve are responsible for performing this before
1339 1339   * entering the guest.
1340 1340   *
1341 1341   * Because this attack takes place in the L1 cache, there's another wrinkle
1342 1342   * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1343 1343   * designs. This means that when a thread enters a hardware virtualized context
1344 1344   * and flushes the L1 data cache, the other thread on the processor may then go
1345 1345   * ahead and put new data in it that can be potentially attacked. While one
1346 1346   * solution is to disable SMT on the system, another option that is available is
1347 1347   * to use a feature for hardware virtualization called 'SMT exclusion'. This
1348 1348   * goes through and makes sure that if a HVM is being scheduled on one thread,
1349 1349   * then the thing on the other thread is from the same hardware virtual machine.
1350 1350   * If an interrupt comes in or the guest exits to the broader system, then the
1351 1351   * other SMT thread will be kicked out.
1352 1352   *
1353 1353   * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1354 1354   * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1355 1355   * perform L1TF related mitigations.
1356 1356   *
1357 1357   * MICROARCHITECTURAL DATA SAMPLING
1358 1358   *
1359 1359   * Microarchitectural data sampling (MDS) is a combination of four discrete
1360 1360   * vulnerabilities that are similar issues affecting various parts of the CPU's
1361 1361   * microarchitectural implementation around load, store, and fill buffers.
1362 1362   * Specifically it is made up of the following subcomponents:
1363 1363   *
1364 1364   *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1365 1365   *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1366 1366   *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1367 1367   *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1368 1368   *
1369 1369   * To begin addressing these, Intel has introduced another feature in microcode
1370 1370   * called MD_CLEAR. This changes the verw instruction to operate in a different
1371 1371   * way. This allows us to execute the verw instruction in a particular way to
1372 1372   * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1373 1373   * updated when this microcode is present to flush this state.
1374 1374   *
1375 1375   * Primarily we need to flush this state whenever we transition from the kernel
1376 1376   * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1377 1377   * little bit different. Here the structures are statically sized when a logical
1378 1378   * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1379 1379   * flush the microarchitectural state before the CPU goes idles by calling hlt,
1380 1380   * mwait, or another ACPI method. To perform these flushes, we call
1381 1381   * x86_md_clear() at all of these transition points.
1382 1382   *
1383 1383   * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1384 1384   * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1385 1385   * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1386 1386   * a no-op.
1387 1387   *
1388 1388   * Unfortunately, with this issue hyperthreading rears its ugly head. In
1389 1389   * particular, everything we've discussed above is only valid for a single
1390 1390   * thread executing on a core. In the case where you have hyper-threading
1391 1391   * present, this attack can be performed between threads. The theoretical fix
1392 1392   * for this is to ensure that both threads are always in the same security
1393 1393   * domain. This means that they are executing in the same ring and mutually
1394 1394   * trust each other. Practically speaking, this would mean that a system call
1395 1395   * would have to issue an inter-processor interrupt (IPI) to the other thread.
1396 1396   * Rather than implement this, we recommend that one disables hyper-threading
1397 1397   * through the use of psradm -aS.
1398 1398   *
1399 1399   * TSX ASYNCHRONOUS ABORT
1400 1400   *
1401 1401   * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1402 1402   * behaves like MDS, but leverages Intel's transactional instructions as another
1403 1403   * vector. Effectively, when a transaction hits one of these cases (unmapped
1404 1404   * page, various cache snoop activity, etc.) then the same data can be exposed
1405 1405   * as in the case of MDS. This means that you can attack your twin.
1406 1406   *
1407 1407   * Intel has described that there are two different ways that we can mitigate
1408 1408   * this problem on affected processors:
1409 1409   *
1410 1410   *   1) We can use the same techniques used to deal with MDS. Flushing the
1411 1411   *      microarchitectural buffers and disabling hyperthreading will mitigate
1412 1412   *      this in the same way.
1413 1413   *
1414 1414   *   2) Using microcode to disable TSX.
1415 1415   *
1416 1416   * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1417 1417   * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1418 1418   * That's OK as we're already doing all such mitigations. On the other hand,
1419 1419   * processors with MDS_NO are all supposed to receive microcode updates that
1420 1420   * enumerate support for disabling TSX. In general, we'd rather use this method
1421 1421   * when available as it doesn't require disabling hyperthreading to be
1422 1422   * effective. Currently we basically are relying on microcode for processors
1423 1423   * that enumerate MDS_NO.
1424 1424   *
1425 1425   * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1426 1426   * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1427 1427   * different powers. The first allows us to cause all transactions to
1428 1428   * immediately abort. The second gives us a means of disabling TSX completely,
1429 1429   * which includes removing it from cpuid. If we have support for this in
1430 1430   * microcode during the first cpuid pass, then we'll disable TSX completely such
1431 1431   * that user land never has a chance to observe the bit. However, if we are late
1432 1432   * loading the microcode, then we must use the functionality to cause
1433 1433   * transactions to automatically abort. This is necessary for user land's sake.
1434 1434   * Once a program sees a cpuid bit, it must not be taken away.
1435 1435   *
1436 1436   * We track whether or not we should do this based on what cpuid pass we're in.
1437 1437   * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1438 1438   * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1439 1439   * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1440 1440   * second time after we do the initial microcode update.  As a result we need to
1441 1441   * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1442 1442   * suitable microcode on the current CPU (which happens prior to
1443 1443   * cpuid_pass_ucode()).
1444 1444   *
1445 1445   * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1446 1446   * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1447 1447   * unfortunate feature in a number of ways, and taking the opportunity to
1448 1448   * finally be able to turn it off is likely to be of benefit in the future.
1449 1449   *
1450 1450   * SUMMARY
1451 1451   *
1452 1452   * The following table attempts to summarize the mitigations for various issues
1453 1453   * and what's done in various places:
1454 1454   *
1455 1455   *  - Spectre v1: Not currently mitigated
1456 1456   *  - swapgs: lfences after swapgs paths
1457 1457   *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1458 1458   *  - Meltdown: Kernel Page Table Isolation
1459 1459   *  - Spectre v3a: Updated CPU microcode
1460 1460   *  - Spectre v4: Not currently mitigated
1461 1461   *  - SpectreRSB: SMEP and RSB Stuffing
1462 1462   *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1463 1463   *  - MDS: x86_md_clear, requires microcode, disabling SMT
1464 1464   *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1465 1465   *
1466 1466   * The following table indicates the x86 feature set bits that indicate that a
1467 1467   * given problem has been solved or a notable feature is present:
1468 1468   *
1469 1469   *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1470 1470   *  - MDS_NO: All forms of MDS
1471 1471   *  - TAA_NO: TAA
1472 1472   */
1473 1473  
1474 1474  #include <sys/types.h>
1475 1475  #include <sys/archsystm.h>
1476 1476  #include <sys/x86_archext.h>
1477 1477  #include <sys/kmem.h>
1478 1478  #include <sys/systm.h>
1479 1479  #include <sys/cmn_err.h>
1480 1480  #include <sys/sunddi.h>
1481 1481  #include <sys/sunndi.h>
1482 1482  #include <sys/cpuvar.h>
1483 1483  #include <sys/processor.h>
1484 1484  #include <sys/sysmacros.h>
1485 1485  #include <sys/pg.h>
1486 1486  #include <sys/fp.h>
1487 1487  #include <sys/controlregs.h>
1488 1488  #include <sys/bitmap.h>
1489 1489  #include <sys/auxv_386.h>
1490 1490  #include <sys/memnode.h>
1491 1491  #include <sys/pci_cfgspace.h>
1492 1492  #include <sys/comm_page.h>
1493 1493  #include <sys/mach_mmu.h>
1494 1494  #include <sys/ucode.h>
1495 1495  #include <sys/tsc.h>
1496 1496  #include <sys/kobj.h>
1497 1497  #include <sys/asm_misc.h>
1498 1498  
1499 1499  #ifdef __xpv
1500 1500  #include <sys/hypervisor.h>
1501 1501  #else
1502 1502  #include <sys/ontrap.h>
1503 1503  #endif
1504 1504  
1505 1505  uint_t x86_vendor = X86_VENDOR_IntelClone;
1506 1506  uint_t x86_type = X86_TYPE_OTHER;
1507 1507  uint_t x86_clflush_size = 0;
1508 1508  
1509 1509  #if defined(__xpv)
1510 1510  int x86_use_pcid = 0;
1511 1511  int x86_use_invpcid = 0;
1512 1512  #else
1513 1513  int x86_use_pcid = -1;
1514 1514  int x86_use_invpcid = -1;
1515 1515  #endif
1516 1516  
1517 1517  typedef enum {
1518 1518          X86_SPECTREV2_RETPOLINE,
1519 1519          X86_SPECTREV2_ENHANCED_IBRS,
1520 1520          X86_SPECTREV2_AUTO_IBRS,
1521 1521          X86_SPECTREV2_DISABLED
1522 1522  } x86_spectrev2_mitigation_t;
1523 1523  
1524 1524  uint_t x86_disable_spectrev2 = 0;
1525 1525  static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1526 1526      X86_SPECTREV2_RETPOLINE;
1527 1527  
1528 1528  /*
1529 1529   * The mitigation status for TAA:
1530 1530   * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1531 1531   * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1532 1532   * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1533 1533   * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1534 1534   * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1535 1535   * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1536 1536   */
1537 1537  typedef enum {
1538 1538          X86_TAA_NOTHING,
1539 1539          X86_TAA_DISABLED,
1540 1540          X86_TAA_MD_CLEAR,
1541 1541          X86_TAA_TSX_FORCE_ABORT,
1542 1542          X86_TAA_TSX_DISABLE,
1543 1543          X86_TAA_HW_MITIGATED
1544 1544  } x86_taa_mitigation_t;
1545 1545  
1546 1546  uint_t x86_disable_taa = 0;
1547 1547  static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1548 1548  
1549 1549  uint_t pentiumpro_bug4046376;
1550 1550  
1551 1551  uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1552 1552  
1553 1553  static char *x86_feature_names[NUM_X86_FEATURES] = {
1554 1554          "lgpg",
1555 1555          "tsc",
1556 1556          "msr",
1557 1557          "mtrr",
1558 1558          "pge",
1559 1559          "de",
1560 1560          "cmov",
1561 1561          "mmx",
1562 1562          "mca",
1563 1563          "pae",
1564 1564          "cv8",
1565 1565          "pat",
1566 1566          "sep",
1567 1567          "sse",
1568 1568          "sse2",
1569 1569          "htt",
1570 1570          "asysc",
1571 1571          "nx",
1572 1572          "sse3",
1573 1573          "cx16",
1574 1574          "cmp",
1575 1575          "tscp",
1576 1576          "mwait",
1577 1577          "sse4a",
1578 1578          "cpuid",
1579 1579          "ssse3",
1580 1580          "sse4_1",
1581 1581          "sse4_2",
1582 1582          "1gpg",
1583 1583          "clfsh",
1584 1584          "64",
1585 1585          "aes",
1586 1586          "pclmulqdq",
1587 1587          "xsave",
1588 1588          "avx",
1589 1589          "vmx",
1590 1590          "svm",
1591 1591          "topoext",
1592 1592          "f16c",
1593 1593          "rdrand",
1594 1594          "x2apic",
1595 1595          "avx2",
1596 1596          "bmi1",
1597 1597          "bmi2",
1598 1598          "fma",
1599 1599          "smep",
1600 1600          "smap",
1601 1601          "adx",
1602 1602          "rdseed",
1603 1603          "mpx",
1604 1604          "avx512f",
1605 1605          "avx512dq",
1606 1606          "avx512pf",
1607 1607          "avx512er",
1608 1608          "avx512cd",
1609 1609          "avx512bw",
1610 1610          "avx512vl",
1611 1611          "avx512fma",
1612 1612          "avx512vbmi",
1613 1613          "avx512_vpopcntdq",
1614 1614          "avx512_4vnniw",
1615 1615          "avx512_4fmaps",
1616 1616          "xsaveopt",
1617 1617          "xsavec",
1618 1618          "xsaves",
1619 1619          "sha",
1620 1620          "umip",
1621 1621          "pku",
1622 1622          "ospke",
1623 1623          "pcid",
1624 1624          "invpcid",
1625 1625          "ibrs",
1626 1626          "ibpb",
1627 1627          "stibp",
1628 1628          "ssbd",
1629 1629          "ssbd_virt",
1630 1630          "rdcl_no",
1631 1631          "ibrs_all",
1632 1632          "rsba",
1633 1633          "ssb_no",
1634 1634          "stibp_all",
1635 1635          "flush_cmd",
1636 1636          "l1d_vmentry_no",
1637 1637          "fsgsbase",
1638 1638          "clflushopt",
1639 1639          "clwb",
1640 1640          "monitorx",
1641 1641          "clzero",
1642 1642          "xop",
1643 1643          "fma4",
1644 1644          "tbm",
1645 1645          "avx512_vnni",
1646 1646          "amd_pcec",
1647 1647          "md_clear",
1648 1648          "mds_no",
1649 1649          "core_thermal",
1650 1650          "pkg_thermal",
1651 1651          "tsx_ctrl",
1652 1652          "taa_no",
1653 1653          "ppin",
1654 1654          "vaes",
1655 1655          "vpclmulqdq",
1656 1656          "lfence_serializing",
1657 1657          "gfni",
1658 1658          "avx512_vp2intersect",
1659 1659          "avx512_bitalg",
1660 1660          "avx512_vbmi2",
1661 1661          "avx512_bf16",
1662 1662          "auto_ibrs"
1663 1663  };
1664 1664  
1665 1665  boolean_t
1666 1666  is_x86_feature(void *featureset, uint_t feature)
1667 1667  {
1668 1668          ASSERT(feature < NUM_X86_FEATURES);
1669 1669          return (BT_TEST((ulong_t *)featureset, feature));
1670 1670  }
1671 1671  
1672 1672  void
1673 1673  add_x86_feature(void *featureset, uint_t feature)
1674 1674  {
1675 1675          ASSERT(feature < NUM_X86_FEATURES);
1676 1676          BT_SET((ulong_t *)featureset, feature);
1677 1677  }
1678 1678  
1679 1679  void
1680 1680  remove_x86_feature(void *featureset, uint_t feature)
1681 1681  {
1682 1682          ASSERT(feature < NUM_X86_FEATURES);
1683 1683          BT_CLEAR((ulong_t *)featureset, feature);
1684 1684  }
1685 1685  
1686 1686  boolean_t
1687 1687  compare_x86_featureset(void *setA, void *setB)
1688 1688  {
1689 1689          /*
1690 1690           * We assume that the unused bits of the bitmap are always zero.
1691 1691           */
1692 1692          if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1693 1693                  return (B_TRUE);
1694 1694          } else {
1695 1695                  return (B_FALSE);
1696 1696          }
1697 1697  }
1698 1698  
1699 1699  void
1700 1700  print_x86_featureset(void *featureset)
1701 1701  {
1702 1702          uint_t i;
1703 1703  
1704 1704          for (i = 0; i < NUM_X86_FEATURES; i++) {
1705 1705                  if (is_x86_feature(featureset, i)) {
1706 1706                          cmn_err(CE_CONT, "?x86_feature: %s\n",
1707 1707                              x86_feature_names[i]);
1708 1708                  }
1709 1709          }
1710 1710  }
1711 1711  
1712 1712  /* Note: This is the maximum size for the CPU, not the size of the structure. */
1713 1713  static size_t xsave_state_size = 0;
1714 1714  uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1715 1715  boolean_t xsave_force_disable = B_FALSE;
1716 1716  extern int disable_smap;
1717 1717  
1718 1718  /*
1719 1719   * This is set to platform type we are running on.
1720 1720   */
1721 1721  static int platform_type = -1;
1722 1722  
1723 1723  #if !defined(__xpv)
1724 1724  /*
1725 1725   * Variable to patch if hypervisor platform detection needs to be
1726 1726   * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1727 1727   */
1728 1728  int enable_platform_detection = 1;
1729 1729  #endif
1730 1730  
1731 1731  /*
1732 1732   * monitor/mwait info.
1733 1733   *
1734 1734   * size_actual and buf_actual are the real address and size allocated to get
1735 1735   * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1736 1736   * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1737 1737   * processor cache-line alignment, but this is not guarantied in the furture.
1738 1738   */
1739 1739  struct mwait_info {
1740 1740          size_t          mon_min;        /* min size to avoid missed wakeups */
1741 1741          size_t          mon_max;        /* size to avoid false wakeups */
1742 1742          size_t          size_actual;    /* size actually allocated */
1743 1743          void            *buf_actual;    /* memory actually allocated */
1744 1744          uint32_t        support;        /* processor support of monitor/mwait */
1745 1745  };
1746 1746  
1747 1747  /*
1748 1748   * xsave/xrestor info.
1749 1749   *
1750 1750   * This structure contains HW feature bits and the size of the xsave save area.
1751 1751   * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1752 1752   * (xsave_state) to describe the xsave layout. However, at runtime the
1753 1753   * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1754 1754   * xsave_state structure simply represents the legacy layout of the beginning
1755 1755   * of the xsave area.
1756 1756   */
1757 1757  struct xsave_info {
1758 1758          uint32_t        xsav_hw_features_low;   /* Supported HW features */
1759 1759          uint32_t        xsav_hw_features_high;  /* Supported HW features */
1760 1760          size_t          xsav_max_size;  /* max size save area for HW features */
1761 1761          size_t          ymm_size;       /* AVX: size of ymm save area */
1762 1762          size_t          ymm_offset;     /* AVX: offset for ymm save area */
1763 1763          size_t          bndregs_size;   /* MPX: size of bndregs save area */
1764 1764          size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1765 1765          size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1766 1766          size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1767 1767          size_t          opmask_size;    /* AVX512: size of opmask save */
1768 1768          size_t          opmask_offset;  /* AVX512: offset for opmask save */
1769 1769          size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1770 1770          size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1771 1771          size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1772 1772          size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1773 1773  };
1774 1774  
1775 1775  
1776 1776  /*
1777 1777   * These constants determine how many of the elements of the
1778 1778   * cpuid we cache in the cpuid_info data structure; the
1779 1779   * remaining elements are accessible via the cpuid instruction.
1780 1780   */
1781 1781  
1782 1782  #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1783 1783  #define NMAX_CPI_EXTD   0x22            /* eax = 0x80000000 .. 0x80000021 */
1784 1784  
1785 1785  /*
1786 1786   * See the big theory statement for a more detailed explanation of what some of
1787 1787   * these members mean.
1788 1788   */
1789 1789  struct cpuid_info {
1790 1790          uint_t cpi_pass;                /* last pass completed */
1791 1791          /*
1792 1792           * standard function information
1793 1793           */
1794 1794          uint_t cpi_maxeax;              /* fn 0: %eax */
1795 1795          char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1796 1796          uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1797 1797  
1798 1798          uint_t cpi_family;              /* fn 1: extended family */
1799 1799          uint_t cpi_model;               /* fn 1: extended model */
1800 1800          uint_t cpi_step;                /* fn 1: stepping */
1801 1801          chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1802 1802                                          /*              AMD: package/socket # */
1803 1803          uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1804 1804          int cpi_clogid;                 /* fn 1: %ebx: thread # */
1805 1805          uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1806 1806          uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1807 1807          uint_t cpi_ncache;              /* fn 2: number of elements */
1808 1808          uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1809 1809          id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1810 1810          uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1811 1811                                          /* Intel fn: 4, AMD fn: 8000001d */
1812 1812          struct cpuid_regs **cpi_cache_leaves;   /* Acual leaves from above */
1813 1813          struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1814 1814          struct cpuid_regs cpi_sub7[1];  /* Leaf 7, sub-leaf 1 */
1815 1815          /*
1816 1816           * extended function information
1817 1817           */
1818 1818          uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1819 1819          char cpi_brandstr[49];          /* fn 0x8000000[234] */
1820 1820          uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1821 1821          uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1822 1822          uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1823 1823          struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1824 1824  
1825 1825          id_t cpi_coreid;                /* same coreid => strands share core */
1826 1826          int cpi_pkgcoreid;              /* core number within single package */
1827 1827          uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1828 1828                                          /* Intel: fn 4: %eax[31-26] */
1829 1829  
1830 1830          /*
1831 1831           * These values represent the number of bits that are required to store
1832 1832           * information about the number of cores and threads.
1833 1833           */
1834 1834          uint_t cpi_ncore_bits;
1835 1835          uint_t cpi_nthread_bits;
1836 1836          /*
1837 1837           * supported feature information
1838 1838           */
1839 1839          uint32_t cpi_support[6];
1840 1840  #define STD_EDX_FEATURES        0
1841 1841  #define AMD_EDX_FEATURES        1
1842 1842  #define TM_EDX_FEATURES         2
1843 1843  #define STD_ECX_FEATURES        3
1844 1844  #define AMD_ECX_FEATURES        4
1845 1845  #define STD_EBX_FEATURES        5
1846 1846          /*
1847 1847           * Synthesized information, where known.
1848 1848           */
1849 1849          x86_chiprev_t cpi_chiprev;      /* See X86_CHIPREV_* in x86_archext.h */
1850 1850          const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1851 1851          uint32_t cpi_socket;            /* Chip package/socket type */
1852 1852          x86_uarchrev_t cpi_uarchrev;    /* Microarchitecture and revision */
1853 1853  
1854 1854          struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1855 1855          uint32_t cpi_apicid;
1856 1856          uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1857 1857          uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1858 1858                                          /* Intel: 1 */
1859 1859          uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1860 1860          uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1861 1861  
1862 1862          struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1863 1863  };
1864 1864  
1865 1865  
1866 1866  static struct cpuid_info cpuid_info0;
1867 1867  
1868 1868  /*
1869 1869   * These bit fields are defined by the Intel Application Note AP-485
1870 1870   * "Intel Processor Identification and the CPUID Instruction"
1871 1871   */
1872 1872  #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1873 1873  #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1874 1874  #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1875 1875  #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1876 1876  #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1877 1877  #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1878 1878  
1879 1879  #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1880 1880  #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1881 1881  #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1882 1882  #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1883 1883  #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1884 1884  #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1885 1885  #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1886 1886  #define CPI_FEATURES_7_1_EAX(cpi)       ((cpi)->cpi_sub7[0].cp_eax)
1887 1887  
1888 1888  #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1889 1889  #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1890 1890  #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1891 1891  #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1892 1892  
1893 1893  #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1894 1894  #define CPI_XMAXEAX_MAX         0x80000100
1895 1895  #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1896 1896  #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1897 1897  
1898 1898  /*
1899 1899   * Function 4 (Deterministic Cache Parameters) macros
1900 1900   * Defined by Intel Application Note AP-485
1901 1901   */
1902 1902  #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1903 1903  #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1904 1904  #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1905 1905  #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1906 1906  #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1907 1907  #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1908 1908  #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1909 1909  
1910 1910  #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1911 1911  #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1912 1912  #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1913 1913  
1914 1914  #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1915 1915  
1916 1916  #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1917 1917  
1918 1918  
1919 1919  /*
1920 1920   * A couple of shorthand macros to identify "later" P6-family chips
1921 1921   * like the Pentium M and Core.  First, the "older" P6-based stuff
1922 1922   * (loosely defined as "pre-Pentium-4"):
1923 1923   * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1924 1924   */
1925 1925  #define IS_LEGACY_P6(cpi) (                     \
1926 1926          cpi->cpi_family == 6 &&                 \
1927 1927                  (cpi->cpi_model == 1 ||         \
1928 1928                  cpi->cpi_model == 3 ||          \
1929 1929                  cpi->cpi_model == 5 ||          \
1930 1930                  cpi->cpi_model == 6 ||          \
1931 1931                  cpi->cpi_model == 7 ||          \
1932 1932                  cpi->cpi_model == 8 ||          \
1933 1933                  cpi->cpi_model == 0xA ||        \
1934 1934                  cpi->cpi_model == 0xB)          \
1935 1935  )
1936 1936  
1937 1937  /* A "new F6" is everything with family 6 that's not the above */
1938 1938  #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1939 1939  
1940 1940  /* Extended family/model support */
1941 1941  #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1942 1942          cpi->cpi_family >= 0xf)
1943 1943  
1944 1944  /*
1945 1945   * Info for monitor/mwait idle loop.
1946 1946   *
1947 1947   * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1948 1948   * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1949 1949   * 2006.
1950 1950   * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1951 1951   * Documentation Updates" #33633, Rev 2.05, December 2006.
1952 1952   */
1953 1953  #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1954 1954  #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1955 1955  #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1956 1956  #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1957 1957  #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1958 1958  #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1959 1959  #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1960 1960  #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1961 1961  /*
1962 1962   * Number of sub-cstates for a given c-state.
1963 1963   */
1964 1964  #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1965 1965          BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1966 1966  
1967 1967  /*
1968 1968   * XSAVE leaf 0xD enumeration
1969 1969   */
1970 1970  #define CPUID_LEAFD_2_YMM_OFFSET        576
1971 1971  #define CPUID_LEAFD_2_YMM_SIZE          256
1972 1972  
1973 1973  /*
1974 1974   * Common extended leaf names to cut down on typos.
1975 1975   */
1976 1976  #define CPUID_LEAF_EXT_0                0x80000000
1977 1977  #define CPUID_LEAF_EXT_8                0x80000008
1978 1978  #define CPUID_LEAF_EXT_1d               0x8000001d
1979 1979  #define CPUID_LEAF_EXT_1e               0x8000001e
1980 1980  #define CPUID_LEAF_EXT_21               0x80000021
1981 1981  
1982 1982  /*
1983 1983   * Functions we consune from cpuid_subr.c;  don't publish these in a header
1984 1984   * file to try and keep people using the expected cpuid_* interfaces.
1985 1985   */
1986 1986  extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1987 1987  extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1988 1988  extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1989 1989  extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1990 1990  extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
1991 1991  extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1992 1992  
1993 1993  /*
1994 1994   * Apply up various platform-dependent restrictions where the
1995 1995   * underlying platform restrictions mean the CPU can be marked
1996 1996   * as less capable than its cpuid instruction would imply.
1997 1997   */
1998 1998  #if defined(__xpv)
1999 1999  static void
2000 2000  platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2001 2001  {
2002 2002          switch (eax) {
2003 2003          case 1: {
2004 2004                  uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2005 2005                      0 : CPUID_INTC_EDX_MCA;
2006 2006                  cp->cp_edx &=
2007 2007                      ~(mcamask |
2008 2008                      CPUID_INTC_EDX_PSE |
2009 2009                      CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2010 2010                      CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2011 2011                      CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2012 2012                      CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2013 2013                      CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2014 2014                  break;
2015 2015          }
2016 2016  
2017 2017          case 0x80000001:
2018 2018                  cp->cp_edx &=
2019 2019                      ~(CPUID_AMD_EDX_PSE |
2020 2020                      CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2021 2021                      CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2022 2022                      CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2023 2023                      CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2024 2024                      CPUID_AMD_EDX_TSCP);
2025 2025                  cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2026 2026                  break;
2027 2027          default:
2028 2028                  break;
2029 2029          }
2030 2030  
2031 2031          switch (vendor) {
2032 2032          case X86_VENDOR_Intel:
2033 2033                  switch (eax) {
2034 2034                  case 4:
2035 2035                          /*
2036 2036                           * Zero out the (ncores-per-chip - 1) field
2037 2037                           */
2038 2038                          cp->cp_eax &= 0x03fffffff;
2039 2039                          break;
2040 2040                  default:
2041 2041                          break;
2042 2042                  }
2043 2043                  break;
2044 2044          case X86_VENDOR_AMD:
2045 2045          case X86_VENDOR_HYGON:
2046 2046                  switch (eax) {
2047 2047  
2048 2048                  case 0x80000001:
2049 2049                          cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2050 2050                          break;
2051 2051  
2052 2052                  case CPUID_LEAF_EXT_8:
2053 2053                          /*
2054 2054                           * Zero out the (ncores-per-chip - 1) field
2055 2055                           */
2056 2056                          cp->cp_ecx &= 0xffffff00;
2057 2057                          break;
2058 2058                  default:
2059 2059                          break;
2060 2060                  }
2061 2061                  break;
2062 2062          default:
2063 2063                  break;
2064 2064          }
2065 2065  }
2066 2066  #else
2067 2067  #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
2068 2068  #endif
2069 2069  
2070 2070  /*
2071 2071   *  Some undocumented ways of patching the results of the cpuid
2072 2072   *  instruction to permit running Solaris 10 on future cpus that
2073 2073   *  we don't currently support.  Could be set to non-zero values
2074 2074   *  via settings in eeprom.
2075 2075   */
2076 2076  
2077 2077  uint32_t cpuid_feature_ecx_include;
2078 2078  uint32_t cpuid_feature_ecx_exclude;
2079 2079  uint32_t cpuid_feature_edx_include;
2080 2080  uint32_t cpuid_feature_edx_exclude;
2081 2081  
2082 2082  /*
2083 2083   * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2084 2084   */
2085 2085  void
2086 2086  cpuid_alloc_space(cpu_t *cpu)
2087 2087  {
2088 2088          /*
2089 2089           * By convention, cpu0 is the boot cpu, which is set up
2090 2090           * before memory allocation is available.  All other cpus get
2091 2091           * their cpuid_info struct allocated here.
2092 2092           */
2093 2093          ASSERT(cpu->cpu_id != 0);
2094 2094          ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2095 2095          cpu->cpu_m.mcpu_cpi =
2096 2096              kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2097 2097  }
2098 2098  
2099 2099  void
2100 2100  cpuid_free_space(cpu_t *cpu)
2101 2101  {
2102 2102          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2103 2103          int i;
2104 2104  
2105 2105          ASSERT(cpi != NULL);
2106 2106          ASSERT(cpi != &cpuid_info0);
2107 2107  
2108 2108          /*
2109 2109           * Free up any cache leaf related dynamic storage. The first entry was
2110 2110           * cached from the standard cpuid storage, so we should not free it.
2111 2111           */
2112 2112          for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2113 2113                  kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2114 2114          if (cpi->cpi_cache_leaf_size > 0)
2115 2115                  kmem_free(cpi->cpi_cache_leaves,
2116 2116                      cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2117 2117  
2118 2118          kmem_free(cpi, sizeof (*cpi));
2119 2119          cpu->cpu_m.mcpu_cpi = NULL;
2120 2120  }
2121 2121  
2122 2122  #if !defined(__xpv)
2123 2123  /*
2124 2124   * Determine the type of the underlying platform. This is used to customize
2125 2125   * initialization of various subsystems (e.g. TSC). determine_platform() must
2126 2126   * only ever be called once to prevent two processors from seeing different
2127 2127   * values of platform_type. Must be called before cpuid_pass_ident(), the
2128 2128   * earliest consumer to execute; the identification pass will call
2129 2129   * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2130 2130   */
2131 2131  void
2132 2132  determine_platform(void)
2133 2133  {
2134 2134          struct cpuid_regs cp;
2135 2135          uint32_t base;
2136 2136          uint32_t regs[4];
2137 2137          char *hvstr = (char *)regs;
2138 2138  
2139 2139          ASSERT(platform_type == -1);
2140 2140  
2141 2141          platform_type = HW_NATIVE;
2142 2142  
2143 2143          if (!enable_platform_detection)
2144 2144                  return;
2145 2145  
2146 2146          /*
2147 2147           * If Hypervisor CPUID bit is set, try to determine hypervisor
2148 2148           * vendor signature, and set platform type accordingly.
2149 2149           *
2150 2150           * References:
2151 2151           * http://lkml.org/lkml/2008/10/1/246
2152 2152           * http://kb.vmware.com/kb/1009458
2153 2153           */
2154 2154          cp.cp_eax = 0x1;
2155 2155          (void) __cpuid_insn(&cp);
2156 2156          if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2157 2157                  cp.cp_eax = 0x40000000;
2158 2158                  (void) __cpuid_insn(&cp);
2159 2159                  regs[0] = cp.cp_ebx;
2160 2160                  regs[1] = cp.cp_ecx;
2161 2161                  regs[2] = cp.cp_edx;
2162 2162                  regs[3] = 0;
2163 2163                  if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2164 2164                          platform_type = HW_XEN_HVM;
2165 2165                          return;
2166 2166                  }
2167 2167                  if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2168 2168                          platform_type = HW_VMWARE;
2169 2169                          return;
2170 2170                  }
2171 2171                  if (strcmp(hvstr, HVSIG_KVM) == 0) {
2172 2172                          platform_type = HW_KVM;
2173 2173                          return;
2174 2174                  }
2175 2175                  if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2176 2176                          platform_type = HW_BHYVE;
2177 2177                          return;
2178 2178                  }
2179 2179                  if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
2180 2180                          platform_type = HW_MICROSOFT;
2181 2181          } else {
2182 2182                  /*
2183 2183                   * Check older VMware hardware versions. VMware hypervisor is
2184 2184                   * detected by performing an IN operation to VMware hypervisor
2185 2185                   * port and checking that value returned in %ebx is VMware
2186 2186                   * hypervisor magic value.
2187 2187                   *
2188 2188                   * References: http://kb.vmware.com/kb/1009458
2189 2189                   */
2190 2190                  vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2191 2191                  if (regs[1] == VMWARE_HVMAGIC) {
2192 2192                          platform_type = HW_VMWARE;
2193 2193                          return;
2194 2194                  }
2195 2195          }
2196 2196  
2197 2197          /*
2198 2198           * Check Xen hypervisor. In a fully virtualized domain,
2199 2199           * Xen's pseudo-cpuid function returns a string representing the
2200 2200           * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2201 2201           * supported cpuid function. We need at least a (base + 2) leaf value
2202 2202           * to do what we want to do. Try different base values, since the
2203 2203           * hypervisor might use a different one depending on whether Hyper-V
2204 2204           * emulation is switched on by default or not.
2205 2205           */
2206 2206          for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2207 2207                  cp.cp_eax = base;
2208 2208                  (void) __cpuid_insn(&cp);
2209 2209                  regs[0] = cp.cp_ebx;
2210 2210                  regs[1] = cp.cp_ecx;
2211 2211                  regs[2] = cp.cp_edx;
2212 2212                  regs[3] = 0;
2213 2213                  if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2214 2214                      cp.cp_eax >= (base + 2)) {
2215 2215                          platform_type &= ~HW_NATIVE;
2216 2216                          platform_type |= HW_XEN_HVM;
2217 2217                          return;
2218 2218                  }
2219 2219          }
2220 2220  }
2221 2221  
2222 2222  int
2223 2223  get_hwenv(void)
2224 2224  {
2225 2225          ASSERT(platform_type != -1);
2226 2226          return (platform_type);
2227 2227  }
2228 2228  
2229 2229  int
2230 2230  is_controldom(void)
2231 2231  {
2232 2232          return (0);
2233 2233  }
2234 2234  
2235 2235  #else
2236 2236  
2237 2237  int
2238 2238  get_hwenv(void)
2239 2239  {
2240 2240          return (HW_XEN_PV);
2241 2241  }
2242 2242  
2243 2243  int
2244 2244  is_controldom(void)
2245 2245  {
2246 2246          return (DOMAIN_IS_INITDOMAIN(xen_info));
2247 2247  }
2248 2248  
2249 2249  #endif  /* __xpv */
2250 2250  
2251 2251  /*
2252 2252   * Make sure that we have gathered all of the CPUID leaves that we might need to
2253 2253   * determine topology. We assume that the standard leaf 1 has already been done
2254 2254   * and that xmaxeax has already been calculated.
2255 2255   */
2256 2256  static void
2257 2257  cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2258 2258  {
2259 2259          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2260 2260  
2261 2261          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2262 2262                  struct cpuid_regs *cp;
2263 2263  
2264 2264                  cp = &cpi->cpi_extd[8];
2265 2265                  cp->cp_eax = CPUID_LEAF_EXT_8;
2266 2266                  (void) __cpuid_insn(cp);
2267 2267                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2268 2268          }
2269 2269  
2270 2270          if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2271 2271              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2272 2272                  struct cpuid_regs *cp;
2273 2273  
2274 2274                  cp = &cpi->cpi_extd[0x1e];
2275 2275                  cp->cp_eax = CPUID_LEAF_EXT_1e;
2276 2276                  (void) __cpuid_insn(cp);
2277 2277          }
2278 2278  }
2279 2279  
2280 2280  /*
2281 2281   * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2282 2282   * it to everything else. If not, and we're on an AMD system where 8000001e is
2283 2283   * valid, then we use that. Othewrise, we fall back to the default value for the
2284 2284   * APIC ID in leaf 1.
2285 2285   */
2286 2286  static uint32_t
2287 2287  cpuid_gather_apicid(struct cpuid_info *cpi)
2288 2288  {
2289 2289          /*
2290 2290           * Leaf B changes based on the arguments to it. Beacuse we don't cache
2291 2291           * it, we need to gather it again.
2292 2292           */
2293 2293          if (cpi->cpi_maxeax >= 0xB) {
2294 2294                  struct cpuid_regs regs;
2295 2295                  struct cpuid_regs *cp;
2296 2296  
2297 2297                  cp = ®s;
2298 2298                  cp->cp_eax = 0xB;
2299 2299                  cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2300 2300                  (void) __cpuid_insn(cp);
2301 2301  
2302 2302                  if (cp->cp_ebx != 0) {
2303 2303                          return (cp->cp_edx);
2304 2304                  }
2305 2305          }
2306 2306  
2307 2307          if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2308 2308              cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2309 2309              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2310 2310              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2311 2311                  return (cpi->cpi_extd[0x1e].cp_eax);
2312 2312          }
2313 2313  
2314 2314          return (CPI_APIC_ID(cpi));
2315 2315  }
2316 2316  
2317 2317  /*
2318 2318   * For AMD processors, attempt to calculate the number of chips and cores that
2319 2319   * exist. The way that we do this varies based on the generation, because the
2320 2320   * generations themselves have changed dramatically.
2321 2321   *
2322 2322   * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2323 2323   * However, with the advent of family 17h (Zen) it actually tells us the number
2324 2324   * of threads, so we need to look at leaf 0x8000001e if available to determine
2325 2325   * its value. Otherwise, for all prior families, the number of enabled cores is
2326 2326   * the same as threads.
2327 2327   *
2328 2328   * If we do not have leaf 0x80000008, then we assume that this processor does
2329 2329   * not have anything. AMD's older CPUID specification says there's no reason to
2330 2330   * fall back to leaf 1.
2331 2331   *
2332 2332   * In some virtualization cases we will not have leaf 8000001e or it will be
2333 2333   * zero. When that happens we assume the number of threads is one.
2334 2334   */
2335 2335  static void
2336 2336  cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2337 2337  {
2338 2338          uint_t nthreads, nthread_per_core;
2339 2339  
2340 2340          nthreads = nthread_per_core = 1;
2341 2341  
2342 2342          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2343 2343                  nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2344 2344          } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2345 2345                  nthreads = CPI_CPU_COUNT(cpi);
2346 2346          }
2347 2347  
2348 2348          /*
2349 2349           * For us to have threads, and know about it, we have to be at least at
2350 2350           * family 17h and have the cpuid bit that says we have extended
2351 2351           * topology.
2352 2352           */
2353 2353          if (cpi->cpi_family >= 0x17 &&
2354 2354              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2355 2355              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2356 2356                  nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2357 2357          }
2358 2358  
2359 2359          *ncpus = nthreads;
2360 2360          *ncores = nthreads / nthread_per_core;
2361 2361  }
2362 2362  
2363 2363  /*
2364 2364   * Seed the initial values for the cores and threads for an Intel based
2365 2365   * processor. These values will be overwritten if we detect that the processor
2366 2366   * supports CPUID leaf 0xb.
2367 2367   */
2368 2368  static void
2369 2369  cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2370 2370  {
2371 2371          /*
2372 2372           * Only seed the number of physical cores from the first level leaf 4
2373 2373           * information. The number of threads there indicate how many share the
2374 2374           * L1 cache, which may or may not have anything to do with the number of
2375 2375           * logical CPUs per core.
2376 2376           */
2377 2377          if (cpi->cpi_maxeax >= 4) {
2378 2378                  *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2379 2379          } else {
2380 2380                  *ncores = 1;
2381 2381          }
2382 2382  
2383 2383          if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2384 2384                  *ncpus = CPI_CPU_COUNT(cpi);
2385 2385          } else {
2386 2386                  *ncpus = *ncores;
2387 2387          }
2388 2388  }
2389 2389  
2390 2390  static boolean_t
2391 2391  cpuid_leafB_getids(cpu_t *cpu)
2392 2392  {
2393 2393          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2394 2394          struct cpuid_regs regs;
2395 2395          struct cpuid_regs *cp;
2396 2396  
2397 2397          if (cpi->cpi_maxeax < 0xB)
2398 2398                  return (B_FALSE);
2399 2399  
2400 2400          cp = ®s;
2401 2401          cp->cp_eax = 0xB;
2402 2402          cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2403 2403  
2404 2404          (void) __cpuid_insn(cp);
2405 2405  
2406 2406          /*
2407 2407           * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2408 2408           * indicates that the extended topology enumeration leaf is
2409 2409           * available.
2410 2410           */
2411 2411          if (cp->cp_ebx != 0) {
2412 2412                  uint32_t x2apic_id = 0;
2413 2413                  uint_t coreid_shift = 0;
2414 2414                  uint_t ncpu_per_core = 1;
2415 2415                  uint_t chipid_shift = 0;
2416 2416                  uint_t ncpu_per_chip = 1;
2417 2417                  uint_t i;
2418 2418                  uint_t level;
2419 2419  
2420 2420                  for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2421 2421                          cp->cp_eax = 0xB;
2422 2422                          cp->cp_ecx = i;
2423 2423  
2424 2424                          (void) __cpuid_insn(cp);
2425 2425                          level = CPI_CPU_LEVEL_TYPE(cp);
2426 2426  
2427 2427                          if (level == 1) {
2428 2428                                  x2apic_id = cp->cp_edx;
2429 2429                                  coreid_shift = BITX(cp->cp_eax, 4, 0);
2430 2430                                  ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2431 2431                          } else if (level == 2) {
2432 2432                                  x2apic_id = cp->cp_edx;
2433 2433                                  chipid_shift = BITX(cp->cp_eax, 4, 0);
2434 2434                                  ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2435 2435                          }
2436 2436                  }
2437 2437  
2438 2438                  /*
2439 2439                   * cpi_apicid is taken care of in cpuid_gather_apicid.
2440 2440                   */
2441 2441                  cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2442 2442                  cpi->cpi_ncore_per_chip = ncpu_per_chip /
2443 2443                      ncpu_per_core;
2444 2444                  cpi->cpi_chipid = x2apic_id >> chipid_shift;
2445 2445                  cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2446 2446                  cpi->cpi_coreid = x2apic_id >> coreid_shift;
2447 2447                  cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2448 2448                  cpi->cpi_procnodeid = cpi->cpi_chipid;
2449 2449                  cpi->cpi_compunitid = cpi->cpi_coreid;
2450 2450  
2451 2451                  if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2452 2452                          cpi->cpi_nthread_bits = coreid_shift;
2453 2453                          cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2454 2454                  }
2455 2455  
2456 2456                  return (B_TRUE);
2457 2457          } else {
2458 2458                  return (B_FALSE);
2459 2459          }
2460 2460  }
2461 2461  
2462 2462  static void
2463 2463  cpuid_intel_getids(cpu_t *cpu, void *feature)
2464 2464  {
2465 2465          uint_t i;
2466 2466          uint_t chipid_shift = 0;
2467 2467          uint_t coreid_shift = 0;
2468 2468          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2469 2469  
2470 2470          /*
2471 2471           * There are no compute units or processor nodes currently on Intel.
2472 2472           * Always set these to one.
2473 2473           */
2474 2474          cpi->cpi_procnodes_per_pkg = 1;
2475 2475          cpi->cpi_cores_per_compunit = 1;
2476 2476  
2477 2477          /*
2478 2478           * If cpuid Leaf B is present, use that to try and get this information.
2479 2479           * It will be the most accurate for Intel CPUs.
2480 2480           */
2481 2481          if (cpuid_leafB_getids(cpu))
2482 2482                  return;
2483 2483  
2484 2484          /*
2485 2485           * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2486 2486           * and ncore_per_chip. These represent the largest power of two values
2487 2487           * that we need to cover all of the IDs in the system. Therefore, we use
2488 2488           * those values to seed the number of bits needed to cover information
2489 2489           * in the case when leaf B is not available. These values will probably
2490 2490           * be larger than required, but that's OK.
2491 2491           */
2492 2492          cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2493 2493          cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2494 2494  
2495 2495          for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2496 2496                  chipid_shift++;
2497 2497  
2498 2498          cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2499 2499          cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2500 2500  
2501 2501          if (is_x86_feature(feature, X86FSET_CMP)) {
2502 2502                  /*
2503 2503                   * Multi-core (and possibly multi-threaded)
2504 2504                   * processors.
2505 2505                   */
2506 2506                  uint_t ncpu_per_core = 0;
2507 2507  
2508 2508                  if (cpi->cpi_ncore_per_chip == 1)
2509 2509                          ncpu_per_core = cpi->cpi_ncpu_per_chip;
2510 2510                  else if (cpi->cpi_ncore_per_chip > 1)
2511 2511                          ncpu_per_core = cpi->cpi_ncpu_per_chip /
2512 2512                              cpi->cpi_ncore_per_chip;
2513 2513                  /*
2514 2514                   * 8bit APIC IDs on dual core Pentiums
2515 2515                   * look like this:
2516 2516                   *
2517 2517                   * +-----------------------+------+------+
2518 2518                   * | Physical Package ID   |  MC  |  HT  |
2519 2519                   * +-----------------------+------+------+
2520 2520                   * <------- chipid -------->
2521 2521                   * <------- coreid --------------->
2522 2522                   *                         <--- clogid -->
2523 2523                   *                         <------>
2524 2524                   *                         pkgcoreid
2525 2525                   *
2526 2526                   * Where the number of bits necessary to
2527 2527                   * represent MC and HT fields together equals
2528 2528                   * to the minimum number of bits necessary to
2529 2529                   * store the value of cpi->cpi_ncpu_per_chip.
2530 2530                   * Of those bits, the MC part uses the number
2531 2531                   * of bits necessary to store the value of
2532 2532                   * cpi->cpi_ncore_per_chip.
2533 2533                   */
2534 2534                  for (i = 1; i < ncpu_per_core; i <<= 1)
2535 2535                          coreid_shift++;
2536 2536                  cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2537 2537                  cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2538 2538          } else if (is_x86_feature(feature, X86FSET_HTT)) {
2539 2539                  /*
2540 2540                   * Single-core multi-threaded processors.
2541 2541                   */
2542 2542                  cpi->cpi_coreid = cpi->cpi_chipid;
2543 2543                  cpi->cpi_pkgcoreid = 0;
2544 2544          } else {
2545 2545                  /*
2546 2546                   * Single-core single-thread processors.
2547 2547                   */
2548 2548                  cpi->cpi_coreid = cpu->cpu_id;
2549 2549                  cpi->cpi_pkgcoreid = 0;
2550 2550          }
2551 2551          cpi->cpi_procnodeid = cpi->cpi_chipid;
2552 2552          cpi->cpi_compunitid = cpi->cpi_coreid;
2553 2553  }
2554 2554  
2555 2555  /*
2556 2556   * Historically, AMD has had CMP chips with only a single thread per core.
2557 2557   * However, starting in family 17h (Zen), this has changed and they now have
2558 2558   * multiple threads. Our internal core id needs to be a unique value.
2559 2559   *
2560 2560   * To determine the core id of an AMD system, if we're from a family before 17h,
2561 2561   * then we just use the cpu id, as that gives us a good value that will be
2562 2562   * unique for each core. If instead, we're on family 17h or later, then we need
2563 2563   * to do something more complicated. CPUID leaf 0x8000001e can tell us
2564 2564   * how many threads are in the system. Based on that, we'll shift the APIC ID.
2565 2565   * We can't use the normal core id in that leaf as it's only unique within the
2566 2566   * socket, which is perfect for cpi_pkgcoreid, but not us.
2567 2567   */
2568 2568  static id_t
2569 2569  cpuid_amd_get_coreid(cpu_t *cpu)
2570 2570  {
2571 2571          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2572 2572  
2573 2573          if (cpi->cpi_family >= 0x17 &&
2574 2574              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2575 2575              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2576 2576                  uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2577 2577                  if (nthreads > 1) {
2578 2578                          VERIFY3U(nthreads, ==, 2);
2579 2579                          return (cpi->cpi_apicid >> 1);
2580 2580                  }
2581 2581          }
2582 2582  
2583 2583          return (cpu->cpu_id);
2584 2584  }
2585 2585  
2586 2586  /*
2587 2587   * IDs on AMD is a more challenging task. This is notable because of the
2588 2588   * following two facts:
2589 2589   *
2590 2590   *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2591 2591   *     also no way to get an actual unique core id from the system. As such, we
2592 2592   *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2593 2593   *     however, guarantee that sibling cores of a chip will have sequential
2594 2594   *     coreids starting at a multiple of the number of cores per chip - that is
2595 2595   *     usually the case, but if the APIC IDs have been set up in a different
2596 2596   *     order then we need to perform a few more gymnastics for the pkgcoreid.
2597 2597   *
2598 2598   *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2599 2599   *     called compute units. These compute units share the L1I cache, L2 cache,
2600 2600   *     and the FPU. To deal with this, a new topology leaf was added in
2601 2601   *     0x8000001e. However, parts of this leaf have different meanings
2602 2602   *     once we get to family 0x17.
2603 2603   */
2604 2604  
2605 2605  static void
2606 2606  cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2607 2607  {
2608 2608          int i, first_half, coreidsz;
2609 2609          uint32_t nb_caps_reg;
2610 2610          uint_t node2_1;
2611 2611          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2612 2612          struct cpuid_regs *cp;
2613 2613  
2614 2614          /*
2615 2615           * Calculate the core id (this comes from hardware in family 0x17 if it
2616 2616           * hasn't been stripped by virtualization). We always set the compute
2617 2617           * unit id to the same value. Also, initialize the default number of
2618 2618           * cores per compute unit and nodes per package. This will be
2619 2619           * overwritten when we know information about a particular family.
2620 2620           */
2621 2621          cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2622 2622          cpi->cpi_compunitid = cpi->cpi_coreid;
2623 2623          cpi->cpi_cores_per_compunit = 1;
2624 2624          cpi->cpi_procnodes_per_pkg = 1;
2625 2625  
2626 2626          /*
2627 2627           * To construct the logical ID, we need to determine how many APIC IDs
2628 2628           * are dedicated to the cores and threads. This is provided for us in
2629 2629           * 0x80000008. However, if it's not present (say due to virtualization),
2630 2630           * then we assume it's one. This should be present on all 64-bit AMD
2631 2631           * processors.  It was added in family 0xf (Hammer).
2632 2632           */
2633 2633          if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2634 2634                  coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2635 2635  
2636 2636                  /*
2637 2637                   * In AMD parlance chip is really a node while illumos
2638 2638                   * uses chip as equivalent to socket/package.
2639 2639                   */
2640 2640                  if (coreidsz == 0) {
2641 2641                          /* Use legacy method */
2642 2642                          for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2643 2643                                  coreidsz++;
2644 2644                          if (coreidsz == 0)
2645 2645                                  coreidsz = 1;
2646 2646                  }
2647 2647          } else {
2648 2648                  /* Assume single-core part */
2649 2649                  coreidsz = 1;
2650 2650          }
2651 2651          cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2652 2652  
2653 2653          /*
2654 2654           * The package core ID varies depending on the family. While it may be
2655 2655           * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2656 2656           * this value is the core id in the given node. For non-virtualized
2657 2657           * family 17h, we need to take the logical core id and shift off the
2658 2658           * threads like we do when getting the core id.  Otherwise, we can use
2659 2659           * the clogid as is. When family 17h is virtualized, the clogid should
2660 2660           * be sufficient as if we don't have valid data in the leaf, then we
2661 2661           * won't think we have SMT, in which case the cpi_clogid should be
2662 2662           * sufficient.
2663 2663           */
2664 2664          if (cpi->cpi_family >= 0x17 &&
2665 2665              is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2666 2666              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2667 2667              cpi->cpi_extd[0x1e].cp_ebx != 0) {
2668 2668                  uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2669 2669                  if (nthreads > 1) {
2670 2670                          VERIFY3U(nthreads, ==, 2);
2671 2671                          cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2672 2672                  } else {
2673 2673                          cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2674 2674                  }
2675 2675          } else {
2676 2676                  cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2677 2677          }
2678 2678  
2679 2679          /*
2680 2680           * Obtain the node ID and compute unit IDs. If we're on family 0x15
2681 2681           * (bulldozer) or newer, then we can derive all of this from leaf
2682 2682           * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2683 2683           */
2684 2684          if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2685 2685              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2686 2686                  cp = &cpi->cpi_extd[0x1e];
2687 2687  
2688 2688                  cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2689 2689                  cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2690 2690  
2691 2691                  /*
2692 2692                   * For Bulldozer-era CPUs, recalculate the compute unit
2693 2693                   * information.
2694 2694                   */
2695 2695                  if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2696 2696                          cpi->cpi_cores_per_compunit =
2697 2697                              BITX(cp->cp_ebx, 15, 8) + 1;
2698 2698                          cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2699 2699                              (cpi->cpi_ncore_per_chip /
2700 2700                              cpi->cpi_cores_per_compunit) *
2701 2701                              (cpi->cpi_procnodeid /
2702 2702                              cpi->cpi_procnodes_per_pkg);
2703 2703                  }
2704 2704          } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2705 2705                  cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2706 2706          } else if (cpi->cpi_family == 0x10) {
2707 2707                  /*
2708 2708                   * See if we are a multi-node processor.
2709 2709                   * All processors in the system have the same number of nodes
2710 2710                   */
2711 2711                  nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2712 2712                  if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2713 2713                          /* Single-node */
2714 2714                          cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2715 2715                              coreidsz);
2716 2716                  } else {
2717 2717  
2718 2718                          /*
2719 2719                           * Multi-node revision D (2 nodes per package
2720 2720                           * are supported)
2721 2721                           */
2722 2722                          cpi->cpi_procnodes_per_pkg = 2;
2723 2723  
2724 2724                          first_half = (cpi->cpi_pkgcoreid <=
2725 2725                              (cpi->cpi_ncore_per_chip/2 - 1));
2726 2726  
2727 2727                          if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2728 2728                                  /* We are BSP */
2729 2729                                  cpi->cpi_procnodeid = (first_half ? 0 : 1);
2730 2730                          } else {
2731 2731  
2732 2732                                  /* We are AP */
2733 2733                                  /* NodeId[2:1] bits to use for reading F3xe8 */
2734 2734                                  node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2735 2735  
2736 2736                                  nb_caps_reg =
2737 2737                                      pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2738 2738  
2739 2739                                  /*
2740 2740                                   * Check IntNodeNum bit (31:30, but bit 31 is
2741 2741                                   * always 0 on dual-node processors)
2742 2742                                   */
2743 2743                                  if (BITX(nb_caps_reg, 30, 30) == 0)
2744 2744                                          cpi->cpi_procnodeid = node2_1 +
2745 2745                                              !first_half;
2746 2746                                  else
2747 2747                                          cpi->cpi_procnodeid = node2_1 +
2748 2748                                              first_half;
2749 2749                          }
2750 2750                  }
2751 2751          } else {
2752 2752                  cpi->cpi_procnodeid = 0;
2753 2753          }
2754 2754  
2755 2755          cpi->cpi_chipid =
2756 2756              cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2757 2757  
2758 2758          cpi->cpi_ncore_bits = coreidsz;
2759 2759          cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2760 2760              cpi->cpi_ncore_per_chip);
2761 2761  }
2762 2762  
2763 2763  static void
2764 2764  spec_uarch_flush_noop(void)
2765 2765  {
2766 2766  }
2767 2767  
2768 2768  /*
2769 2769   * When microcode is present that mitigates MDS, this wrmsr will also flush the
2770 2770   * MDS-related micro-architectural state that would normally happen by calling
2771 2771   * x86_md_clear().
2772 2772   */
2773 2773  static void
2774 2774  spec_uarch_flush_msr(void)
2775 2775  {
2776 2776          wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2777 2777  }
2778 2778  
2779 2779  /*
2780 2780   * This function points to a function that will flush certain
2781 2781   * micro-architectural state on the processor. This flush is used to mitigate
2782 2782   * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2783 2783   * function can point to one of three functions:
2784 2784   *
2785 2785   * - A noop which is done because we either are vulnerable, but do not have
2786 2786   *   microcode available to help deal with a fix, or because we aren't
2787 2787   *   vulnerable.
2788 2788   *
2789 2789   * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2790 2790   *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2791 2791   *   however, it only flushes the MDS related micro-architectural state on the
2792 2792   *   current hyperthread, it does not do anything for the twin.
2793 2793   *
2794 2794   * - x86_md_clear which will flush the MDS related state. This is done when we
2795 2795   *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2796 2796   *   (RDCL_NO is set).
2797 2797   */
2798 2798  void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2799 2799  
2800 2800  static void
2801 2801  cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2802 2802  {
2803 2803          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2804 2804  
2805 2805          /*
2806 2806           * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2807 2807           * has been fixed in hardware, it doesn't cover everything related to
2808 2808           * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2809 2809           * need to mitigate this.
2810 2810           */
2811 2811          if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2812 2812              is_x86_feature(featureset, X86FSET_MDS_NO)) {
2813 2813                  return;
2814 2814          }
2815 2815  
2816 2816          if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2817 2817                  const uint8_t nop = NOP_INSTR;
2818 2818                  uint8_t *md = (uint8_t *)x86_md_clear;
2819 2819  
2820 2820                  *md = nop;
2821 2821          }
2822 2822  
2823 2823          membar_producer();
2824 2824  }
2825 2825  
2826 2826  static void
2827 2827  cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2828 2828  {
2829 2829          boolean_t need_l1d, need_mds;
2830 2830          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2831 2831  
2832 2832          /*
2833 2833           * If we're not on Intel or we've mitigated both RDCL and MDS in
2834 2834           * hardware, then there's nothing left for us to do for enabling the
2835 2835           * flush. We can also go ahead and say that SMT exclusion is
2836 2836           * unnecessary.
2837 2837           */
2838 2838          if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2839 2839              (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2840 2840              is_x86_feature(featureset, X86FSET_MDS_NO))) {
2841 2841                  extern int smt_exclusion;
2842 2842                  smt_exclusion = 0;
2843 2843                  spec_uarch_flush = spec_uarch_flush_noop;
2844 2844                  membar_producer();
2845 2845                  return;
2846 2846          }
2847 2847  
2848 2848          /*
2849 2849           * The locations where we need to perform an L1D flush are required both
2850 2850           * for mitigating L1TF and MDS. When verw support is present in
2851 2851           * microcode, then the L1D flush will take care of doing that as well.
2852 2852           * However, if we have a system where RDCL_NO is present, but we don't
2853 2853           * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2854 2854           * L1D flush.
2855 2855           */
2856 2856          if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2857 2857              is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2858 2858              !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2859 2859                  need_l1d = B_TRUE;
2860 2860          } else {
2861 2861                  need_l1d = B_FALSE;
2862 2862          }
2863 2863  
2864 2864          if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2865 2865              is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2866 2866                  need_mds = B_TRUE;
2867 2867          } else {
2868 2868                  need_mds = B_FALSE;
2869 2869          }
2870 2870  
2871 2871          if (need_l1d) {
2872 2872                  spec_uarch_flush = spec_uarch_flush_msr;
2873 2873          } else if (need_mds) {
2874 2874                  spec_uarch_flush = x86_md_clear;
2875 2875          } else {
2876 2876                  /*
2877 2877                   * We have no hardware mitigations available to us.
2878 2878                   */
2879 2879                  spec_uarch_flush = spec_uarch_flush_noop;
2880 2880          }
2881 2881          membar_producer();
2882 2882  }
2883 2883  
2884 2884  /*
2885 2885   * We default to enabling RSB mitigations.
2886 2886   *
2887 2887   * NOTE: We used to skip RSB mitigations with eIBRS, but developments around
2888 2888   * post-barrier RSB guessing suggests we should enable RSB mitigations always
2889 2889   * unless specifically instructed not to.
2890 2890   *
2891 2891   * AMD indicates that when Automatic IBRS is enabled we do not need to implement
2892 2892   * return stack buffer clearing for VMEXIT as it takes care of it. The manual
2893 2893   * also states that as long as SMEP and we maintain at least one page between
2894 2894   * the kernel and user space (we have much more of a red zone), then we do not
2895 2895   * need to clear the RSB. We constrain this to only when Automatic IRBS is
2896 2896   * present.
2897 2897   */
2898 2898  static void
2899 2899  cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2900 2900  {
2901 2901          const uint8_t ret = RET_INSTR;
2902 2902          uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2903 2903  
2904 2904          switch (mit) {
2905 2905          case X86_SPECTREV2_AUTO_IBRS:
2906 2906          case X86_SPECTREV2_DISABLED:
2907 2907                  *stuff = ret;
2908 2908                  break;
2909 2909          default:
2910 2910                  break;
2911 2911          }
2912 2912  }
2913 2913  
2914 2914  static void
2915 2915  cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2916 2916  {
2917 2917          const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2918 2918              "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2919 2919              "_r14", "_r15" };
2920 2920          const uint_t nthunks = ARRAY_SIZE(thunks);
2921 2921          const char *type;
2922 2922          uint_t i;
2923 2923  
2924 2924          if (mit == x86_spectrev2_mitigation)
2925 2925                  return;
2926 2926  
2927 2927          switch (mit) {
2928 2928          case X86_SPECTREV2_RETPOLINE:
2929 2929                  type = "gen";
2930 2930                  break;
2931 2931          case X86_SPECTREV2_AUTO_IBRS:
2932 2932          case X86_SPECTREV2_ENHANCED_IBRS:
2933 2933          case X86_SPECTREV2_DISABLED:
2934 2934                  type = "jmp";
2935 2935                  break;
2936 2936          default:
2937 2937                  panic("asked to update retpoline state with unknown state!");
2938 2938          }
2939 2939  
2940 2940          for (i = 0; i < nthunks; i++) {
2941 2941                  uintptr_t source, dest;
2942 2942                  int ssize, dsize;
2943 2943                  char sourcebuf[64], destbuf[64];
2944 2944  
2945 2945                  (void) snprintf(destbuf, sizeof (destbuf),
2946 2946                      "__x86_indirect_thunk%s", thunks[i]);
2947 2947                  (void) snprintf(sourcebuf, sizeof (sourcebuf),
2948 2948                      "__x86_indirect_thunk_%s%s", type, thunks[i]);
2949 2949  
2950 2950                  source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2951 2951                  dest = kobj_getelfsym(destbuf, NULL, &dsize);
2952 2952                  VERIFY3U(source, !=, 0);
2953 2953                  VERIFY3U(dest, !=, 0);
2954 2954                  VERIFY3S(dsize, >=, ssize);
2955 2955                  bcopy((void *)source, (void *)dest, ssize);
2956 2956          }
2957 2957  }
2958 2958  
2959 2959  static void
2960 2960  cpuid_enable_enhanced_ibrs(void)
2961 2961  {
2962 2962          uint64_t val;
2963 2963  
2964 2964          val = rdmsr(MSR_IA32_SPEC_CTRL);
2965 2965          val |= IA32_SPEC_CTRL_IBRS;
2966 2966          wrmsr(MSR_IA32_SPEC_CTRL, val);
2967 2967  }
2968 2968  
2969 2969  static void
2970 2970  cpuid_enable_auto_ibrs(void)
2971 2971  {
2972 2972          uint64_t val;
2973 2973  
2974 2974          val = rdmsr(MSR_AMD_EFER);
2975 2975          val |= AMD_EFER_AIBRSE;
2976 2976          wrmsr(MSR_AMD_EFER, val);
2977 2977  }
2978 2978  
2979 2979  /*
2980 2980   * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2981 2981   * we can disable TSX, we do so.
2982 2982   *
2983 2983   * This determination is done only on the boot CPU, potentially after loading
2984 2984   * updated microcode.
2985 2985   */
2986 2986  static void
2987 2987  cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2988 2988  {
2989 2989          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2990 2990  
2991 2991          VERIFY(cpu->cpu_id == 0);
2992 2992  
2993 2993          if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2994 2994                  x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2995 2995                  return;
2996 2996          }
2997 2997  
2998 2998          if (x86_disable_taa) {
2999 2999                  x86_taa_mitigation = X86_TAA_DISABLED;
3000 3000                  return;
3001 3001          }
3002 3002  
3003 3003          /*
3004 3004           * If we do not have the ability to disable TSX, then our only
3005 3005           * mitigation options are in hardware (TAA_NO), or by using our existing
3006 3006           * MDS mitigation as described above.  The latter relies upon us having
3007 3007           * configured MDS mitigations correctly! This includes disabling SMT if
3008 3008           * we want to cross-CPU-thread protection.
3009 3009           */
3010 3010          if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3011 3011                  /*
3012 3012                   * It's not clear whether any parts will enumerate TAA_NO
3013 3013                   * *without* TSX_CTRL, but let's mark it as such if we see this.
3014 3014                   */
3015 3015                  if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3016 3016                          x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3017 3017                          return;
3018 3018                  }
3019 3019  
3020 3020                  if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3021 3021                      !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3022 3022                          x86_taa_mitigation = X86_TAA_MD_CLEAR;
3023 3023                  } else {
3024 3024                          x86_taa_mitigation = X86_TAA_NOTHING;
3025 3025                  }
3026 3026                  return;
3027 3027          }
3028 3028  
3029 3029          /*
3030 3030           * We have TSX_CTRL, but we can only fully disable TSX if we're early
3031 3031           * enough in boot.
3032 3032           *
3033 3033           * Otherwise, we'll fall back to causing transactions to abort as our
3034 3034           * mitigation. TSX-using code will always take the fallback path.
3035 3035           */
3036 3036          if (cpi->cpi_pass < 4) {
3037 3037                  x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3038 3038          } else {
3039 3039                  x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3040 3040          }
3041 3041  }
3042 3042  
3043 3043  /*
3044 3044   * As mentioned, we should only touch the MSR when we've got a suitable
3045 3045   * microcode loaded on this CPU.
3046 3046   */
3047 3047  static void
3048 3048  cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3049 3049  {
3050 3050          uint64_t val;
3051 3051  
3052 3052          switch (taa) {
3053 3053          case X86_TAA_TSX_DISABLE:
3054 3054                  if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3055 3055                          return;
3056 3056                  val = rdmsr(MSR_IA32_TSX_CTRL);
3057 3057                  val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3058 3058                  wrmsr(MSR_IA32_TSX_CTRL, val);
3059 3059                  break;
3060 3060          case X86_TAA_TSX_FORCE_ABORT:
3061 3061                  if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3062 3062                          return;
3063 3063                  val = rdmsr(MSR_IA32_TSX_CTRL);
3064 3064                  val |= IA32_TSX_CTRL_RTM_DISABLE;
3065 3065                  wrmsr(MSR_IA32_TSX_CTRL, val);
3066 3066                  break;
3067 3067          case X86_TAA_HW_MITIGATED:
3068 3068          case X86_TAA_MD_CLEAR:
3069 3069          case X86_TAA_DISABLED:
3070 3070          case X86_TAA_NOTHING:
3071 3071                  break;
3072 3072          }
3073 3073  }
3074 3074  
3075 3075  static void
3076 3076  cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3077 3077  {
3078 3078          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3079 3079          x86_spectrev2_mitigation_t v2mit;
3080 3080  
3081 3081          if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3082 3082              cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3083 3083              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3084 3084                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3085 3085                          add_x86_feature(featureset, X86FSET_IBPB);
3086 3086                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3087 3087                          add_x86_feature(featureset, X86FSET_IBRS);
3088 3088                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3089 3089                          add_x86_feature(featureset, X86FSET_STIBP);
3090 3090                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3091 3091                          add_x86_feature(featureset, X86FSET_STIBP_ALL);
3092 3092                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3093 3093                          add_x86_feature(featureset, X86FSET_SSBD);
3094 3094                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3095 3095                          add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3096 3096                  if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3097 3097                          add_x86_feature(featureset, X86FSET_SSB_NO);
3098 3098  
3099 3099                  /*
3100 3100                   * Rather than Enhanced IBRS, AMD has a different feature that
3101 3101                   * is a bit in EFER that can be enabled and will basically do
3102 3102                   * the right thing while executing in the kernel.
3103 3103                   */
3104 3104                  if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3105 3105                      (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3106 3106                      cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3107 3107                      (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3108 3108                          add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3109 3109                  }
3110 3110  
3111 3111          } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3112 3112              cpi->cpi_maxeax >= 7) {
3113 3113                  struct cpuid_regs *ecp;
3114 3114                  ecp = &cpi->cpi_std[7];
3115 3115  
3116 3116                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3117 3117                          add_x86_feature(featureset, X86FSET_MD_CLEAR);
3118 3118                  }
3119 3119  
3120 3120                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3121 3121                          add_x86_feature(featureset, X86FSET_IBRS);
3122 3122                          add_x86_feature(featureset, X86FSET_IBPB);
3123 3123                  }
3124 3124  
3125 3125                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3126 3126                          add_x86_feature(featureset, X86FSET_STIBP);
3127 3127                  }
3128 3128  
3129 3129                  /*
3130 3130                   * Don't read the arch caps MSR on xpv where we lack the
3131 3131                   * on_trap().
3132 3132                   */
3133 3133  #ifndef __xpv
3134 3134                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3135 3135                          on_trap_data_t otd;
3136 3136  
3137 3137                          /*
3138 3138                           * Be paranoid and assume we'll get a #GP.
3139 3139                           */
3140 3140                          if (!on_trap(&otd, OT_DATA_ACCESS)) {
3141 3141                                  uint64_t reg;
3142 3142  
3143 3143                                  reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3144 3144                                  if (reg & IA32_ARCH_CAP_RDCL_NO) {
3145 3145                                          add_x86_feature(featureset,
3146 3146                                              X86FSET_RDCL_NO);
3147 3147                                  }
3148 3148                                  if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3149 3149                                          add_x86_feature(featureset,
3150 3150                                              X86FSET_IBRS_ALL);
3151 3151                                  }
3152 3152                                  if (reg & IA32_ARCH_CAP_RSBA) {
3153 3153                                          add_x86_feature(featureset,
3154 3154                                              X86FSET_RSBA);
3155 3155                                  }
3156 3156                                  if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3157 3157                                          add_x86_feature(featureset,
3158 3158                                              X86FSET_L1D_VM_NO);
3159 3159                                  }
3160 3160                                  if (reg & IA32_ARCH_CAP_SSB_NO) {
3161 3161                                          add_x86_feature(featureset,
3162 3162                                              X86FSET_SSB_NO);
3163 3163                                  }
3164 3164                                  if (reg & IA32_ARCH_CAP_MDS_NO) {
3165 3165                                          add_x86_feature(featureset,
3166 3166                                              X86FSET_MDS_NO);
3167 3167                                  }
3168 3168                                  if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3169 3169                                          add_x86_feature(featureset,
3170 3170                                              X86FSET_TSX_CTRL);
3171 3171                                  }
3172 3172                                  if (reg & IA32_ARCH_CAP_TAA_NO) {
3173 3173                                          add_x86_feature(featureset,
3174 3174                                              X86FSET_TAA_NO);
3175 3175                                  }
3176 3176                          }
3177 3177                          no_trap();
3178 3178                  }
3179 3179  #endif  /* !__xpv */
3180 3180  
3181 3181                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3182 3182                          add_x86_feature(featureset, X86FSET_SSBD);
3183 3183  
3184 3184                  if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3185 3185                          add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3186 3186          }
3187 3187  
3188 3188          /*
3189 3189           * Take care of certain mitigations on the non-boot CPU. The boot CPU
3190 3190           * will have already run this function and determined what we need to
3191 3191           * do. This gives us a hook for per-HW thread mitigations such as
3192 3192           * enhanced IBRS, or disabling TSX.
3193 3193           */
3194 3194          if (cpu->cpu_id != 0) {
3195 3195                  switch (x86_spectrev2_mitigation) {
3196 3196                  case X86_SPECTREV2_ENHANCED_IBRS:
3197 3197                          cpuid_enable_enhanced_ibrs();
3198 3198                          break;
3199 3199                  case X86_SPECTREV2_AUTO_IBRS:
3200 3200                          cpuid_enable_auto_ibrs();
3201 3201                          break;
3202 3202                  default:
3203 3203                          break;
3204 3204                  }
3205 3205  
3206 3206                  cpuid_apply_tsx(x86_taa_mitigation, featureset);
3207 3207                  return;
3208 3208          }
3209 3209  
3210 3210          /*
3211 3211           * Go through and initialize various security mechanisms that we should
3212 3212           * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3213 3213           * TAA.
3214 3214           */
3215 3215  
3216 3216          /*
3217 3217           * By default we've come in with retpolines enabled. Check whether we
3218 3218           * should disable them or enable enhanced or automatic IBRS. RSB
3219 3219           * stuffing is enabled by default. Note, we do not allow the use of AMD
3220 3220           * optimized retpolines as it was disclosed by AMD in March 2022 that
3221 3221           * they were still vulnerable. Prior to that point, we used them.
3222 3222           */
3223 3223          if (x86_disable_spectrev2 != 0) {
3224 3224                  v2mit = X86_SPECTREV2_DISABLED;
3225 3225          } else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3226 3226                  cpuid_enable_auto_ibrs();
3227 3227                  v2mit = X86_SPECTREV2_AUTO_IBRS;
3228 3228          } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3229 3229                  cpuid_enable_enhanced_ibrs();
3230 3230                  v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3231 3231          } else {
3232 3232                  v2mit = X86_SPECTREV2_RETPOLINE;
3233 3233          }
3234 3234  
3235 3235          cpuid_patch_retpolines(v2mit);
3236 3236          cpuid_patch_rsb(v2mit);
3237 3237          x86_spectrev2_mitigation = v2mit;
3238 3238          membar_producer();
3239 3239  
3240 3240          /*
3241 3241           * We need to determine what changes are required for mitigating L1TF
3242 3242           * and MDS. If the CPU suffers from either of them, then SMT exclusion
3243 3243           * is required.
3244 3244           *
3245 3245           * If any of these are present, then we need to flush u-arch state at
3246 3246           * various points. For MDS, we need to do so whenever we change to a
3247 3247           * lesser privilege level or we are halting the CPU. For L1TF we need to
3248 3248           * flush the L1D cache at VM entry. When we have microcode that handles
3249 3249           * MDS, the L1D flush also clears the other u-arch state that the
3250 3250           * md_clear does.
3251 3251           */
3252 3252  
3253 3253          /*
3254 3254           * Update whether or not we need to be taking explicit action against
3255 3255           * MDS.
3256 3256           */
3257 3257          cpuid_update_md_clear(cpu, featureset);
3258 3258  
3259 3259          /*
3260 3260           * Determine whether SMT exclusion is required and whether or not we
3261 3261           * need to perform an l1d flush.
3262 3262           */
3263 3263          cpuid_update_l1d_flush(cpu, featureset);
3264 3264  
3265 3265          /*
3266 3266           * Determine what our mitigation strategy should be for TAA and then
3267 3267           * also apply TAA mitigations.
3268 3268           */
3269 3269          cpuid_update_tsx(cpu, featureset);
3270 3270          cpuid_apply_tsx(x86_taa_mitigation, featureset);
3271 3271  }
3272 3272  
3273 3273  /*
3274 3274   * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3275 3275   */
3276 3276  void
3277 3277  setup_xfem(void)
3278 3278  {
3279 3279          uint64_t flags = XFEATURE_LEGACY_FP;
3280 3280  
3281 3281          ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3282 3282  
3283 3283          if (is_x86_feature(x86_featureset, X86FSET_SSE))
3284 3284                  flags |= XFEATURE_SSE;
3285 3285  
3286 3286          if (is_x86_feature(x86_featureset, X86FSET_AVX))
3287 3287                  flags |= XFEATURE_AVX;
3288 3288  
3289 3289          if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3290 3290                  flags |= XFEATURE_AVX512;
3291 3291  
3292 3292          set_xcr(XFEATURE_ENABLED_MASK, flags);
3293 3293  
3294 3294          xsave_bv_all = flags;
3295 3295  }
3296 3296  
3297 3297  static void
3298 3298  cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3299 3299  {
3300 3300          struct cpuid_info *cpi;
3301 3301  
3302 3302          cpi = cpu->cpu_m.mcpu_cpi;
3303 3303  
3304 3304          if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3305 3305              cpi->cpi_vendor == X86_VENDOR_HYGON) {
3306 3306                  cpuid_gather_amd_topology_leaves(cpu);
3307 3307          }
3308 3308  
3309 3309          cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3310 3310  
3311 3311          /*
3312 3312           * Before we can calculate the IDs that we should assign to this
3313 3313           * processor, we need to understand how many cores and threads it has.
3314 3314           */
3315 3315          switch (cpi->cpi_vendor) {
3316 3316          case X86_VENDOR_Intel:
3317 3317                  cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3318 3318                      &cpi->cpi_ncore_per_chip);
3319 3319                  break;
3320 3320          case X86_VENDOR_AMD:
3321 3321          case X86_VENDOR_HYGON:
3322 3322                  cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3323 3323                      &cpi->cpi_ncore_per_chip);
3324 3324                  break;
3325 3325          default:
3326 3326                  /*
3327 3327                   * If we have some other x86 compatible chip, it's not clear how
3328 3328                   * they would behave. The most common case is virtualization
3329 3329                   * today, though there are also 64-bit VIA chips. Assume that
3330 3330                   * all we can get is the basic Leaf 1 HTT information.
3331 3331                   */
3332 3332                  if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3333 3333                          cpi->cpi_ncore_per_chip = 1;
3334 3334                          cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3335 3335                  }
3336 3336                  break;
3337 3337          }
3338 3338  
3339 3339          /*
3340 3340           * Based on the calculated number of threads and cores, potentially
3341 3341           * assign the HTT and CMT features.
3342 3342           */
3343 3343          if (cpi->cpi_ncore_per_chip > 1) {
3344 3344                  add_x86_feature(featureset, X86FSET_CMP);
3345 3345          }
3346 3346  
3347 3347          if (cpi->cpi_ncpu_per_chip > 1 &&
3348 3348              cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3349 3349                  add_x86_feature(featureset, X86FSET_HTT);
3350 3350          }
3351 3351  
3352 3352          /*
3353 3353           * Now that has been set up, we need to go through and calculate all of
3354 3354           * the rest of the parameters that exist. If we think the CPU doesn't
3355 3355           * have either SMT (HTT) or CMP, then we basically go through and fake
3356 3356           * up information in some way. The most likely case for this is
3357 3357           * virtualization where we have a lot of partial topology information.
3358 3358           */
3359 3359          if (!is_x86_feature(featureset, X86FSET_HTT) &&
3360 3360              !is_x86_feature(featureset, X86FSET_CMP)) {
3361 3361                  /*
3362 3362                   * This is a single core, single-threaded processor.
3363 3363                   */
3364 3364                  cpi->cpi_procnodes_per_pkg = 1;
3365 3365                  cpi->cpi_cores_per_compunit = 1;
3366 3366                  cpi->cpi_compunitid = 0;
3367 3367                  cpi->cpi_chipid = -1;
3368 3368                  cpi->cpi_clogid = 0;
3369 3369                  cpi->cpi_coreid = cpu->cpu_id;
3370 3370                  cpi->cpi_pkgcoreid = 0;
3371 3371                  if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3372 3372                      cpi->cpi_vendor == X86_VENDOR_HYGON) {
3373 3373                          cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3374 3374                  } else {
3375 3375                          cpi->cpi_procnodeid = cpi->cpi_chipid;
3376 3376                  }
3377 3377          } else {
3378 3378                  switch (cpi->cpi_vendor) {
3379 3379                  case X86_VENDOR_Intel:
3380 3380                          cpuid_intel_getids(cpu, featureset);
3381 3381                          break;
3382 3382                  case X86_VENDOR_AMD:
3383 3383                  case X86_VENDOR_HYGON:
3384 3384                          cpuid_amd_getids(cpu, featureset);
3385 3385                          break;
3386 3386                  default:
3387 3387                          /*
3388 3388                           * In this case, it's hard to say what we should do.
3389 3389                           * We're going to model them to the OS as single core
3390 3390                           * threads. We don't have a good identifier for them, so
3391 3391                           * we're just going to use the cpu id all on a single
3392 3392                           * chip.
3393 3393                           *
3394 3394                           * This case has historically been different from the
3395 3395                           * case above where we don't have HTT or CMP. While they
3396 3396                           * could be combined, we've opted to keep it separate to
3397 3397                           * minimize the risk of topology changes in weird cases.
3398 3398                           */
3399 3399                          cpi->cpi_procnodes_per_pkg = 1;
3400 3400                          cpi->cpi_cores_per_compunit = 1;
3401 3401                          cpi->cpi_chipid = 0;
3402 3402                          cpi->cpi_coreid = cpu->cpu_id;
3403 3403                          cpi->cpi_clogid = cpu->cpu_id;
3404 3404                          cpi->cpi_pkgcoreid = cpu->cpu_id;
3405 3405                          cpi->cpi_procnodeid = cpi->cpi_chipid;
3406 3406                          cpi->cpi_compunitid = cpi->cpi_coreid;
3407 3407                          break;
3408 3408                  }
3409 3409          }
3410 3410  }
3411 3411  
3412 3412  /*
3413 3413   * Gather relevant CPU features from leaf 6 which covers thermal information. We
3414 3414   * always gather leaf 6 if it's supported; however, we only look for features on
3415 3415   * Intel systems as AMD does not currently define any of the features we look
3416 3416   * for below.
3417 3417   */
3418 3418  static void
3419 3419  cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3420 3420  {
3421 3421          struct cpuid_regs *cp;
3422 3422          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3423 3423  
3424 3424          if (cpi->cpi_maxeax < 6) {
3425 3425                  return;
3426 3426          }
3427 3427  
3428 3428          cp = &cpi->cpi_std[6];
3429 3429          cp->cp_eax = 6;
3430 3430          cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3431 3431          (void) __cpuid_insn(cp);
3432 3432          platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3433 3433  
3434 3434          if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3435 3435                  return;
3436 3436          }
3437 3437  
3438 3438          if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3439 3439                  add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3440 3440          }
3441 3441  
3442 3442          if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3443 3443                  add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3444 3444          }
3445 3445  }
3446 3446  
3447 3447  /*
3448 3448   * This is used when we discover that we have AVX support in cpuid. This
3449 3449   * proceeds to scan for the rest of the AVX derived features.
3450 3450   */
3451 3451  static void
3452 3452  cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3453 3453  {
3454 3454          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3455 3455  
3456 3456          /*
3457 3457           * If we don't have AVX, don't bother with most of this.
3458 3458           */
3459 3459          if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3460 3460                  return;
3461 3461  
3462 3462          add_x86_feature(featureset, X86FSET_AVX);
3463 3463  
3464 3464          /*
3465 3465           * Intel says we can't check these without also
3466 3466           * checking AVX.
3467 3467           */
3468 3468          if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3469 3469                  add_x86_feature(featureset, X86FSET_F16C);
3470 3470  
3471 3471          if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3472 3472                  add_x86_feature(featureset, X86FSET_FMA);
3473 3473  
3474 3474          if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3475 3475                  add_x86_feature(featureset, X86FSET_BMI1);
3476 3476  
3477 3477          if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3478 3478                  add_x86_feature(featureset, X86FSET_BMI2);
3479 3479  
3480 3480          if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3481 3481                  add_x86_feature(featureset, X86FSET_AVX2);
3482 3482  
3483 3483          if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3484 3484                  add_x86_feature(featureset, X86FSET_VAES);
3485 3485  
3486 3486          if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3487 3487                  add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3488 3488  
3489 3489          /*
3490 3490           * The rest of the AVX features require AVX512. Do not check them unless
3491 3491           * it is present.
3492 3492           */
3493 3493          if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3494 3494                  return;
3495 3495          add_x86_feature(featureset, X86FSET_AVX512F);
3496 3496  
3497 3497          if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3498 3498                  add_x86_feature(featureset, X86FSET_AVX512DQ);
3499 3499  
3500 3500          if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3501 3501                  add_x86_feature(featureset, X86FSET_AVX512FMA);
3502 3502  
3503 3503          if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3504 3504                  add_x86_feature(featureset, X86FSET_AVX512PF);
3505 3505  
3506 3506          if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3507 3507                  add_x86_feature(featureset, X86FSET_AVX512ER);
3508 3508  
3509 3509          if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3510 3510                  add_x86_feature(featureset, X86FSET_AVX512CD);
3511 3511  
3512 3512          if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3513 3513                  add_x86_feature(featureset, X86FSET_AVX512BW);
3514 3514  
3515 3515          if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3516 3516                  add_x86_feature(featureset, X86FSET_AVX512VL);
3517 3517  
3518 3518          if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3519 3519                  add_x86_feature(featureset, X86FSET_AVX512VBMI);
3520 3520  
3521 3521          if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3522 3522                  add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3523 3523  
3524 3524          if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3525 3525                  add_x86_feature(featureset, X86FSET_AVX512VNNI);
3526 3526  
3527 3527          if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3528 3528                  add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3529 3529  
3530 3530          if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3531 3531                  add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3532 3532  
3533 3533          if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3534 3534                  add_x86_feature(featureset, X86FSET_AVX512NNIW);
3535 3535  
3536 3536          if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3537 3537                  add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3538 3538  
3539 3539          /*
3540 3540           * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3541 3541           * we don't need to.
3542 3542           */
3543 3543          if (cpi->cpi_std[7].cp_eax < 1)
3544 3544                  return;
3545 3545  
3546 3546          if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3547 3547                  add_x86_feature(featureset, X86FSET_AVX512_BF16);
3548 3548  }
3549 3549  
3550 3550  /*
3551 3551   * PPIN is the protected processor inventory number. On AMD this is an actual
3552 3552   * feature bit. However, on Intel systems we need to read the platform
3553 3553   * information MSR if we're on a specific model.
3554 3554   */
3555 3555  #if !defined(__xpv)
3556 3556  static void
3557 3557  cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3558 3558  {
3559 3559          on_trap_data_t otd;
3560 3560          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3561 3561  
3562 3562          switch (cpi->cpi_vendor) {
3563 3563          case X86_VENDOR_AMD:
3564 3564                  /*
3565 3565                   * This leaf will have already been gathered in the topology
3566 3566                   * functions.
3567 3567                   */
3568 3568                  if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3569 3569                          if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3570 3570                                  add_x86_feature(featureset, X86FSET_PPIN);
3571 3571                          }
3572 3572                  }
3573 3573                  break;
3574 3574          case X86_VENDOR_Intel:
3575 3575                  if (cpi->cpi_family != 6)
3576 3576                          break;
3577 3577                  switch (cpi->cpi_model) {
3578 3578                  case INTC_MODEL_IVYBRIDGE_XEON:
3579 3579                  case INTC_MODEL_HASWELL_XEON:
3580 3580                  case INTC_MODEL_BROADWELL_XEON:
3581 3581                  case INTC_MODEL_BROADWELL_XEON_D:
3582 3582                  case INTC_MODEL_SKYLAKE_XEON:
3583 3583                  case INTC_MODEL_ICELAKE_XEON:
3584 3584                          if (!on_trap(&otd, OT_DATA_ACCESS)) {
3585 3585                                  uint64_t value;
3586 3586  
3587 3587                                  value = rdmsr(MSR_PLATFORM_INFO);
3588 3588                                  if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3589 3589                                          add_x86_feature(featureset,
3590 3590                                              X86FSET_PPIN);
3591 3591                                  }
3592 3592                          }
3593 3593                          no_trap();
3594 3594                          break;
3595 3595                  default:
3596 3596                          break;
3597 3597                  }
3598 3598                  break;
3599 3599          default:
3600 3600                  break;
3601 3601          }
3602 3602  }
3603 3603  #endif  /* ! __xpv */
3604 3604  
3605 3605  static void
3606 3606  cpuid_pass_prelude(cpu_t *cpu, void *arg)
3607 3607  {
3608 3608          uchar_t *featureset = (uchar_t *)arg;
3609 3609  
3610 3610          /*
3611 3611           * We don't run on any processor that doesn't have cpuid, and could not
3612 3612           * possibly have arrived here.
3613 3613           */
3614 3614          add_x86_feature(featureset, X86FSET_CPUID);
3615 3615  }
3616 3616  
3617 3617  static void
3618 3618  cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3619 3619  {
3620 3620          struct cpuid_info *cpi;
3621 3621          struct cpuid_regs *cp;
3622 3622  
3623 3623          /*
3624 3624           * We require that virtual/native detection be complete and that PCI
3625 3625           * config space access has been set up; at present there is no reliable
3626 3626           * way to determine the latter.
3627 3627           */
3628 3628  #if !defined(__xpv)
3629 3629          ASSERT3S(platform_type, !=, -1);
3630 3630  #endif  /* !__xpv */
3631 3631  
3632 3632          cpi = cpu->cpu_m.mcpu_cpi;
3633 3633          ASSERT(cpi != NULL);
3634 3634  
3635 3635          cp = &cpi->cpi_std[0];
3636 3636          cp->cp_eax = 0;
3637 3637          cpi->cpi_maxeax = __cpuid_insn(cp);
3638 3638          {
3639 3639                  uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3640 3640                  *iptr++ = cp->cp_ebx;
3641 3641                  *iptr++ = cp->cp_edx;
3642 3642                  *iptr++ = cp->cp_ecx;
3643 3643                  *(char *)&cpi->cpi_vendorstr[12] = '\0';
3644 3644          }
3645 3645  
3646 3646          cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3647 3647          x86_vendor = cpi->cpi_vendor; /* for compatibility */
3648 3648  
3649 3649          /*
3650 3650           * Limit the range in case of weird hardware
3651 3651           */
3652 3652          if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3653 3653                  cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3654 3654          if (cpi->cpi_maxeax < 1)
3655 3655                  return;
3656 3656  
3657 3657          cp = &cpi->cpi_std[1];
3658 3658          cp->cp_eax = 1;
3659 3659          (void) __cpuid_insn(cp);
3660 3660  
3661 3661          /*
3662 3662           * Extract identifying constants for easy access.
3663 3663           */
3664 3664          cpi->cpi_model = CPI_MODEL(cpi);
3665 3665          cpi->cpi_family = CPI_FAMILY(cpi);
3666 3666  
3667 3667          if (cpi->cpi_family == 0xf)
3668 3668                  cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3669 3669  
3670 3670          /*
3671 3671           * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3672 3672           * Intel, and presumably everyone else, uses model == 0xf, as
3673 3673           * one would expect (max value means possible overflow).  Sigh.
3674 3674           */
3675 3675  
3676 3676          switch (cpi->cpi_vendor) {
3677 3677          case X86_VENDOR_Intel:
3678 3678                  if (IS_EXTENDED_MODEL_INTEL(cpi))
3679 3679                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3680 3680                  break;
3681 3681          case X86_VENDOR_AMD:
3682 3682                  if (CPI_FAMILY(cpi) == 0xf)
3683 3683                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3684 3684                  break;
3685 3685          case X86_VENDOR_HYGON:
3686 3686                  cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3687 3687                  break;
3688 3688          default:
3689 3689                  if (cpi->cpi_model == 0xf)
3690 3690                          cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3691 3691                  break;
3692 3692          }
3693 3693  
3694 3694          cpi->cpi_step = CPI_STEP(cpi);
3695 3695          cpi->cpi_brandid = CPI_BRANDID(cpi);
3696 3696  
3697 3697          /*
3698 3698           * Synthesize chip "revision" and socket type
3699 3699           */
3700 3700          cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3701 3701              cpi->cpi_model, cpi->cpi_step);
3702 3702          cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3703 3703              cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3704 3704          cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3705 3705              cpi->cpi_model, cpi->cpi_step);
3706 3706          cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
3707 3707              cpi->cpi_model, cpi->cpi_step);
3708 3708  }
3709 3709  
3710 3710  static void
3711 3711  cpuid_pass_basic(cpu_t *cpu, void *arg)
3712 3712  {
3713 3713          uchar_t *featureset = (uchar_t *)arg;
3714 3714          uint32_t mask_ecx, mask_edx;
3715 3715          struct cpuid_info *cpi;
3716 3716          struct cpuid_regs *cp;
3717 3717          int xcpuid;
3718 3718  #if !defined(__xpv)
3719 3719          extern int idle_cpu_prefer_mwait;
3720 3720  #endif
3721 3721  
3722 3722          cpi = cpu->cpu_m.mcpu_cpi;
3723 3723          ASSERT(cpi != NULL);
3724 3724  
3725 3725          if (cpi->cpi_maxeax < 1)
3726 3726                  return;
3727 3727  
3728 3728          /*
3729 3729           * This was filled during the identification pass.
3730 3730           */
3731 3731          cp = &cpi->cpi_std[1];
3732 3732  
3733 3733          /*
3734 3734           * *default* assumptions:
3735 3735           * - believe %edx feature word
3736 3736           * - ignore %ecx feature word
3737 3737           * - 32-bit virtual and physical addressing
3738 3738           */
3739 3739          mask_edx = 0xffffffff;
3740 3740          mask_ecx = 0;
3741 3741  
3742 3742          cpi->cpi_pabits = cpi->cpi_vabits = 32;
3743 3743  
3744 3744          switch (cpi->cpi_vendor) {
3745 3745          case X86_VENDOR_Intel:
3746 3746                  if (cpi->cpi_family == 5)
3747 3747                          x86_type = X86_TYPE_P5;
3748 3748                  else if (IS_LEGACY_P6(cpi)) {
3749 3749                          x86_type = X86_TYPE_P6;
3750 3750                          pentiumpro_bug4046376 = 1;
3751 3751                          /*
3752 3752                           * Clear the SEP bit when it was set erroneously
3753 3753                           */
3754 3754                          if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3755 3755                                  cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3756 3756                  } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3757 3757                          x86_type = X86_TYPE_P4;
3758 3758                          /*
3759 3759                           * We don't currently depend on any of the %ecx
3760 3760                           * features until Prescott, so we'll only check
3761 3761                           * this from P4 onwards.  We might want to revisit
3762 3762                           * that idea later.
3763 3763                           */
3764 3764                          mask_ecx = 0xffffffff;
3765 3765                  } else if (cpi->cpi_family > 0xf)
3766 3766                          mask_ecx = 0xffffffff;
3767 3767                  /*
3768 3768                   * We don't support MONITOR/MWAIT if leaf 5 is not available
3769 3769                   * to obtain the monitor linesize.
3770 3770                   */
3771 3771                  if (cpi->cpi_maxeax < 5)
3772 3772                          mask_ecx &= ~CPUID_INTC_ECX_MON;
3773 3773                  break;
3774 3774          case X86_VENDOR_IntelClone:
3775 3775          default:
3776 3776                  break;
3777 3777          case X86_VENDOR_AMD:
3778 3778  #if defined(OPTERON_ERRATUM_108)
3779 3779                  if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3780 3780                          cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3781 3781                          cpi->cpi_model = 0xc;
3782 3782                  } else
3783 3783  #endif
3784 3784                  if (cpi->cpi_family == 5) {
3785 3785                          /*
3786 3786                           * AMD K5 and K6
3787 3787                           *
3788 3788                           * These CPUs have an incomplete implementation
3789 3789                           * of MCA/MCE which we mask away.
3790 3790                           */
3791 3791                          mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3792 3792  
3793 3793                          /*
3794 3794                           * Model 0 uses the wrong (APIC) bit
3795 3795                           * to indicate PGE.  Fix it here.
3796 3796                           */
3797 3797                          if (cpi->cpi_model == 0) {
3798 3798                                  if (cp->cp_edx & 0x200) {
3799 3799                                          cp->cp_edx &= ~0x200;
3800 3800                                          cp->cp_edx |= CPUID_INTC_EDX_PGE;
3801 3801                                  }
3802 3802                          }
3803 3803  
3804 3804                          /*
3805 3805                           * Early models had problems w/ MMX; disable.
3806 3806                           */
3807 3807                          if (cpi->cpi_model < 6)
3808 3808                                  mask_edx &= ~CPUID_INTC_EDX_MMX;
3809 3809                  }
3810 3810  
3811 3811                  /*
3812 3812                   * For newer families, SSE3 and CX16, at least, are valid;
3813 3813                   * enable all
3814 3814                   */
3815 3815                  if (cpi->cpi_family >= 0xf)
3816 3816                          mask_ecx = 0xffffffff;
3817 3817                  /*
3818 3818                   * We don't support MONITOR/MWAIT if leaf 5 is not available
3819 3819                   * to obtain the monitor linesize.
3820 3820                   */
3821 3821                  if (cpi->cpi_maxeax < 5)
3822 3822                          mask_ecx &= ~CPUID_INTC_ECX_MON;
3823 3823  
3824 3824  #if !defined(__xpv)
3825 3825                  /*
3826 3826                   * AMD has not historically used MWAIT in the CPU's idle loop.
3827 3827                   * Pre-family-10h Opterons do not have the MWAIT instruction. We
3828 3828                   * know for certain that in at least family 17h, per AMD, mwait
3829 3829                   * is preferred. Families in-between are less certain.
3830 3830                   */
3831 3831                  if (cpi->cpi_family < 0x17) {
3832 3832                          idle_cpu_prefer_mwait = 0;
3833 3833                  }
3834 3834  #endif
3835 3835  
3836 3836                  break;
3837 3837          case X86_VENDOR_HYGON:
3838 3838                  /* Enable all for Hygon Dhyana CPU */
3839 3839                  mask_ecx = 0xffffffff;
3840 3840                  break;
3841 3841          case X86_VENDOR_TM:
3842 3842                  /*
3843 3843                   * workaround the NT workaround in CMS 4.1
3844 3844                   */
3845 3845                  if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3846 3846                      (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3847 3847                          cp->cp_edx |= CPUID_INTC_EDX_CX8;
3848 3848                  break;
3849 3849          case X86_VENDOR_Centaur:
3850 3850                  /*
3851 3851                   * workaround the NT workarounds again
3852 3852                   */
3853 3853                  if (cpi->cpi_family == 6)
3854 3854                          cp->cp_edx |= CPUID_INTC_EDX_CX8;
3855 3855                  break;
3856 3856          case X86_VENDOR_Cyrix:
3857 3857                  /*
3858 3858                   * We rely heavily on the probing in locore
3859 3859                   * to actually figure out what parts, if any,
3860 3860                   * of the Cyrix cpuid instruction to believe.
3861 3861                   */
3862 3862                  switch (x86_type) {
3863 3863                  case X86_TYPE_CYRIX_486:
3864 3864                          mask_edx = 0;
3865 3865                          break;
3866 3866                  case X86_TYPE_CYRIX_6x86:
3867 3867                          mask_edx = 0;
3868 3868                          break;
3869 3869                  case X86_TYPE_CYRIX_6x86L:
3870 3870                          mask_edx =
3871 3871                              CPUID_INTC_EDX_DE |
3872 3872                              CPUID_INTC_EDX_CX8;
3873 3873                          break;
3874 3874                  case X86_TYPE_CYRIX_6x86MX:
3875 3875                          mask_edx =
3876 3876                              CPUID_INTC_EDX_DE |
3877 3877                              CPUID_INTC_EDX_MSR |
3878 3878                              CPUID_INTC_EDX_CX8 |
3879 3879                              CPUID_INTC_EDX_PGE |
3880 3880                              CPUID_INTC_EDX_CMOV |
3881 3881                              CPUID_INTC_EDX_MMX;
3882 3882                          break;
3883 3883                  case X86_TYPE_CYRIX_GXm:
3884 3884                          mask_edx =
3885 3885                              CPUID_INTC_EDX_MSR |
3886 3886                              CPUID_INTC_EDX_CX8 |
3887 3887                              CPUID_INTC_EDX_CMOV |
3888 3888                              CPUID_INTC_EDX_MMX;
3889 3889                          break;
3890 3890                  case X86_TYPE_CYRIX_MediaGX:
3891 3891                          break;
3892 3892                  case X86_TYPE_CYRIX_MII:
3893 3893                  case X86_TYPE_VIA_CYRIX_III:
3894 3894                          mask_edx =
3895 3895                              CPUID_INTC_EDX_DE |
3896 3896                              CPUID_INTC_EDX_TSC |
3897 3897                              CPUID_INTC_EDX_MSR |
3898 3898                              CPUID_INTC_EDX_CX8 |
3899 3899                              CPUID_INTC_EDX_PGE |
3900 3900                              CPUID_INTC_EDX_CMOV |
3901 3901                              CPUID_INTC_EDX_MMX;
3902 3902                          break;
3903 3903                  default:
3904 3904                          break;
3905 3905                  }
3906 3906                  break;
3907 3907          }
3908 3908  
3909 3909  #if defined(__xpv)
3910 3910          /*
3911 3911           * Do not support MONITOR/MWAIT under a hypervisor
3912 3912           */
3913 3913          mask_ecx &= ~CPUID_INTC_ECX_MON;
3914 3914          /*
3915 3915           * Do not support XSAVE under a hypervisor for now
3916 3916           */
3917 3917          xsave_force_disable = B_TRUE;
3918 3918  
3919 3919  #endif  /* __xpv */
3920 3920  
3921 3921          if (xsave_force_disable) {
3922 3922                  mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3923 3923                  mask_ecx &= ~CPUID_INTC_ECX_AVX;
3924 3924                  mask_ecx &= ~CPUID_INTC_ECX_F16C;
3925 3925                  mask_ecx &= ~CPUID_INTC_ECX_FMA;
3926 3926          }
3927 3927  
3928 3928          /*
3929 3929           * Now we've figured out the masks that determine
3930 3930           * which bits we choose to believe, apply the masks
3931 3931           * to the feature words, then map the kernel's view
3932 3932           * of these feature words into its feature word.
3933 3933           */
3934 3934          cp->cp_edx &= mask_edx;
3935 3935          cp->cp_ecx &= mask_ecx;
3936 3936  
3937 3937          /*
3938 3938           * apply any platform restrictions (we don't call this
3939 3939           * immediately after __cpuid_insn here, because we need the
3940 3940           * workarounds applied above first)
3941 3941           */
3942 3942          platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3943 3943  
3944 3944          /*
3945 3945           * In addition to ecx and edx, Intel and AMD are storing a bunch of
3946 3946           * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
3947 3947           * 7 has sub-leaves determined by ecx.
3948 3948           */
3949 3949          if (cpi->cpi_maxeax >= 7) {
3950 3950                  struct cpuid_regs *ecp;
3951 3951                  ecp = &cpi->cpi_std[7];
3952 3952                  ecp->cp_eax = 7;
3953 3953                  ecp->cp_ecx = 0;
3954 3954                  (void) __cpuid_insn(ecp);
3955 3955  
3956 3956                  /*
3957 3957                   * If XSAVE has been disabled, just ignore all of the
3958 3958                   * extended-save-area dependent flags here. By removing most of
3959 3959                   * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
3960 3960                   * end up looking at additional xsave dependent leaves right
3961 3961                   * now.
3962 3962                   */
3963 3963                  if (xsave_force_disable) {
3964 3964                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3965 3965                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3966 3966                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3967 3967                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3968 3968                          ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3969 3969                          ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3970 3970                          ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3971 3971                          ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
3972 3972                          ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
3973 3973                          ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
3974 3974                  }
3975 3975  
3976 3976                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3977 3977                          add_x86_feature(featureset, X86FSET_SMEP);
3978 3978  
3979 3979                  /*
3980 3980                   * We check disable_smap here in addition to in startup_smap()
3981 3981                   * to ensure CPUs that aren't the boot CPU don't accidentally
3982 3982                   * include it in the feature set and thus generate a mismatched
3983 3983                   * x86 feature set across CPUs.
3984 3984                   */
3985 3985                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3986 3986                      disable_smap == 0)
3987 3987                          add_x86_feature(featureset, X86FSET_SMAP);
3988 3988  
3989 3989                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3990 3990                          add_x86_feature(featureset, X86FSET_RDSEED);
3991 3991  
3992 3992                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3993 3993                          add_x86_feature(featureset, X86FSET_ADX);
3994 3994  
3995 3995                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3996 3996                          add_x86_feature(featureset, X86FSET_FSGSBASE);
3997 3997  
3998 3998                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3999 3999                          add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4000 4000  
4001 4001                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4002 4002                          add_x86_feature(featureset, X86FSET_INVPCID);
4003 4003  
4004 4004                  if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4005 4005                          add_x86_feature(featureset, X86FSET_UMIP);
4006 4006                  if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4007 4007                          add_x86_feature(featureset, X86FSET_PKU);
4008 4008                  if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4009 4009                          add_x86_feature(featureset, X86FSET_OSPKE);
4010 4010                  if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4011 4011                          add_x86_feature(featureset, X86FSET_GFNI);
4012 4012  
4013 4013                  if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4014 4014                          add_x86_feature(featureset, X86FSET_CLWB);
4015 4015  
4016 4016                  if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4017 4017                          if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4018 4018                                  add_x86_feature(featureset, X86FSET_MPX);
4019 4019                  }
4020 4020  
4021 4021                  /*
4022 4022                   * If we have subleaf 1 available, grab and store that. This is
4023 4023                   * used for more AVX and related features.
4024 4024                   */
4025 4025                  if (ecp->cp_eax >= 1) {
4026 4026                          struct cpuid_regs *c71;
4027 4027                          c71 = &cpi->cpi_sub7[0];
4028 4028                          c71->cp_eax = 7;
4029 4029                          c71->cp_ecx = 1;
4030 4030                          (void) __cpuid_insn(c71);
4031 4031                  }
4032 4032          }
4033 4033  
4034 4034          /*
4035 4035           * fold in overrides from the "eeprom" mechanism
4036 4036           */
4037 4037          cp->cp_edx |= cpuid_feature_edx_include;
4038 4038          cp->cp_edx &= ~cpuid_feature_edx_exclude;
4039 4039  
4040 4040          cp->cp_ecx |= cpuid_feature_ecx_include;
4041 4041          cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4042 4042  
4043 4043          if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4044 4044                  add_x86_feature(featureset, X86FSET_LARGEPAGE);
4045 4045          }
4046 4046          if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4047 4047                  add_x86_feature(featureset, X86FSET_TSC);
4048 4048          }
4049 4049          if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4050 4050                  add_x86_feature(featureset, X86FSET_MSR);
4051 4051          }
4052 4052          if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4053 4053                  add_x86_feature(featureset, X86FSET_MTRR);
4054 4054          }
4055 4055          if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4056 4056                  add_x86_feature(featureset, X86FSET_PGE);
4057 4057          }
4058 4058          if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4059 4059                  add_x86_feature(featureset, X86FSET_CMOV);
4060 4060          }
4061 4061          if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4062 4062                  add_x86_feature(featureset, X86FSET_MMX);
4063 4063          }
4064 4064          if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4065 4065              (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4066 4066                  add_x86_feature(featureset, X86FSET_MCA);
4067 4067          }
4068 4068          if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4069 4069                  add_x86_feature(featureset, X86FSET_PAE);
4070 4070          }
4071 4071          if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4072 4072                  add_x86_feature(featureset, X86FSET_CX8);
4073 4073          }
4074 4074          if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4075 4075                  add_x86_feature(featureset, X86FSET_CX16);
4076 4076          }
4077 4077          if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4078 4078                  add_x86_feature(featureset, X86FSET_PAT);
4079 4079          }
4080 4080          if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4081 4081                  add_x86_feature(featureset, X86FSET_SEP);
4082 4082          }
4083 4083          if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4084 4084                  /*
4085 4085                   * In our implementation, fxsave/fxrstor
4086 4086                   * are prerequisites before we'll even
4087 4087                   * try and do SSE things.
4088 4088                   */
4089 4089                  if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4090 4090                          add_x86_feature(featureset, X86FSET_SSE);
4091 4091                  }
4092 4092                  if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4093 4093                          add_x86_feature(featureset, X86FSET_SSE2);
4094 4094                  }
4095 4095                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4096 4096                          add_x86_feature(featureset, X86FSET_SSE3);
4097 4097                  }
4098 4098                  if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4099 4099                          add_x86_feature(featureset, X86FSET_SSSE3);
4100 4100                  }
4101 4101                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4102 4102                          add_x86_feature(featureset, X86FSET_SSE4_1);
4103 4103                  }
4104 4104                  if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4105 4105                          add_x86_feature(featureset, X86FSET_SSE4_2);
4106 4106                  }
4107 4107                  if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4108 4108                          add_x86_feature(featureset, X86FSET_AES);
4109 4109                  }
4110 4110                  if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4111 4111                          add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4112 4112                  }
4113 4113  
4114 4114                  if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4115 4115                          add_x86_feature(featureset, X86FSET_SHA);
4116 4116  
4117 4117                  if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4118 4118                          add_x86_feature(featureset, X86FSET_XSAVE);
4119 4119  
4120 4120                          /* We only test AVX & AVX512 when there is XSAVE */
4121 4121                          cpuid_basic_avx(cpu, featureset);
4122 4122                  }
4123 4123          }
4124 4124  
4125 4125          if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4126 4126                  add_x86_feature(featureset, X86FSET_PCID);
4127 4127          }
4128 4128  
4129 4129          if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4130 4130                  add_x86_feature(featureset, X86FSET_X2APIC);
4131 4131          }
4132 4132          if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4133 4133                  add_x86_feature(featureset, X86FSET_DE);
4134 4134          }
4135 4135  #if !defined(__xpv)
4136 4136          if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4137 4137  
4138 4138                  /*
4139 4139                   * We require the CLFLUSH instruction for erratum workaround
4140 4140                   * to use MONITOR/MWAIT.
4141 4141                   */
4142 4142                  if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4143 4143                          cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4144 4144                          add_x86_feature(featureset, X86FSET_MWAIT);
4145 4145                  } else {
4146 4146                          extern int idle_cpu_assert_cflush_monitor;
4147 4147  
4148 4148                          /*
4149 4149                           * All processors we are aware of which have
4150 4150                           * MONITOR/MWAIT also have CLFLUSH.
4151 4151                           */
4152 4152                          if (idle_cpu_assert_cflush_monitor) {
4153 4153                                  ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4154 4154                                      (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4155 4155                          }
4156 4156                  }
4157 4157          }
4158 4158  #endif  /* __xpv */
4159 4159  
4160 4160          if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4161 4161                  add_x86_feature(featureset, X86FSET_VMX);
4162 4162          }
4163 4163  
4164 4164          if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4165 4165                  add_x86_feature(featureset, X86FSET_RDRAND);
4166 4166  
4167 4167          /*
4168 4168           * Only need it first time, rest of the cpus would follow suit.
4169 4169           * we only capture this for the bootcpu.
4170 4170           */
4171 4171          if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4172 4172                  add_x86_feature(featureset, X86FSET_CLFSH);
4173 4173                  x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4174 4174          }
4175 4175          if (is_x86_feature(featureset, X86FSET_PAE))
4176 4176                  cpi->cpi_pabits = 36;
4177 4177  
4178 4178          if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4179 4179                  struct cpuid_regs r, *ecp;
4180 4180  
4181 4181                  ecp = &r;
4182 4182                  ecp->cp_eax = 0xD;
4183 4183                  ecp->cp_ecx = 1;
4184 4184                  ecp->cp_edx = ecp->cp_ebx = 0;
4185 4185                  (void) __cpuid_insn(ecp);
4186 4186  
4187 4187                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4188 4188                          add_x86_feature(featureset, X86FSET_XSAVEOPT);
4189 4189                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4190 4190                          add_x86_feature(featureset, X86FSET_XSAVEC);
4191 4191                  if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4192 4192                          add_x86_feature(featureset, X86FSET_XSAVES);
4193 4193  
4194 4194                  /*
4195 4195                   * Zen 2 family processors suffer from erratum 1386 that causes
4196 4196                   * xsaves to not function correctly in some circumstances. There
4197 4197                   * are no supervisor states in Zen 2 and earlier. Practically
4198 4198                   * speaking this has no impact for us as we currently do not
4199 4199                   * leverage compressed xsave formats. To safeguard against
4200 4200                   * issues in the future where we may opt to using it, we remove
4201 4201                   * it from the feature set now. While Matisse has a microcode
4202 4202                   * update available with a fix, not all Zen 2 CPUs do so it's
4203 4203                   * simpler for the moment to unconditionally remove it.
4204 4204                   */
4205 4205                  if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4206 4206                      uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4207 4207                          remove_x86_feature(featureset, X86FSET_XSAVES);
4208 4208                  }
4209 4209          }
4210 4210  
4211 4211          /*
4212 4212           * Work on the "extended" feature information, doing
4213 4213           * some basic initialization to be used in the extended pass.
4214 4214           */
4215 4215          xcpuid = 0;
4216 4216          switch (cpi->cpi_vendor) {
4217 4217          case X86_VENDOR_Intel:
4218 4218                  /*
4219 4219                   * On KVM we know we will have proper support for extended
4220 4220                   * cpuid.
4221 4221                   */
4222 4222                  if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4223 4223                      (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4224 4224                      (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4225 4225                          xcpuid++;
4226 4226                  break;
4227 4227          case X86_VENDOR_AMD:
4228 4228                  if (cpi->cpi_family > 5 ||
4229 4229                      (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4230 4230                          xcpuid++;
4231 4231                  break;
4232 4232          case X86_VENDOR_Cyrix:
4233 4233                  /*
4234 4234                   * Only these Cyrix CPUs are -known- to support
4235 4235                   * extended cpuid operations.
4236 4236                   */
4237 4237                  if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4238 4238                      x86_type == X86_TYPE_CYRIX_GXm)
4239 4239                          xcpuid++;
4240 4240                  break;
4241 4241          case X86_VENDOR_HYGON:
4242 4242          case X86_VENDOR_Centaur:
4243 4243          case X86_VENDOR_TM:
4244 4244          default:
4245 4245                  xcpuid++;
4246 4246                  break;
4247 4247          }
4248 4248  
4249 4249          if (xcpuid) {
4250 4250                  cp = &cpi->cpi_extd[0];
4251 4251                  cp->cp_eax = CPUID_LEAF_EXT_0;
4252 4252                  cpi->cpi_xmaxeax = __cpuid_insn(cp);
4253 4253          }
4254 4254  
4255 4255          if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4256 4256  
4257 4257                  if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4258 4258                          cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4259 4259  
4260 4260                  switch (cpi->cpi_vendor) {
4261 4261                  case X86_VENDOR_Intel:
4262 4262                  case X86_VENDOR_AMD:
4263 4263                  case X86_VENDOR_HYGON:
4264 4264                          if (cpi->cpi_xmaxeax < 0x80000001)
4265 4265                                  break;
4266 4266                          cp = &cpi->cpi_extd[1];
4267 4267                          cp->cp_eax = 0x80000001;
4268 4268                          (void) __cpuid_insn(cp);
4269 4269  
4270 4270                          if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4271 4271                              cpi->cpi_family == 5 &&
4272 4272                              cpi->cpi_model == 6 &&
4273 4273                              cpi->cpi_step == 6) {
4274 4274                                  /*
4275 4275                                   * K6 model 6 uses bit 10 to indicate SYSC
4276 4276                                   * Later models use bit 11. Fix it here.
4277 4277                                   */
4278 4278                                  if (cp->cp_edx & 0x400) {
4279 4279                                          cp->cp_edx &= ~0x400;
4280 4280                                          cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4281 4281                                  }
4282 4282                          }
4283 4283  
4284 4284                          platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4285 4285  
4286 4286                          /*
4287 4287                           * Compute the additions to the kernel's feature word.
4288 4288                           */
4289 4289                          if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4290 4290                                  add_x86_feature(featureset, X86FSET_NX);
4291 4291                          }
4292 4292  
4293 4293                          /*
4294 4294                           * Regardless whether or not we boot 64-bit,
4295 4295                           * we should have a way to identify whether
4296 4296                           * the CPU is capable of running 64-bit.
4297 4297                           */
4298 4298                          if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4299 4299                                  add_x86_feature(featureset, X86FSET_64);
4300 4300                          }
4301 4301  
4302 4302                          /* 1 GB large page - enable only for 64 bit kernel */
4303 4303                          if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4304 4304                                  add_x86_feature(featureset, X86FSET_1GPG);
4305 4305                          }
4306 4306  
4307 4307                          if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4308 4308                              cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4309 4309                              (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4310 4310                              (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4311 4311                                  add_x86_feature(featureset, X86FSET_SSE4A);
4312 4312                          }
4313 4313  
4314 4314                          /*
4315 4315                           * It's really tricky to support syscall/sysret in
4316 4316                           * the i386 kernel; we rely on sysenter/sysexit
4317 4317                           * instead.  In the amd64 kernel, things are -way-
4318 4318                           * better.
4319 4319                           */
4320 4320                          if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4321 4321                                  add_x86_feature(featureset, X86FSET_ASYSC);
4322 4322                          }
4323 4323  
4324 4324                          /*
4325 4325                           * While we're thinking about system calls, note
4326 4326                           * that AMD processors don't support sysenter
4327 4327                           * in long mode at all, so don't try to program them.
4328 4328                           */
4329 4329                          if (x86_vendor == X86_VENDOR_AMD ||
4330 4330                              x86_vendor == X86_VENDOR_HYGON) {
4331 4331                                  remove_x86_feature(featureset, X86FSET_SEP);
4332 4332                          }
4333 4333  
4334 4334                          if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4335 4335                                  add_x86_feature(featureset, X86FSET_TSCP);
4336 4336                          }
4337 4337  
4338 4338                          if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4339 4339                                  add_x86_feature(featureset, X86FSET_SVM);
4340 4340                          }
4341 4341  
4342 4342                          if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4343 4343                                  add_x86_feature(featureset, X86FSET_TOPOEXT);
4344 4344                          }
4345 4345  
4346 4346                          if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4347 4347                                  add_x86_feature(featureset, X86FSET_AMD_PCEC);
4348 4348                          }
4349 4349  
4350 4350                          if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4351 4351                                  add_x86_feature(featureset, X86FSET_XOP);
4352 4352                          }
4353 4353  
4354 4354                          if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4355 4355                                  add_x86_feature(featureset, X86FSET_FMA4);
4356 4356                          }
4357 4357  
4358 4358                          if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4359 4359                                  add_x86_feature(featureset, X86FSET_TBM);
4360 4360                          }
4361 4361  
4362 4362                          if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4363 4363                                  add_x86_feature(featureset, X86FSET_MONITORX);
4364 4364                          }
4365 4365                          break;
4366 4366                  default:
4367 4367                          break;
4368 4368                  }
4369 4369  
4370 4370                  /*
4371 4371                   * Get CPUID data about processor cores and hyperthreads.
4372 4372                   */
4373 4373                  switch (cpi->cpi_vendor) {
4374 4374                  case X86_VENDOR_Intel:
4375 4375                          if (cpi->cpi_maxeax >= 4) {
4376 4376                                  cp = &cpi->cpi_std[4];
4377 4377                                  cp->cp_eax = 4;
4378 4378                                  cp->cp_ecx = 0;
4379 4379                                  (void) __cpuid_insn(cp);
4380 4380                                  platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4381 4381                          }
4382 4382                          /*FALLTHROUGH*/
4383 4383                  case X86_VENDOR_AMD:
4384 4384                  case X86_VENDOR_HYGON:
4385 4385                          if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4386 4386                                  break;
4387 4387                          cp = &cpi->cpi_extd[8];
4388 4388                          cp->cp_eax = CPUID_LEAF_EXT_8;
4389 4389                          (void) __cpuid_insn(cp);
4390 4390                          platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4391 4391                              cp);
4392 4392  
4393 4393                          /*
4394 4394                           * AMD uses ebx for some extended functions.
4395 4395                           */
4396 4396                          if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4397 4397                              cpi->cpi_vendor == X86_VENDOR_HYGON) {
4398 4398                                  /*
4399 4399                                   * While we're here, check for the AMD "Error
4400 4400                                   * Pointer Zero/Restore" feature. This can be
4401 4401                                   * used to setup the FP save handlers
4402 4402                                   * appropriately.
4403 4403                                   */
4404 4404                                  if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4405 4405                                          cpi->cpi_fp_amd_save = 0;
4406 4406                                  } else {
4407 4407                                          cpi->cpi_fp_amd_save = 1;
4408 4408                                  }
4409 4409  
4410 4410                                  if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4411 4411                                          add_x86_feature(featureset,
4412 4412                                              X86FSET_CLZERO);
4413 4413                                  }
4414 4414                          }
4415 4415  
4416 4416                          /*
4417 4417                           * Virtual and physical address limits from
4418 4418                           * cpuid override previously guessed values.
4419 4419                           */
4420 4420                          cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4421 4421                          cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4422 4422                          break;
4423 4423                  default:
4424 4424                          break;
4425 4425                  }
4426 4426  
4427 4427                  /*
4428 4428                   * Get CPUID data about TSC Invariance in Deep C-State.
4429 4429                   */
4430 4430                  switch (cpi->cpi_vendor) {
4431 4431                  case X86_VENDOR_Intel:
4432 4432                  case X86_VENDOR_AMD:
4433 4433                  case X86_VENDOR_HYGON:
4434 4434                          if (cpi->cpi_maxeax >= 7) {
4435 4435                                  cp = &cpi->cpi_extd[7];
4436 4436                                  cp->cp_eax = 0x80000007;
4437 4437                                  cp->cp_ecx = 0;
4438 4438                                  (void) __cpuid_insn(cp);
4439 4439                          }
4440 4440                          break;
4441 4441                  default:
4442 4442                          break;
4443 4443                  }
4444 4444          }
4445 4445  
4446 4446          /*
4447 4447           * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4448 4448           * run and thus gathered some of its dependent leaves.
4449 4449           */
4450 4450          cpuid_basic_topology(cpu, featureset);
4451 4451          cpuid_basic_thermal(cpu, featureset);
4452 4452  #if !defined(__xpv)
4453 4453          cpuid_basic_ppin(cpu, featureset);
4454 4454  #endif
4455 4455  
4456 4456          if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4457 4457              cpi->cpi_vendor == X86_VENDOR_HYGON) {
4458 4458                  if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4459 4459                      cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4460 4460                          /* Special handling for AMD FP not necessary. */
4461 4461                          cpi->cpi_fp_amd_save = 0;
4462 4462                  } else {
4463 4463                          cpi->cpi_fp_amd_save = 1;
4464 4464                  }
4465 4465          }
4466 4466  
4467 4467          /*
4468 4468           * Check (and potentially set) if lfence is serializing.
4469 4469           * This is useful for accurate rdtsc measurements and AMD retpolines.
4470 4470           */
4471 4471          if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4472 4472              cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4473 4473              is_x86_feature(featureset, X86FSET_SSE2)) {
4474 4474                  /*
4475 4475                   * The AMD white paper Software Techniques For Managing
4476 4476                   * Speculation on AMD Processors details circumstances for when
4477 4477                   * lfence instructions are serializing.
4478 4478                   *
4479 4479                   * On family 0xf and 0x11, it is inherently so.  On family 0x10
4480 4480                   * and later (excluding 0x11), a bit in the DE_CFG MSR
4481 4481                   * determines the lfence behavior.  Per that whitepaper, AMD has
4482 4482                   * committed to supporting that MSR on all later CPUs.
4483 4483                   */
4484 4484                  if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4485 4485                          add_x86_feature(featureset, X86FSET_LFENCE_SER);
4486 4486                  } else if (cpi->cpi_family >= 0x10) {
4487 4487  #if !defined(__xpv)
4488 4488                          uint64_t val;
4489 4489  
4490 4490                          /*
4491 4491                           * Be careful when attempting to enable the bit, and
4492 4492                           * verify that it was actually set in case we are
4493 4493                           * running in a hypervisor which is less than faithful
4494 4494                           * about its emulation of this feature.
4495 4495                           */
4496 4496                          on_trap_data_t otd;
4497 4497                          if (!on_trap(&otd, OT_DATA_ACCESS)) {
4498 4498                                  val = rdmsr(MSR_AMD_DE_CFG);
4499 4499                                  val |= AMD_DE_CFG_LFENCE_DISPATCH;
4500 4500                                  wrmsr(MSR_AMD_DE_CFG, val);
4501 4501                                  val = rdmsr(MSR_AMD_DE_CFG);
4502 4502                          } else {
4503 4503                                  val = 0;
4504 4504                          }
4505 4505                          no_trap();
4506 4506  
4507 4507                          if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4508 4508                                  add_x86_feature(featureset, X86FSET_LFENCE_SER);
4509 4509                          }
4510 4510  #endif
4511 4511                  }
4512 4512          } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4513 4513              is_x86_feature(featureset, X86FSET_SSE2)) {
4514 4514                  /*
4515 4515                   * Documentation and other OSes indicate that lfence is always
4516 4516                   * serializing on Intel CPUs.
4517 4517                   */
4518 4518                  add_x86_feature(featureset, X86FSET_LFENCE_SER);
4519 4519          }
4520 4520  
4521 4521  
4522 4522          /*
4523 4523           * Check the processor leaves that are used for security features. Grab
4524 4524           * any additional processor-specific leaves that we may not have yet.
4525 4525           */
4526 4526          switch (cpi->cpi_vendor) {
4527 4527          case X86_VENDOR_AMD:
4528 4528          case X86_VENDOR_HYGON:
4529 4529                  if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4530 4530                          cp = &cpi->cpi_extd[7];
4531 4531                          cp->cp_eax = CPUID_LEAF_EXT_21;
4532 4532                          cp->cp_ecx = 0;
4533 4533                          (void) __cpuid_insn(cp);
4534 4534                  }
4535 4535                  break;
4536 4536          default:
4537 4537                  break;
4538 4538          }
4539 4539  
4540 4540          cpuid_scan_security(cpu, featureset);
4541 4541  }
4542 4542  
4543 4543  /*
4544 4544   * Make copies of the cpuid table entries we depend on, in
4545 4545   * part for ease of parsing now, in part so that we have only
4546 4546   * one place to correct any of it, in part for ease of
4547 4547   * later export to userland, and in part so we can look at
4548 4548   * this stuff in a crash dump.
4549 4549   */
4550 4550  
4551 4551  static void
4552 4552  cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4553 4553  {
4554 4554          uint_t n, nmax;
4555 4555          int i;
4556 4556          struct cpuid_regs *cp;
4557 4557          uint8_t *dp;
4558 4558          uint32_t *iptr;
4559 4559          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4560 4560  
4561 4561          if (cpi->cpi_maxeax < 1)
4562 4562                  return;
4563 4563  
4564 4564          if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4565 4565                  nmax = NMAX_CPI_STD;
4566 4566          /*
4567 4567           * (We already handled n == 0 and n == 1 in the basic pass)
4568 4568           */
4569 4569          for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4570 4570                  /*
4571 4571                   * leaves 6 and 7 were handled in the basic pass
4572 4572                   */
4573 4573                  if (n == 6 || n == 7)
4574 4574                          continue;
4575 4575  
4576 4576                  cp->cp_eax = n;
4577 4577  
4578 4578                  /*
4579 4579                   * CPUID function 4 expects %ecx to be initialized
4580 4580                   * with an index which indicates which cache to return
4581 4581                   * information about. The OS is expected to call function 4
4582 4582                   * with %ecx set to 0, 1, 2, ... until it returns with
4583 4583                   * EAX[4:0] set to 0, which indicates there are no more
4584 4584                   * caches.
4585 4585                   *
4586 4586                   * Here, populate cpi_std[4] with the information returned by
4587 4587                   * function 4 when %ecx == 0, and do the rest in a later pass
4588 4588                   * when dynamic memory allocation becomes available.
4589 4589                   *
4590 4590                   * Note: we need to explicitly initialize %ecx here, since
4591 4591                   * function 4 may have been previously invoked.
4592 4592                   */
4593 4593                  if (n == 4)
4594 4594                          cp->cp_ecx = 0;
4595 4595  
4596 4596                  (void) __cpuid_insn(cp);
4597 4597                  platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4598 4598                  switch (n) {
4599 4599                  case 2:
4600 4600                          /*
4601 4601                           * "the lower 8 bits of the %eax register
4602 4602                           * contain a value that identifies the number
4603 4603                           * of times the cpuid [instruction] has to be
4604 4604                           * executed to obtain a complete image of the
4605 4605                           * processor's caching systems."
4606 4606                           *
4607 4607                           * How *do* they make this stuff up?
4608 4608                           */
4609 4609                          cpi->cpi_ncache = sizeof (*cp) *
4610 4610                              BITX(cp->cp_eax, 7, 0);
4611 4611                          if (cpi->cpi_ncache == 0)
4612 4612                                  break;
4613 4613                          cpi->cpi_ncache--;      /* skip count byte */
4614 4614  
4615 4615                          /*
4616 4616                           * Well, for now, rather than attempt to implement
4617 4617                           * this slightly dubious algorithm, we just look
4618 4618                           * at the first 15 ..
4619 4619                           */
4620 4620                          if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4621 4621                                  cpi->cpi_ncache = sizeof (*cp) - 1;
4622 4622  
4623 4623                          dp = cpi->cpi_cacheinfo;
4624 4624                          if (BITX(cp->cp_eax, 31, 31) == 0) {
4625 4625                                  uint8_t *p = (void *)&cp->cp_eax;
4626 4626                                  for (i = 1; i < 4; i++)
4627 4627                                          if (p[i] != 0)
4628 4628                                                  *dp++ = p[i];
4629 4629                          }
4630 4630                          if (BITX(cp->cp_ebx, 31, 31) == 0) {
4631 4631                                  uint8_t *p = (void *)&cp->cp_ebx;
4632 4632                                  for (i = 0; i < 4; i++)
4633 4633                                          if (p[i] != 0)
4634 4634                                                  *dp++ = p[i];
4635 4635                          }
4636 4636                          if (BITX(cp->cp_ecx, 31, 31) == 0) {
4637 4637                                  uint8_t *p = (void *)&cp->cp_ecx;
4638 4638                                  for (i = 0; i < 4; i++)
4639 4639                                          if (p[i] != 0)
4640 4640                                                  *dp++ = p[i];
4641 4641                          }
4642 4642                          if (BITX(cp->cp_edx, 31, 31) == 0) {
4643 4643                                  uint8_t *p = (void *)&cp->cp_edx;
4644 4644                                  for (i = 0; i < 4; i++)
4645 4645                                          if (p[i] != 0)
4646 4646                                                  *dp++ = p[i];
4647 4647                          }
4648 4648                          break;
4649 4649  
4650 4650                  case 3: /* Processor serial number, if PSN supported */
4651 4651                          break;
4652 4652  
4653 4653                  case 4: /* Deterministic cache parameters */
4654 4654                          break;
4655 4655  
4656 4656                  case 5: /* Monitor/Mwait parameters */
4657 4657                  {
4658 4658                          size_t mwait_size;
4659 4659  
4660 4660                          /*
4661 4661                           * check cpi_mwait.support which was set in
4662 4662                           * cpuid_pass_basic()
4663 4663                           */
4664 4664                          if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4665 4665                                  break;
4666 4666  
4667 4667                          /*
4668 4668                           * Protect ourself from insane mwait line size.
4669 4669                           * Workaround for incomplete hardware emulator(s).
4670 4670                           */
4671 4671                          mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4672 4672                          if (mwait_size < sizeof (uint32_t) ||
4673 4673                              !ISP2(mwait_size)) {
4674 4674  #if DEBUG
4675 4675                                  cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4676 4676                                      "size %ld", cpu->cpu_id, (long)mwait_size);
4677 4677  #endif
4678 4678                                  break;
4679 4679                          }
4680 4680  
4681 4681                          cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4682 4682                          cpi->cpi_mwait.mon_max = mwait_size;
4683 4683                          if (MWAIT_EXTENSION(cpi)) {
4684 4684                                  cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4685 4685                                  if (MWAIT_INT_ENABLE(cpi))
4686 4686                                          cpi->cpi_mwait.support |=
4687 4687                                              MWAIT_ECX_INT_ENABLE;
4688 4688                          }
4689 4689                          break;
4690 4690                  }
4691 4691                  default:
4692 4692                          break;
4693 4693                  }
4694 4694          }
4695 4695  
4696 4696          /*
4697 4697           * XSAVE enumeration
4698 4698           */
4699 4699          if (cpi->cpi_maxeax >= 0xD) {
4700 4700                  struct cpuid_regs regs;
4701 4701                  boolean_t cpuid_d_valid = B_TRUE;
4702 4702  
4703 4703                  cp = ®s;
4704 4704                  cp->cp_eax = 0xD;
4705 4705                  cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4706 4706  
4707 4707                  (void) __cpuid_insn(cp);
4708 4708  
4709 4709                  /*
4710 4710                   * Sanity checks for debug
4711 4711                   */
4712 4712                  if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4713 4713                      (cp->cp_eax & XFEATURE_SSE) == 0) {
4714 4714                          cpuid_d_valid = B_FALSE;
4715 4715                  }
4716 4716  
4717 4717                  cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4718 4718                  cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4719 4719                  cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4720 4720  
4721 4721                  /*
4722 4722                   * If the hw supports AVX, get the size and offset in the save
4723 4723                   * area for the ymm state.
4724 4724                   */
4725 4725                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4726 4726                          cp->cp_eax = 0xD;
4727 4727                          cp->cp_ecx = 2;
4728 4728                          cp->cp_edx = cp->cp_ebx = 0;
4729 4729  
4730 4730                          (void) __cpuid_insn(cp);
4731 4731  
4732 4732                          if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4733 4733                              cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4734 4734                                  cpuid_d_valid = B_FALSE;
4735 4735                          }
4736 4736  
4737 4737                          cpi->cpi_xsave.ymm_size = cp->cp_eax;
4738 4738                          cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4739 4739                  }
4740 4740  
4741 4741                  /*
4742 4742                   * If the hw supports MPX, get the size and offset in the
4743 4743                   * save area for BNDREGS and BNDCSR.
4744 4744                   */
4745 4745                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4746 4746                          cp->cp_eax = 0xD;
4747 4747                          cp->cp_ecx = 3;
4748 4748                          cp->cp_edx = cp->cp_ebx = 0;
4749 4749  
4750 4750                          (void) __cpuid_insn(cp);
4751 4751  
4752 4752                          cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4753 4753                          cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4754 4754  
4755 4755                          cp->cp_eax = 0xD;
4756 4756                          cp->cp_ecx = 4;
4757 4757                          cp->cp_edx = cp->cp_ebx = 0;
4758 4758  
4759 4759                          (void) __cpuid_insn(cp);
4760 4760  
4761 4761                          cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4762 4762                          cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4763 4763                  }
4764 4764  
4765 4765                  /*
4766 4766                   * If the hw supports AVX512, get the size and offset in the
4767 4767                   * save area for the opmask registers and zmm state.
4768 4768                   */
4769 4769                  if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4770 4770                          cp->cp_eax = 0xD;
4771 4771                          cp->cp_ecx = 5;
4772 4772                          cp->cp_edx = cp->cp_ebx = 0;
4773 4773  
4774 4774                          (void) __cpuid_insn(cp);
4775 4775  
4776 4776                          cpi->cpi_xsave.opmask_size = cp->cp_eax;
4777 4777                          cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4778 4778  
4779 4779                          cp->cp_eax = 0xD;
4780 4780                          cp->cp_ecx = 6;
4781 4781                          cp->cp_edx = cp->cp_ebx = 0;
4782 4782  
4783 4783                          (void) __cpuid_insn(cp);
4784 4784  
4785 4785                          cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4786 4786                          cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4787 4787  
4788 4788                          cp->cp_eax = 0xD;
4789 4789                          cp->cp_ecx = 7;
4790 4790                          cp->cp_edx = cp->cp_ebx = 0;
4791 4791  
4792 4792                          (void) __cpuid_insn(cp);
4793 4793  
4794 4794                          cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4795 4795                          cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4796 4796                  }
4797 4797  
4798 4798                  if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4799 4799                          xsave_state_size = 0;
4800 4800                  } else if (cpuid_d_valid) {
4801 4801                          xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4802 4802                  } else {
4803 4803                          /* Broken CPUID 0xD, probably in HVM */
4804 4804                          cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4805 4805                              "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4806 4806                              ", ymm_size = %d, ymm_offset = %d\n",
4807 4807                              cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4808 4808                              cpi->cpi_xsave.xsav_hw_features_high,
4809 4809                              (int)cpi->cpi_xsave.xsav_max_size,
4810 4810                              (int)cpi->cpi_xsave.ymm_size,
4811 4811                              (int)cpi->cpi_xsave.ymm_offset);
4812 4812  
4813 4813                          if (xsave_state_size != 0) {
4814 4814                                  /*
4815 4815                                   * This must be a non-boot CPU. We cannot
4816 4816                                   * continue, because boot cpu has already
4817 4817                                   * enabled XSAVE.
4818 4818                                   */
4819 4819                                  ASSERT(cpu->cpu_id != 0);
4820 4820                                  cmn_err(CE_PANIC, "cpu%d: we have already "
4821 4821                                      "enabled XSAVE on boot cpu, cannot "
4822 4822                                      "continue.", cpu->cpu_id);
4823 4823                          } else {
4824 4824                                  /*
4825 4825                                   * If we reached here on the boot CPU, it's also
4826 4826                                   * almost certain that we'll reach here on the
4827 4827                                   * non-boot CPUs. When we're here on a boot CPU
4828 4828                                   * we should disable the feature, on a non-boot
4829 4829                                   * CPU we need to confirm that we have.
4830 4830                                   */
4831 4831                                  if (cpu->cpu_id == 0) {
4832 4832                                          remove_x86_feature(x86_featureset,
4833 4833                                              X86FSET_XSAVE);
4834 4834                                          remove_x86_feature(x86_featureset,
4835 4835                                              X86FSET_AVX);
4836 4836                                          remove_x86_feature(x86_featureset,
4837 4837                                              X86FSET_F16C);
4838 4838                                          remove_x86_feature(x86_featureset,
4839 4839                                              X86FSET_BMI1);
4840 4840                                          remove_x86_feature(x86_featureset,
4841 4841                                              X86FSET_BMI2);
4842 4842                                          remove_x86_feature(x86_featureset,
4843 4843                                              X86FSET_FMA);
4844 4844                                          remove_x86_feature(x86_featureset,
4845 4845                                              X86FSET_AVX2);
4846 4846                                          remove_x86_feature(x86_featureset,
4847 4847                                              X86FSET_MPX);
4848 4848                                          remove_x86_feature(x86_featureset,
4849 4849                                              X86FSET_AVX512F);
4850 4850                                          remove_x86_feature(x86_featureset,
4851 4851                                              X86FSET_AVX512DQ);
4852 4852                                          remove_x86_feature(x86_featureset,
4853 4853                                              X86FSET_AVX512PF);
4854 4854                                          remove_x86_feature(x86_featureset,
4855 4855                                              X86FSET_AVX512ER);
4856 4856                                          remove_x86_feature(x86_featureset,
4857 4857                                              X86FSET_AVX512CD);
4858 4858                                          remove_x86_feature(x86_featureset,
4859 4859                                              X86FSET_AVX512BW);
4860 4860                                          remove_x86_feature(x86_featureset,
4861 4861                                              X86FSET_AVX512VL);
4862 4862                                          remove_x86_feature(x86_featureset,
4863 4863                                              X86FSET_AVX512FMA);
4864 4864                                          remove_x86_feature(x86_featureset,
4865 4865                                              X86FSET_AVX512VBMI);
4866 4866                                          remove_x86_feature(x86_featureset,
4867 4867                                              X86FSET_AVX512VNNI);
4868 4868                                          remove_x86_feature(x86_featureset,
4869 4869                                              X86FSET_AVX512VPOPCDQ);
4870 4870                                          remove_x86_feature(x86_featureset,
4871 4871                                              X86FSET_AVX512NNIW);
4872 4872                                          remove_x86_feature(x86_featureset,
4873 4873                                              X86FSET_AVX512FMAPS);
4874 4874                                          remove_x86_feature(x86_featureset,
4875 4875                                              X86FSET_VAES);
4876 4876                                          remove_x86_feature(x86_featureset,
4877 4877                                              X86FSET_VPCLMULQDQ);
4878 4878                                          remove_x86_feature(x86_featureset,
4879 4879                                              X86FSET_GFNI);
4880 4880                                          remove_x86_feature(x86_featureset,
4881 4881                                              X86FSET_AVX512_VP2INT);
4882 4882                                          remove_x86_feature(x86_featureset,
4883 4883                                              X86FSET_AVX512_BITALG);
4884 4884                                          remove_x86_feature(x86_featureset,
4885 4885                                              X86FSET_AVX512_VBMI2);
4886 4886                                          remove_x86_feature(x86_featureset,
4887 4887                                              X86FSET_AVX512_BF16);
4888 4888  
4889 4889                                          xsave_force_disable = B_TRUE;
4890 4890                                  } else {
4891 4891                                          VERIFY(is_x86_feature(x86_featureset,
4892 4892                                              X86FSET_XSAVE) == B_FALSE);
4893 4893                                  }
4894 4894                          }
4895 4895                  }
4896 4896          }
4897 4897  
4898 4898  
4899 4899          if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4900 4900                  return;
4901 4901  
4902 4902          if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4903 4903                  nmax = NMAX_CPI_EXTD;
4904 4904          /*
4905 4905           * Copy the extended properties, fixing them as we go. While we start at
4906 4906           * 2 because we've already handled a few cases in the basic pass, the
4907 4907           * rest we let ourselves just grab again (e.g. 0x8, 0x21).
4908 4908           */
4909 4909          iptr = (void *)cpi->cpi_brandstr;
4910 4910          for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4911 4911                  cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4912 4912                  (void) __cpuid_insn(cp);
4913 4913                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4914 4914                      cp);
4915 4915                  switch (n) {
4916 4916                  case 2:
4917 4917                  case 3:
4918 4918                  case 4:
4919 4919                          /*
4920 4920                           * Extract the brand string
4921 4921                           */
4922 4922                          *iptr++ = cp->cp_eax;
4923 4923                          *iptr++ = cp->cp_ebx;
4924 4924                          *iptr++ = cp->cp_ecx;
4925 4925                          *iptr++ = cp->cp_edx;
4926 4926                          break;
4927 4927                  case 5:
4928 4928                          switch (cpi->cpi_vendor) {
4929 4929                          case X86_VENDOR_AMD:
4930 4930                                  /*
4931 4931                                   * The Athlon and Duron were the first
4932 4932                                   * parts to report the sizes of the
4933 4933                                   * TLB for large pages. Before then,
4934 4934                                   * we don't trust the data.
4935 4935                                   */
4936 4936                                  if (cpi->cpi_family < 6 ||
4937 4937                                      (cpi->cpi_family == 6 &&
4938 4938                                      cpi->cpi_model < 1))
4939 4939                                          cp->cp_eax = 0;
4940 4940                                  break;
4941 4941                          default:
4942 4942                                  break;
4943 4943                          }
4944 4944                          break;
4945 4945                  case 6:
4946 4946                          switch (cpi->cpi_vendor) {
4947 4947                          case X86_VENDOR_AMD:
4948 4948                                  /*
4949 4949                                   * The Athlon and Duron were the first
4950 4950                                   * AMD parts with L2 TLB's.
4951 4951                                   * Before then, don't trust the data.
4952 4952                                   */
4953 4953                                  if (cpi->cpi_family < 6 ||
4954 4954                                      (cpi->cpi_family == 6 &&
4955 4955                                      cpi->cpi_model < 1))
4956 4956                                          cp->cp_eax = cp->cp_ebx = 0;
4957 4957                                  /*
4958 4958                                   * AMD Duron rev A0 reports L2
4959 4959                                   * cache size incorrectly as 1K
4960 4960                                   * when it is really 64K
4961 4961                                   */
4962 4962                                  if (cpi->cpi_family == 6 &&
4963 4963                                      cpi->cpi_model == 3 &&
4964 4964                                      cpi->cpi_step == 0) {
4965 4965                                          cp->cp_ecx &= 0xffff;
4966 4966                                          cp->cp_ecx |= 0x400000;
4967 4967                                  }
4968 4968                                  break;
4969 4969                          case X86_VENDOR_Cyrix:  /* VIA C3 */
4970 4970                                  /*
4971 4971                                   * VIA C3 processors are a bit messed
4972 4972                                   * up w.r.t. encoding cache sizes in %ecx
4973 4973                                   */
4974 4974                                  if (cpi->cpi_family != 6)
4975 4975                                          break;
4976 4976                                  /*
4977 4977                                   * model 7 and 8 were incorrectly encoded
4978 4978                                   *
4979 4979                                   * xxx is model 8 really broken?
4980 4980                                   */
4981 4981                                  if (cpi->cpi_model == 7 ||
4982 4982                                      cpi->cpi_model == 8)
4983 4983                                          cp->cp_ecx =
4984 4984                                              BITX(cp->cp_ecx, 31, 24) << 16 |
4985 4985                                              BITX(cp->cp_ecx, 23, 16) << 12 |
4986 4986                                              BITX(cp->cp_ecx, 15, 8) << 8 |
4987 4987                                              BITX(cp->cp_ecx, 7, 0);
4988 4988                                  /*
4989 4989                                   * model 9 stepping 1 has wrong associativity
4990 4990                                   */
4991 4991                                  if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4992 4992                                          cp->cp_ecx |= 8 << 12;
4993 4993                                  break;
4994 4994                          case X86_VENDOR_Intel:
4995 4995                                  /*
4996 4996                                   * Extended L2 Cache features function.
4997 4997                                   * First appeared on Prescott.
4998 4998                                   */
4999 4999                          default:
5000 5000                                  break;
5001 5001                          }
5002 5002                          break;
5003 5003                  default:
5004 5004                          break;
5005 5005                  }
5006 5006          }
5007 5007  }
5008 5008  
5009 5009  static const char *
5010 5010  intel_cpubrand(const struct cpuid_info *cpi)
5011 5011  {
5012 5012          int i;
5013 5013  
5014 5014          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5015 5015  
5016 5016          switch (cpi->cpi_family) {
5017 5017          case 5:
5018 5018                  return ("Intel Pentium(r)");
5019 5019          case 6:
5020 5020                  switch (cpi->cpi_model) {
5021 5021                          uint_t celeron, xeon;
5022 5022                          const struct cpuid_regs *cp;
5023 5023                  case 0:
5024 5024                  case 1:
5025 5025                  case 2:
5026 5026                          return ("Intel Pentium(r) Pro");
5027 5027                  case 3:
5028 5028                  case 4:
5029 5029                          return ("Intel Pentium(r) II");
5030 5030                  case 6:
5031 5031                          return ("Intel Celeron(r)");
5032 5032                  case 5:
5033 5033                  case 7:
5034 5034                          celeron = xeon = 0;
5035 5035                          cp = &cpi->cpi_std[2];  /* cache info */
5036 5036  
5037 5037                          for (i = 1; i < 4; i++) {
5038 5038                                  uint_t tmp;
5039 5039  
5040 5040                                  tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5041 5041                                  if (tmp == 0x40)
5042 5042                                          celeron++;
5043 5043                                  if (tmp >= 0x44 && tmp <= 0x45)
5044 5044                                          xeon++;
5045 5045                          }
5046 5046  
5047 5047                          for (i = 0; i < 2; i++) {
5048 5048                                  uint_t tmp;
5049 5049  
5050 5050                                  tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5051 5051                                  if (tmp == 0x40)
5052 5052                                          celeron++;
5053 5053                                  else if (tmp >= 0x44 && tmp <= 0x45)
5054 5054                                          xeon++;
5055 5055                          }
5056 5056  
5057 5057                          for (i = 0; i < 4; i++) {
5058 5058                                  uint_t tmp;
5059 5059  
5060 5060                                  tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5061 5061                                  if (tmp == 0x40)
5062 5062                                          celeron++;
5063 5063                                  else if (tmp >= 0x44 && tmp <= 0x45)
5064 5064                                          xeon++;
5065 5065                          }
5066 5066  
5067 5067                          for (i = 0; i < 4; i++) {
5068 5068                                  uint_t tmp;
5069 5069  
5070 5070                                  tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5071 5071                                  if (tmp == 0x40)
5072 5072                                          celeron++;
5073 5073                                  else if (tmp >= 0x44 && tmp <= 0x45)
5074 5074                                          xeon++;
5075 5075                          }
5076 5076  
5077 5077                          if (celeron)
5078 5078                                  return ("Intel Celeron(r)");
5079 5079                          if (xeon)
5080 5080                                  return (cpi->cpi_model == 5 ?
5081 5081                                      "Intel Pentium(r) II Xeon(tm)" :
5082 5082                                      "Intel Pentium(r) III Xeon(tm)");
5083 5083                          return (cpi->cpi_model == 5 ?
5084 5084                              "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5085 5085                              "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5086 5086                  default:
5087 5087                          break;
5088 5088                  }
5089 5089          default:
5090 5090                  break;
5091 5091          }
5092 5092  
5093 5093          /* BrandID is present if the field is nonzero */
5094 5094          if (cpi->cpi_brandid != 0) {
5095 5095                  static const struct {
5096 5096                          uint_t bt_bid;
5097 5097                          const char *bt_str;
5098 5098                  } brand_tbl[] = {
5099 5099                          { 0x1,  "Intel(r) Celeron(r)" },
5100 5100                          { 0x2,  "Intel(r) Pentium(r) III" },
5101 5101                          { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
5102 5102                          { 0x4,  "Intel(r) Pentium(r) III" },
5103 5103                          { 0x6,  "Mobile Intel(r) Pentium(r) III" },
5104 5104                          { 0x7,  "Mobile Intel(r) Celeron(r)" },
5105 5105                          { 0x8,  "Intel(r) Pentium(r) 4" },
5106 5106                          { 0x9,  "Intel(r) Pentium(r) 4" },
5107 5107                          { 0xa,  "Intel(r) Celeron(r)" },
5108 5108                          { 0xb,  "Intel(r) Xeon(tm)" },
5109 5109                          { 0xc,  "Intel(r) Xeon(tm) MP" },
5110 5110                          { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
5111 5111                          { 0xf,  "Mobile Intel(r) Celeron(r)" },
5112 5112                          { 0x11, "Mobile Genuine Intel(r)" },
5113 5113                          { 0x12, "Intel(r) Celeron(r) M" },
5114 5114                          { 0x13, "Mobile Intel(r) Celeron(r)" },
5115 5115                          { 0x14, "Intel(r) Celeron(r)" },
5116 5116                          { 0x15, "Mobile Genuine Intel(r)" },
5117 5117                          { 0x16, "Intel(r) Pentium(r) M" },
5118 5118                          { 0x17, "Mobile Intel(r) Celeron(r)" }
5119 5119                  };
5120 5120                  uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5121 5121                  uint_t sgn;
5122 5122  
5123 5123                  sgn = (cpi->cpi_family << 8) |
5124 5124                      (cpi->cpi_model << 4) | cpi->cpi_step;
5125 5125  
5126 5126                  for (i = 0; i < btblmax; i++)
5127 5127                          if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5128 5128                                  break;
5129 5129                  if (i < btblmax) {
5130 5130                          if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5131 5131                                  return ("Intel(r) Celeron(r)");
5132 5132                          if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5133 5133                                  return ("Intel(r) Xeon(tm) MP");
5134 5134                          if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5135 5135                                  return ("Intel(r) Xeon(tm)");
5136 5136                          return (brand_tbl[i].bt_str);
5137 5137                  }
5138 5138          }
5139 5139  
5140 5140          return (NULL);
5141 5141  }
5142 5142  
5143 5143  static const char *
5144 5144  amd_cpubrand(const struct cpuid_info *cpi)
5145 5145  {
5146 5146          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5147 5147  
5148 5148          switch (cpi->cpi_family) {
5149 5149          case 5:
5150 5150                  switch (cpi->cpi_model) {
5151 5151                  case 0:
5152 5152                  case 1:
5153 5153                  case 2:
5154 5154                  case 3:
5155 5155                  case 4:
5156 5156                  case 5:
5157 5157                          return ("AMD-K5(r)");
5158 5158                  case 6:
5159 5159                  case 7:
5160 5160                          return ("AMD-K6(r)");
5161 5161                  case 8:
5162 5162                          return ("AMD-K6(r)-2");
5163 5163                  case 9:
5164 5164                          return ("AMD-K6(r)-III");
5165 5165                  default:
5166 5166                          return ("AMD (family 5)");
5167 5167                  }
5168 5168          case 6:
5169 5169                  switch (cpi->cpi_model) {
5170 5170                  case 1:
5171 5171                          return ("AMD-K7(tm)");
5172 5172                  case 0:
5173 5173                  case 2:
5174 5174                  case 4:
5175 5175                          return ("AMD Athlon(tm)");
5176 5176                  case 3:
5177 5177                  case 7:
5178 5178                          return ("AMD Duron(tm)");
5179 5179                  case 6:
5180 5180                  case 8:
5181 5181                  case 10:
5182 5182                          /*
5183 5183                           * Use the L2 cache size to distinguish
5184 5184                           */
5185 5185                          return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5186 5186                              "AMD Athlon(tm)" : "AMD Duron(tm)");
5187 5187                  default:
5188 5188                          return ("AMD (family 6)");
5189 5189                  }
5190 5190          default:
5191 5191                  break;
5192 5192          }
5193 5193  
5194 5194          if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5195 5195              cpi->cpi_brandid != 0) {
5196 5196                  switch (BITX(cpi->cpi_brandid, 7, 5)) {
5197 5197                  case 3:
5198 5198                          return ("AMD Opteron(tm) UP 1xx");
5199 5199                  case 4:
5200 5200                          return ("AMD Opteron(tm) DP 2xx");
5201 5201                  case 5:
5202 5202                          return ("AMD Opteron(tm) MP 8xx");
5203 5203                  default:
5204 5204                          return ("AMD Opteron(tm)");
5205 5205                  }
5206 5206          }
5207 5207  
5208 5208          return (NULL);
5209 5209  }
5210 5210  
5211 5211  static const char *
5212 5212  cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5213 5213  {
5214 5214          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5215 5215  
5216 5216          switch (type) {
5217 5217          case X86_TYPE_CYRIX_6x86:
5218 5218                  return ("Cyrix 6x86");
5219 5219          case X86_TYPE_CYRIX_6x86L:
5220 5220                  return ("Cyrix 6x86L");
5221 5221          case X86_TYPE_CYRIX_6x86MX:
5222 5222                  return ("Cyrix 6x86MX");
5223 5223          case X86_TYPE_CYRIX_GXm:
5224 5224                  return ("Cyrix GXm");
5225 5225          case X86_TYPE_CYRIX_MediaGX:
5226 5226                  return ("Cyrix MediaGX");
5227 5227          case X86_TYPE_CYRIX_MII:
5228 5228                  return ("Cyrix M2");
5229 5229          case X86_TYPE_VIA_CYRIX_III:
5230 5230                  return ("VIA Cyrix M3");
5231 5231          default:
5232 5232                  /*
5233 5233                   * Have another wild guess ..
5234 5234                   */
5235 5235                  if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5236 5236                          return ("Cyrix 5x86");
5237 5237                  else if (cpi->cpi_family == 5) {
5238 5238                          switch (cpi->cpi_model) {
5239 5239                          case 2:
5240 5240                                  return ("Cyrix 6x86");  /* Cyrix M1 */
5241 5241                          case 4:
5242 5242                                  return ("Cyrix MediaGX");
5243 5243                          default:
5244 5244                                  break;
5245 5245                          }
5246 5246                  } else if (cpi->cpi_family == 6) {
5247 5247                          switch (cpi->cpi_model) {
5248 5248                          case 0:
5249 5249                                  return ("Cyrix 6x86MX"); /* Cyrix M2? */
5250 5250                          case 5:
5251 5251                          case 6:
5252 5252                          case 7:
5253 5253                          case 8:
5254 5254                          case 9:
5255 5255                                  return ("VIA C3");
5256 5256                          default:
5257 5257                                  break;
5258 5258                          }
5259 5259                  }
5260 5260                  break;
5261 5261          }
5262 5262          return (NULL);
5263 5263  }
5264 5264  
5265 5265  /*
5266 5266   * This only gets called in the case that the CPU extended
5267 5267   * feature brand string (0x80000002, 0x80000003, 0x80000004)
5268 5268   * aren't available, or contain null bytes for some reason.
5269 5269   */
5270 5270  static void
5271 5271  fabricate_brandstr(struct cpuid_info *cpi)
5272 5272  {
5273 5273          const char *brand = NULL;
5274 5274  
5275 5275          switch (cpi->cpi_vendor) {
5276 5276          case X86_VENDOR_Intel:
5277 5277                  brand = intel_cpubrand(cpi);
5278 5278                  break;
5279 5279          case X86_VENDOR_AMD:
5280 5280                  brand = amd_cpubrand(cpi);
5281 5281                  break;
5282 5282          case X86_VENDOR_Cyrix:
5283 5283                  brand = cyrix_cpubrand(cpi, x86_type);
5284 5284                  break;
5285 5285          case X86_VENDOR_NexGen:
5286 5286                  if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5287 5287                          brand = "NexGen Nx586";
5288 5288                  break;
5289 5289          case X86_VENDOR_Centaur:
5290 5290                  if (cpi->cpi_family == 5)
5291 5291                          switch (cpi->cpi_model) {
5292 5292                          case 4:
5293 5293                                  brand = "Centaur C6";
5294 5294                                  break;
5295 5295                          case 8:
5296 5296                                  brand = "Centaur C2";
5297 5297                                  break;
5298 5298                          case 9:
5299 5299                                  brand = "Centaur C3";
5300 5300                                  break;
5301 5301                          default:
5302 5302                                  break;
5303 5303                          }
5304 5304                  break;
5305 5305          case X86_VENDOR_Rise:
5306 5306                  if (cpi->cpi_family == 5 &&
5307 5307                      (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5308 5308                          brand = "Rise mP6";
5309 5309                  break;
5310 5310          case X86_VENDOR_SiS:
5311 5311                  if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5312 5312                          brand = "SiS 55x";
5313 5313                  break;
5314 5314          case X86_VENDOR_TM:
5315 5315                  if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5316 5316                          brand = "Transmeta Crusoe TM3x00 or TM5x00";
5317 5317                  break;
5318 5318          case X86_VENDOR_NSC:
5319 5319          case X86_VENDOR_UMC:
5320 5320          default:
5321 5321                  break;
5322 5322          }
5323 5323          if (brand) {
5324 5324                  (void) strcpy((char *)cpi->cpi_brandstr, brand);
5325 5325                  return;
5326 5326          }
5327 5327  
5328 5328          /*
5329 5329           * If all else fails ...
5330 5330           */
5331 5331          (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5332 5332              "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5333 5333              cpi->cpi_model, cpi->cpi_step);
5334 5334  }
5335 5335  
5336 5336  /*
5337 5337   * This routine is called just after kernel memory allocation
5338 5338   * becomes available on cpu0, and as part of mp_startup() on
5339 5339   * the other cpus.
5340 5340   *
5341 5341   * Fixup the brand string, and collect any information from cpuid
5342 5342   * that requires dynamically allocated storage to represent.
5343 5343   */
5344 5344  
5345 5345  static void
5346 5346  cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5347 5347  {
5348 5348          int     i, max, shft, level, size;
5349 5349          struct cpuid_regs regs;
5350 5350          struct cpuid_regs *cp;
5351 5351          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5352 5352  
5353 5353          /*
5354 5354           * Deterministic cache parameters
5355 5355           *
5356 5356           * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5357 5357           * values that are present are currently defined to be the same. This
5358 5358           * means we can use the same logic to parse it as long as we use the
5359 5359           * appropriate leaf to get the data. If you're updating this, make sure
5360 5360           * you're careful about which vendor supports which aspect.
5361 5361           *
5362 5362           * Take this opportunity to detect the number of threads sharing the
5363 5363           * last level cache, and construct a corresponding cache id. The
5364 5364           * respective cpuid_info members are initialized to the default case of
5365 5365           * "no last level cache sharing".
5366 5366           */
5367 5367          cpi->cpi_ncpu_shr_last_cache = 1;
5368 5368          cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5369 5369  
5370 5370          if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5371 5371              ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5372 5372              cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5373 5373              cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5374 5374              is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5375 5375                  uint32_t leaf;
5376 5376  
5377 5377                  if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5378 5378                          leaf = 4;
5379 5379                  } else {
5380 5380                          leaf = CPUID_LEAF_EXT_1d;
5381 5381                  }
5382 5382  
5383 5383                  /*
5384 5384                   * Find the # of elements (size) returned by the leaf and along
5385 5385                   * the way detect last level cache sharing details.
5386 5386                   */
5387 5387                  bzero(®s, sizeof (regs));
5388 5388                  cp = ®s;
5389 5389                  for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5390 5390                          cp->cp_eax = leaf;
5391 5391                          cp->cp_ecx = i;
5392 5392  
5393 5393                          (void) __cpuid_insn(cp);
5394 5394  
5395 5395                          if (CPI_CACHE_TYPE(cp) == 0)
5396 5396                                  break;
5397 5397                          level = CPI_CACHE_LVL(cp);
5398 5398                          if (level > max) {
5399 5399                                  max = level;
5400 5400                                  cpi->cpi_ncpu_shr_last_cache =
5401 5401                                      CPI_NTHR_SHR_CACHE(cp) + 1;
5402 5402                          }
5403 5403                  }
5404 5404                  cpi->cpi_cache_leaf_size = size = i;
5405 5405  
5406 5406                  /*
5407 5407                   * Allocate the cpi_cache_leaves array. The first element
5408 5408                   * references the regs for the corresponding leaf with %ecx set
5409 5409                   * to 0. This was gathered in cpuid_pass_extended().
5410 5410                   */
5411 5411                  if (size > 0) {
5412 5412                          cpi->cpi_cache_leaves =
5413 5413                              kmem_alloc(size * sizeof (cp), KM_SLEEP);
5414 5414                          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5415 5415                                  cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5416 5416                          } else {
5417 5417                                  cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5418 5418                          }
5419 5419  
5420 5420                          /*
5421 5421                           * Allocate storage to hold the additional regs
5422 5422                           * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5423 5423                           *
5424 5424                           * The regs for the leaf, %ecx == 0 has already
5425 5425                           * been allocated as indicated above.
5426 5426                           */
5427 5427                          for (i = 1; i < size; i++) {
5428 5428                                  cp = cpi->cpi_cache_leaves[i] =
5429 5429                                      kmem_zalloc(sizeof (regs), KM_SLEEP);
5430 5430                                  cp->cp_eax = leaf;
5431 5431                                  cp->cp_ecx = i;
5432 5432  
5433 5433                                  (void) __cpuid_insn(cp);
5434 5434                          }
5435 5435                  }
5436 5436                  /*
5437 5437                   * Determine the number of bits needed to represent
5438 5438                   * the number of CPUs sharing the last level cache.
5439 5439                   *
5440 5440                   * Shift off that number of bits from the APIC id to
5441 5441                   * derive the cache id.
5442 5442                   */
5443 5443                  shft = 0;
5444 5444                  for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5445 5445                          shft++;
5446 5446                  cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5447 5447          }
5448 5448  
5449 5449          /*
5450 5450           * Now fixup the brand string
5451 5451           */
5452 5452          if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5453 5453                  fabricate_brandstr(cpi);
5454 5454          } else {
5455 5455  
5456 5456                  /*
5457 5457                   * If we successfully extracted a brand string from the cpuid
5458 5458                   * instruction, clean it up by removing leading spaces and
5459 5459                   * similar junk.
5460 5460                   */
5461 5461                  if (cpi->cpi_brandstr[0]) {
5462 5462                          size_t maxlen = sizeof (cpi->cpi_brandstr);
5463 5463                          char *src, *dst;
5464 5464  
5465 5465                          dst = src = (char *)cpi->cpi_brandstr;
5466 5466                          src[maxlen - 1] = '\0';
5467 5467                          /*
5468 5468                           * strip leading spaces
5469 5469                           */
5470 5470                          while (*src == ' ')
5471 5471                                  src++;
5472 5472                          /*
5473 5473                           * Remove any 'Genuine' or "Authentic" prefixes
5474 5474                           */
5475 5475                          if (strncmp(src, "Genuine ", 8) == 0)
5476 5476                                  src += 8;
5477 5477                          if (strncmp(src, "Authentic ", 10) == 0)
5478 5478                                  src += 10;
5479 5479  
5480 5480                          /*
5481 5481                           * Now do an in-place copy.
5482 5482                           * Map (R) to (r) and (TM) to (tm).
5483 5483                           * The era of teletypes is long gone, and there's
5484 5484                           * -really- no need to shout.
5485 5485                           */
5486 5486                          while (*src != '\0') {
5487 5487                                  if (src[0] == '(') {
5488 5488                                          if (strncmp(src + 1, "R)", 2) == 0) {
5489 5489                                                  (void) strncpy(dst, "(r)", 3);
5490 5490                                                  src += 3;
5491 5491                                                  dst += 3;
5492 5492                                                  continue;
5493 5493                                          }
5494 5494                                          if (strncmp(src + 1, "TM)", 3) == 0) {
5495 5495                                                  (void) strncpy(dst, "(tm)", 4);
5496 5496                                                  src += 4;
5497 5497                                                  dst += 4;
5498 5498                                                  continue;
5499 5499                                          }
5500 5500                                  }
5501 5501                                  *dst++ = *src++;
5502 5502                          }
5503 5503                          *dst = '\0';
5504 5504  
5505 5505                          /*
5506 5506                           * Finally, remove any trailing spaces
5507 5507                           */
5508 5508                          while (--dst > cpi->cpi_brandstr)
5509 5509                                  if (*dst == ' ')
5510 5510                                          *dst = '\0';
5511 5511                                  else
5512 5512                                          break;
5513 5513                  } else
5514 5514                          fabricate_brandstr(cpi);
5515 5515          }
5516 5516  }
5517 5517  
5518 5518  typedef struct {
5519 5519          uint32_t avm_av;
5520 5520          uint32_t avm_feat;
5521 5521  } av_feat_map_t;
5522 5522  
5523 5523  /*
5524 5524   * These arrays are used to map features that we should add based on x86
5525 5525   * features that are present. As a large number depend on kernel features,
5526 5526   * rather than rechecking and clearing CPUID everywhere, we simply map these.
5527 5527   * There is an array of these for each hwcap word. Some features aren't tracked
5528 5528   * in the kernel x86 featureset and that's ok. They will not show up in here.
5529 5529   */
5530 5530  static const av_feat_map_t x86fset_to_av1[] = {
5531 5531          { AV_386_CX8, X86FSET_CX8 },
5532 5532          { AV_386_SEP, X86FSET_SEP },
5533 5533          { AV_386_AMD_SYSC, X86FSET_ASYSC },
5534 5534          { AV_386_CMOV, X86FSET_CMOV },
5535 5535          { AV_386_FXSR, X86FSET_SSE },
5536 5536          { AV_386_SSE, X86FSET_SSE },
5537 5537          { AV_386_SSE2, X86FSET_SSE2 },
5538 5538          { AV_386_SSE3, X86FSET_SSE3 },
5539 5539          { AV_386_CX16, X86FSET_CX16 },
5540 5540          { AV_386_TSCP, X86FSET_TSCP },
5541 5541          { AV_386_AMD_SSE4A, X86FSET_SSE4A },
5542 5542          { AV_386_SSSE3, X86FSET_SSSE3 },
5543 5543          { AV_386_SSE4_1, X86FSET_SSE4_1 },
5544 5544          { AV_386_SSE4_2, X86FSET_SSE4_2 },
5545 5545          { AV_386_AES, X86FSET_AES },
5546 5546          { AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5547 5547          { AV_386_XSAVE, X86FSET_XSAVE },
5548 5548          { AV_386_AVX, X86FSET_AVX },
5549 5549          { AV_386_VMX, X86FSET_VMX },
5550 5550          { AV_386_AMD_SVM, X86FSET_SVM }
5551 5551  };
5552 5552  
5553 5553  static const av_feat_map_t x86fset_to_av2[] = {
5554 5554          { AV_386_2_F16C, X86FSET_F16C },
5555 5555          { AV_386_2_RDRAND, X86FSET_RDRAND },
5556 5556          { AV_386_2_BMI1, X86FSET_BMI1 },
5557 5557          { AV_386_2_BMI2, X86FSET_BMI2 },
5558 5558          { AV_386_2_FMA, X86FSET_FMA },
5559 5559          { AV_386_2_AVX2, X86FSET_AVX2 },
5560 5560          { AV_386_2_ADX, X86FSET_ADX },
5561 5561          { AV_386_2_RDSEED, X86FSET_RDSEED },
5562 5562          { AV_386_2_AVX512F, X86FSET_AVX512F },
5563 5563          { AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5564 5564          { AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5565 5565          { AV_386_2_AVX512PF, X86FSET_AVX512PF },
5566 5566          { AV_386_2_AVX512ER, X86FSET_AVX512ER },
5567 5567          { AV_386_2_AVX512CD, X86FSET_AVX512CD },
5568 5568          { AV_386_2_AVX512BW, X86FSET_AVX512BW },
5569 5569          { AV_386_2_AVX512VL, X86FSET_AVX512VL },
5570 5570          { AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5571 5571          { AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5572 5572          { AV_386_2_SHA, X86FSET_SHA },
5573 5573          { AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5574 5574          { AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5575 5575          { AV_386_2_CLWB, X86FSET_CLWB },
5576 5576          { AV_386_2_MONITORX, X86FSET_MONITORX },
5577 5577          { AV_386_2_CLZERO, X86FSET_CLZERO },
5578 5578          { AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5579 5579          { AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5580 5580          { AV_386_2_VAES, X86FSET_VAES },
5581 5581          { AV_386_2_GFNI, X86FSET_GFNI },
5582 5582          { AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5583 5583          { AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5584 5584  };
5585 5585  
5586 5586  static const av_feat_map_t x86fset_to_av3[] = {
5587 5587          { AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5588 5588          { AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5589 5589  };
5590 5590  
5591 5591  /*
5592 5592   * This routine is called out of bind_hwcap() much later in the life
5593 5593   * of the kernel (post_startup()).  The job of this routine is to resolve
5594 5594   * the hardware feature support and kernel support for those features into
5595 5595   * what we're actually going to tell applications via the aux vector.
5596 5596   *
5597 5597   * Most of the aux vector is derived from the x86_featureset array vector where
5598 5598   * a given feature indicates that an aux vector should be plumbed through. This
5599 5599   * allows the kernel to use one tracking mechanism for these based on whether or
5600 5600   * not it has the required hardware support (most often xsave). Most newer
5601 5601   * features are added there in case we need them in the kernel. Otherwise,
5602 5602   * features are evaluated based on looking at the cpuid features that remain. If
5603 5603   * you find yourself wanting to clear out cpuid features for some reason, they
5604 5604   * should instead be driven by the feature set so we have a consistent view.
5605 5605   */
5606 5606  
5607 5607  static void
5608 5608  cpuid_pass_resolve(cpu_t *cpu, void *arg)
5609 5609  {
5610 5610          uint_t *hwcap_out = (uint_t *)arg;
5611 5611          struct cpuid_info *cpi;
5612 5612          uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5613 5613  
5614 5614          cpi = cpu->cpu_m.mcpu_cpi;
5615 5615  
5616 5616          for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5617 5617                  if (is_x86_feature(x86_featureset,
5618 5618                      x86fset_to_av1[i].avm_feat)) {
5619 5619                          hwcap_flags |= x86fset_to_av1[i].avm_av;
5620 5620                  }
5621 5621          }
5622 5622  
5623 5623          for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5624 5624                  if (is_x86_feature(x86_featureset,
5625 5625                      x86fset_to_av2[i].avm_feat)) {
5626 5626                          hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5627 5627                  }
5628 5628          }
5629 5629  
5630 5630          for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5631 5631                  if (is_x86_feature(x86_featureset,
5632 5632                      x86fset_to_av3[i].avm_feat)) {
5633 5633                          hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5634 5634                  }
5635 5635          }
5636 5636  
5637 5637          /*
5638 5638           * From here on out we're working through features that don't have
5639 5639           * corresponding kernel feature flags for various reasons that are
5640 5640           * mostly just due to the historical implementation.
5641 5641           */
5642 5642          if (cpi->cpi_maxeax >= 1) {
5643 5643                  uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5644 5644                  uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5645 5645  
5646 5646                  *edx = CPI_FEATURES_EDX(cpi);
5647 5647                  *ecx = CPI_FEATURES_ECX(cpi);
5648 5648  
5649 5649                  /*
5650 5650                   * [no explicit support required beyond x87 fp context]
5651 5651                   */
5652 5652                  if (!fpu_exists)
5653 5653                          *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5654 5654  
5655 5655                  /*
5656 5656                   * Now map the supported feature vector to things that we
5657 5657                   * think userland will care about.
5658 5658                   */
5659 5659                  if (*ecx & CPUID_INTC_ECX_MOVBE)
5660 5660                          hwcap_flags |= AV_386_MOVBE;
5661 5661  
5662 5662                  if (*ecx & CPUID_INTC_ECX_POPCNT)
5663 5663                          hwcap_flags |= AV_386_POPCNT;
5664 5664                  if (*edx & CPUID_INTC_EDX_FPU)
5665 5665                          hwcap_flags |= AV_386_FPU;
5666 5666                  if (*edx & CPUID_INTC_EDX_MMX)
5667 5667                          hwcap_flags |= AV_386_MMX;
5668 5668                  if (*edx & CPUID_INTC_EDX_TSC)
5669 5669                          hwcap_flags |= AV_386_TSC;
5670 5670          }
5671 5671  
5672 5672          /* Detect systems with a potential CPUID limit  */
5673 5673          if (cpi->cpi_vendor == X86_VENDOR_Intel && cpi->cpi_maxeax < 4) {
5674 5674                  cmn_err(CE_NOTE, "CPUID limit detected, "
5675 5675                      "see the CPUID(7D) man page for details\n");
5676 5676          }
5677 5677  
5678 5678          /*
5679 5679           * Check a few miscellaneous features.
5680 5680           */
5681 5681          if (cpi->cpi_xmaxeax < 0x80000001)
5682 5682                  goto resolve_done;
5683 5683  
5684 5684          switch (cpi->cpi_vendor) {
5685 5685                  uint32_t *edx, *ecx;
5686 5686  
5687 5687          case X86_VENDOR_Intel:
5688 5688                  /*
5689 5689                   * Seems like Intel duplicated what we necessary
5690 5690                   * here to make the initial crop of 64-bit OS's work.
5691 5691                   * Hopefully, those are the only "extended" bits
5692 5692                   * they'll add.
5693 5693                   */
5694 5694                  /*FALLTHROUGH*/
5695 5695  
5696 5696          case X86_VENDOR_AMD:
5697 5697          case X86_VENDOR_HYGON:
5698 5698                  edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5699 5699                  ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5700 5700  
5701 5701                  *edx = CPI_FEATURES_XTD_EDX(cpi);
5702 5702                  *ecx = CPI_FEATURES_XTD_ECX(cpi);
5703 5703  
5704 5704                  /*
5705 5705                   * [no explicit support required beyond
5706 5706                   * x87 fp context and exception handlers]
5707 5707                   */
5708 5708                  if (!fpu_exists)
5709 5709                          *edx &= ~(CPUID_AMD_EDX_MMXamd |
5710 5710                              CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5711 5711  
5712 5712                  /*
5713 5713                   * Now map the supported feature vector to
5714 5714                   * things that we think userland will care about.
5715 5715                   */
5716 5716                  if (*edx & CPUID_AMD_EDX_MMXamd)
5717 5717                          hwcap_flags |= AV_386_AMD_MMX;
5718 5718                  if (*edx & CPUID_AMD_EDX_3DNow)
5719 5719                          hwcap_flags |= AV_386_AMD_3DNow;
5720 5720                  if (*edx & CPUID_AMD_EDX_3DNowx)
5721 5721                          hwcap_flags |= AV_386_AMD_3DNowx;
5722 5722  
5723 5723                  switch (cpi->cpi_vendor) {
5724 5724                  case X86_VENDOR_AMD:
5725 5725                  case X86_VENDOR_HYGON:
5726 5726                          if (*ecx & CPUID_AMD_ECX_AHF64)
5727 5727                                  hwcap_flags |= AV_386_AHF;
5728 5728                          if (*ecx & CPUID_AMD_ECX_LZCNT)
5729 5729                                  hwcap_flags |= AV_386_AMD_LZCNT;
5730 5730                          break;
5731 5731  
5732 5732                  case X86_VENDOR_Intel:
5733 5733                          if (*ecx & CPUID_AMD_ECX_LZCNT)
5734 5734                                  hwcap_flags |= AV_386_AMD_LZCNT;
5735 5735                          /*
5736 5736                           * Aarrgh.
5737 5737                           * Intel uses a different bit in the same word.
5738 5738                           */
5739 5739                          if (*ecx & CPUID_INTC_ECX_AHF64)
5740 5740                                  hwcap_flags |= AV_386_AHF;
5741 5741                          break;
5742 5742                  default:
5743 5743                          break;
5744 5744                  }
5745 5745                  break;
5746 5746  
5747 5747          default:
5748 5748                  break;
5749 5749          }
5750 5750  
5751 5751  resolve_done:
5752 5752          if (hwcap_out != NULL) {
5753 5753                  hwcap_out[0] = hwcap_flags;
5754 5754                  hwcap_out[1] = hwcap_flags_2;
5755 5755                  hwcap_out[2] = hwcap_flags_3;
5756 5756          }
5757 5757  }
5758 5758  
5759 5759  
5760 5760  /*
5761 5761   * Simulate the cpuid instruction using the data we previously
5762 5762   * captured about this CPU.  We try our best to return the truth
5763 5763   * about the hardware, independently of kernel support.
5764 5764   */
5765 5765  uint32_t
5766 5766  cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5767 5767  {
5768 5768          struct cpuid_info *cpi;
5769 5769          struct cpuid_regs *xcp;
5770 5770  
5771 5771          if (cpu == NULL)
5772 5772                  cpu = CPU;
5773 5773          cpi = cpu->cpu_m.mcpu_cpi;
5774 5774  
5775 5775          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5776 5776  
5777 5777          /*
5778 5778           * CPUID data is cached in two separate places: cpi_std for standard
5779 5779           * CPUID leaves , and cpi_extd for extended CPUID leaves.
5780 5780           */
5781 5781          if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5782 5782                  xcp = &cpi->cpi_std[cp->cp_eax];
5783 5783          } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5784 5784              cp->cp_eax <= cpi->cpi_xmaxeax &&
5785 5785              cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5786 5786                  xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5787 5787          } else {
5788 5788                  /*
5789 5789                   * The caller is asking for data from an input parameter which
5790 5790                   * the kernel has not cached.  In this case we go fetch from
5791 5791                   * the hardware and return the data directly to the user.
5792 5792                   */
5793 5793                  return (__cpuid_insn(cp));
5794 5794          }
5795 5795  
5796 5796          cp->cp_eax = xcp->cp_eax;
5797 5797          cp->cp_ebx = xcp->cp_ebx;
5798 5798          cp->cp_ecx = xcp->cp_ecx;
5799 5799          cp->cp_edx = xcp->cp_edx;
5800 5800          return (cp->cp_eax);
5801 5801  }
5802 5802  
5803 5803  boolean_t
5804 5804  cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
5805 5805  {
5806 5806          return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5807 5807              cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5808 5808  }
5809 5809  
5810 5810  int
5811 5811  cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5812 5812  {
5813 5813          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5814 5814  
5815 5815          return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5816 5816  }
5817 5817  
5818 5818  int
5819 5819  cpuid_is_cmt(cpu_t *cpu)
5820 5820  {
5821 5821          if (cpu == NULL)
5822 5822                  cpu = CPU;
5823 5823  
5824 5824          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5825 5825  
5826 5826          return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5827 5827  }
5828 5828  
5829 5829  /*
5830 5830   * AMD and Intel both implement the 64-bit variant of the syscall
5831 5831   * instruction (syscallq), so if there's -any- support for syscall,
5832 5832   * cpuid currently says "yes, we support this".
5833 5833   *
5834 5834   * However, Intel decided to -not- implement the 32-bit variant of the
5835 5835   * syscall instruction, so we provide a predicate to allow our caller
5836 5836   * to test that subtlety here.
5837 5837   *
5838 5838   * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5839 5839   *      even in the case where the hardware would in fact support it.
5840 5840   */
5841 5841  /*ARGSUSED*/
5842 5842  int
5843 5843  cpuid_syscall32_insn(cpu_t *cpu)
5844 5844  {
5845 5845          ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
5846 5846  
5847 5847  #if !defined(__xpv)
5848 5848          if (cpu == NULL)
5849 5849                  cpu = CPU;
5850 5850  
5851 5851          /*CSTYLED*/
5852 5852          {
5853 5853                  struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5854 5854  
5855 5855                  if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5856 5856                      cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5857 5857                      cpi->cpi_xmaxeax >= 0x80000001 &&
5858 5858                      (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5859 5859                          return (1);
5860 5860          }
5861 5861  #endif
5862 5862          return (0);
5863 5863  }
5864 5864  
5865 5865  int
5866 5866  cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5867 5867  {
5868 5868          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5869 5869  
5870 5870          static const char fmt[] =
5871 5871              "x86 (%s %X family %d model %d step %d clock %d MHz)";
5872 5872          static const char fmt_ht[] =
5873 5873              "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5874 5874  
5875 5875          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5876 5876  
5877 5877          if (cpuid_is_cmt(cpu))
5878 5878                  return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5879 5879                      cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5880 5880                      cpi->cpi_family, cpi->cpi_model,
5881 5881                      cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5882 5882          return (snprintf(s, n, fmt,
5883 5883              cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5884 5884              cpi->cpi_family, cpi->cpi_model,
5885 5885              cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5886 5886  }
5887 5887  
5888 5888  const char *
5889 5889  cpuid_getvendorstr(cpu_t *cpu)
5890 5890  {
5891 5891          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5892 5892          return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5893 5893  }
5894 5894  
5895 5895  uint_t
5896 5896  cpuid_getvendor(cpu_t *cpu)
5897 5897  {
5898 5898          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5899 5899          return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5900 5900  }
5901 5901  
5902 5902  uint_t
5903 5903  cpuid_getfamily(cpu_t *cpu)
5904 5904  {
5905 5905          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5906 5906          return (cpu->cpu_m.mcpu_cpi->cpi_family);
5907 5907  }
5908 5908  
5909 5909  uint_t
5910 5910  cpuid_getmodel(cpu_t *cpu)
5911 5911  {
5912 5912          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5913 5913          return (cpu->cpu_m.mcpu_cpi->cpi_model);
5914 5914  }
5915 5915  
5916 5916  uint_t
5917 5917  cpuid_get_ncpu_per_chip(cpu_t *cpu)
5918 5918  {
5919 5919          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5920 5920          return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5921 5921  }
5922 5922  
5923 5923  uint_t
5924 5924  cpuid_get_ncore_per_chip(cpu_t *cpu)
5925 5925  {
5926 5926          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5927 5927          return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5928 5928  }
5929 5929  
5930 5930  uint_t
5931 5931  cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5932 5932  {
5933 5933          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5934 5934          return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5935 5935  }
5936 5936  
5937 5937  id_t
5938 5938  cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5939 5939  {
5940 5940          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5941 5941          return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5942 5942  }
5943 5943  
5944 5944  uint_t
5945 5945  cpuid_getstep(cpu_t *cpu)
5946 5946  {
5947 5947          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5948 5948          return (cpu->cpu_m.mcpu_cpi->cpi_step);
5949 5949  }
5950 5950  
5951 5951  uint_t
5952 5952  cpuid_getsig(struct cpu *cpu)
5953 5953  {
5954 5954          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5955 5955          return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5956 5956  }
5957 5957  
5958 5958  uint32_t
5959 5959  cpuid_getchiprev(struct cpu *cpu)
5960 5960  {
5961 5961          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5962 5962          return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5963 5963  }
5964 5964  
5965 5965  const char *
5966 5966  cpuid_getchiprevstr(struct cpu *cpu)
5967 5967  {
5968 5968          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5969 5969          return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5970 5970  }
5971 5971  
5972 5972  uint32_t
5973 5973  cpuid_getsockettype(struct cpu *cpu)
5974 5974  {
5975 5975          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5976 5976          return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5977 5977  }
5978 5978  
5979 5979  const char *
5980 5980  cpuid_getsocketstr(cpu_t *cpu)
5981 5981  {
5982 5982          static const char *socketstr = NULL;
5983 5983          struct cpuid_info *cpi;
5984 5984  
5985 5985          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5986 5986          cpi = cpu->cpu_m.mcpu_cpi;
5987 5987  
5988 5988          /* Assume that socket types are the same across the system */
5989 5989          if (socketstr == NULL)
5990 5990                  socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5991 5991                      cpi->cpi_model, cpi->cpi_step);
5992 5992  
5993 5993  
5994 5994          return (socketstr);
5995 5995  }
5996 5996  
5997 5997  x86_uarchrev_t
5998 5998  cpuid_getuarchrev(cpu_t *cpu)
5999 5999  {
6000 6000          return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
6001 6001  }
6002 6002  
6003 6003  int
6004 6004  cpuid_get_chipid(cpu_t *cpu)
6005 6005  {
6006 6006          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6007 6007  
6008 6008          if (cpuid_is_cmt(cpu))
6009 6009                  return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6010 6010          return (cpu->cpu_id);
6011 6011  }
6012 6012  
6013 6013  id_t
6014 6014  cpuid_get_coreid(cpu_t *cpu)
6015 6015  {
6016 6016          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6017 6017          return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6018 6018  }
6019 6019  
6020 6020  int
6021 6021  cpuid_get_pkgcoreid(cpu_t *cpu)
6022 6022  {
6023 6023          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6024 6024          return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6025 6025  }
6026 6026  
6027 6027  int
6028 6028  cpuid_get_clogid(cpu_t *cpu)
6029 6029  {
6030 6030          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6031 6031          return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6032 6032  }
6033 6033  
6034 6034  int
6035 6035  cpuid_get_cacheid(cpu_t *cpu)
6036 6036  {
6037 6037          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6038 6038          return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6039 6039  }
6040 6040  
6041 6041  uint_t
6042 6042  cpuid_get_procnodeid(cpu_t *cpu)
6043 6043  {
6044 6044          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6045 6045          return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6046 6046  }
6047 6047  
6048 6048  uint_t
6049 6049  cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6050 6050  {
6051 6051          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6052 6052          return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6053 6053  }
6054 6054  
6055 6055  uint_t
6056 6056  cpuid_get_compunitid(cpu_t *cpu)
6057 6057  {
6058 6058          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6059 6059          return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6060 6060  }
6061 6061  
6062 6062  uint_t
6063 6063  cpuid_get_cores_per_compunit(cpu_t *cpu)
6064 6064  {
6065 6065          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6066 6066          return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6067 6067  }
6068 6068  
6069 6069  uint32_t
6070 6070  cpuid_get_apicid(cpu_t *cpu)
6071 6071  {
6072 6072          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6073 6073          if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6074 6074                  return (UINT32_MAX);
6075 6075          } else {
6076 6076                  return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6077 6077          }
6078 6078  }
6079 6079  
6080 6080  void
6081 6081  cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6082 6082  {
6083 6083          struct cpuid_info *cpi;
6084 6084  
6085 6085          if (cpu == NULL)
6086 6086                  cpu = CPU;
6087 6087          cpi = cpu->cpu_m.mcpu_cpi;
  
    | 
      ↓ open down ↓ | 
    6087 lines elided | 
    
      ↑ open up ↑ | 
  
6088 6088  
6089 6089          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6090 6090  
6091 6091          if (pabits)
6092 6092                  *pabits = cpi->cpi_pabits;
6093 6093          if (vabits)
6094 6094                  *vabits = cpi->cpi_vabits;
6095 6095  }
6096 6096  
6097 6097  size_t
6098      -cpuid_get_xsave_size()
     6098 +cpuid_get_xsave_size(void)
6099 6099  {
6100 6100          return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6101 6101              sizeof (struct xsave_state)));
6102 6102  }
6103 6103  
6104 6104  /*
     6105 + * Export information about known offsets to the kernel. We only care about
     6106 + * things we have actually enabled support for in %xcr0.
     6107 + */
     6108 +void
     6109 +cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
     6110 +{
     6111 +        size_t size, off;
     6112 +
     6113 +        VERIFY3U(bit & xsave_bv_all, !=, 0);
     6114 +
     6115 +        if (sizep == NULL)
     6116 +                sizep = &size;
     6117 +        if (offp == NULL)
     6118 +                offp = &off;
     6119 +
     6120 +        switch (bit) {
     6121 +        case XFEATURE_LEGACY_FP:
     6122 +        case XFEATURE_SSE:
     6123 +                *sizep = sizeof (struct fxsave_state);
     6124 +                *offp = 0;
     6125 +                break;
     6126 +        case XFEATURE_AVX:
     6127 +                *sizep = cpuid_info0.cpi_xsave.ymm_size;
     6128 +                *offp = cpuid_info0.cpi_xsave.ymm_offset;
     6129 +                break;
     6130 +        case XFEATURE_AVX512_OPMASK:
     6131 +                *sizep = cpuid_info0.cpi_xsave.opmask_size;
     6132 +                *offp = cpuid_info0.cpi_xsave.opmask_offset;
     6133 +                break;
     6134 +        case XFEATURE_AVX512_ZMM:
     6135 +                *sizep = cpuid_info0.cpi_xsave.zmmlo_size;
     6136 +                *offp = cpuid_info0.cpi_xsave.zmmlo_offset;
     6137 +                break;
     6138 +        case XFEATURE_AVX512_HI_ZMM:
     6139 +                *sizep = cpuid_info0.cpi_xsave.zmmhi_size;
     6140 +                *offp = cpuid_info0.cpi_xsave.zmmhi_offset;
     6141 +                break;
     6142 +        default:
     6143 +                panic("asked for unsupported xsave feature: 0x%lx", bit);
     6144 +        }
     6145 +}
     6146 +
     6147 +/*
6105 6148   * Return true if the CPUs on this system require 'pointer clearing' for the
6106 6149   * floating point error pointer exception handling. In the past, this has been
6107 6150   * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6108 6151   * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6109 6152   * feature bit and is reflected in the cpi_fp_amd_save member.
6110 6153   */
6111 6154  boolean_t
6112      -cpuid_need_fp_excp_handling()
     6155 +cpuid_need_fp_excp_handling(void)
6113 6156  {
6114 6157          return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6115 6158              cpuid_info0.cpi_fp_amd_save != 0);
6116 6159  }
6117 6160  
6118 6161  /*
6119 6162   * Returns the number of data TLB entries for a corresponding
6120 6163   * pagesize.  If it can't be computed, or isn't known, the
6121 6164   * routine returns zero.  If you ask about an architecturally
6122 6165   * impossible pagesize, the routine will panic (so that the
6123 6166   * hat implementor knows that things are inconsistent.)
6124 6167   */
6125 6168  uint_t
6126 6169  cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6127 6170  {
6128 6171          struct cpuid_info *cpi;
6129 6172          uint_t dtlb_nent = 0;
6130 6173  
6131 6174          if (cpu == NULL)
6132 6175                  cpu = CPU;
6133 6176          cpi = cpu->cpu_m.mcpu_cpi;
6134 6177  
6135 6178          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6136 6179  
6137 6180          /*
6138 6181           * Check the L2 TLB info
6139 6182           */
6140 6183          if (cpi->cpi_xmaxeax >= 0x80000006) {
6141 6184                  struct cpuid_regs *cp = &cpi->cpi_extd[6];
6142 6185  
6143 6186                  switch (pagesize) {
6144 6187  
6145 6188                  case 4 * 1024:
6146 6189                          /*
6147 6190                           * All zero in the top 16 bits of the register
6148 6191                           * indicates a unified TLB. Size is in low 16 bits.
6149 6192                           */
6150 6193                          if ((cp->cp_ebx & 0xffff0000) == 0)
6151 6194                                  dtlb_nent = cp->cp_ebx & 0x0000ffff;
6152 6195                          else
6153 6196                                  dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6154 6197                          break;
6155 6198  
6156 6199                  case 2 * 1024 * 1024:
6157 6200                          if ((cp->cp_eax & 0xffff0000) == 0)
6158 6201                                  dtlb_nent = cp->cp_eax & 0x0000ffff;
6159 6202                          else
6160 6203                                  dtlb_nent = BITX(cp->cp_eax, 27, 16);
6161 6204                          break;
6162 6205  
6163 6206                  default:
6164 6207                          panic("unknown L2 pagesize");
6165 6208                          /*NOTREACHED*/
6166 6209                  }
6167 6210          }
6168 6211  
6169 6212          if (dtlb_nent != 0)
6170 6213                  return (dtlb_nent);
6171 6214  
6172 6215          /*
6173 6216           * No L2 TLB support for this size, try L1.
6174 6217           */
6175 6218          if (cpi->cpi_xmaxeax >= 0x80000005) {
6176 6219                  struct cpuid_regs *cp = &cpi->cpi_extd[5];
6177 6220  
6178 6221                  switch (pagesize) {
6179 6222                  case 4 * 1024:
6180 6223                          dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6181 6224                          break;
6182 6225                  case 2 * 1024 * 1024:
6183 6226                          dtlb_nent = BITX(cp->cp_eax, 23, 16);
6184 6227                          break;
6185 6228                  default:
6186 6229                          panic("unknown L1 d-TLB pagesize");
6187 6230                          /*NOTREACHED*/
6188 6231                  }
6189 6232          }
6190 6233  
6191 6234          return (dtlb_nent);
6192 6235  }
6193 6236  
6194 6237  /*
6195 6238   * Return 0 if the erratum is not present or not applicable, positive
6196 6239   * if it is, and negative if the status of the erratum is unknown.
6197 6240   *
6198 6241   * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6199 6242   * Processors" #25759, Rev 3.57, August 2005
6200 6243   */
6201 6244  int
6202 6245  cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6203 6246  {
6204 6247          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6205 6248          uint_t eax;
6206 6249  
6207 6250          /*
6208 6251           * Bail out if this CPU isn't an AMD CPU, or if it's
6209 6252           * a legacy (32-bit) AMD CPU.
6210 6253           */
6211 6254          if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6212 6255              cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6213 6256              cpi->cpi_family == 6) {
6214 6257                  return (0);
6215 6258          }
6216 6259  
6217 6260          eax = cpi->cpi_std[1].cp_eax;
6218 6261  
6219 6262  #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
6220 6263  #define SH_B3(eax)      (eax == 0xf51)
6221 6264  #define B(eax)          (SH_B0(eax) || SH_B3(eax))
6222 6265  
6223 6266  #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
6224 6267  
6225 6268  #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6226 6269  #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6227 6270  #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
6228 6271  #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6229 6272  
6230 6273  #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6231 6274  #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
6232 6275  #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
6233 6276  #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6234 6277  
6235 6278  #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6236 6279  #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
6237 6280  #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
6238 6281  #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
6239 6282  #define BH_E4(eax)      (eax == 0x20fb1)
6240 6283  #define SH_E5(eax)      (eax == 0x20f42)
6241 6284  #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
6242 6285  #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
6243 6286  #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6244 6287                              SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6245 6288                              DH_E6(eax) || JH_E6(eax))
6246 6289  
6247 6290  #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6248 6291  #define DR_B0(eax)      (eax == 0x100f20)
6249 6292  #define DR_B1(eax)      (eax == 0x100f21)
6250 6293  #define DR_BA(eax)      (eax == 0x100f2a)
6251 6294  #define DR_B2(eax)      (eax == 0x100f22)
6252 6295  #define DR_B3(eax)      (eax == 0x100f23)
6253 6296  #define RB_C0(eax)      (eax == 0x100f40)
6254 6297  
6255 6298          switch (erratum) {
6256 6299          case 1:
6257 6300                  return (cpi->cpi_family < 0x10);
6258 6301          case 51:        /* what does the asterisk mean? */
6259 6302                  return (B(eax) || SH_C0(eax) || CG(eax));
6260 6303          case 52:
6261 6304                  return (B(eax));
6262 6305          case 57:
6263 6306                  return (cpi->cpi_family <= 0x11);
6264 6307          case 58:
6265 6308                  return (B(eax));
6266 6309          case 60:
6267 6310                  return (cpi->cpi_family <= 0x11);
6268 6311          case 61:
6269 6312          case 62:
6270 6313          case 63:
6271 6314          case 64:
6272 6315          case 65:
6273 6316          case 66:
6274 6317          case 68:
6275 6318          case 69:
6276 6319          case 70:
6277 6320          case 71:
6278 6321                  return (B(eax));
6279 6322          case 72:
6280 6323                  return (SH_B0(eax));
6281 6324          case 74:
6282 6325                  return (B(eax));
6283 6326          case 75:
6284 6327                  return (cpi->cpi_family < 0x10);
6285 6328          case 76:
6286 6329                  return (B(eax));
6287 6330          case 77:
6288 6331                  return (cpi->cpi_family <= 0x11);
6289 6332          case 78:
6290 6333                  return (B(eax) || SH_C0(eax));
6291 6334          case 79:
6292 6335                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6293 6336          case 80:
6294 6337          case 81:
6295 6338          case 82:
6296 6339                  return (B(eax));
6297 6340          case 83:
6298 6341                  return (B(eax) || SH_C0(eax) || CG(eax));
6299 6342          case 85:
6300 6343                  return (cpi->cpi_family < 0x10);
6301 6344          case 86:
6302 6345                  return (SH_C0(eax) || CG(eax));
6303 6346          case 88:
6304 6347                  return (B(eax) || SH_C0(eax));
6305 6348          case 89:
6306 6349                  return (cpi->cpi_family < 0x10);
6307 6350          case 90:
6308 6351                  return (B(eax) || SH_C0(eax) || CG(eax));
6309 6352          case 91:
6310 6353          case 92:
6311 6354                  return (B(eax) || SH_C0(eax));
6312 6355          case 93:
6313 6356                  return (SH_C0(eax));
6314 6357          case 94:
6315 6358                  return (B(eax) || SH_C0(eax) || CG(eax));
6316 6359          case 95:
6317 6360                  return (B(eax) || SH_C0(eax));
6318 6361          case 96:
6319 6362                  return (B(eax) || SH_C0(eax) || CG(eax));
6320 6363          case 97:
6321 6364          case 98:
6322 6365                  return (SH_C0(eax) || CG(eax));
6323 6366          case 99:
6324 6367                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6325 6368          case 100:
6326 6369                  return (B(eax) || SH_C0(eax));
6327 6370          case 101:
6328 6371          case 103:
6329 6372                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6330 6373          case 104:
6331 6374                  return (SH_C0(eax) || CG(eax) || D0(eax));
6332 6375          case 105:
6333 6376          case 106:
6334 6377          case 107:
6335 6378                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6336 6379          case 108:
6337 6380                  return (DH_CG(eax));
6338 6381          case 109:
6339 6382                  return (SH_C0(eax) || CG(eax) || D0(eax));
6340 6383          case 110:
6341 6384                  return (D0(eax) || EX(eax));
6342 6385          case 111:
6343 6386                  return (CG(eax));
6344 6387          case 112:
6345 6388                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6346 6389          case 113:
6347 6390                  return (eax == 0x20fc0);
6348 6391          case 114:
6349 6392                  return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6350 6393          case 115:
6351 6394                  return (SH_E0(eax) || JH_E1(eax));
6352 6395          case 116:
6353 6396                  return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6354 6397          case 117:
6355 6398                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6356 6399          case 118:
6357 6400                  return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6358 6401                      JH_E6(eax));
6359 6402          case 121:
6360 6403                  return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6361 6404          case 122:
6362 6405                  return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6363 6406          case 123:
6364 6407                  return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6365 6408          case 131:
6366 6409                  return (cpi->cpi_family < 0x10);
6367 6410          case 6336786:
6368 6411  
6369 6412                  /*
6370 6413                   * Test for AdvPowerMgmtInfo.TscPStateInvariant
6371 6414                   * if this is a K8 family or newer processor. We're testing for
6372 6415                   * this 'erratum' to determine whether or not we have a constant
6373 6416                   * TSC.
6374 6417                   *
6375 6418                   * Our current fix for this is to disable the C1-Clock ramping.
6376 6419                   * However, this doesn't work on newer processor families nor
6377 6420                   * does it work when virtualized as those devices don't exist.
6378 6421                   */
6379 6422                  if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6380 6423                          return (0);
6381 6424                  }
6382 6425  
6383 6426                  if (CPI_FAMILY(cpi) == 0xf) {
6384 6427                          struct cpuid_regs regs;
6385 6428                          regs.cp_eax = 0x80000007;
6386 6429                          (void) __cpuid_insn(®s);
6387 6430                          return (!(regs.cp_edx & 0x100));
6388 6431                  }
6389 6432                  return (0);
6390 6433          case 147:
6391 6434                  /*
6392 6435                   * This erratum (K8 #147) is not present on family 10 and newer.
6393 6436                   */
6394 6437                  if (cpi->cpi_family >= 0x10) {
6395 6438                          return (0);
6396 6439                  }
6397 6440                  return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6398 6441                      (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6399 6442  
6400 6443          case 6671130:
6401 6444                  /*
6402 6445                   * check for processors (pre-Shanghai) that do not provide
6403 6446                   * optimal management of 1gb ptes in its tlb.
6404 6447                   */
6405 6448                  return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6406 6449  
6407 6450          case 298:
6408 6451                  return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6409 6452                      DR_B2(eax) || RB_C0(eax));
6410 6453  
6411 6454          case 721:
6412 6455                  return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6413 6456  
6414 6457          default:
6415 6458                  return (-1);
6416 6459  
6417 6460          }
6418 6461  }
6419 6462  
6420 6463  /*
6421 6464   * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6422 6465   * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6423 6466   */
6424 6467  int
6425 6468  osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6426 6469  {
6427 6470          struct cpuid_info       *cpi;
6428 6471          uint_t                  osvwid;
6429 6472          static int              osvwfeature = -1;
6430 6473          uint64_t                osvwlength;
6431 6474  
6432 6475  
6433 6476          cpi = cpu->cpu_m.mcpu_cpi;
6434 6477  
6435 6478          /* confirm OSVW supported */
6436 6479          if (osvwfeature == -1) {
6437 6480                  osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6438 6481          } else {
6439 6482                  /* assert that osvw feature setting is consistent on all cpus */
6440 6483                  ASSERT(osvwfeature ==
6441 6484                      (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6442 6485          }
6443 6486          if (!osvwfeature)
6444 6487                  return (-1);
6445 6488  
6446 6489          osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6447 6490  
6448 6491          switch (erratum) {
6449 6492          case 298:       /* osvwid is 0 */
6450 6493                  osvwid = 0;
6451 6494                  if (osvwlength <= (uint64_t)osvwid) {
6452 6495                          /* osvwid 0 is unknown */
6453 6496                          return (-1);
6454 6497                  }
6455 6498  
6456 6499                  /*
6457 6500                   * Check the OSVW STATUS MSR to determine the state
6458 6501                   * of the erratum where:
6459 6502                   *   0 - fixed by HW
6460 6503                   *   1 - BIOS has applied the workaround when BIOS
6461 6504                   *   workaround is available. (Or for other errata,
6462 6505                   *   OS workaround is required.)
6463 6506                   * For a value of 1, caller will confirm that the
6464 6507                   * erratum 298 workaround has indeed been applied by BIOS.
6465 6508                   *
6466 6509                   * A 1 may be set in cpus that have a HW fix
6467 6510                   * in a mixed cpu system. Regarding erratum 298:
6468 6511                   *   In a multiprocessor platform, the workaround above
6469 6512                   *   should be applied to all processors regardless of
6470 6513                   *   silicon revision when an affected processor is
6471 6514                   *   present.
6472 6515                   */
6473 6516  
6474 6517                  return (rdmsr(MSR_AMD_OSVW_STATUS +
6475 6518                      (osvwid / OSVW_ID_CNT_PER_MSR)) &
6476 6519                      (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6477 6520  
6478 6521          default:
6479 6522                  return (-1);
6480 6523          }
6481 6524  }
6482 6525  
6483 6526  static const char assoc_str[] = "associativity";
6484 6527  static const char line_str[] = "line-size";
6485 6528  static const char size_str[] = "size";
6486 6529  
6487 6530  static void
6488 6531  add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6489 6532      uint32_t val)
6490 6533  {
6491 6534          char buf[128];
6492 6535  
6493 6536          /*
6494 6537           * ndi_prop_update_int() is used because it is desirable for
6495 6538           * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6496 6539           */
6497 6540          if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6498 6541                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6499 6542  }
6500 6543  
6501 6544  /*
6502 6545   * Intel-style cache/tlb description
6503 6546   *
6504 6547   * Standard cpuid level 2 gives a randomly ordered
6505 6548   * selection of tags that index into a table that describes
6506 6549   * cache and tlb properties.
6507 6550   */
6508 6551  
6509 6552  static const char l1_icache_str[] = "l1-icache";
6510 6553  static const char l1_dcache_str[] = "l1-dcache";
6511 6554  static const char l2_cache_str[] = "l2-cache";
6512 6555  static const char l3_cache_str[] = "l3-cache";
6513 6556  static const char itlb4k_str[] = "itlb-4K";
6514 6557  static const char dtlb4k_str[] = "dtlb-4K";
6515 6558  static const char itlb2M_str[] = "itlb-2M";
6516 6559  static const char itlb4M_str[] = "itlb-4M";
6517 6560  static const char dtlb4M_str[] = "dtlb-4M";
6518 6561  static const char dtlb24_str[] = "dtlb0-2M-4M";
6519 6562  static const char itlb424_str[] = "itlb-4K-2M-4M";
6520 6563  static const char itlb24_str[] = "itlb-2M-4M";
6521 6564  static const char dtlb44_str[] = "dtlb-4K-4M";
6522 6565  static const char sl1_dcache_str[] = "sectored-l1-dcache";
6523 6566  static const char sl2_cache_str[] = "sectored-l2-cache";
6524 6567  static const char itrace_str[] = "itrace-cache";
6525 6568  static const char sl3_cache_str[] = "sectored-l3-cache";
6526 6569  static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6527 6570  
6528 6571  static const struct cachetab {
6529 6572          uint8_t         ct_code;
6530 6573          uint8_t         ct_assoc;
6531 6574          uint16_t        ct_line_size;
6532 6575          size_t          ct_size;
6533 6576          const char      *ct_label;
6534 6577  } intel_ctab[] = {
6535 6578          /*
6536 6579           * maintain descending order!
6537 6580           *
6538 6581           * Codes ignored - Reason
6539 6582           * ----------------------
6540 6583           * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6541 6584           * f0H/f1H - Currently we do not interpret prefetch size by design
6542 6585           */
6543 6586          { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6544 6587          { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6545 6588          { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6546 6589          { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6547 6590          { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6548 6591          { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6549 6592          { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6550 6593          { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6551 6594          { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6552 6595          { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6553 6596          { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6554 6597          { 0xd0, 4, 64, 512*1024, l3_cache_str},
6555 6598          { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6556 6599          { 0xc0, 4, 0, 8, dtlb44_str },
6557 6600          { 0xba, 4, 0, 64, dtlb4k_str },
6558 6601          { 0xb4, 4, 0, 256, dtlb4k_str },
6559 6602          { 0xb3, 4, 0, 128, dtlb4k_str },
6560 6603          { 0xb2, 4, 0, 64, itlb4k_str },
6561 6604          { 0xb0, 4, 0, 128, itlb4k_str },
6562 6605          { 0x87, 8, 64, 1024*1024, l2_cache_str},
6563 6606          { 0x86, 4, 64, 512*1024, l2_cache_str},
6564 6607          { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6565 6608          { 0x84, 8, 32, 1024*1024, l2_cache_str},
6566 6609          { 0x83, 8, 32, 512*1024, l2_cache_str},
6567 6610          { 0x82, 8, 32, 256*1024, l2_cache_str},
6568 6611          { 0x80, 8, 64, 512*1024, l2_cache_str},
6569 6612          { 0x7f, 2, 64, 512*1024, l2_cache_str},
6570 6613          { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6571 6614          { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6572 6615          { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6573 6616          { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6574 6617          { 0x79, 8, 64, 128*1024, sl2_cache_str},
6575 6618          { 0x78, 8, 64, 1024*1024, l2_cache_str},
6576 6619          { 0x73, 8, 0, 64*1024, itrace_str},
6577 6620          { 0x72, 8, 0, 32*1024, itrace_str},
6578 6621          { 0x71, 8, 0, 16*1024, itrace_str},
6579 6622          { 0x70, 8, 0, 12*1024, itrace_str},
6580 6623          { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6581 6624          { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6582 6625          { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6583 6626          { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6584 6627          { 0x5d, 0, 0, 256, dtlb44_str},
6585 6628          { 0x5c, 0, 0, 128, dtlb44_str},
6586 6629          { 0x5b, 0, 0, 64, dtlb44_str},
6587 6630          { 0x5a, 4, 0, 32, dtlb24_str},
6588 6631          { 0x59, 0, 0, 16, dtlb4k_str},
6589 6632          { 0x57, 4, 0, 16, dtlb4k_str},
6590 6633          { 0x56, 4, 0, 16, dtlb4M_str},
6591 6634          { 0x55, 0, 0, 7, itlb24_str},
6592 6635          { 0x52, 0, 0, 256, itlb424_str},
6593 6636          { 0x51, 0, 0, 128, itlb424_str},
6594 6637          { 0x50, 0, 0, 64, itlb424_str},
6595 6638          { 0x4f, 0, 0, 32, itlb4k_str},
6596 6639          { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6597 6640          { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6598 6641          { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6599 6642          { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6600 6643          { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6601 6644          { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6602 6645          { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6603 6646          { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6604 6647          { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6605 6648          { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6606 6649          { 0x44, 4, 32, 1024*1024, l2_cache_str},
6607 6650          { 0x43, 4, 32, 512*1024, l2_cache_str},
6608 6651          { 0x42, 4, 32, 256*1024, l2_cache_str},
6609 6652          { 0x41, 4, 32, 128*1024, l2_cache_str},
6610 6653          { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6611 6654          { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6612 6655          { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6613 6656          { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6614 6657          { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6615 6658          { 0x39, 4, 64, 128*1024, sl2_cache_str},
6616 6659          { 0x30, 8, 64, 32*1024, l1_icache_str},
6617 6660          { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6618 6661          { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6619 6662          { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6620 6663          { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6621 6664          { 0x22, 4, 64, 512*1024, sl3_cache_str},
6622 6665          { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6623 6666          { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6624 6667          { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6625 6668          { 0x0b, 4, 0, 4, itlb4M_str},
6626 6669          { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6627 6670          { 0x08, 4, 32, 16*1024, l1_icache_str},
6628 6671          { 0x06, 4, 32, 8*1024, l1_icache_str},
6629 6672          { 0x05, 4, 0, 32, dtlb4M_str},
6630 6673          { 0x04, 4, 0, 8, dtlb4M_str},
6631 6674          { 0x03, 4, 0, 64, dtlb4k_str},
6632 6675          { 0x02, 4, 0, 2, itlb4M_str},
6633 6676          { 0x01, 4, 0, 32, itlb4k_str},
6634 6677          { 0 }
6635 6678  };
6636 6679  
6637 6680  static const struct cachetab cyrix_ctab[] = {
6638 6681          { 0x70, 4, 0, 32, "tlb-4K" },
6639 6682          { 0x80, 4, 16, 16*1024, "l1-cache" },
6640 6683          { 0 }
6641 6684  };
6642 6685  
6643 6686  /*
6644 6687   * Search a cache table for a matching entry
6645 6688   */
6646 6689  static const struct cachetab *
6647 6690  find_cacheent(const struct cachetab *ct, uint_t code)
6648 6691  {
6649 6692          if (code != 0) {
6650 6693                  for (; ct->ct_code != 0; ct++)
6651 6694                          if (ct->ct_code <= code)
6652 6695                                  break;
6653 6696                  if (ct->ct_code == code)
6654 6697                          return (ct);
6655 6698          }
6656 6699          return (NULL);
6657 6700  }
6658 6701  
6659 6702  /*
6660 6703   * Populate cachetab entry with L2 or L3 cache-information using
6661 6704   * cpuid function 4. This function is called from intel_walk_cacheinfo()
6662 6705   * when descriptor 0x49 is encountered. It returns 0 if no such cache
6663 6706   * information is found.
6664 6707   */
6665 6708  static int
6666 6709  intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6667 6710  {
6668 6711          uint32_t level, i;
6669 6712          int ret = 0;
6670 6713  
6671 6714          for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6672 6715                  level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6673 6716  
6674 6717                  if (level == 2 || level == 3) {
6675 6718                          ct->ct_assoc =
6676 6719                              CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6677 6720                          ct->ct_line_size =
6678 6721                              CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6679 6722                          ct->ct_size = ct->ct_assoc *
6680 6723                              (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6681 6724                              ct->ct_line_size *
6682 6725                              (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6683 6726  
6684 6727                          if (level == 2) {
6685 6728                                  ct->ct_label = l2_cache_str;
6686 6729                          } else if (level == 3) {
6687 6730                                  ct->ct_label = l3_cache_str;
6688 6731                          }
6689 6732                          ret = 1;
6690 6733                  }
6691 6734          }
6692 6735  
6693 6736          return (ret);
6694 6737  }
6695 6738  
6696 6739  /*
6697 6740   * Walk the cacheinfo descriptor, applying 'func' to every valid element
6698 6741   * The walk is terminated if the walker returns non-zero.
6699 6742   */
6700 6743  static void
6701 6744  intel_walk_cacheinfo(struct cpuid_info *cpi,
6702 6745      void *arg, int (*func)(void *, const struct cachetab *))
6703 6746  {
6704 6747          const struct cachetab *ct;
6705 6748          struct cachetab des_49_ct, des_b1_ct;
6706 6749          uint8_t *dp;
6707 6750          int i;
6708 6751  
6709 6752          if ((dp = cpi->cpi_cacheinfo) == NULL)
6710 6753                  return;
6711 6754          for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6712 6755                  /*
6713 6756                   * For overloaded descriptor 0x49 we use cpuid function 4
6714 6757                   * if supported by the current processor, to create
6715 6758                   * cache information.
6716 6759                   * For overloaded descriptor 0xb1 we use X86_PAE flag
6717 6760                   * to disambiguate the cache information.
6718 6761                   */
6719 6762                  if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6720 6763                      intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6721 6764                                  ct = &des_49_ct;
6722 6765                  } else if (*dp == 0xb1) {
6723 6766                          des_b1_ct.ct_code = 0xb1;
6724 6767                          des_b1_ct.ct_assoc = 4;
6725 6768                          des_b1_ct.ct_line_size = 0;
6726 6769                          if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6727 6770                                  des_b1_ct.ct_size = 8;
6728 6771                                  des_b1_ct.ct_label = itlb2M_str;
6729 6772                          } else {
6730 6773                                  des_b1_ct.ct_size = 4;
6731 6774                                  des_b1_ct.ct_label = itlb4M_str;
6732 6775                          }
6733 6776                          ct = &des_b1_ct;
6734 6777                  } else {
6735 6778                          if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6736 6779                                  continue;
6737 6780                          }
6738 6781                  }
6739 6782  
6740 6783                  if (func(arg, ct) != 0) {
6741 6784                          break;
6742 6785                  }
6743 6786          }
6744 6787  }
6745 6788  
6746 6789  /*
6747 6790   * (Like the Intel one, except for Cyrix CPUs)
6748 6791   */
6749 6792  static void
6750 6793  cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6751 6794      void *arg, int (*func)(void *, const struct cachetab *))
6752 6795  {
6753 6796          const struct cachetab *ct;
6754 6797          uint8_t *dp;
6755 6798          int i;
6756 6799  
6757 6800          if ((dp = cpi->cpi_cacheinfo) == NULL)
6758 6801                  return;
6759 6802          for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6760 6803                  /*
6761 6804                   * Search Cyrix-specific descriptor table first ..
6762 6805                   */
6763 6806                  if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6764 6807                          if (func(arg, ct) != 0)
6765 6808                                  break;
6766 6809                          continue;
6767 6810                  }
6768 6811                  /*
6769 6812                   * .. else fall back to the Intel one
6770 6813                   */
6771 6814                  if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6772 6815                          if (func(arg, ct) != 0)
6773 6816                                  break;
6774 6817                          continue;
6775 6818                  }
6776 6819          }
6777 6820  }
6778 6821  
6779 6822  /*
6780 6823   * A cacheinfo walker that adds associativity, line-size, and size properties
6781 6824   * to the devinfo node it is passed as an argument.
6782 6825   */
6783 6826  static int
6784 6827  add_cacheent_props(void *arg, const struct cachetab *ct)
6785 6828  {
6786 6829          dev_info_t *devi = arg;
6787 6830  
6788 6831          add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6789 6832          if (ct->ct_line_size != 0)
6790 6833                  add_cache_prop(devi, ct->ct_label, line_str,
6791 6834                      ct->ct_line_size);
6792 6835          add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6793 6836          return (0);
6794 6837  }
6795 6838  
6796 6839  
6797 6840  static const char fully_assoc[] = "fully-associative?";
6798 6841  
6799 6842  /*
6800 6843   * AMD style cache/tlb description
6801 6844   *
6802 6845   * Extended functions 5 and 6 directly describe properties of
6803 6846   * tlbs and various cache levels.
6804 6847   */
6805 6848  static void
6806 6849  add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6807 6850  {
6808 6851          switch (assoc) {
6809 6852          case 0: /* reserved; ignore */
6810 6853                  break;
6811 6854          default:
6812 6855                  add_cache_prop(devi, label, assoc_str, assoc);
6813 6856                  break;
6814 6857          case 0xff:
6815 6858                  add_cache_prop(devi, label, fully_assoc, 1);
6816 6859                  break;
6817 6860          }
6818 6861  }
6819 6862  
6820 6863  static void
6821 6864  add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6822 6865  {
6823 6866          if (size == 0)
6824 6867                  return;
6825 6868          add_cache_prop(devi, label, size_str, size);
6826 6869          add_amd_assoc(devi, label, assoc);
6827 6870  }
6828 6871  
6829 6872  static void
6830 6873  add_amd_cache(dev_info_t *devi, const char *label,
6831 6874      uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6832 6875  {
6833 6876          if (size == 0 || line_size == 0)
6834 6877                  return;
6835 6878          add_amd_assoc(devi, label, assoc);
6836 6879          /*
6837 6880           * Most AMD parts have a sectored cache. Multiple cache lines are
6838 6881           * associated with each tag. A sector consists of all cache lines
6839 6882           * associated with a tag. For example, the AMD K6-III has a sector
6840 6883           * size of 2 cache lines per tag.
6841 6884           */
6842 6885          if (lines_per_tag != 0)
6843 6886                  add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6844 6887          add_cache_prop(devi, label, line_str, line_size);
6845 6888          add_cache_prop(devi, label, size_str, size * 1024);
6846 6889  }
6847 6890  
6848 6891  static void
6849 6892  add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6850 6893  {
6851 6894          switch (assoc) {
6852 6895          case 0: /* off */
6853 6896                  break;
6854 6897          case 1:
6855 6898          case 2:
6856 6899          case 4:
6857 6900                  add_cache_prop(devi, label, assoc_str, assoc);
6858 6901                  break;
6859 6902          case 6:
6860 6903                  add_cache_prop(devi, label, assoc_str, 8);
6861 6904                  break;
6862 6905          case 8:
6863 6906                  add_cache_prop(devi, label, assoc_str, 16);
6864 6907                  break;
6865 6908          case 0xf:
6866 6909                  add_cache_prop(devi, label, fully_assoc, 1);
6867 6910                  break;
6868 6911          default: /* reserved; ignore */
6869 6912                  break;
6870 6913          }
6871 6914  }
6872 6915  
6873 6916  static void
6874 6917  add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6875 6918  {
6876 6919          if (size == 0 || assoc == 0)
6877 6920                  return;
6878 6921          add_amd_l2_assoc(devi, label, assoc);
6879 6922          add_cache_prop(devi, label, size_str, size);
6880 6923  }
6881 6924  
6882 6925  static void
6883 6926  add_amd_l2_cache(dev_info_t *devi, const char *label,
6884 6927      uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6885 6928  {
6886 6929          if (size == 0 || assoc == 0 || line_size == 0)
6887 6930                  return;
6888 6931          add_amd_l2_assoc(devi, label, assoc);
6889 6932          if (lines_per_tag != 0)
6890 6933                  add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6891 6934          add_cache_prop(devi, label, line_str, line_size);
6892 6935          add_cache_prop(devi, label, size_str, size * 1024);
6893 6936  }
6894 6937  
6895 6938  static void
6896 6939  amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6897 6940  {
6898 6941          struct cpuid_regs *cp;
6899 6942  
6900 6943          if (cpi->cpi_xmaxeax < 0x80000005)
6901 6944                  return;
6902 6945          cp = &cpi->cpi_extd[5];
6903 6946  
6904 6947          /*
6905 6948           * 4M/2M L1 TLB configuration
6906 6949           *
6907 6950           * We report the size for 2M pages because AMD uses two
6908 6951           * TLB entries for one 4M page.
6909 6952           */
6910 6953          add_amd_tlb(devi, "dtlb-2M",
6911 6954              BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6912 6955          add_amd_tlb(devi, "itlb-2M",
6913 6956              BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6914 6957  
6915 6958          /*
6916 6959           * 4K L1 TLB configuration
6917 6960           */
6918 6961  
6919 6962          switch (cpi->cpi_vendor) {
6920 6963                  uint_t nentries;
6921 6964          case X86_VENDOR_TM:
6922 6965                  if (cpi->cpi_family >= 5) {
6923 6966                          /*
6924 6967                           * Crusoe processors have 256 TLB entries, but
6925 6968                           * cpuid data format constrains them to only
6926 6969                           * reporting 255 of them.
6927 6970                           */
6928 6971                          if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6929 6972                                  nentries = 256;
6930 6973                          /*
6931 6974                           * Crusoe processors also have a unified TLB
6932 6975                           */
6933 6976                          add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6934 6977                              nentries);
6935 6978                          break;
6936 6979                  }
6937 6980                  /*FALLTHROUGH*/
6938 6981          default:
6939 6982                  add_amd_tlb(devi, itlb4k_str,
6940 6983                      BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6941 6984                  add_amd_tlb(devi, dtlb4k_str,
6942 6985                      BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6943 6986                  break;
6944 6987          }
6945 6988  
6946 6989          /*
6947 6990           * data L1 cache configuration
6948 6991           */
6949 6992  
6950 6993          add_amd_cache(devi, l1_dcache_str,
6951 6994              BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6952 6995              BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6953 6996  
6954 6997          /*
6955 6998           * code L1 cache configuration
6956 6999           */
6957 7000  
6958 7001          add_amd_cache(devi, l1_icache_str,
6959 7002              BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6960 7003              BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6961 7004  
6962 7005          if (cpi->cpi_xmaxeax < 0x80000006)
6963 7006                  return;
6964 7007          cp = &cpi->cpi_extd[6];
6965 7008  
6966 7009          /* Check for a unified L2 TLB for large pages */
6967 7010  
6968 7011          if (BITX(cp->cp_eax, 31, 16) == 0)
6969 7012                  add_amd_l2_tlb(devi, "l2-tlb-2M",
6970 7013                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6971 7014          else {
6972 7015                  add_amd_l2_tlb(devi, "l2-dtlb-2M",
6973 7016                      BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6974 7017                  add_amd_l2_tlb(devi, "l2-itlb-2M",
6975 7018                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6976 7019          }
6977 7020  
6978 7021          /* Check for a unified L2 TLB for 4K pages */
6979 7022  
6980 7023          if (BITX(cp->cp_ebx, 31, 16) == 0) {
6981 7024                  add_amd_l2_tlb(devi, "l2-tlb-4K",
6982 7025                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6983 7026          } else {
6984 7027                  add_amd_l2_tlb(devi, "l2-dtlb-4K",
6985 7028                      BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6986 7029                  add_amd_l2_tlb(devi, "l2-itlb-4K",
6987 7030                      BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6988 7031          }
6989 7032  
6990 7033          add_amd_l2_cache(devi, l2_cache_str,
6991 7034              BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6992 7035              BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6993 7036  }
6994 7037  
6995 7038  /*
6996 7039   * There are two basic ways that the x86 world describes it cache
6997 7040   * and tlb architecture - Intel's way and AMD's way.
6998 7041   *
6999 7042   * Return which flavor of cache architecture we should use
7000 7043   */
7001 7044  static int
7002 7045  x86_which_cacheinfo(struct cpuid_info *cpi)
7003 7046  {
7004 7047          switch (cpi->cpi_vendor) {
7005 7048          case X86_VENDOR_Intel:
7006 7049                  if (cpi->cpi_maxeax >= 2)
7007 7050                          return (X86_VENDOR_Intel);
7008 7051                  break;
7009 7052          case X86_VENDOR_AMD:
7010 7053                  /*
7011 7054                   * The K5 model 1 was the first part from AMD that reported
7012 7055                   * cache sizes via extended cpuid functions.
7013 7056                   */
7014 7057                  if (cpi->cpi_family > 5 ||
7015 7058                      (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7016 7059                          return (X86_VENDOR_AMD);
7017 7060                  break;
7018 7061          case X86_VENDOR_HYGON:
7019 7062                  return (X86_VENDOR_AMD);
7020 7063          case X86_VENDOR_TM:
7021 7064                  if (cpi->cpi_family >= 5)
7022 7065                          return (X86_VENDOR_AMD);
7023 7066                  /*FALLTHROUGH*/
7024 7067          default:
7025 7068                  /*
7026 7069                   * If they have extended CPU data for 0x80000005
7027 7070                   * then we assume they have AMD-format cache
7028 7071                   * information.
7029 7072                   *
7030 7073                   * If not, and the vendor happens to be Cyrix,
7031 7074                   * then try our-Cyrix specific handler.
7032 7075                   *
7033 7076                   * If we're not Cyrix, then assume we're using Intel's
7034 7077                   * table-driven format instead.
7035 7078                   */
7036 7079                  if (cpi->cpi_xmaxeax >= 0x80000005)
7037 7080                          return (X86_VENDOR_AMD);
7038 7081                  else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7039 7082                          return (X86_VENDOR_Cyrix);
7040 7083                  else if (cpi->cpi_maxeax >= 2)
7041 7084                          return (X86_VENDOR_Intel);
7042 7085                  break;
7043 7086          }
7044 7087          return (-1);
7045 7088  }
7046 7089  
7047 7090  void
7048 7091  cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7049 7092      struct cpuid_info *cpi)
7050 7093  {
7051 7094          dev_info_t *cpu_devi;
7052 7095          int create;
7053 7096  
7054 7097          cpu_devi = (dev_info_t *)dip;
7055 7098  
7056 7099          /* device_type */
7057 7100          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7058 7101              "device_type", "cpu");
7059 7102  
7060 7103          /* reg */
7061 7104          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7062 7105              "reg", cpu_id);
7063 7106  
7064 7107          /* cpu-mhz, and clock-frequency */
7065 7108          if (cpu_freq > 0) {
7066 7109                  long long mul;
7067 7110  
7068 7111                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7069 7112                      "cpu-mhz", cpu_freq);
7070 7113                  if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7071 7114                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7072 7115                              "clock-frequency", (int)mul);
7073 7116          }
7074 7117  
7075 7118          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7076 7119  
7077 7120          /* vendor-id */
7078 7121          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7079 7122              "vendor-id", cpi->cpi_vendorstr);
7080 7123  
7081 7124          if (cpi->cpi_maxeax == 0) {
7082 7125                  return;
7083 7126          }
7084 7127  
7085 7128          /*
7086 7129           * family, model, and step
7087 7130           */
7088 7131          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7089 7132              "family", CPI_FAMILY(cpi));
7090 7133          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7091 7134              "cpu-model", CPI_MODEL(cpi));
7092 7135          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7093 7136              "stepping-id", CPI_STEP(cpi));
7094 7137  
7095 7138          /* type */
7096 7139          switch (cpi->cpi_vendor) {
7097 7140          case X86_VENDOR_Intel:
7098 7141                  create = 1;
7099 7142                  break;
7100 7143          default:
7101 7144                  create = 0;
7102 7145                  break;
7103 7146          }
7104 7147          if (create)
7105 7148                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7106 7149                      "type", CPI_TYPE(cpi));
7107 7150  
7108 7151          /* ext-family */
7109 7152          switch (cpi->cpi_vendor) {
7110 7153          case X86_VENDOR_Intel:
7111 7154          case X86_VENDOR_AMD:
7112 7155                  create = cpi->cpi_family >= 0xf;
7113 7156                  break;
7114 7157          case X86_VENDOR_HYGON:
7115 7158                  create = 1;
7116 7159                  break;
7117 7160          default:
7118 7161                  create = 0;
7119 7162                  break;
7120 7163          }
7121 7164          if (create)
7122 7165                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7123 7166                      "ext-family", CPI_FAMILY_XTD(cpi));
7124 7167  
7125 7168          /* ext-model */
7126 7169          switch (cpi->cpi_vendor) {
7127 7170          case X86_VENDOR_Intel:
7128 7171                  create = IS_EXTENDED_MODEL_INTEL(cpi);
7129 7172                  break;
7130 7173          case X86_VENDOR_AMD:
7131 7174                  create = CPI_FAMILY(cpi) == 0xf;
7132 7175                  break;
7133 7176          case X86_VENDOR_HYGON:
7134 7177                  create = 1;
7135 7178                  break;
7136 7179          default:
7137 7180                  create = 0;
7138 7181                  break;
7139 7182          }
7140 7183          if (create)
7141 7184                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7142 7185                      "ext-model", CPI_MODEL_XTD(cpi));
7143 7186  
7144 7187          /* generation */
7145 7188          switch (cpi->cpi_vendor) {
7146 7189          case X86_VENDOR_AMD:
7147 7190          case X86_VENDOR_HYGON:
7148 7191                  /*
7149 7192                   * AMD K5 model 1 was the first part to support this
7150 7193                   */
7151 7194                  create = cpi->cpi_xmaxeax >= 0x80000001;
7152 7195                  break;
7153 7196          default:
7154 7197                  create = 0;
7155 7198                  break;
7156 7199          }
7157 7200          if (create)
7158 7201                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7159 7202                      "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7160 7203  
7161 7204          /* brand-id */
7162 7205          switch (cpi->cpi_vendor) {
7163 7206          case X86_VENDOR_Intel:
7164 7207                  /*
7165 7208                   * brand id first appeared on Pentium III Xeon model 8,
7166 7209                   * and Celeron model 8 processors and Opteron
7167 7210                   */
7168 7211                  create = cpi->cpi_family > 6 ||
7169 7212                      (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7170 7213                  break;
7171 7214          case X86_VENDOR_AMD:
7172 7215                  create = cpi->cpi_family >= 0xf;
7173 7216                  break;
7174 7217          case X86_VENDOR_HYGON:
7175 7218                  create = 1;
7176 7219                  break;
7177 7220          default:
7178 7221                  create = 0;
7179 7222                  break;
7180 7223          }
7181 7224          if (create && cpi->cpi_brandid != 0) {
7182 7225                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7183 7226                      "brand-id", cpi->cpi_brandid);
7184 7227          }
7185 7228  
7186 7229          /* chunks, and apic-id */
7187 7230          switch (cpi->cpi_vendor) {
7188 7231                  /*
7189 7232                   * first available on Pentium IV and Opteron (K8)
7190 7233                   */
7191 7234          case X86_VENDOR_Intel:
7192 7235                  create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7193 7236                  break;
7194 7237          case X86_VENDOR_AMD:
7195 7238                  create = cpi->cpi_family >= 0xf;
7196 7239                  break;
7197 7240          case X86_VENDOR_HYGON:
7198 7241                  create = 1;
7199 7242                  break;
7200 7243          default:
7201 7244                  create = 0;
7202 7245                  break;
7203 7246          }
7204 7247          if (create) {
7205 7248                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7206 7249                      "chunks", CPI_CHUNKS(cpi));
7207 7250                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7208 7251                      "apic-id", cpi->cpi_apicid);
7209 7252                  if (cpi->cpi_chipid >= 0) {
7210 7253                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7211 7254                              "chip#", cpi->cpi_chipid);
7212 7255                          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7213 7256                              "clog#", cpi->cpi_clogid);
7214 7257                  }
7215 7258          }
7216 7259  
7217 7260          /* cpuid-features */
7218 7261          (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7219 7262              "cpuid-features", CPI_FEATURES_EDX(cpi));
7220 7263  
7221 7264  
7222 7265          /* cpuid-features-ecx */
7223 7266          switch (cpi->cpi_vendor) {
7224 7267          case X86_VENDOR_Intel:
7225 7268                  create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7226 7269                  break;
7227 7270          case X86_VENDOR_AMD:
7228 7271                  create = cpi->cpi_family >= 0xf;
7229 7272                  break;
7230 7273          case X86_VENDOR_HYGON:
7231 7274                  create = 1;
7232 7275                  break;
7233 7276          default:
7234 7277                  create = 0;
7235 7278                  break;
7236 7279          }
7237 7280          if (create)
7238 7281                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7239 7282                      "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7240 7283  
7241 7284          /* ext-cpuid-features */
7242 7285          switch (cpi->cpi_vendor) {
7243 7286          case X86_VENDOR_Intel:
7244 7287          case X86_VENDOR_AMD:
7245 7288          case X86_VENDOR_HYGON:
7246 7289          case X86_VENDOR_Cyrix:
7247 7290          case X86_VENDOR_TM:
7248 7291          case X86_VENDOR_Centaur:
7249 7292                  create = cpi->cpi_xmaxeax >= 0x80000001;
7250 7293                  break;
7251 7294          default:
7252 7295                  create = 0;
7253 7296                  break;
7254 7297          }
7255 7298          if (create) {
7256 7299                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7257 7300                      "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7258 7301                  (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7259 7302                      "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7260 7303          }
7261 7304  
7262 7305          /*
7263 7306           * Brand String first appeared in Intel Pentium IV, AMD K5
7264 7307           * model 1, and Cyrix GXm.  On earlier models we try and
7265 7308           * simulate something similar .. so this string should always
7266 7309           * same -something- about the processor, however lame.
7267 7310           */
7268 7311          (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7269 7312              "brand-string", cpi->cpi_brandstr);
7270 7313  
7271 7314          /*
7272 7315           * Finally, cache and tlb information
7273 7316           */
7274 7317          switch (x86_which_cacheinfo(cpi)) {
7275 7318          case X86_VENDOR_Intel:
7276 7319                  intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7277 7320                  break;
7278 7321          case X86_VENDOR_Cyrix:
7279 7322                  cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7280 7323                  break;
7281 7324          case X86_VENDOR_AMD:
7282 7325                  amd_cache_info(cpi, cpu_devi);
7283 7326                  break;
7284 7327          default:
7285 7328                  break;
7286 7329          }
7287 7330  }
7288 7331  
7289 7332  struct l2info {
7290 7333          int *l2i_csz;
7291 7334          int *l2i_lsz;
7292 7335          int *l2i_assoc;
7293 7336          int l2i_ret;
7294 7337  };
7295 7338  
7296 7339  /*
7297 7340   * A cacheinfo walker that fetches the size, line-size and associativity
7298 7341   * of the L2 cache
7299 7342   */
7300 7343  static int
7301 7344  intel_l2cinfo(void *arg, const struct cachetab *ct)
7302 7345  {
7303 7346          struct l2info *l2i = arg;
7304 7347          int *ip;
7305 7348  
7306 7349          if (ct->ct_label != l2_cache_str &&
7307 7350              ct->ct_label != sl2_cache_str)
7308 7351                  return (0);     /* not an L2 -- keep walking */
7309 7352  
7310 7353          if ((ip = l2i->l2i_csz) != NULL)
7311 7354                  *ip = ct->ct_size;
7312 7355          if ((ip = l2i->l2i_lsz) != NULL)
7313 7356                  *ip = ct->ct_line_size;
7314 7357          if ((ip = l2i->l2i_assoc) != NULL)
7315 7358                  *ip = ct->ct_assoc;
7316 7359          l2i->l2i_ret = ct->ct_size;
7317 7360          return (1);             /* was an L2 -- terminate walk */
7318 7361  }
7319 7362  
7320 7363  /*
7321 7364   * AMD L2/L3 Cache and TLB Associativity Field Definition:
7322 7365   *
7323 7366   *      Unlike the associativity for the L1 cache and tlb where the 8 bit
7324 7367   *      value is the associativity, the associativity for the L2 cache and
7325 7368   *      tlb is encoded in the following table. The 4 bit L2 value serves as
7326 7369   *      an index into the amd_afd[] array to determine the associativity.
7327 7370   *      -1 is undefined. 0 is fully associative.
7328 7371   */
7329 7372  
7330 7373  static int amd_afd[] =
7331 7374          {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7332 7375  
7333 7376  static void
7334 7377  amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7335 7378  {
7336 7379          struct cpuid_regs *cp;
7337 7380          uint_t size, assoc;
7338 7381          int i;
7339 7382          int *ip;
7340 7383  
7341 7384          if (cpi->cpi_xmaxeax < 0x80000006)
7342 7385                  return;
7343 7386          cp = &cpi->cpi_extd[6];
7344 7387  
7345 7388          if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7346 7389              (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7347 7390                  uint_t cachesz = size * 1024;
7348 7391                  assoc = amd_afd[i];
7349 7392  
7350 7393                  ASSERT(assoc != -1);
7351 7394  
7352 7395                  if ((ip = l2i->l2i_csz) != NULL)
7353 7396                          *ip = cachesz;
7354 7397                  if ((ip = l2i->l2i_lsz) != NULL)
7355 7398                          *ip = BITX(cp->cp_ecx, 7, 0);
7356 7399                  if ((ip = l2i->l2i_assoc) != NULL)
7357 7400                          *ip = assoc;
7358 7401                  l2i->l2i_ret = cachesz;
7359 7402          }
7360 7403  }
7361 7404  
7362 7405  int
7363 7406  getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7364 7407  {
7365 7408          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7366 7409          struct l2info __l2info, *l2i = &__l2info;
7367 7410  
7368 7411          l2i->l2i_csz = csz;
7369 7412          l2i->l2i_lsz = lsz;
7370 7413          l2i->l2i_assoc = assoc;
7371 7414          l2i->l2i_ret = -1;
7372 7415  
7373 7416          switch (x86_which_cacheinfo(cpi)) {
7374 7417          case X86_VENDOR_Intel:
7375 7418                  intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7376 7419                  break;
7377 7420          case X86_VENDOR_Cyrix:
7378 7421                  cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7379 7422                  break;
7380 7423          case X86_VENDOR_AMD:
7381 7424                  amd_l2cacheinfo(cpi, l2i);
7382 7425                  break;
7383 7426          default:
7384 7427                  break;
7385 7428          }
7386 7429          return (l2i->l2i_ret);
7387 7430  }
7388 7431  
7389 7432  #if !defined(__xpv)
7390 7433  
7391 7434  uint32_t *
7392 7435  cpuid_mwait_alloc(cpu_t *cpu)
7393 7436  {
7394 7437          uint32_t        *ret;
7395 7438          size_t          mwait_size;
7396 7439  
7397 7440          ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7398 7441  
7399 7442          mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7400 7443          if (mwait_size == 0)
7401 7444                  return (NULL);
7402 7445  
7403 7446          /*
7404 7447           * kmem_alloc() returns cache line size aligned data for mwait_size
7405 7448           * allocations.  mwait_size is currently cache line sized.  Neither
7406 7449           * of these implementation details are guarantied to be true in the
7407 7450           * future.
7408 7451           *
7409 7452           * First try allocating mwait_size as kmem_alloc() currently returns
7410 7453           * correctly aligned memory.  If kmem_alloc() does not return
7411 7454           * mwait_size aligned memory, then use mwait_size ROUNDUP.
7412 7455           *
7413 7456           * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7414 7457           * decide to free this memory.
7415 7458           */
7416 7459          ret = kmem_zalloc(mwait_size, KM_SLEEP);
7417 7460          if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7418 7461                  cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7419 7462                  cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7420 7463                  *ret = MWAIT_RUNNING;
7421 7464                  return (ret);
7422 7465          } else {
7423 7466                  kmem_free(ret, mwait_size);
7424 7467                  ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7425 7468                  cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7426 7469                  cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7427 7470                  ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7428 7471                  *ret = MWAIT_RUNNING;
7429 7472                  return (ret);
7430 7473          }
7431 7474  }
7432 7475  
7433 7476  void
7434 7477  cpuid_mwait_free(cpu_t *cpu)
7435 7478  {
7436 7479          if (cpu->cpu_m.mcpu_cpi == NULL) {
7437 7480                  return;
7438 7481          }
7439 7482  
7440 7483          if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7441 7484              cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7442 7485                  kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7443 7486                      cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7444 7487          }
7445 7488  
7446 7489          cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7447 7490          cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7448 7491  }
7449 7492  
7450 7493  void
7451 7494  patch_tsc_read(int flag)
7452 7495  {
7453 7496          size_t cnt;
7454 7497  
7455 7498          switch (flag) {
7456 7499          case TSC_NONE:
7457 7500                  cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7458 7501                  (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7459 7502                  break;
7460 7503          case TSC_RDTSC_LFENCE:
7461 7504                  cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7462 7505                  (void) memcpy((void *)tsc_read,
7463 7506                      (void *)&_tsc_lfence_start, cnt);
7464 7507                  break;
7465 7508          case TSC_TSCP:
7466 7509                  cnt = &_tscp_end - &_tscp_start;
7467 7510                  (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7468 7511                  break;
7469 7512          default:
7470 7513                  /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7471 7514                  cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7472 7515                  break;
7473 7516          }
7474 7517          tsc_type = flag;
7475 7518  }
7476 7519  
7477 7520  int
7478 7521  cpuid_deep_cstates_supported(void)
7479 7522  {
7480 7523          struct cpuid_info *cpi;
7481 7524          struct cpuid_regs regs;
7482 7525  
7483 7526          ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7484 7527          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7485 7528  
7486 7529          cpi = CPU->cpu_m.mcpu_cpi;
7487 7530  
7488 7531          switch (cpi->cpi_vendor) {
7489 7532          case X86_VENDOR_Intel:
7490 7533                  if (cpi->cpi_xmaxeax < 0x80000007)
7491 7534                          return (0);
7492 7535  
7493 7536                  /*
7494 7537                   * Does TSC run at a constant rate in all C-states?
7495 7538                   */
7496 7539                  regs.cp_eax = 0x80000007;
7497 7540                  (void) __cpuid_insn(®s);
7498 7541                  return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7499 7542  
7500 7543          default:
7501 7544                  return (0);
7502 7545          }
7503 7546  }
7504 7547  
7505 7548  #endif  /* !__xpv */
7506 7549  
7507 7550  void
7508 7551  post_startup_cpu_fixups(void)
7509 7552  {
7510 7553  #ifndef __xpv
7511 7554          /*
7512 7555           * Some AMD processors support C1E state. Entering this state will
7513 7556           * cause the local APIC timer to stop, which we can't deal with at
7514 7557           * this time.
7515 7558           */
7516 7559          if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7517 7560                  on_trap_data_t otd;
7518 7561                  uint64_t reg;
7519 7562  
7520 7563                  if (!on_trap(&otd, OT_DATA_ACCESS)) {
7521 7564                          reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7522 7565                          /* Disable C1E state if it is enabled by BIOS */
7523 7566                          if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7524 7567                              AMD_ACTONCMPHALT_MASK) {
7525 7568                                  reg &= ~(AMD_ACTONCMPHALT_MASK <<
7526 7569                                      AMD_ACTONCMPHALT_SHIFT);
7527 7570                                  wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7528 7571                          }
7529 7572                  }
7530 7573                  no_trap();
7531 7574          }
7532 7575  #endif  /* !__xpv */
7533 7576  }
7534 7577  
7535 7578  void
7536 7579  enable_pcid(void)
7537 7580  {
7538 7581          if (x86_use_pcid == -1)
7539 7582                  x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7540 7583  
7541 7584          if (x86_use_invpcid == -1) {
7542 7585                  x86_use_invpcid = is_x86_feature(x86_featureset,
7543 7586                      X86FSET_INVPCID);
7544 7587          }
7545 7588  
7546 7589          if (!x86_use_pcid)
7547 7590                  return;
7548 7591  
7549 7592          /*
7550 7593           * Intel say that on setting PCIDE, it immediately starts using the PCID
7551 7594           * bits; better make sure there's nothing there.
7552 7595           */
7553 7596          ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7554 7597  
7555 7598          setcr4(getcr4() | CR4_PCIDE);
7556 7599  }
7557 7600  
7558 7601  /*
7559 7602   * Setup necessary registers to enable XSAVE feature on this processor.
7560 7603   * This function needs to be called early enough, so that no xsave/xrstor
7561 7604   * ops will execute on the processor before the MSRs are properly set up.
7562 7605   *
7563 7606   * Current implementation has the following assumption:
7564 7607   * - cpuid_pass_basic() is done, so that X86 features are known.
7565 7608   * - fpu_probe() is done, so that fp_save_mech is chosen.
7566 7609   */
7567 7610  void
7568 7611  xsave_setup_msr(cpu_t *cpu)
7569 7612  {
7570 7613          ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7571 7614          ASSERT(fp_save_mech == FP_XSAVE);
7572 7615          ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7573 7616  
7574 7617          /* Enable OSXSAVE in CR4. */
7575 7618          setcr4(getcr4() | CR4_OSXSAVE);
7576 7619          /*
7577 7620           * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7578 7621           * correct value.
7579 7622           */
7580 7623          cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7581 7624          setup_xfem();
7582 7625  }
7583 7626  
7584 7627  /*
7585 7628   * Starting with the Westmere processor the local
7586 7629   * APIC timer will continue running in all C-states,
7587 7630   * including the deepest C-states.
7588 7631   */
7589 7632  int
7590 7633  cpuid_arat_supported(void)
7591 7634  {
7592 7635          struct cpuid_info *cpi;
7593 7636          struct cpuid_regs regs;
7594 7637  
7595 7638          ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7596 7639          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7597 7640  
7598 7641          cpi = CPU->cpu_m.mcpu_cpi;
7599 7642  
7600 7643          switch (cpi->cpi_vendor) {
7601 7644          case X86_VENDOR_Intel:
7602 7645                  /*
7603 7646                   * Always-running Local APIC Timer is
7604 7647                   * indicated by CPUID.6.EAX[2].
7605 7648                   */
7606 7649                  if (cpi->cpi_maxeax >= 6) {
7607 7650                          regs.cp_eax = 6;
7608 7651                          (void) cpuid_insn(NULL, ®s);
7609 7652                          return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7610 7653                  } else {
7611 7654                          return (0);
7612 7655                  }
7613 7656          default:
7614 7657                  return (0);
7615 7658          }
7616 7659  }
7617 7660  
7618 7661  /*
7619 7662   * Check support for Intel ENERGY_PERF_BIAS feature
7620 7663   */
7621 7664  int
7622 7665  cpuid_iepb_supported(struct cpu *cp)
7623 7666  {
7624 7667          struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7625 7668          struct cpuid_regs regs;
7626 7669  
7627 7670          ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7628 7671          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7629 7672  
7630 7673          if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7631 7674                  return (0);
7632 7675          }
7633 7676  
7634 7677          /*
7635 7678           * Intel ENERGY_PERF_BIAS MSR is indicated by
7636 7679           * capability bit CPUID.6.ECX.3
7637 7680           */
7638 7681          if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7639 7682                  return (0);
7640 7683  
7641 7684          regs.cp_eax = 0x6;
7642 7685          (void) cpuid_insn(NULL, ®s);
7643 7686          return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7644 7687  }
7645 7688  
7646 7689  /*
7647 7690   * Check support for TSC deadline timer
7648 7691   *
7649 7692   * TSC deadline timer provides a superior software programming
7650 7693   * model over local APIC timer that eliminates "time drifts".
7651 7694   * Instead of specifying a relative time, software specifies an
7652 7695   * absolute time as the target at which the processor should
7653 7696   * generate a timer event.
7654 7697   */
7655 7698  int
7656 7699  cpuid_deadline_tsc_supported(void)
7657 7700  {
7658 7701          struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7659 7702          struct cpuid_regs regs;
7660 7703  
7661 7704          ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7662 7705          ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7663 7706  
7664 7707          switch (cpi->cpi_vendor) {
7665 7708          case X86_VENDOR_Intel:
7666 7709                  if (cpi->cpi_maxeax >= 1) {
7667 7710                          regs.cp_eax = 1;
7668 7711                          (void) cpuid_insn(NULL, ®s);
7669 7712                          return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7670 7713                  } else {
7671 7714                          return (0);
7672 7715                  }
7673 7716          default:
7674 7717                  return (0);
7675 7718          }
7676 7719  }
7677 7720  
7678 7721  #if !defined(__xpv)
7679 7722  /*
7680 7723   * Patch in versions of bcopy for high performance Intel Nhm processors
7681 7724   * and later...
7682 7725   */
7683 7726  void
7684 7727  patch_memops(uint_t vendor)
7685 7728  {
7686 7729          size_t cnt, i;
7687 7730          caddr_t to, from;
7688 7731  
7689 7732          if ((vendor == X86_VENDOR_Intel) &&
7690 7733              is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7691 7734                  cnt = &bcopy_patch_end - &bcopy_patch_start;
7692 7735                  to = &bcopy_ck_size;
7693 7736                  from = &bcopy_patch_start;
7694 7737                  for (i = 0; i < cnt; i++) {
7695 7738                          *to++ = *from++;
7696 7739                  }
7697 7740          }
7698 7741  }
7699 7742  #endif  /*  !__xpv */
7700 7743  
7701 7744  /*
7702 7745   * We're being asked to tell the system how many bits are required to represent
7703 7746   * the various thread and strand IDs. While it's tempting to derive this based
7704 7747   * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7705 7748   * correct. Instead, this needs to be based on the number of bits that the APIC
7706 7749   * allows for these different configurations. We only update these to a larger
7707 7750   * value if we find one.
7708 7751   */
7709 7752  void
7710 7753  cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7711 7754  {
7712 7755          struct cpuid_info *cpi;
7713 7756  
7714 7757          VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7715 7758          cpi = cpu->cpu_m.mcpu_cpi;
7716 7759  
7717 7760          if (cpi->cpi_ncore_bits > *core_nbits) {
7718 7761                  *core_nbits = cpi->cpi_ncore_bits;
7719 7762          }
7720 7763  
7721 7764          if (cpi->cpi_nthread_bits > *strand_nbits) {
7722 7765                  *strand_nbits = cpi->cpi_nthread_bits;
7723 7766          }
7724 7767  }
7725 7768  
7726 7769  void
7727 7770  cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7728 7771  {
7729 7772          struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7730 7773          struct cpuid_regs cp;
7731 7774  
7732 7775          /*
7733 7776           * Reread the CPUID portions that we need for various security
7734 7777           * information.
7735 7778           */
7736 7779          if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7737 7780                  /*
7738 7781                   * Check if we now have leaf 7 available to us.
7739 7782                   */
7740 7783                  if (cpi->cpi_maxeax < 7) {
7741 7784                          bzero(&cp, sizeof (cp));
7742 7785                          cp.cp_eax = 0;
7743 7786                          cpi->cpi_maxeax = __cpuid_insn(&cp);
7744 7787                          if (cpi->cpi_maxeax < 7)
7745 7788                                  return;
7746 7789                  }
7747 7790  
7748 7791                  bzero(&cp, sizeof (cp));
7749 7792                  cp.cp_eax = 7;
7750 7793                  cp.cp_ecx = 0;
7751 7794                  (void) __cpuid_insn(&cp);
7752 7795                  cpi->cpi_std[7] = cp;
7753 7796          } else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
7754 7797              cpi->cpi_vendor == X86_VENDOR_HYGON) {
7755 7798                  /* No xcpuid support */
7756 7799                  if (cpi->cpi_family < 5 ||
7757 7800                      (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7758 7801                          return;
7759 7802  
7760 7803                  if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7761 7804                          bzero(&cp, sizeof (cp));
7762 7805                          cp.cp_eax = CPUID_LEAF_EXT_0;
7763 7806                          cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7764 7807                          if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7765 7808                                  return;
7766 7809                          }
7767 7810                  }
7768 7811  
7769 7812                  /*
7770 7813                   * Most AMD features are in leaf 8. Automatic IBRS was added in
7771 7814                   * leaf 0x21. So we also check that.
7772 7815                   */
7773 7816                  bzero(&cp, sizeof (cp));
7774 7817                  cp.cp_eax = CPUID_LEAF_EXT_8;
7775 7818                  (void) __cpuid_insn(&cp);
7776 7819                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7777 7820                  cpi->cpi_extd[8] = cp;
7778 7821  
7779 7822                  if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21) {
7780 7823                          return;
7781 7824                  }
7782 7825  
7783 7826                  bzero(&cp, sizeof (cp));
7784 7827                  cp.cp_eax = CPUID_LEAF_EXT_21;
7785 7828                  (void) __cpuid_insn(&cp);
7786 7829                  platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
7787 7830                  cpi->cpi_extd[0x21] = cp;
7788 7831          } else {
7789 7832                  /*
7790 7833                   * Nothing to do here. Return an empty set which has already
7791 7834                   * been zeroed for us.
7792 7835                   */
7793 7836                  return;
7794 7837          }
7795 7838          cpuid_scan_security(cpu, fset);
7796 7839  }
7797 7840  
7798 7841  /* ARGSUSED */
7799 7842  static int
7800 7843  cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7801 7844  {
7802 7845          uchar_t *fset;
7803 7846          boolean_t first_pass = (boolean_t)arg1;
7804 7847  
7805 7848          fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7806 7849          if (first_pass && CPU->cpu_id != 0)
7807 7850                  return (0);
7808 7851          if (!first_pass && CPU->cpu_id == 0)
7809 7852                  return (0);
7810 7853          cpuid_pass_ucode(CPU, fset);
7811 7854  
7812 7855          return (0);
7813 7856  }
7814 7857  
7815 7858  /*
7816 7859   * After a microcode update where the version has changed, then we need to
7817 7860   * rescan CPUID. To do this we check every CPU to make sure that they have the
7818 7861   * same microcode. Then we perform a cross call to all such CPUs. It's the
7819 7862   * caller's job to make sure that no one else can end up doing an update while
7820 7863   * this is going on.
7821 7864   *
7822 7865   * We assume that the system is microcode capable if we're called.
7823 7866   */
7824 7867  void
7825 7868  cpuid_post_ucodeadm(void)
7826 7869  {
7827 7870          uint32_t rev;
7828 7871          int i;
7829 7872          struct cpu *cpu;
7830 7873          cpuset_t cpuset;
7831 7874          void *argdata;
7832 7875          uchar_t *f0;
7833 7876  
7834 7877          argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7835 7878  
7836 7879          mutex_enter(&cpu_lock);
7837 7880          cpu = cpu_get(0);
7838 7881          rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7839 7882          CPUSET_ONLY(cpuset, 0);
7840 7883          for (i = 1; i < max_ncpus; i++) {
7841 7884                  if ((cpu = cpu_get(i)) == NULL)
7842 7885                          continue;
7843 7886  
7844 7887                  if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7845 7888                          panic("post microcode update CPU %d has differing "
7846 7889                              "microcode revision (%u) from CPU 0 (%u)",
7847 7890                              i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7848 7891                  }
7849 7892                  CPUSET_ADD(cpuset, i);
7850 7893          }
7851 7894  
7852 7895          /*
7853 7896           * We do the cross calls in two passes. The first pass is only for the
7854 7897           * boot CPU. The second pass is for all of the other CPUs. This allows
7855 7898           * the boot CPU to go through and change behavior related to patching or
7856 7899           * whether or not Enhanced IBRS needs to be enabled and then allow all
7857 7900           * other CPUs to follow suit.
7858 7901           */
7859 7902          kpreempt_disable();
7860 7903          xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7861 7904              cpuid_post_ucodeadm_xc);
7862 7905          xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7863 7906              cpuid_post_ucodeadm_xc);
7864 7907          kpreempt_enable();
7865 7908  
7866 7909          /*
7867 7910           * OK, now look at each CPU and see if their feature sets are equal.
7868 7911           */
7869 7912          f0 = argdata;
7870 7913          for (i = 1; i < max_ncpus; i++) {
7871 7914                  uchar_t *fset;
7872 7915                  if (!CPU_IN_SET(cpuset, i))
7873 7916                          continue;
7874 7917  
7875 7918                  fset = (uchar_t *)((uintptr_t)argdata +
7876 7919                      sizeof (x86_featureset) * i);
7877 7920  
7878 7921                  if (!compare_x86_featureset(f0, fset)) {
7879 7922                          panic("Post microcode update CPU %d has "
7880 7923                              "differing security feature (%p) set from CPU 0 "
7881 7924                              "(%p), not appending to feature set", i,
7882 7925                              (void *)fset, (void *)f0);
7883 7926                  }
7884 7927          }
7885 7928  
7886 7929          mutex_exit(&cpu_lock);
7887 7930  
7888 7931          for (i = 0; i < NUM_X86_FEATURES; i++) {
7889 7932                  cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7890 7933                      x86_feature_names[i]);
7891 7934                  if (is_x86_feature(f0, i)) {
7892 7935                          add_x86_feature(x86_featureset, i);
7893 7936                  }
7894 7937          }
7895 7938          kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7896 7939  }
7897 7940  
7898 7941  typedef void (*cpuid_pass_f)(cpu_t *, void *);
7899 7942  
7900 7943  typedef struct cpuid_pass_def {
7901 7944          cpuid_pass_t cpd_pass;
7902 7945          cpuid_pass_f cpd_func;
7903 7946  } cpuid_pass_def_t;
7904 7947  
7905 7948  /*
7906 7949   * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
7907 7950   * normal sense and should not appear here.
7908 7951   */
7909 7952  static const cpuid_pass_def_t cpuid_pass_defs[] = {
7910 7953          { CPUID_PASS_PRELUDE, cpuid_pass_prelude },
7911 7954          { CPUID_PASS_IDENT, cpuid_pass_ident },
7912 7955          { CPUID_PASS_BASIC, cpuid_pass_basic },
7913 7956          { CPUID_PASS_EXTENDED, cpuid_pass_extended },
7914 7957          { CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
7915 7958          { CPUID_PASS_RESOLVE, cpuid_pass_resolve },
7916 7959  };
7917 7960  
7918 7961  void
7919 7962  cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
7920 7963  {
7921 7964          VERIFY3S(pass, !=, CPUID_PASS_NONE);
7922 7965  
7923 7966          if (cp == NULL)
7924 7967                  cp = CPU;
7925 7968  
7926 7969          /*
7927 7970           * Space statically allocated for BSP, ensure pointer is set
7928 7971           */
7929 7972          if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
7930 7973                  cp->cpu_m.mcpu_cpi = &cpuid_info0;
7931 7974  
7932 7975          ASSERT(cpuid_checkpass(cp, pass - 1));
7933 7976  
7934 7977          for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
7935 7978                  if (cpuid_pass_defs[i].cpd_pass == pass) {
7936 7979                          cpuid_pass_defs[i].cpd_func(cp, arg);
7937 7980                          cp->cpu_m.mcpu_cpi->cpi_pass = pass;
7938 7981                          return;
7939 7982                  }
7940 7983          }
7941 7984  
7942 7985          panic("unable to execute invalid cpuid pass %d on cpu%d\n",
7943 7986              pass, cp->cpu_id);
7944 7987  }
7945 7988  
7946 7989  /*
7947 7990   * Extract the processor family from a chiprev.  Processor families are not the
7948 7991   * same as cpuid families; see comments above and in x86_archext.h.
7949 7992   */
7950 7993  x86_processor_family_t
7951 7994  chiprev_family(const x86_chiprev_t cr)
7952 7995  {
7953 7996          return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
7954 7997  }
7955 7998  
7956 7999  /*
7957 8000   * A chiprev matches its template if the vendor and family are identical and the
7958 8001   * revision of the chiprev matches one of the bits set in the template.  Callers
7959 8002   * may bitwise-OR together chiprevs of the same vendor and family to form the
7960 8003   * template, or use the _ANY variant.  It is not possible to match chiprevs of
7961 8004   * multiple vendors or processor families with a single call.  Note that this
7962 8005   * function operates on processor families, not cpuid families.
7963 8006   */
7964 8007  boolean_t
7965 8008  chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
7966 8009  {
7967 8010          return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
7968 8011              _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
7969 8012              (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
7970 8013  }
7971 8014  
7972 8015  /*
7973 8016   * A chiprev is at least min if the vendor and family are identical and the
7974 8017   * revision of the chiprev is at least as recent as that of min.  Processor
7975 8018   * families are considered unordered and cannot be compared using this function.
7976 8019   * Note that this function operates on processor families, not cpuid families.
7977 8020   * Use of the _ANY chiprev variant with this function is not useful; it will
7978 8021   * always return B_FALSE if the _ANY variant is supplied as the minimum
7979 8022   * revision.  To determine only whether a chiprev is of a given processor
7980 8023   * family, test the return value of chiprev_family() instead.
7981 8024   */
7982 8025  boolean_t
7983 8026  chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
7984 8027  {
7985 8028          return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
7986 8029              _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
7987 8030              _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
7988 8031  }
7989 8032  
7990 8033  /*
7991 8034   * The uarch functions operate in a manner similar to the chiprev functions
7992 8035   * above.  While it is tempting to allow these to operate on microarchitectures
7993 8036   * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
7994 8037   * than ZEN2), we elect not to do so because a manufacturer may supply
7995 8038   * processors of multiple different microarchitecture families each of which may
7996 8039   * be internally ordered but unordered with respect to those of other families.
7997 8040   */
7998 8041  x86_uarch_t
7999 8042  uarchrev_uarch(const x86_uarchrev_t ur)
8000 8043  {
8001 8044          return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8002 8045  }
8003 8046  
8004 8047  boolean_t
8005 8048  uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8006 8049  {
8007 8050          return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8008 8051              _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8009 8052              (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8010 8053  }
8011 8054  
8012 8055  boolean_t
8013 8056  uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8014 8057  {
8015 8058          return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8016 8059              _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8017 8060              _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8018 8061  }
  
    | 
      ↓ open down ↓ | 
    1896 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX