ij-vs-gate Old usr/src/uts/intel/os/cpuid.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
  26  * Copyright 2020 Joyent, Inc.
  27  * Copyright 2023 Oxide Computer Company
  28  * Copyright 2022 MNX Cloud, Inc.
  29  */
  30 /*
  31  * Copyright (c) 2010, Intel Corporation.
  32  * All rights reserved.
  33  */
  34 /*
  35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
  36  */
  37 
  38 /*
  39  * CPU Identification logic
  40  *
  41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  42  * with the identification of CPUs, their features, and their topologies. More
  43  * specifically, this file helps drive the following:
  44  *
  45  * 1. Enumeration of features of the processor which are used by the kernel to
  46  *    determine what features to enable or disable. These may be instruction set
  47  *    enhancements or features that we use.
  48  *
  49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
  50  *    will be told about through the auxiliary vector.
  51  *
  52  * 3. Understanding the physical topology of the CPU such as the number of
  53  *    caches, how many cores it has, whether or not it supports symmetric
  54  *    multi-processing (SMT), etc.
  55  *
  56  * ------------------------
  57  * CPUID History and Basics
  58  * ------------------------
  59  *
  60  * The cpuid instruction was added by Intel roughly around the time that the
  61  * original Pentium was introduced. The purpose of cpuid was to tell in a
  62  * programmatic fashion information about the CPU that previously was guessed
  63  * at. For example, an important part of cpuid is that we can know what
  64  * extensions to the ISA exist. If you use an invalid opcode you would get a
  65  * #UD, so this method allows a program (whether a user program or the kernel)
  66  * to determine what exists without crashing or getting a SIGILL. Of course,
  67  * this was also during the era of the clones and the AMD Am5x86. The vendor
  68  * name shows up first in cpuid for a reason.
  69  *
  70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  72  * its own meaning. The different leaves are broken down into different regions:
  73  *
  74  *      [ 0, 7fffffff ]                 This region is called the 'basic'
  75  *                                      region. This region is generally defined
  76  *                                      by Intel, though some of the original
  77  *                                      portions have different meanings based
  78  *                                      on the manufacturer. These days, Intel
  79  *                                      adds most new features to this region.
  80  *                                      AMD adds non-Intel compatible
  81  *                                      information in the third, extended
  82  *                                      region. Intel uses this for everything
  83  *                                      including ISA extensions, CPU
  84  *                                      features, cache information, topology,
  85  *                                      and more.
  86  *
  87  *                                      There is a hole carved out of this
  88  *                                      region which is reserved for
  89  *                                      hypervisors.
  90  *
  91  *      [ 40000000, 4fffffff ]          This region, which is found in the
  92  *                                      middle of the previous region, is
  93  *                                      explicitly promised to never be used by
  94  *                                      CPUs. Instead, it is used by hypervisors
  95  *                                      to communicate information about
  96  *                                      themselves to the operating system. The
  97  *                                      values and details are unique for each
  98  *                                      hypervisor.
  99  *
 100  *      [ 80000000, ffffffff ]          This region is called the 'extended'
 101  *                                      region. Some of the low leaves mirror
 102  *                                      parts of the basic leaves. This region
 103  *                                      has generally been used by AMD for
 104  *                                      various extensions. For example, AMD-
 105  *                                      specific information about caches,
 106  *                                      features, and topology are found in this
 107  *                                      region.
 108  *
 109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 111  * the ranges, one of the primary things returned is the maximum valid leaf in
 112  * that range. This allows for discovery of what range of CPUID is valid.
 113  *
 114  * The CPUs have potentially surprising behavior when using an invalid leaf or
 115  * unimplemented leaf. If the requested leaf is within the valid basic or
 116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 117  * set to zero. However, if you specify a leaf that is outside of a valid range,
 118  * then instead it will be filled with the last valid _basic_ leaf. For example,
 119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 120  * an invalid extended leaf will return the information for leaf 3.
 121  *
 122  * Some leaves are broken down into sub-leaves. This means that the value
 123  * depends on both the leaf asked for in %eax and a secondary register. For
 124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 125  * additional information. Or when getting topology information in leaf 0xb, the
 126  * initial value in %ecx changes which level of the topology that you are
 127  * getting information about.
 128  *
 129  * cpuid values are always kept to 32 bits regardless of whether or not the
 130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
 131  * 32 bits of the register are always set to zero so that way the values are the
 132  * same regardless of execution mode.
 133  *
 134  * ----------------------
 135  * Identifying Processors
 136  * ----------------------
 137  *
 138  * We can identify a processor in two steps. The first step looks at cpuid leaf
 139  * 0. Leaf 0 contains the processor's vendor information. This is done by
 140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 142  *
 143  * From there, a processor is identified by a combination of three different
 144  * values:
 145  *
 146  *  1. Family
 147  *  2. Model
 148  *  3. Stepping
 149  *
 150  * Each vendor uses the family and model to uniquely identify a processor. The
 151  * way that family and model are changed depends on the vendor. For example,
 152  * Intel has been using family 0x6 for almost all of their processor since the
 153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
 154  * identify the exact processor. Different models are often used for the client
 155  * (consumer) and server parts. Even though each processor often has major
 156  * architectural differences, they still are considered the same family by
 157  * Intel.
 158  *
 159  * On the other hand, each major AMD architecture generally has its own family.
 160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 161  * the model number is used to help identify specific processors.  As AMD's
 162  * product lines have expanded, they have started putting a mixed bag of
 163  * processors into the same family, with each processor under a single
 164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
 165  * refer to each such collection as a processor family, distinct from cpuid
 166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
 167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
 168  * defines the processor family's non-architectural features.  In general, we'll
 169  * use "family" here to mean the family number reported by the cpuid instruction
 170  * and distinguish the processor family from it where appropriate.
 171  *
 172  * The stepping is used to refer to a revision of a specific microprocessor. The
 173  * term comes from equipment used to produce masks that are used to create
 174  * integrated circuits.
 175  *
 176  * The information is present in leaf 1, %eax. In technical documentation you
 177  * will see the terms extended model and extended family. The original family,
 178  * model, and stepping fields were each 4 bits wide. If the values in either
 179  * are 0xf, then one is to consult the extended model and extended family, which
 180  * take previously reserved bits and allow for a larger number of models and add
 181  * 0xf to them.
 182  *
 183  * When we process this information, we store the full family, model, and
 184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 185  * cpi_step, respectively. Whenever you are performing comparisons with the
 186  * family, model, and stepping, you should use these members and not the raw
 187  * values from cpuid. If you must use the raw values from cpuid directly, you
 188  * must make sure that you add the extended model and family to the base model
 189  * and family.
 190  *
 191  * In general, we do not use information about the family, model, and stepping
 192  * to determine whether or not a feature is present; that is generally driven by
 193  * specific leaves. However, when something we care about on the processor is
 194  * not considered 'architectural' meaning that it is specific to a set of
 195  * processors and not promised in the architecture model to be consistent from
 196  * generation to generation, then we will fall back on this information. The
 197  * most common cases where this comes up is when we have to workaround errata in
 198  * the processor, are dealing with processor-specific features such as CPU
 199  * performance counters, or we want to provide additional information for things
 200  * such as fault management.
 201  *
 202  * While processors also do have a brand string, which is the name that people
 203  * are familiar with when buying the processor, they are not meant for
 204  * programmatic consumption. That is what the family, model, and stepping are
 205  * for.
 206  *
 207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
 208  * and stepping(s) that refer to a single or very closely related set of silicon
 209  * implementations; while there are sometimes more specific ways to learn of the
 210  * presence or absence of a particular erratum or workaround, one may generally
 211  * assume that all processors of the same chiprev have the same errata and we
 212  * have chosen to represent them this way precisely because that is how AMD
 213  * groups them in their revision guides (errata documentation).  The processor
 214  * family (x86_processor_family_t) may be extracted from the chiprev if that
 215  * level of detail is not needed.  Processor families are considered unordered
 216  * but revisions within a family may be compared for either an exact match or at
 217  * least as recent as a reference revision.  See the chiprev_xxx() functions
 218  * below.
 219  *
 220  * Similarly, each processor family implements a particular microarchitecture,
 221  * which itself may have multiple revisions.  In general, non-architectural
 222  * features are specific to a processor family, but some may exist across
 223  * families containing cores that implement the same microarchitectural revision
 224  * (and, such cores share common bugs, too).  We provide utility routines
 225  * analogous to those for extracting and comparing chiprevs for
 226  * microarchitectures as well; see the uarch_xxx() functions.
 227  *
 228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
 229  * present used and available only for AMD and AMD-like processors.
 230  *
 231  * ------------
 232  * CPUID Passes
 233  * ------------
 234  *
 235  * As part of performing feature detection, we break this into several different
 236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
 237  * support processors that have a missing or broken cpuid instruction (notably
 238  * certain Cyrix processors) but those were all 32-bit processors which are no
 239  * longer supported. Passes are no longer numbered explicitly to make it easier
 240  * to break them up or move them around as needed; however, they still have a
 241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
 242  * x86_archext.h. The external interface to execute a cpuid pass or determine
 243  * whether a pass has been completed consists of cpuid_execpass() and
 244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
 245  * are as follows:
 246  *
 247  *      PRELUDE         This pass does not have any dependencies on system
 248  *                      setup; in particular, unlike all subsequent passes it is
 249  *                      guaranteed not to require PCI config space access.  It
 250  *                      sets the flag indicating that the processor we are
 251  *                      running on supports the cpuid instruction, which all
 252  *                      64-bit processors do.  This would also be the place to
 253  *                      add any other basic state that is required later on and
 254  *                      can be learned without dependencies.
 255  *
 256  *      IDENT           Determine which vendor manufactured the CPU, the family,
 257  *                      model, and stepping information, and compute basic
 258  *                      identifying tags from those values.  This is done first
 259  *                      so that machine-dependent code can control the features
 260  *                      the cpuid instruction will report during subsequent
 261  *                      passes if needed, and so that any intervening
 262  *                      machine-dependent code that needs basic identity will
 263  *                      have it available.  This includes synthesised
 264  *                      identifiers such as chiprev and uarchrev as well as the
 265  *                      values obtained directly from cpuid.  Prior to executing
 266  *                      this pass, machine-depedent boot code is responsible for
 267  *                      ensuring that the PCI configuration space access
 268  *                      functions have been set up and, if necessary, that
 269  *                      determine_platform() has been called.
 270  *
 271  *      BASIC           This is the primary pass and is responsible for doing a
 272  *                      large number of different things:
 273  *
 274  *                      1. Gathering a large number of feature flags to
 275  *                      determine which features the CPU support and which
 276  *                      indicate things that we need to do other work in the OS
 277  *                      to enable. Features detected this way are added to the
 278  *                      x86_featureset which can be queried to
 279  *                      determine what we should do. This includes processing
 280  *                      all of the basic and extended CPU features that we care
 281  *                      about.
 282  *
 283  *                      2. Determining the CPU's topology. This includes
 284  *                      information about how many cores and threads are present
 285  *                      in the package. It also is responsible for figuring out
 286  *                      which logical CPUs are potentially part of the same core
 287  *                      and what other resources they might share. For more
 288  *                      information see the 'Topology' section.
 289  *
 290  *                      3. Determining the set of CPU security-specific features
 291  *                      that we need to worry about and determine the
 292  *                      appropriate set of workarounds.
 293  *
 294  *                      Pass 1 on the boot CPU occurs before KMDB is started.
 295  *
 296  *      EXTENDED        The second pass is done after startup(). Here, we check
 297  *                      other miscellaneous features. Most of this is gathering
 298  *                      additional basic and extended features that we'll use in
 299  *                      later passes or for debugging support.
 300  *
 301  *      DYNAMIC         The third pass occurs after the kernel memory allocator
 302  *                      has been fully initialized. This gathers information
 303  *                      where we might need dynamic memory available for our
 304  *                      uses. This includes several varying width leaves that
 305  *                      have cache information and the processor's brand string.
 306  *
 307  *      RESOLVE         The fourth and final normal pass is performed after the
 308  *                      kernel has brought most everything online. This is
 309  *                      invoked from post_startup(). In this pass, we go through
 310  *                      the set of features that we have enabled and turn that
 311  *                      into the hardware auxiliary vector features that
 312  *                      userland receives. This is used by userland, primarily
 313  *                      by the run-time link-editor (RTLD), though userland
 314  *                      software could also refer to it directly.
 315  *
 316  * The function that performs a pass is currently assumed to be infallible, and
 317  * all existing implementation are.  This simplifies callers by allowing
 318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
 319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
 320  * Both of these assumptions can be relaxed if needed by future developments.
 321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
 322  * error to attempt to execute a pass before all previous passes have been
 323  * completed on the specified CPU, or to request cpuid information before the
 324  * pass that captures it has been executed.  These conditions can be tested
 325  * using cpuid_checkpass().
 326  *
 327  * The Microcode Pass
 328  *
 329  * After a microcode update, we do a selective rescan of the cpuid leaves to
 330  * determine what features have changed. Microcode updates can provide more
 331  * details about security related features to deal with issues like Spectre and
 332  * L1TF. On occasion, vendors have violated their contract and removed bits.
 333  * However, we don't try to detect that because that puts us in a situation that
 334  * we really can't deal with. As such, the only thing we rescan are security
 335  * related features today. See cpuid_pass_ucode().  This pass may be run in a
 336  * different sequence on APs and therefore is not part of the sequential order;
 337  * It is invoked directly instead of by cpuid_execpass() and its completion
 338  * status cannot be checked by cpuid_checkpass().  This could be integrated with
 339  * a more complex dependency mechanism if warranted by future developments.
 340  *
 341  * All of the passes are run on all CPUs. However, for the most part we only
 342  * care about what the boot CPU says about this information and use the other
 343  * CPUs as a rough guide to sanity check that we have the same feature set.
 344  *
 345  * We do not support running multiple logical CPUs with disjoint, let alone
 346  * different, feature sets.
 347  *
 348  * ------------------
 349  * Processor Topology
 350  * ------------------
 351  *
 352  * One of the important things that we need to do is to understand the topology
 353  * of the underlying processor. When we say topology in this case, we're trying
 354  * to understand the relationship between the logical CPUs that the operating
 355  * system sees and the underlying physical layout. Different logical CPUs may
 356  * share different resources which can have important consequences for the
 357  * performance of the system. For example, they may share caches, execution
 358  * units, and more.
 359  *
 360  * The topology of the processor changes from generation to generation and
 361  * vendor to vendor.  Along with that, different vendors use different
 362  * terminology, and the operating system itself uses occasionally overlapping
 363  * terminology. It's important to understand what this topology looks like so
 364  * one can understand the different things that we try to calculate and
 365  * determine.
 366  *
 367  * To get started, let's talk about a little bit of terminology that we've used
 368  * so far, is used throughout this file, and is fairly generic across multiple
 369  * vendors:
 370  *
 371  * CPU
 372  *      A central processing unit (CPU) refers to a logical and/or virtual
 373  *      entity that the operating system can execute instructions on. The
 374  *      underlying resources for this CPU may be shared between multiple
 375  *      entities; however, to the operating system it is a discrete unit.
 376  *
 377  * PROCESSOR and PACKAGE
 378  *
 379  *      Generally, when we use the term 'processor' on its own, we are referring
 380  *      to the physical entity that one buys and plugs into a board. However,
 381  *      because processor has been overloaded and one might see it used to mean
 382  *      multiple different levels, we will instead use the term 'package' for
 383  *      the rest of this file. The term package comes from the electrical
 384  *      engineering side and refers to the physical entity that encloses the
 385  *      electronics inside. Strictly speaking the package can contain more than
 386  *      just the CPU, for example, on many processors it may also have what's
 387  *      called an 'integrated graphical processing unit (GPU)'. Because the
 388  *      package can encapsulate multiple units, it is the largest physical unit
 389  *      that we refer to.
 390  *
 391  * SOCKET
 392  *
 393  *      A socket refers to unit on a system board (generally the motherboard)
 394  *      that can receive a package. A single package, or processor, is plugged
 395  *      into a single socket. A system may have multiple sockets. Often times,
 396  *      the term socket is used interchangeably with package and refers to the
 397  *      electrical component that has plugged in, and not the receptacle itself.
 398  *
 399  * CORE
 400  *
 401  *      A core refers to the physical instantiation of a CPU, generally, with a
 402  *      full set of hardware resources available to it. A package may contain
 403  *      multiple cores inside of it or it may just have a single one. A
 404  *      processor with more than one core is often referred to as 'multi-core'.
 405  *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 406  *      that has 'multi-core' processors.
 407  *
 408  *      A core may expose a single logical CPU to the operating system, or it
 409  *      may expose multiple CPUs, which we call threads, defined below.
 410  *
 411  *      Some resources may still be shared by cores in the same package. For
 412  *      example, many processors will share the level 3 cache between cores.
 413  *      Some AMD generations share hardware resources between cores. For more
 414  *      information on that see the section 'AMD Topology'.
 415  *
 416  * THREAD and STRAND
 417  *
 418  *      In this file, generally a thread refers to a hardware resources and not
 419  *      the operating system's logical abstraction. A thread is always exposed
 420  *      as an independent logical CPU to the operating system. A thread belongs
 421  *      to a specific core. A core may have more than one thread. When that is
 422  *      the case, the threads that are part of the same core are often referred
 423  *      to as 'siblings'.
 424  *
 425  *      When multiple threads exist, this is generally referred to as
 426  *      simultaneous multi-threading (SMT). When Intel introduced this in their
 427  *      processors they called it hyper-threading (HT). When multiple threads
 428  *      are active in a core, they split the resources of the core. For example,
 429  *      two threads may share the same set of hardware execution units.
 430  *
 431  *      The operating system often uses the term 'strand' to refer to a thread.
 432  *      This helps disambiguate it from the software concept.
 433  *
 434  * CHIP
 435  *
 436  *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 437  *      base meaning, it is used to refer to a single integrated circuit, which
 438  *      may or may not be the only thing in the package. In illumos, when you
 439  *      see the term 'chip' it is almost always referring to the same thing as
 440  *      the 'package'. However, many vendors may use chip to refer to one of
 441  *      many integrated circuits that have been placed in the package. As an
 442  *      example, see the subsequent definition.
 443  *
 444  *      To try and keep things consistent, we will only use chip when referring
 445  *      to the entire integrated circuit package, with the exception of the
 446  *      definition of multi-chip module (because it is in the name) and use the
 447  *      term 'die' when we want the more general, potential sub-component
 448  *      definition.
 449  *
 450  * DIE
 451  *
 452  *      A die refers to an integrated circuit. Inside of the package there may
 453  *      be a single die or multiple dies. This is sometimes called a 'chip' in
 454  *      vendor's parlance, but in this file, we use the term die to refer to a
 455  *      subcomponent.
 456  *
 457  * MULTI-CHIP MODULE
 458  *
 459  *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 460  *      are connected together in the same package. When a multi-chip design is
 461  *      used, generally each chip is manufactured independently and then joined
 462  *      together in the package. For example, on AMD's Zen microarchitecture
 463  *      (family 0x17), the package contains several dies (the second meaning of
 464  *      chip from above) that are connected together.
 465  *
 466  * CACHE
 467  *
 468  *      A cache is a part of the processor that maintains copies of recently
 469  *      accessed memory. Caches are split into levels and then into types.
 470  *      Commonly there are one to three levels, called level one, two, and
 471  *      three. The lower the level, the smaller it is, the closer it is to the
 472  *      execution units of the CPU, and the faster it is to access. The layout
 473  *      and design of the cache come in many different flavors, consult other
 474  *      resources for a discussion of those.
 475  *
 476  *      Caches are generally split into two types, the instruction and data
 477  *      cache. The caches contain what their names suggest, the instruction
 478  *      cache has executable program text, while the data cache has all other
 479  *      memory that the processor accesses. As of this writing, data is kept
 480  *      coherent between all of the caches on x86, so if one modifies program
 481  *      text before it is executed, that will be in the data cache, and the
 482  *      instruction cache will be synchronized with that change when the
 483  *      processor actually executes those instructions. This coherency also
 484  *      covers the fact that data could show up in multiple caches.
 485  *
 486  *      Generally, the lowest level caches are specific to a core. However, the
 487  *      last layer cache is shared between some number of cores. The number of
 488  *      CPUs sharing this last level cache is important. This has implications
 489  *      for the choices that the scheduler makes, as accessing memory that might
 490  *      be in a remote cache after thread migration can be quite expensive.
 491  *
 492  *      Sometimes, the word cache is abbreviated with a '$', because in US
 493  *      English the word cache is pronounced the same as cash. So L1D$ refers to
 494  *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 495  *      in the rest of this theory statement for clarity.
 496  *
 497  * MEMORY CONTROLLER
 498  *
 499  *      The memory controller is a component that provides access to DRAM. Each
 500  *      memory controller can access a set number of DRAM channels. Each channel
 501  *      can have a number of DIMMs (sticks of memory) associated with it. A
 502  *      given package may have more than one memory controller. The association
 503  *      of the memory controller to a group of cores is important as it is
 504  *      cheaper to access memory on the controller that you are associated with.
 505  *
 506  * NUMA
 507  *
 508  *      NUMA or non-uniform memory access, describes a way that systems are
 509  *      built. On x86, any processor core can address all of the memory in the
 510  *      system. However, When using multiple sockets or possibly within a
 511  *      multi-chip module, some of that memory is physically closer and some of
 512  *      it is further. Memory that is further away is more expensive to access.
 513  *      Consider the following image of multiple sockets with memory:
 514  *
 515  *      +--------+                                                +--------+
 516  *      | DIMM A |         +----------+      +----------+         | DIMM D |
 517  *      +--------+-+       |          |      |          |       +-+------+-+
 518  *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 519  *        +--------+-+     |          |      |          |     +-+------+-+
 520  *          | DIMM C |     +----------+      +----------+     | DIMM F |
 521  *          +--------+                                        +--------+
 522  *
 523  *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 524  *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 525  *      access DIMMs A-C and more expensive to access D-F as it has to go
 526  *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 527  *      D-F are cheaper than A-C. While the socket form is the most common, when
 528  *      using multi-chip modules, this can also sometimes occur. For another
 529  *      example of this that's more involved, see the AMD topology section.
 530  *
 531  *
 532  * Intel Topology
 533  * --------------
 534  *
 535  * Most Intel processors since Nehalem, (as of this writing the current gen
 536  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 537  * the package is a single monolithic die. MCMs currently aren't used. Most
 538  * parts have three levels of caches, with the L3 cache being shared between
 539  * all of the cores on the package. The L1/L2 cache is generally specific to
 540  * an individual core. The following image shows at a simplified level what
 541  * this looks like. The memory controller is commonly part of something called
 542  * the 'Uncore', that used to be separate physical chips that were not a part of
 543  * the package, but are now part of the same chip.
 544  *
 545  *  +-----------------------------------------------------------------------+
 546  *  | Package                                                               |
 547  *  |  +-------------------+  +-------------------+  +-------------------+  |
 548  *  |  | Core              |  | Core              |  | Core              |  |
 549  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 550  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 551  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 552  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 553  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 554  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 555  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 556  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 557  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 558  *  |  +-------------------+  +-------------------+  +-------------------+  |
 559  *  | +-------------------------------------------------------------------+ |
 560  *  | |                         Shared L3 Cache                           | |
 561  *  | +-------------------------------------------------------------------+ |
 562  *  | +-------------------------------------------------------------------+ |
 563  *  | |                        Memory Controller                          | |
 564  *  | +-------------------------------------------------------------------+ |
 565  *  +-----------------------------------------------------------------------+
 566  *
 567  * A side effect of this current architecture is that what we care about from a
 568  * scheduling and topology perspective, is simplified. In general we care about
 569  * understanding which logical CPUs are part of the same core and socket.
 570  *
 571  * To determine the relationship between threads and cores, Intel initially used
 572  * the identifier in the advanced programmable interrupt controller (APIC). They
 573  * also added cpuid leaf 4 to give additional information about the number of
 574  * threads and CPUs in the processor. With the addition of x2apic (which
 575  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 576  * additional cpuid topology leaf 0xB was added.
 577  *
 578  * AMD Topology
 579  * ------------
 580  *
 581  * When discussing AMD topology, we want to break this into three distinct
 582  * generations of topology. There's the basic topology that has been used in
 583  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 584  * with family 0x15 (Bulldozer), and there's the topology that was introduced
 585  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
 586  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
 587  * additional terminology that's worth talking about.
 588  *
 589  * Until the introduction of family 0x17 (Zen), AMD did not implement something
 590  * that they considered SMT. Whether or not the AMD processors have SMT
 591  * influences many things including scheduling and reliability, availability,
 592  * and serviceability (RAS) features.
 593  *
 594  * NODE
 595  *
 596  *      AMD uses the term node to refer to a die that contains a number of cores
 597  *      and I/O resources. Depending on the processor family and model, more
 598  *      than one node can be present in the package. When there is more than one
 599  *      node this indicates a multi-chip module. Usually each node has its own
 600  *      access to memory and I/O devices. This is important and generally
 601  *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 602  *      result, we track this relationship in the operating system.
 603  *
 604  *      In processors with an L3 cache, the L3 cache is generally shared across
 605  *      the entire node, though the way this is carved up varies from generation
 606  *      to generation.
 607  *
 608  * BULLDOZER
 609  *
 610  *      Starting with the Bulldozer family (0x15) and continuing until the
 611  *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 612  *      compute unit. In a compute unit, two traditional cores share a number of
 613  *      hardware resources. Critically, they share the FPU, L1 instruction
 614  *      cache, and the L2 cache. Several compute units were then combined inside
 615  *      of a single node.  Because the integer execution units, L1 data cache,
 616  *      and some other resources were not shared between the cores, AMD never
 617  *      considered this to be SMT.
 618  *
 619  * ZEN
 620  *
 621  *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 622  *      is called Zeppelin. These modules are similar to the idea of nodes used
 623  *      previously. Each of these nodes has two DRAM channels which all of the
 624  *      cores in the node can access uniformly. These nodes are linked together
 625  *      in the package, creating a NUMA environment.
 626  *
 627  *      The Zeppelin die itself contains two different 'core complexes'. Each
 628  *      core complex consists of four cores which each have two threads, for a
 629  *      total of 8 logical CPUs per complex. Unlike other generations,
 630  *      where all the logical CPUs in a given node share the L3 cache, here each
 631  *      core complex has its own shared L3 cache.
 632  *
 633  *      A further thing that we need to consider is that in some configurations,
 634  *      particularly with the Threadripper line of processors, not every die
 635  *      actually has its memory controllers wired up to actual memory channels.
 636  *      This means that some cores have memory attached to them and others
 637  *      don't.
 638  *
 639  *      To put Zen in perspective, consider the following images:
 640  *
 641  *      +--------------------------------------------------------+
 642  *      | Core Complex                                           |
 643  *      | +-------------------+    +-------------------+  +---+  |
 644  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 645  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 646  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 647  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 648  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 649  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 650  *      | +-------------------+    +-------------------+  | C |  |
 651  *      | +-------------------+    +-------------------+  | a |  |
 652  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 653  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 654  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 655  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 656  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 657  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 658  *      | +-------------------+    +-------------------+  +---+  |
 659  *      |                                                        |
 660  *      +--------------------------------------------------------+
 661  *
 662  *  This first image represents a single Zen core complex that consists of four
 663  *  cores.
 664  *
 665  *
 666  *      +--------------------------------------------------------+
 667  *      | Zeppelin Die                                           |
 668  *      |  +--------------------------------------------------+  |
 669  *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 670  *      |  +--------------------------------------------------+  |
 671  *      |                           HH                           |
 672  *      |          +-----------+    HH    +-----------+          |
 673  *      |          |           |    HH    |           |          |
 674  *      |          |    Core   |==========|    Core   |          |
 675  *      |          |  Complex  |==========|  Complex  |          |
 676  *      |          |           |    HH    |           |          |
 677  *      |          +-----------+    HH    +-----------+          |
 678  *      |                           HH                           |
 679  *      |  +--------------------------------------------------+  |
 680  *      |  |                Memory Controller                 |  |
 681  *      |  +--------------------------------------------------+  |
 682  *      |                                                        |
 683  *      +--------------------------------------------------------+
 684  *
 685  *  This image represents a single Zeppelin Die. Note how both cores are
 686  *  connected to the same memory controller and I/O units. While each core
 687  *  complex has its own L3 cache as seen in the first image, they both have
 688  *  uniform access to memory.
 689  *
 690  *
 691  *                      PP                     PP
 692  *                      PP                     PP
 693  *           +----------PP---------------------PP---------+
 694  *           |          PP                     PP         |
 695  *           |    +-----------+          +-----------+    |
 696  *           |    |           |          |           |    |
 697  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 698  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 699  *           |    |           |          |           |    |
 700  *           |    +-----------+ooo    ...+-----------+    |
 701  *           |          HH      ooo  ...       HH         |
 702  *           |          HH        oo..         HH         |
 703  *           |          HH        ..oo         HH         |
 704  *           |          HH      ...  ooo       HH         |
 705  *           |    +-----------+...    ooo+-----------+    |
 706  *           |    |           |          |           |    |
 707  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 708  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 709  *           |    |           |          |           |    |
 710  *           |    +-----------+          +-----------+    |
 711  *           |          PP                     PP         |
 712  *           +----------PP---------------------PP---------+
 713  *                      PP                     PP
 714  *                      PP                     PP
 715  *
 716  *  This image represents a single Zen package. In this example, it has four
 717  *  Zeppelin dies, though some configurations only have a single one. In this
 718  *  example, each die is directly connected to the next. Also, each die is
 719  *  represented as being connected to memory by the 'M' character and connected
 720  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 721  *  die is made up of two core complexes, we have multiple different NUMA
 722  *  domains that we care about for these systems.
 723  *
 724  * ZEN 2
 725  *
 726  *      Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
 727  *      each Zeppelin Die had its own I/O die, that has been moved out of the
 728  *      core complex in Zen 2. The actual core complex looks pretty similar, but
 729  *      now the die actually looks much simpler:
 730  *
 731  *      +--------------------------------------------------------+
 732  *      | Zen 2 Core Complex Die    HH                           |
 733  *      |                           HH                           |
 734  *      |          +-----------+    HH    +-----------+          |
 735  *      |          |           |    HH    |           |          |
 736  *      |          |    Core   |==========|    Core   |          |
 737  *      |          |  Complex  |==========|  Complex  |          |
 738  *      |          |           |    HH    |           |          |
 739  *      |          +-----------+    HH    +-----------+          |
 740  *      |                           HH                           |
 741  *      |                           HH                           |
 742  *      +--------------------------------------------------------+
 743  *
 744  *      From here, when we add the central I/O die, this changes things a bit.
 745  *      Each die is connected to the I/O die, rather than trying to interconnect
 746  *      them directly. The following image takes the same Zen 1 image that we
 747  *      had earlier and shows what it looks like with the I/O die instead:
 748  *
 749  *                                 PP    PP
 750  *                                 PP    PP
 751  *           +---------------------PP----PP---------------------+
 752  *           |                     PP    PP                     |
 753  *           |  +-----------+      PP    PP      +-----------+  |
 754  *           |  |           |      PP    PP      |           |  |
 755  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
 756  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
 757  *           |  |         |o|oooo|          |oooo|o|         |  |
 758  *           |  +-----------+    |          |    +-----------+  |
 759  *           |                   |   I/O    |                   |
 760  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
 761  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
 762  *           |                   |          |                   |
 763  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
 764  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
 765  *           |                   |          |                   |
 766  *           |  +-----------+    |          |    +-----------+  |
 767  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
 768  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
 769  *           |  |    Die    |      PP    PP      |    Die    |  |
 770  *           |  |           |      PP    PP      |           |  |
 771  *           |  +-----------+      PP    PP      +-----------+  |
 772  *           |                     PP    PP                     |
 773  *           +---------------------PP----PP---------------------+
 774  *                                 PP    PP
 775  *                                 PP    PP
 776  *
 777  *      The above has four core complex dies installed, though the Zen 2 EPYC
 778  *      and ThreadRipper parts allow for up to eight, while the Ryzen parts
 779  *      generally only have one to two. The more notable difference here is how
 780  *      everything communicates. Note that memory and PCIe come out of the
 781  *      central die. This changes the way that one die accesses a resource. It
 782  *      basically always has to go to the I/O die, where as in Zen 1 it may have
 783  *      satisfied it locally. In general, this ends up being a better strategy
 784  *      for most things, though it is possible to still treat everything in four
 785  *      distinct NUMA domains with each Zen 2 die slightly closer to some memory
 786  *      and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
 787  *      now there is only one 'node' present.
 788  *
 789  * ZEN 3
 790  *
 791  *      From an architectural perspective, Zen 3 is a much smaller change from
 792  *      Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
 793  *      its microarchitectural changes. The biggest thing for us is how the die
 794  *      changes. In Zen 1 and Zen 2, each core complex still had its own L3
 795  *      cache. However, in Zen 3, the L3 is now shared between the entire core
 796  *      complex die and is no longer partitioned between each core complex. This
 797  *      means that all cores on the die can share the same L3 cache. Otherwise,
 798  *      the general layout of the overall package with various core complexes
 799  *      and an I/O die stays the same. Here's what the Core Complex Die looks
 800  *      like in a bit more detail:
 801  *
 802  *               +-------------------------------------------------+
 803  *               | Zen 3 Core Complex Die                          |
 804  *               | +-------------------+    +-------------------+  |
 805  *               | | Core       +----+ |    | Core       +----+ |  |
 806  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 807  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 808  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 809  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 810  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 811  *               | +-------------------+    +-------------------+  |
 812  *               | +-------------------+    +-------------------+  |
 813  *               | | Core       +----+ |    | Core       +----+ |  |
 814  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 815  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 816  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 817  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 818  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 819  *               | +-------------------+    +-------------------+  |
 820  *               |                                                 |
 821  *               | +--------------------------------------------+  |
 822  *               | |                 L3 Cache                   |  |
 823  *               | +--------------------------------------------+  |
 824  *               |                                                 |
 825  *               | +-------------------+    +-------------------+  |
 826  *               | | Core       +----+ |    | Core       +----+ |  |
 827  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 828  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 829  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 830  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 831  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 832  *               | +-------------------+    +-------------------+  |
 833  *               | +-------------------+    +-------------------+  |
 834  *               | | Core       +----+ |    | Core       +----+ |  |
 835  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 836  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 837  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 838  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 839  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 840  *               | +-------------------+    +-------------------+  |
 841  *               +-------------------------------------------------+
 842  *
 843  *      While it is not pictured, there are connections from the die to the
 844  *      broader data fabric and additional functional blocks to support that
 845  *      communication and coherency.
 846  *
 847  * CPUID LEAVES
 848  *
 849  * There are a few different CPUID leaves that we can use to try and understand
 850  * the actual state of the world. As part of the introduction of family 0xf, AMD
 851  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 852  * processors that are in the system. Because families before Zen didn't have
 853  * SMT, this was always the number of cores that were in the system. However, it
 854  * should always be thought of as the number of logical threads to be consistent
 855  * between generations. In addition we also get the size of the APIC ID that is
 856  * used to represent the number of logical processors. This is important for
 857  * deriving topology information.
 858  *
 859  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 860  * bit between Bulldozer and later families, but it is quite useful in
 861  * determining the topology information. Because this information has changed
 862  * across family generations, it's worth calling out what these mean
 863  * explicitly. The registers have the following meanings:
 864  *
 865  *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 866  *              APIC ID, even though on systems without x2apic support, it will
 867  *              be limited to 8 bits.
 868  *
 869  *      %ebx    On Bulldozer-era systems this contains information about the
 870  *              number of cores that are in a compute unit (cores that share
 871  *              resources). It also contains a per-package compute unit ID that
 872  *              identifies which compute unit the logical CPU is a part of.
 873  *
 874  *              On Zen-era systems this instead contains the number of threads
 875  *              per core and the ID of the core that the logical CPU is a part
 876  *              of. Note, this ID is unique only to the package, it is not
 877  *              globally unique across the entire system.
 878  *
 879  *      %ecx    This contains the number of nodes that exist in the package. It
 880  *              also contains an ID that identifies which node the logical CPU
 881  *              is a part of.
 882  *
 883  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 884  * cache layout to determine which logical CPUs are sharing which caches.
 885  *
 886  * illumos Topology
 887  * ----------------
 888  *
 889  * Based on the above we synthesize the information into several different
 890  * variables that we store in the 'struct cpuid_info'. We'll go into the details
 891  * of what each member is supposed to represent and their uniqueness. In
 892  * general, there are two levels of uniqueness that we care about. We care about
 893  * an ID that is globally unique. That means that it will be unique across all
 894  * entities in the system. For example, the default logical CPU ID is globally
 895  * unique. On the other hand, there is some information that we only care about
 896  * being unique within the context of a single package / socket. Here are the
 897  * variables that we keep track of and their meaning.
 898  *
 899  * Several of the values that are asking for an identifier, with the exception
 900  * of cpi_apicid, are allowed to be synthetic.
 901  *
 902  *
 903  * cpi_apicid
 904  *
 905  *      This is the value of the CPU's APIC id. This should be the full 32-bit
 906  *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 907  *      APIC ID. This value is globally unique between all logical CPUs across
 908  *      all packages. This is usually required by the APIC.
 909  *
 910  * cpi_chipid
 911  *
 912  *      This value indicates the ID of the package that the logical CPU is a
 913  *      part of. This value is allowed to be synthetic. It is usually derived by
 914  *      taking the CPU's APIC ID and determining how many bits are used to
 915  *      represent CPU cores in the package. All logical CPUs that are part of
 916  *      the same package must have the same value.
 917  *
 918  * cpi_coreid
 919  *
 920  *      This represents the ID of a CPU core. Two logical CPUs should only have
 921  *      the same cpi_coreid value if they are part of the same core. These
 922  *      values may be synthetic. On systems that support SMT, this value is
 923  *      usually derived from the APIC ID, otherwise it is often synthetic and
 924  *      just set to the value of the cpu_id in the cpu_t.
 925  *
 926  * cpi_pkgcoreid
 927  *
 928  *      This is similar to the cpi_coreid in that logical CPUs that are part of
 929  *      the same core should have the same ID. The main difference is that these
 930  *      values are only required to be unique to a given socket.
 931  *
 932  * cpi_clogid
 933  *
 934  *      This represents the logical ID of a logical CPU. This value should be
 935  *      unique within a given socket for each logical CPU. This is allowed to be
 936  *      synthetic, though it is usually based off of the CPU's apic ID. The
 937  *      broader system expects that logical CPUs that have are part of the same
 938  *      core have contiguous numbers. For example, if there were two threads per
 939  *      core, then the core IDs divided by two should be the same and the first
 940  *      modulus two should be zero and the second one. For example, IDs 4 and 5
 941  *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 942  *      6 represent two logical CPUs that are part of different cores.
 943  *
 944  *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 945  *      from the same source, strictly speaking, they don't have to be and the
 946  *      two values should be considered logically independent. One should not
 947  *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 948  *      some kind of relationship. While this is tempting, we've seen cases on
 949  *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 950  *
 951  * cpi_ncpu_per_chip
 952  *
 953  *      This value indicates the total number of logical CPUs that exist in the
 954  *      physical package. Critically, this is not the number of logical CPUs
 955  *      that exist for just the single core.
 956  *
 957  *      This value should be the same for all logical CPUs in the same package.
 958  *
 959  * cpi_ncore_per_chip
 960  *
 961  *      This value indicates the total number of physical CPU cores that exist
 962  *      in the package. The system compares this value with cpi_ncpu_per_chip to
 963  *      determine if simultaneous multi-threading (SMT) is enabled. When
 964  *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 965  *      the X86FSET_HTT feature is not set. If this value is greater than one,
 966  *      than we consider the processor to have the feature X86FSET_CMP, to
 967  *      indicate that there is support for more than one core.
 968  *
 969  *      This value should be the same for all logical CPUs in the same package.
 970  *
 971  * cpi_procnodes_per_pkg
 972  *
 973  *      This value indicates the number of 'nodes' that exist in the package.
 974  *      When processors are actually a multi-chip module, this represents the
 975  *      number of such modules that exist in the package. Currently, on Intel
 976  *      based systems this member is always set to 1.
 977  *
 978  *      This value should be the same for all logical CPUs in the same package.
 979  *
 980  * cpi_procnodeid
 981  *
 982  *      This value indicates the ID of the node that the logical CPU is a part
 983  *      of. All logical CPUs that are in the same node must have the same value
 984  *      here. This value must be unique across all of the packages in the
 985  *      system.  On Intel based systems, this is currently set to the value in
 986  *      cpi_chipid because there is only one node.
 987  *
 988  * cpi_cores_per_compunit
 989  *
 990  *      This value indicates the number of cores that are part of a compute
 991  *      unit. See the AMD topology section for this. This member only has real
 992  *      meaning currently for AMD Bulldozer family processors. For all other
 993  *      processors, this should currently be set to 1.
 994  *
 995  * cpi_compunitid
 996  *
 997  *      This indicates the compute unit that the logical CPU belongs to. For
 998  *      processors without AMD Bulldozer-style compute units this should be set
 999  *      to the value of cpi_coreid.
1000  *
1001  * cpi_ncpu_shr_last_cache
1002  *
1003  *      This indicates the number of logical CPUs that are sharing the same last
1004  *      level cache. This value should be the same for all CPUs that are sharing
1005  *      that cache. The last cache refers to the cache that is closest to memory
1006  *      and furthest away from the CPU.
1007  *
1008  * cpi_last_lvl_cacheid
1009  *
1010  *      This indicates the ID of the last cache that the logical CPU uses. This
1011  *      cache is often shared between multiple logical CPUs and is the cache
1012  *      that is closest to memory and furthest away from the CPU. This value
1013  *      should be the same for a group of logical CPUs only if they actually
1014  *      share the same last level cache. IDs should not overlap between
1015  *      packages.
1016  *
1017  * cpi_ncore_bits
1018  *
1019  *      This indicates the number of bits that are required to represent all of
1020  *      the cores in the system. As cores are derived based on their APIC IDs,
1021  *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1022  *      this value to be larger than the actual number of IDs that are present
1023  *      in the system. This is used to size tables by the CMI framework. It is
1024  *      only filled in for Intel and AMD CPUs.
1025  *
1026  * cpi_nthread_bits
1027  *
1028  *      This indicates the number of bits required to represent all of the IDs
1029  *      that cover the logical CPUs that exist on a given core. It's OK for this
1030  *      value to be larger than the actual number of IDs that are present in the
1031  *      system.  This is used to size tables by the CMI framework. It is
1032  *      only filled in for Intel and AMD CPUs.
1033  *
1034  * -----------
1035  * Hypervisors
1036  * -----------
1037  *
1038  * If trying to manage the differences between vendors wasn't bad enough, it can
1039  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1040  * the ability to interpose on all cpuid instructions and change them to suit
1041  * their purposes. In general, this is necessary as the hypervisor wants to be
1042  * able to present a more uniform set of features or not necessarily give the
1043  * guest operating system kernel knowledge of all features so it can be
1044  * more easily migrated between systems.
1045  *
1046  * When it comes to trying to determine topology information, this can be a
1047  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1048  * leaf, it'll often return all zeros. Because of that, you'll often see various
1049  * checks scattered about fields being non-zero before we assume we can use
1050  * them.
1051  *
1052  * When it comes to topology information, the hypervisor is often incentivized
1053  * to lie to you about topology. This is because it doesn't always actually
1054  * guarantee that topology at all. The topology path we take in the system
1055  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1056  * or AMD CPU, then we basically do our normal path. However, when they don't
1057  * use an actual vendor, then that usually turns into multiple one-core CPUs
1058  * that we enumerate that are often on different sockets. The actual behavior
1059  * depends greatly on what the hypervisor actually exposes to us.
1060  *
1061  * --------------------
1062  * Exposing Information
1063  * --------------------
1064  *
1065  * We expose CPUID information in three different forms in the system.
1066  *
1067  * The first is through the x86_featureset variable. This is used in conjunction
1068  * with the is_x86_feature() function. This is queried by x86-specific functions
1069  * to determine which features are or aren't present in the system and to make
1070  * decisions based upon them. For example, users of this include everything from
1071  * parts of the system dedicated to reliability, availability, and
1072  * serviceability (RAS), to making decisions about how to handle security
1073  * mitigations, to various x86-specific drivers. General purpose or
1074  * architecture independent drivers should never be calling this function.
1075  *
1076  * The second means is through the auxiliary vector. The auxiliary vector is a
1077  * series of tagged data that the kernel passes down to a user program when it
1078  * begins executing. This information is used to indicate to programs what
1079  * instruction set extensions are present. For example, information about the
1080  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1081  * since user programs cannot make use of it. However, things like the AVX
1082  * instruction sets are. Programs use this information to make run-time
1083  * decisions about what features they should use. As an example, the run-time
1084  * link-editor (rtld) can relocate different functions depending on the hardware
1085  * support available.
1086  *
1087  * The final form is through a series of accessor functions that all have the
1088  * form cpuid_get*. This is used by a number of different subsystems in the
1089  * kernel to determine more detailed information about what we're running on,
1090  * topology information, etc. Some of these subsystems include processor groups
1091  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1092  * microcode, and performance monitoring. These functions all ASSERT that the
1093  * CPU they're being called on has reached a certain cpuid pass. If the passes
1094  * are rearranged, then this needs to be adjusted.
1095  *
1096  * -----------------------------------------------
1097  * Speculative Execution CPU Side Channel Security
1098  * -----------------------------------------------
1099  *
1100  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1101  * execution in the CPU to create side channels there have been a number of
1102  * different attacks and corresponding issues that the operating system needs to
1103  * mitigate against. The following list is some of the common, but not
1104  * exhaustive, set of issues that we know about and have done some or need to do
1105  * more work in the system to mitigate against:
1106  *
1107  *   - Spectre v1
1108  *   - swapgs (Spectre v1 variant)
1109  *   - Spectre v2
1110  *   - Meltdown (Spectre v3)
1111  *   - Rogue Register Read (Spectre v3a)
1112  *   - Speculative Store Bypass (Spectre v4)
1113  *   - ret2spec, SpectreRSB
1114  *   - L1 Terminal Fault (L1TF)
1115  *   - Microarchitectural Data Sampling (MDS)
1116  *
1117  * Each of these requires different sets of mitigations and has different attack
1118  * surfaces. For the most part, this discussion is about protecting the kernel
1119  * from non-kernel executing environments such as user processes and hardware
1120  * virtual machines. Unfortunately, there are a number of user vs. user
1121  * scenarios that exist with these. The rest of this section will describe the
1122  * overall approach that the system has taken to address these as well as their
1123  * shortcomings. Unfortunately, not all of the above have been handled today.
1124  *
1125  * SPECTRE v2, ret2spec, SpectreRSB
1126  *
1127  * The second variant of the spectre attack focuses on performing branch target
1128  * injection. This generally impacts indirect call instructions in the system.
1129  * There are four different ways to mitigate this issue that are commonly
1130  * described today:
1131  *
1132  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1133  *  2. Using Retpolines and RSB Stuffing
1134  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1135  *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1136  *
1137  * IBRS uses a feature added to microcode to restrict speculation, among other
1138  * things. This form of mitigation has not been used as it has been generally
1139  * seen as too expensive and requires reactivation upon various transitions in
1140  * the system.
1141  *
1142  * As a less impactful alternative to IBRS, retpolines were developed by
1143  * Google. These basically require one to replace indirect calls with a specific
1144  * trampoline that will cause speculation to fail and break the attack.
1145  * Retpolines require compiler support. We always build with retpolines in the
1146  * external thunk mode. This means that a traditional indirect call is replaced
1147  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1148  * of this is that all indirect function calls are performed through a register.
1149  *
1150  * We have to use a common external location of the thunk and not inline it into
1151  * the callsite so that way we can have a single place to patch these functions.
1152  * As it turns out, we currently have two different forms of retpolines that
1153  * exist in the system:
1154  *
1155  *  1. A full retpoline
1156  *  2. A no-op version
1157  *
1158  * The first one is used in the general case. Historically, there was an
1159  * AMD-specific optimized retopoline variant that was based around using a
1160  * serializing lfence instruction; however, in March 2022 it was announced that
1161  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1162  * use it and it is no longer available in the system.
1163  *
1164  * The third form described above is the most curious. It turns out that the way
1165  * that retpolines are implemented is that they rely on how speculation is
1166  * performed on a 'ret' instruction. Intel has continued to optimize this
1167  * process (which is partly why we need to have return stack buffer stuffing,
1168  * but more on that in a bit) and in processors starting with Cascade Lake
1169  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1170  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1171  *
1172  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1173  * physical core. However, if this is the case, we don't want to use retpolines
1174  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1175  * function (called a thunk) into a jmp instruction. This means that we're still
1176  * paying the cost of an extra jump to the external thunk, but it gives us
1177  * flexibility and the ability to have a single kernel image that works across a
1178  * wide variety of systems and hardware features.
1179  *
1180  * Unfortunately, this alone is insufficient. First, Skylake systems have
1181  * additional speculation for the Return Stack Buffer (RSB) which is used to
1182  * return from call instructions which retpolines take advantage of. However,
1183  * this problem is not just limited to Skylake and is actually more pernicious.
1184  * The SpectreRSB paper introduces several more problems that can arise with
1185  * dealing with this. The RSB can be poisoned just like the indirect branch
1186  * predictor. This means that one needs to clear the RSB when transitioning
1187  * between two different privilege domains. Some examples include:
1188  *
1189  *  - Switching between two different user processes
1190  *  - Going between user land and the kernel
1191  *  - Returning to the kernel from a hardware virtual machine
1192  *
1193  * Mitigating this involves combining a couple of different things. The first is
1194  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1195  * Bridge. When an RSB entry refers to a user address and we're executing in the
1196  * kernel, speculation through it will be stopped when SMEP is enabled. This
1197  * protects against a number of the different cases that we would normally be
1198  * worried about such as when we enter the kernel from user land.
1199  *
1200  * To prevent against additional manipulation of the RSB from other contexts
1201  * such as a non-root VMX context attacking the kernel we first look to
1202  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1203  * nothing else that we need to do to protect the kernel at this time.
1204  *
1205  * Unfortunately, eIBRS or not, we need to manually overwrite the contents of
1206  * the return stack buffer. We do this through the x86_rsb_stuff() function.
1207  * Currently this is employed on context switch and vmx_exit. The
1208  * x86_rsb_stuff() function is disabled only when mitigations in general are.
1209  *
1210  * If SMEP is not present, then we would have to stuff the RSB every time we
1211  * transitioned from user mode to the kernel, which isn't very practical right
1212  * now.
1213  *
1214  * To fully protect user to user and vmx to vmx attacks from these classes of
1215  * issues, we would also need to allow them to opt into performing an Indirect
1216  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1217  *
1218  * The fourth form of mitigation here is specific to AMD and is called Automated
1219  * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1220  * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1221  * (extended feature enable register) MSR. This bit basically says that IBRS
1222  * acts as though it is always active when executing at CPL0 and when executing
1223  * in the 'host' context when SEV-SNP is enabled.
1224  *
1225  * When this is active, AMD states that the RSB is cleared on VMEXIT and
1226  * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1227  * to the kernel, we must still consider the remaining cases that exist, just
1228  * like above. While traditionally AMD employed a 32 entry RSB allowing the
1229  * traditional technique to work, this is not true on all CPUs. While a write to
1230  * IBRS would clear the RSB if the processor supports more than 32 entries (but
1231  * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1232  * guard page is present between user and kernel address spaces and SMEP is
1233  * enabled, then there is no need to clear the RSB at all.
1234  *
1235  * By default, the system will enable RSB stuffing and the required variant of
1236  * retpolines and store that information in the x86_spectrev2_mitigation value.
1237  * This will be evaluated after a microcode update as well, though it is
1238  * expected that microcode updates will not take away features. This may mean
1239  * that a late loaded microcode may not end up in the optimal configuration
1240  * (though this should be rare).
1241  *
1242  * Currently we do not build kmdb with retpolines or perform any additional side
1243  * channel security mitigations for it. One complication with kmdb is that it
1244  * requires its own retpoline thunks and it would need to adjust itself based on
1245  * what the kernel does. The threat model of kmdb is more limited and therefore
1246  * it may make more sense to investigate using prediction barriers as the whole
1247  * system is only executing a single instruction at a time while in kmdb.
1248  *
1249  * SPECTRE v1, v4
1250  *
1251  * The v1 and v4 variants of spectre are not currently mitigated in the
1252  * system and require other classes of changes to occur in the code.
1253  *
1254  * SPECTRE v1 (SWAPGS VARIANT)
1255  *
1256  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1257  * can generally affect any branch-dependent code. The swapgs issue is one
1258  * variant of this. If we are coming in from userspace, we can have code like
1259  * this:
1260  *
1261  *      cmpw    $KCS_SEL, REGOFF_CS(%rsp)
1262  *      je      1f
1263  *      movq    $0, REGOFF_SAVFP(%rsp)
1264  *      swapgs
1265  *      1:
1266  *      movq    %gs:CPU_THREAD, %rax
1267  *
1268  * If an attacker can cause a mis-speculation of the branch here, we could skip
1269  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1270  * load. If subsequent code can act as the usual Spectre cache gadget, this
1271  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1272  * any use of the %gs override.
1273  *
1274  * The other case is also an issue: if we're coming into a trap from kernel
1275  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1276  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1277  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1278  * case, and the fix is the same in both cases (an lfence at the branch target
1279  * 1: in this example), we'll just do it unconditionally.
1280  *
1281  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1282  * harder for user-space to actually set a useful %gsbase value: although it's
1283  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1284  * mitigate anyway.
1285  *
1286  * MELTDOWN
1287  *
1288  * Meltdown, or spectre v3, allowed a user process to read any data in their
1289  * address space regardless of whether or not the page tables in question
1290  * allowed the user to have the ability to read them. The solution to meltdown
1291  * is kernel page table isolation. In this world, there are two page tables that
1292  * are used for a process, one in user land and one in the kernel. To implement
1293  * this we use per-CPU page tables and switch between the user and kernel
1294  * variants when entering and exiting the kernel.  For more information about
1295  * this process and how the trampolines work, please see the big theory
1296  * statements and additional comments in:
1297  *
1298  *  - uts/i86pc/ml/kpti_trampolines.s
1299  *  - uts/i86pc/vm/hat_i86.c
1300  *
1301  * While Meltdown only impacted Intel systems and there are also Intel systems
1302  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1303  * kernel page table isolation enabled. While this may at first seem weird, an
1304  * important thing to remember is that you can't speculatively read an address
1305  * if it's never in your page table at all. Having user processes without kernel
1306  * pages present provides us with an important layer of defense in the kernel
1307  * against any other side channel attacks that exist and have yet to be
1308  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1309  * default, no matter the x86 system.
1310  *
1311  * L1 TERMINAL FAULT
1312  *
1313  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1314  * execution uses page table entries. Effectively, it is two different problems.
1315  * The first is that it ignores the not present bit in the page table entries
1316  * when performing speculative execution. This means that something can
1317  * speculatively read the listed physical address if it's present in the L1
1318  * cache under certain conditions (see Intel's documentation for the full set of
1319  * conditions). Secondly, this can be used to bypass hardware virtualization
1320  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1321  * instructions.
1322  *
1323  * For the non-hardware virtualized case, this is relatively easy to deal with.
1324  * We must make sure that all unmapped pages have an address of zero. This means
1325  * that they could read the first 4k of physical memory; however, we never use
1326  * that first page in the operating system and always skip putting it in our
1327  * memory map, even if firmware tells us we can use it in our memory map. While
1328  * other systems try to put extra metadata in the address and reserved bits,
1329  * which led to this being problematic in those cases, we do not.
1330  *
1331  * For hardware virtual machines things are more complicated. Because they can
1332  * construct their own page tables, it isn't hard for them to perform this
1333  * attack against any physical address. The one wrinkle is that this physical
1334  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1335  * to flush the L1 data cache. We wrap this up in the function
1336  * spec_uarch_flush(). This function is also used in the mitigation of
1337  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1338  * hypervisors such as KVM or bhyve are responsible for performing this before
1339  * entering the guest.
1340  *
1341  * Because this attack takes place in the L1 cache, there's another wrinkle
1342  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1343  * designs. This means that when a thread enters a hardware virtualized context
1344  * and flushes the L1 data cache, the other thread on the processor may then go
1345  * ahead and put new data in it that can be potentially attacked. While one
1346  * solution is to disable SMT on the system, another option that is available is
1347  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1348  * goes through and makes sure that if a HVM is being scheduled on one thread,
1349  * then the thing on the other thread is from the same hardware virtual machine.
1350  * If an interrupt comes in or the guest exits to the broader system, then the
1351  * other SMT thread will be kicked out.
1352  *
1353  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1354  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1355  * perform L1TF related mitigations.
1356  *
1357  * MICROARCHITECTURAL DATA SAMPLING
1358  *
1359  * Microarchitectural data sampling (MDS) is a combination of four discrete
1360  * vulnerabilities that are similar issues affecting various parts of the CPU's
1361  * microarchitectural implementation around load, store, and fill buffers.
1362  * Specifically it is made up of the following subcomponents:
1363  *
1364  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1365  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1366  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1367  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1368  *
1369  * To begin addressing these, Intel has introduced another feature in microcode
1370  * called MD_CLEAR. This changes the verw instruction to operate in a different
1371  * way. This allows us to execute the verw instruction in a particular way to
1372  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1373  * updated when this microcode is present to flush this state.
1374  *
1375  * Primarily we need to flush this state whenever we transition from the kernel
1376  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1377  * little bit different. Here the structures are statically sized when a logical
1378  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1379  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1380  * mwait, or another ACPI method. To perform these flushes, we call
1381  * x86_md_clear() at all of these transition points.
1382  *
1383  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1384  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1385  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1386  * a no-op.
1387  *
1388  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1389  * particular, everything we've discussed above is only valid for a single
1390  * thread executing on a core. In the case where you have hyper-threading
1391  * present, this attack can be performed between threads. The theoretical fix
1392  * for this is to ensure that both threads are always in the same security
1393  * domain. This means that they are executing in the same ring and mutually
1394  * trust each other. Practically speaking, this would mean that a system call
1395  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1396  * Rather than implement this, we recommend that one disables hyper-threading
1397  * through the use of psradm -aS.
1398  *
1399  * TSX ASYNCHRONOUS ABORT
1400  *
1401  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1402  * behaves like MDS, but leverages Intel's transactional instructions as another
1403  * vector. Effectively, when a transaction hits one of these cases (unmapped
1404  * page, various cache snoop activity, etc.) then the same data can be exposed
1405  * as in the case of MDS. This means that you can attack your twin.
1406  *
1407  * Intel has described that there are two different ways that we can mitigate
1408  * this problem on affected processors:
1409  *
1410  *   1) We can use the same techniques used to deal with MDS. Flushing the
1411  *      microarchitectural buffers and disabling hyperthreading will mitigate
1412  *      this in the same way.
1413  *
1414  *   2) Using microcode to disable TSX.
1415  *
1416  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1417  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1418  * That's OK as we're already doing all such mitigations. On the other hand,
1419  * processors with MDS_NO are all supposed to receive microcode updates that
1420  * enumerate support for disabling TSX. In general, we'd rather use this method
1421  * when available as it doesn't require disabling hyperthreading to be
1422  * effective. Currently we basically are relying on microcode for processors
1423  * that enumerate MDS_NO.
1424  *
1425  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1426  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1427  * different powers. The first allows us to cause all transactions to
1428  * immediately abort. The second gives us a means of disabling TSX completely,
1429  * which includes removing it from cpuid. If we have support for this in
1430  * microcode during the first cpuid pass, then we'll disable TSX completely such
1431  * that user land never has a chance to observe the bit. However, if we are late
1432  * loading the microcode, then we must use the functionality to cause
1433  * transactions to automatically abort. This is necessary for user land's sake.
1434  * Once a program sees a cpuid bit, it must not be taken away.
1435  *
1436  * We track whether or not we should do this based on what cpuid pass we're in.
1437  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1438  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1439  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1440  * second time after we do the initial microcode update.  As a result we need to
1441  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1442  * suitable microcode on the current CPU (which happens prior to
1443  * cpuid_pass_ucode()).
1444  *
1445  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1446  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1447  * unfortunate feature in a number of ways, and taking the opportunity to
1448  * finally be able to turn it off is likely to be of benefit in the future.
1449  *
1450  * SUMMARY
1451  *
1452  * The following table attempts to summarize the mitigations for various issues
1453  * and what's done in various places:
1454  *
1455  *  - Spectre v1: Not currently mitigated
1456  *  - swapgs: lfences after swapgs paths
1457  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1458  *  - Meltdown: Kernel Page Table Isolation
1459  *  - Spectre v3a: Updated CPU microcode
1460  *  - Spectre v4: Not currently mitigated
1461  *  - SpectreRSB: SMEP and RSB Stuffing
1462  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1463  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1464  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1465  *
1466  * The following table indicates the x86 feature set bits that indicate that a
1467  * given problem has been solved or a notable feature is present:
1468  *
1469  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1470  *  - MDS_NO: All forms of MDS
1471  *  - TAA_NO: TAA
1472  */
1473 
1474 #include <sys/types.h>
1475 #include <sys/archsystm.h>
1476 #include <sys/x86_archext.h>
1477 #include <sys/kmem.h>
1478 #include <sys/systm.h>
1479 #include <sys/cmn_err.h>
1480 #include <sys/sunddi.h>
1481 #include <sys/sunndi.h>
1482 #include <sys/cpuvar.h>
1483 #include <sys/processor.h>
1484 #include <sys/sysmacros.h>
1485 #include <sys/pg.h>
1486 #include <sys/fp.h>
1487 #include <sys/controlregs.h>
1488 #include <sys/bitmap.h>
1489 #include <sys/auxv_386.h>
1490 #include <sys/memnode.h>
1491 #include <sys/pci_cfgspace.h>
1492 #include <sys/comm_page.h>
1493 #include <sys/mach_mmu.h>
1494 #include <sys/ucode.h>
1495 #include <sys/tsc.h>
1496 #include <sys/kobj.h>
1497 #include <sys/asm_misc.h>
1498 
1499 #ifdef __xpv
1500 #include <sys/hypervisor.h>
1501 #else
1502 #include <sys/ontrap.h>
1503 #endif
1504 
1505 uint_t x86_vendor = X86_VENDOR_IntelClone;
1506 uint_t x86_type = X86_TYPE_OTHER;
1507 uint_t x86_clflush_size = 0;
1508 
1509 #if defined(__xpv)
1510 int x86_use_pcid = 0;
1511 int x86_use_invpcid = 0;
1512 #else
1513 int x86_use_pcid = -1;
1514 int x86_use_invpcid = -1;
1515 #endif
1516 
1517 typedef enum {
1518         X86_SPECTREV2_RETPOLINE,
1519         X86_SPECTREV2_ENHANCED_IBRS,
1520         X86_SPECTREV2_AUTO_IBRS,
1521         X86_SPECTREV2_DISABLED
1522 } x86_spectrev2_mitigation_t;
1523 
1524 uint_t x86_disable_spectrev2 = 0;
1525 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1526     X86_SPECTREV2_RETPOLINE;
1527 
1528 /*
1529  * The mitigation status for TAA:
1530  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1531  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1532  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1533  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1534  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1535  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1536  */
1537 typedef enum {
1538         X86_TAA_NOTHING,
1539         X86_TAA_DISABLED,
1540         X86_TAA_MD_CLEAR,
1541         X86_TAA_TSX_FORCE_ABORT,
1542         X86_TAA_TSX_DISABLE,
1543         X86_TAA_HW_MITIGATED
1544 } x86_taa_mitigation_t;
1545 
1546 uint_t x86_disable_taa = 0;
1547 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1548 
1549 uint_t pentiumpro_bug4046376;
1550 
1551 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1552 
1553 static char *x86_feature_names[NUM_X86_FEATURES] = {
1554         "lgpg",
1555         "tsc",
1556         "msr",
1557         "mtrr",
1558         "pge",
1559         "de",
1560         "cmov",
1561         "mmx",
1562         "mca",
1563         "pae",
1564         "cv8",
1565         "pat",
1566         "sep",
1567         "sse",
1568         "sse2",
1569         "htt",
1570         "asysc",
1571         "nx",
1572         "sse3",
1573         "cx16",
1574         "cmp",
1575         "tscp",
1576         "mwait",
1577         "sse4a",
1578         "cpuid",
1579         "ssse3",
1580         "sse4_1",
1581         "sse4_2",
1582         "1gpg",
1583         "clfsh",
1584         "64",
1585         "aes",
1586         "pclmulqdq",
1587         "xsave",
1588         "avx",
1589         "vmx",
1590         "svm",
1591         "topoext",
1592         "f16c",
1593         "rdrand",
1594         "x2apic",
1595         "avx2",
1596         "bmi1",
1597         "bmi2",
1598         "fma",
1599         "smep",
1600         "smap",
1601         "adx",
1602         "rdseed",
1603         "mpx",
1604         "avx512f",
1605         "avx512dq",
1606         "avx512pf",
1607         "avx512er",
1608         "avx512cd",
1609         "avx512bw",
1610         "avx512vl",
1611         "avx512fma",
1612         "avx512vbmi",
1613         "avx512_vpopcntdq",
1614         "avx512_4vnniw",
1615         "avx512_4fmaps",
1616         "xsaveopt",
1617         "xsavec",
1618         "xsaves",
1619         "sha",
1620         "umip",
1621         "pku",
1622         "ospke",
1623         "pcid",
1624         "invpcid",
1625         "ibrs",
1626         "ibpb",
1627         "stibp",
1628         "ssbd",
1629         "ssbd_virt",
1630         "rdcl_no",
1631         "ibrs_all",
1632         "rsba",
1633         "ssb_no",
1634         "stibp_all",
1635         "flush_cmd",
1636         "l1d_vmentry_no",
1637         "fsgsbase",
1638         "clflushopt",
1639         "clwb",
1640         "monitorx",
1641         "clzero",
1642         "xop",
1643         "fma4",
1644         "tbm",
1645         "avx512_vnni",
1646         "amd_pcec",
1647         "md_clear",
1648         "mds_no",
1649         "core_thermal",
1650         "pkg_thermal",
1651         "tsx_ctrl",
1652         "taa_no",
1653         "ppin",
1654         "vaes",
1655         "vpclmulqdq",
1656         "lfence_serializing",
1657         "gfni",
1658         "avx512_vp2intersect",
1659         "avx512_bitalg",
1660         "avx512_vbmi2",
1661         "avx512_bf16",
1662         "auto_ibrs"
1663 };
1664 
1665 boolean_t
1666 is_x86_feature(void *featureset, uint_t feature)
1667 {
1668         ASSERT(feature < NUM_X86_FEATURES);
1669         return (BT_TEST((ulong_t *)featureset, feature));
1670 }
1671 
1672 void
1673 add_x86_feature(void *featureset, uint_t feature)
1674 {
1675         ASSERT(feature < NUM_X86_FEATURES);
1676         BT_SET((ulong_t *)featureset, feature);
1677 }
1678 
1679 void
1680 remove_x86_feature(void *featureset, uint_t feature)
1681 {
1682         ASSERT(feature < NUM_X86_FEATURES);
1683         BT_CLEAR((ulong_t *)featureset, feature);
1684 }
1685 
1686 boolean_t
1687 compare_x86_featureset(void *setA, void *setB)
1688 {
1689         /*
1690          * We assume that the unused bits of the bitmap are always zero.
1691          */
1692         if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1693                 return (B_TRUE);
1694         } else {
1695                 return (B_FALSE);
1696         }
1697 }
1698 
1699 void
1700 print_x86_featureset(void *featureset)
1701 {
1702         uint_t i;
1703 
1704         for (i = 0; i < NUM_X86_FEATURES; i++) {
1705                 if (is_x86_feature(featureset, i)) {
1706                         cmn_err(CE_CONT, "?x86_feature: %s\n",
1707                             x86_feature_names[i]);
1708                 }
1709         }
1710 }
1711 
1712 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1713 static size_t xsave_state_size = 0;
1714 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1715 boolean_t xsave_force_disable = B_FALSE;
1716 extern int disable_smap;
1717 
1718 /*
1719  * This is set to platform type we are running on.
1720  */
1721 static int platform_type = -1;
1722 
1723 #if !defined(__xpv)
1724 /*
1725  * Variable to patch if hypervisor platform detection needs to be
1726  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1727  */
1728 int enable_platform_detection = 1;
1729 #endif
1730 
1731 /*
1732  * monitor/mwait info.
1733  *
1734  * size_actual and buf_actual are the real address and size allocated to get
1735  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1736  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1737  * processor cache-line alignment, but this is not guarantied in the furture.
1738  */
1739 struct mwait_info {
1740         size_t          mon_min;        /* min size to avoid missed wakeups */
1741         size_t          mon_max;        /* size to avoid false wakeups */
1742         size_t          size_actual;    /* size actually allocated */
1743         void            *buf_actual;    /* memory actually allocated */
1744         uint32_t        support;        /* processor support of monitor/mwait */
1745 };
1746 
1747 /*
1748  * xsave/xrestor info.
1749  *
1750  * This structure contains HW feature bits and the size of the xsave save area.
1751  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1752  * (xsave_state) to describe the xsave layout. However, at runtime the
1753  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1754  * xsave_state structure simply represents the legacy layout of the beginning
1755  * of the xsave area.
1756  */
1757 struct xsave_info {
1758         uint32_t        xsav_hw_features_low;   /* Supported HW features */
1759         uint32_t        xsav_hw_features_high;  /* Supported HW features */
1760         size_t          xsav_max_size;  /* max size save area for HW features */
1761         size_t          ymm_size;       /* AVX: size of ymm save area */
1762         size_t          ymm_offset;     /* AVX: offset for ymm save area */
1763         size_t          bndregs_size;   /* MPX: size of bndregs save area */
1764         size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1765         size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1766         size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1767         size_t          opmask_size;    /* AVX512: size of opmask save */
1768         size_t          opmask_offset;  /* AVX512: offset for opmask save */
1769         size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1770         size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1771         size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1772         size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1773 };
1774 
1775 
1776 /*
1777  * These constants determine how many of the elements of the
1778  * cpuid we cache in the cpuid_info data structure; the
1779  * remaining elements are accessible via the cpuid instruction.
1780  */
1781 
1782 #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1783 #define NMAX_CPI_EXTD   0x22            /* eax = 0x80000000 .. 0x80000021 */
1784 
1785 /*
1786  * See the big theory statement for a more detailed explanation of what some of
1787  * these members mean.
1788  */
1789 struct cpuid_info {
1790         uint_t cpi_pass;                /* last pass completed */
1791         /*
1792          * standard function information
1793          */
1794         uint_t cpi_maxeax;              /* fn 0: %eax */
1795         char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1796         uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1797 
1798         uint_t cpi_family;              /* fn 1: extended family */
1799         uint_t cpi_model;               /* fn 1: extended model */
1800         uint_t cpi_step;                /* fn 1: stepping */
1801         chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1802                                         /*              AMD: package/socket # */
1803         uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1804         int cpi_clogid;                 /* fn 1: %ebx: thread # */
1805         uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1806         uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1807         uint_t cpi_ncache;              /* fn 2: number of elements */
1808         uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1809         id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1810         uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1811                                         /* Intel fn: 4, AMD fn: 8000001d */
1812         struct cpuid_regs **cpi_cache_leaves;   /* Acual leaves from above */
1813         struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1814         struct cpuid_regs cpi_sub7[1];  /* Leaf 7, sub-leaf 1 */
1815         /*
1816          * extended function information
1817          */
1818         uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1819         char cpi_brandstr[49];          /* fn 0x8000000[234] */
1820         uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1821         uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1822         uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1823         struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1824 
1825         id_t cpi_coreid;                /* same coreid => strands share core */
1826         int cpi_pkgcoreid;              /* core number within single package */
1827         uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1828                                         /* Intel: fn 4: %eax[31-26] */
1829 
1830         /*
1831          * These values represent the number of bits that are required to store
1832          * information about the number of cores and threads.
1833          */
1834         uint_t cpi_ncore_bits;
1835         uint_t cpi_nthread_bits;
1836         /*
1837          * supported feature information
1838          */
1839         uint32_t cpi_support[6];
1840 #define STD_EDX_FEATURES        0
1841 #define AMD_EDX_FEATURES        1
1842 #define TM_EDX_FEATURES         2
1843 #define STD_ECX_FEATURES        3
1844 #define AMD_ECX_FEATURES        4
1845 #define STD_EBX_FEATURES        5
1846         /*
1847          * Synthesized information, where known.
1848          */
1849         x86_chiprev_t cpi_chiprev;      /* See X86_CHIPREV_* in x86_archext.h */
1850         const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1851         uint32_t cpi_socket;            /* Chip package/socket type */
1852         x86_uarchrev_t cpi_uarchrev;    /* Microarchitecture and revision */
1853 
1854         struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1855         uint32_t cpi_apicid;
1856         uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1857         uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1858                                         /* Intel: 1 */
1859         uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1860         uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1861 
1862         struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1863 };
1864 
1865 
1866 static struct cpuid_info cpuid_info0;
1867 
1868 /*
1869  * These bit fields are defined by the Intel Application Note AP-485
1870  * "Intel Processor Identification and the CPUID Instruction"
1871  */
1872 #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1873 #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1874 #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1875 #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1876 #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1877 #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1878 
1879 #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1880 #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1881 #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1882 #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1883 #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1884 #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1885 #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1886 #define CPI_FEATURES_7_1_EAX(cpi)       ((cpi)->cpi_sub7[0].cp_eax)
1887 
1888 #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1889 #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1890 #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1891 #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1892 
1893 #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1894 #define CPI_XMAXEAX_MAX         0x80000100
1895 #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1896 #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1897 
1898 /*
1899  * Function 4 (Deterministic Cache Parameters) macros
1900  * Defined by Intel Application Note AP-485
1901  */
1902 #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1903 #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1904 #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1905 #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1906 #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1907 #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1908 #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1909 
1910 #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1911 #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1912 #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1913 
1914 #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1915 
1916 #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1917 
1918 
1919 /*
1920  * A couple of shorthand macros to identify "later" P6-family chips
1921  * like the Pentium M and Core.  First, the "older" P6-based stuff
1922  * (loosely defined as "pre-Pentium-4"):
1923  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1924  */
1925 #define IS_LEGACY_P6(cpi) (                     \
1926         cpi->cpi_family == 6 &&                      \
1927                 (cpi->cpi_model == 1 ||              \
1928                 cpi->cpi_model == 3 ||               \
1929                 cpi->cpi_model == 5 ||               \
1930                 cpi->cpi_model == 6 ||               \
1931                 cpi->cpi_model == 7 ||               \
1932                 cpi->cpi_model == 8 ||               \
1933                 cpi->cpi_model == 0xA ||     \
1934                 cpi->cpi_model == 0xB)               \
1935 )
1936 
1937 /* A "new F6" is everything with family 6 that's not the above */
1938 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1939 
1940 /* Extended family/model support */
1941 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1942         cpi->cpi_family >= 0xf)
1943 
1944 /*
1945  * Info for monitor/mwait idle loop.
1946  *
1947  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1948  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1949  * 2006.
1950  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1951  * Documentation Updates" #33633, Rev 2.05, December 2006.
1952  */
1953 #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1954 #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1955 #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1956 #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1957 #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1958 #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1959 #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1960 #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1961 /*
1962  * Number of sub-cstates for a given c-state.
1963  */
1964 #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1965         BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1966 
1967 /*
1968  * XSAVE leaf 0xD enumeration
1969  */
1970 #define CPUID_LEAFD_2_YMM_OFFSET        576
1971 #define CPUID_LEAFD_2_YMM_SIZE          256
1972 
1973 /*
1974  * Common extended leaf names to cut down on typos.
1975  */
1976 #define CPUID_LEAF_EXT_0                0x80000000
1977 #define CPUID_LEAF_EXT_8                0x80000008
1978 #define CPUID_LEAF_EXT_1d               0x8000001d
1979 #define CPUID_LEAF_EXT_1e               0x8000001e
1980 #define CPUID_LEAF_EXT_21               0x80000021
1981 
1982 /*
1983  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1984  * file to try and keep people using the expected cpuid_* interfaces.
1985  */
1986 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1987 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1988 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1989 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1990 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
1991 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1992 
1993 /*
1994  * Apply up various platform-dependent restrictions where the
1995  * underlying platform restrictions mean the CPU can be marked
1996  * as less capable than its cpuid instruction would imply.
1997  */
1998 #if defined(__xpv)
1999 static void
2000 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2001 {
2002         switch (eax) {
2003         case 1: {
2004                 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2005                     0 : CPUID_INTC_EDX_MCA;
2006                 cp->cp_edx &=
2007                     ~(mcamask |
2008                     CPUID_INTC_EDX_PSE |
2009                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2010                     CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2011                     CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2012                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2013                     CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2014                 break;
2015         }
2016 
2017         case 0x80000001:
2018                 cp->cp_edx &=
2019                     ~(CPUID_AMD_EDX_PSE |
2020                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2021                     CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2022                     CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2023                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2024                     CPUID_AMD_EDX_TSCP);
2025                 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2026                 break;
2027         default:
2028                 break;
2029         }
2030 
2031         switch (vendor) {
2032         case X86_VENDOR_Intel:
2033                 switch (eax) {
2034                 case 4:
2035                         /*
2036                          * Zero out the (ncores-per-chip - 1) field
2037                          */
2038                         cp->cp_eax &= 0x03fffffff;
2039                         break;
2040                 default:
2041                         break;
2042                 }
2043                 break;
2044         case X86_VENDOR_AMD:
2045         case X86_VENDOR_HYGON:
2046                 switch (eax) {
2047 
2048                 case 0x80000001:
2049                         cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2050                         break;
2051 
2052                 case CPUID_LEAF_EXT_8:
2053                         /*
2054                          * Zero out the (ncores-per-chip - 1) field
2055                          */
2056                         cp->cp_ecx &= 0xffffff00;
2057                         break;
2058                 default:
2059                         break;
2060                 }
2061                 break;
2062         default:
2063                 break;
2064         }
2065 }
2066 #else
2067 #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
2068 #endif
2069 
2070 /*
2071  *  Some undocumented ways of patching the results of the cpuid
2072  *  instruction to permit running Solaris 10 on future cpus that
2073  *  we don't currently support.  Could be set to non-zero values
2074  *  via settings in eeprom.
2075  */
2076 
2077 uint32_t cpuid_feature_ecx_include;
2078 uint32_t cpuid_feature_ecx_exclude;
2079 uint32_t cpuid_feature_edx_include;
2080 uint32_t cpuid_feature_edx_exclude;
2081 
2082 /*
2083  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2084  */
2085 void
2086 cpuid_alloc_space(cpu_t *cpu)
2087 {
2088         /*
2089          * By convention, cpu0 is the boot cpu, which is set up
2090          * before memory allocation is available.  All other cpus get
2091          * their cpuid_info struct allocated here.
2092          */
2093         ASSERT(cpu->cpu_id != 0);
2094         ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2095         cpu->cpu_m.mcpu_cpi =
2096             kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2097 }
2098 
2099 void
2100 cpuid_free_space(cpu_t *cpu)
2101 {
2102         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2103         int i;
2104 
2105         ASSERT(cpi != NULL);
2106         ASSERT(cpi != &cpuid_info0);
2107 
2108         /*
2109          * Free up any cache leaf related dynamic storage. The first entry was
2110          * cached from the standard cpuid storage, so we should not free it.
2111          */
2112         for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2113                 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2114         if (cpi->cpi_cache_leaf_size > 0)
2115                 kmem_free(cpi->cpi_cache_leaves,
2116                     cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2117 
2118         kmem_free(cpi, sizeof (*cpi));
2119         cpu->cpu_m.mcpu_cpi = NULL;
2120 }
2121 
2122 #if !defined(__xpv)
2123 /*
2124  * Determine the type of the underlying platform. This is used to customize
2125  * initialization of various subsystems (e.g. TSC). determine_platform() must
2126  * only ever be called once to prevent two processors from seeing different
2127  * values of platform_type. Must be called before cpuid_pass_ident(), the
2128  * earliest consumer to execute; the identification pass will call
2129  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2130  */
2131 void
2132 determine_platform(void)
2133 {
2134         struct cpuid_regs cp;
2135         uint32_t base;
2136         uint32_t regs[4];
2137         char *hvstr = (char *)regs;
2138 
2139         ASSERT(platform_type == -1);
2140 
2141         platform_type = HW_NATIVE;
2142 
2143         if (!enable_platform_detection)
2144                 return;
2145 
2146         /*
2147          * If Hypervisor CPUID bit is set, try to determine hypervisor
2148          * vendor signature, and set platform type accordingly.
2149          *
2150          * References:
2151          * http://lkml.org/lkml/2008/10/1/246
2152          * http://kb.vmware.com/kb/1009458
2153          */
2154         cp.cp_eax = 0x1;
2155         (void) __cpuid_insn(&cp);
2156         if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2157                 cp.cp_eax = 0x40000000;
2158                 (void) __cpuid_insn(&cp);
2159                 regs[0] = cp.cp_ebx;
2160                 regs[1] = cp.cp_ecx;
2161                 regs[2] = cp.cp_edx;
2162                 regs[3] = 0;
2163                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2164                         platform_type = HW_XEN_HVM;
2165                         return;
2166                 }
2167                 if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2168                         platform_type = HW_VMWARE;
2169                         return;
2170                 }
2171                 if (strcmp(hvstr, HVSIG_KVM) == 0) {
2172                         platform_type = HW_KVM;
2173                         return;
2174                 }
2175                 if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2176                         platform_type = HW_BHYVE;
2177                         return;
2178                 }
2179                 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
2180                         platform_type = HW_MICROSOFT;
2181         } else {
2182                 /*
2183                  * Check older VMware hardware versions. VMware hypervisor is
2184                  * detected by performing an IN operation to VMware hypervisor
2185                  * port and checking that value returned in %ebx is VMware
2186                  * hypervisor magic value.
2187                  *
2188                  * References: http://kb.vmware.com/kb/1009458
2189                  */
2190                 vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2191                 if (regs[1] == VMWARE_HVMAGIC) {
2192                         platform_type = HW_VMWARE;
2193                         return;
2194                 }
2195         }
2196 
2197         /*
2198          * Check Xen hypervisor. In a fully virtualized domain,
2199          * Xen's pseudo-cpuid function returns a string representing the
2200          * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2201          * supported cpuid function. We need at least a (base + 2) leaf value
2202          * to do what we want to do. Try different base values, since the
2203          * hypervisor might use a different one depending on whether Hyper-V
2204          * emulation is switched on by default or not.
2205          */
2206         for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2207                 cp.cp_eax = base;
2208                 (void) __cpuid_insn(&cp);
2209                 regs[0] = cp.cp_ebx;
2210                 regs[1] = cp.cp_ecx;
2211                 regs[2] = cp.cp_edx;
2212                 regs[3] = 0;
2213                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2214                     cp.cp_eax >= (base + 2)) {
2215                         platform_type &= ~HW_NATIVE;
2216                         platform_type |= HW_XEN_HVM;
2217                         return;
2218                 }
2219         }
2220 }
2221 
2222 int
2223 get_hwenv(void)
2224 {
2225         ASSERT(platform_type != -1);
2226         return (platform_type);
2227 }
2228 
2229 int
2230 is_controldom(void)
2231 {
2232         return (0);
2233 }
2234 
2235 #else
2236 
2237 int
2238 get_hwenv(void)
2239 {
2240         return (HW_XEN_PV);
2241 }
2242 
2243 int
2244 is_controldom(void)
2245 {
2246         return (DOMAIN_IS_INITDOMAIN(xen_info));
2247 }
2248 
2249 #endif  /* __xpv */
2250 
2251 /*
2252  * Make sure that we have gathered all of the CPUID leaves that we might need to
2253  * determine topology. We assume that the standard leaf 1 has already been done
2254  * and that xmaxeax has already been calculated.
2255  */
2256 static void
2257 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2258 {
2259         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2260 
2261         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2262                 struct cpuid_regs *cp;
2263 
2264                 cp = &cpi->cpi_extd[8];
2265                 cp->cp_eax = CPUID_LEAF_EXT_8;
2266                 (void) __cpuid_insn(cp);
2267                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2268         }
2269 
2270         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2271             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2272                 struct cpuid_regs *cp;
2273 
2274                 cp = &cpi->cpi_extd[0x1e];
2275                 cp->cp_eax = CPUID_LEAF_EXT_1e;
2276                 (void) __cpuid_insn(cp);
2277         }
2278 }
2279 
2280 /*
2281  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2282  * it to everything else. If not, and we're on an AMD system where 8000001e is
2283  * valid, then we use that. Othewrise, we fall back to the default value for the
2284  * APIC ID in leaf 1.
2285  */
2286 static uint32_t
2287 cpuid_gather_apicid(struct cpuid_info *cpi)
2288 {
2289         /*
2290          * Leaf B changes based on the arguments to it. Beacuse we don't cache
2291          * it, we need to gather it again.
2292          */
2293         if (cpi->cpi_maxeax >= 0xB) {
2294                 struct cpuid_regs regs;
2295                 struct cpuid_regs *cp;
2296 
2297                 cp = &regs;
2298                 cp->cp_eax = 0xB;
2299                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2300                 (void) __cpuid_insn(cp);
2301 
2302                 if (cp->cp_ebx != 0) {
2303                         return (cp->cp_edx);
2304                 }
2305         }
2306 
2307         if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2308             cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2309             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2310             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2311                 return (cpi->cpi_extd[0x1e].cp_eax);
2312         }
2313 
2314         return (CPI_APIC_ID(cpi));
2315 }
2316 
2317 /*
2318  * For AMD processors, attempt to calculate the number of chips and cores that
2319  * exist. The way that we do this varies based on the generation, because the
2320  * generations themselves have changed dramatically.
2321  *
2322  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2323  * However, with the advent of family 17h (Zen) it actually tells us the number
2324  * of threads, so we need to look at leaf 0x8000001e if available to determine
2325  * its value. Otherwise, for all prior families, the number of enabled cores is
2326  * the same as threads.
2327  *
2328  * If we do not have leaf 0x80000008, then we assume that this processor does
2329  * not have anything. AMD's older CPUID specification says there's no reason to
2330  * fall back to leaf 1.
2331  *
2332  * In some virtualization cases we will not have leaf 8000001e or it will be
2333  * zero. When that happens we assume the number of threads is one.
2334  */
2335 static void
2336 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2337 {
2338         uint_t nthreads, nthread_per_core;
2339 
2340         nthreads = nthread_per_core = 1;
2341 
2342         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2343                 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2344         } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2345                 nthreads = CPI_CPU_COUNT(cpi);
2346         }
2347 
2348         /*
2349          * For us to have threads, and know about it, we have to be at least at
2350          * family 17h and have the cpuid bit that says we have extended
2351          * topology.
2352          */
2353         if (cpi->cpi_family >= 0x17 &&
2354             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2355             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2356                 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2357         }
2358 
2359         *ncpus = nthreads;
2360         *ncores = nthreads / nthread_per_core;
2361 }
2362 
2363 /*
2364  * Seed the initial values for the cores and threads for an Intel based
2365  * processor. These values will be overwritten if we detect that the processor
2366  * supports CPUID leaf 0xb.
2367  */
2368 static void
2369 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2370 {
2371         /*
2372          * Only seed the number of physical cores from the first level leaf 4
2373          * information. The number of threads there indicate how many share the
2374          * L1 cache, which may or may not have anything to do with the number of
2375          * logical CPUs per core.
2376          */
2377         if (cpi->cpi_maxeax >= 4) {
2378                 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2379         } else {
2380                 *ncores = 1;
2381         }
2382 
2383         if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2384                 *ncpus = CPI_CPU_COUNT(cpi);
2385         } else {
2386                 *ncpus = *ncores;
2387         }
2388 }
2389 
2390 static boolean_t
2391 cpuid_leafB_getids(cpu_t *cpu)
2392 {
2393         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2394         struct cpuid_regs regs;
2395         struct cpuid_regs *cp;
2396 
2397         if (cpi->cpi_maxeax < 0xB)
2398                 return (B_FALSE);
2399 
2400         cp = &regs;
2401         cp->cp_eax = 0xB;
2402         cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2403 
2404         (void) __cpuid_insn(cp);
2405 
2406         /*
2407          * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2408          * indicates that the extended topology enumeration leaf is
2409          * available.
2410          */
2411         if (cp->cp_ebx != 0) {
2412                 uint32_t x2apic_id = 0;
2413                 uint_t coreid_shift = 0;
2414                 uint_t ncpu_per_core = 1;
2415                 uint_t chipid_shift = 0;
2416                 uint_t ncpu_per_chip = 1;
2417                 uint_t i;
2418                 uint_t level;
2419 
2420                 for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2421                         cp->cp_eax = 0xB;
2422                         cp->cp_ecx = i;
2423 
2424                         (void) __cpuid_insn(cp);
2425                         level = CPI_CPU_LEVEL_TYPE(cp);
2426 
2427                         if (level == 1) {
2428                                 x2apic_id = cp->cp_edx;
2429                                 coreid_shift = BITX(cp->cp_eax, 4, 0);
2430                                 ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2431                         } else if (level == 2) {
2432                                 x2apic_id = cp->cp_edx;
2433                                 chipid_shift = BITX(cp->cp_eax, 4, 0);
2434                                 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2435                         }
2436                 }
2437 
2438                 /*
2439                  * cpi_apicid is taken care of in cpuid_gather_apicid.
2440                  */
2441                 cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2442                 cpi->cpi_ncore_per_chip = ncpu_per_chip /
2443                     ncpu_per_core;
2444                 cpi->cpi_chipid = x2apic_id >> chipid_shift;
2445                 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2446                 cpi->cpi_coreid = x2apic_id >> coreid_shift;
2447                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2448                 cpi->cpi_procnodeid = cpi->cpi_chipid;
2449                 cpi->cpi_compunitid = cpi->cpi_coreid;
2450 
2451                 if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2452                         cpi->cpi_nthread_bits = coreid_shift;
2453                         cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2454                 }
2455 
2456                 return (B_TRUE);
2457         } else {
2458                 return (B_FALSE);
2459         }
2460 }
2461 
2462 static void
2463 cpuid_intel_getids(cpu_t *cpu, void *feature)
2464 {
2465         uint_t i;
2466         uint_t chipid_shift = 0;
2467         uint_t coreid_shift = 0;
2468         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2469 
2470         /*
2471          * There are no compute units or processor nodes currently on Intel.
2472          * Always set these to one.
2473          */
2474         cpi->cpi_procnodes_per_pkg = 1;
2475         cpi->cpi_cores_per_compunit = 1;
2476 
2477         /*
2478          * If cpuid Leaf B is present, use that to try and get this information.
2479          * It will be the most accurate for Intel CPUs.
2480          */
2481         if (cpuid_leafB_getids(cpu))
2482                 return;
2483 
2484         /*
2485          * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2486          * and ncore_per_chip. These represent the largest power of two values
2487          * that we need to cover all of the IDs in the system. Therefore, we use
2488          * those values to seed the number of bits needed to cover information
2489          * in the case when leaf B is not available. These values will probably
2490          * be larger than required, but that's OK.
2491          */
2492         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2493         cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2494 
2495         for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2496                 chipid_shift++;
2497 
2498         cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2499         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2500 
2501         if (is_x86_feature(feature, X86FSET_CMP)) {
2502                 /*
2503                  * Multi-core (and possibly multi-threaded)
2504                  * processors.
2505                  */
2506                 uint_t ncpu_per_core = 0;
2507 
2508                 if (cpi->cpi_ncore_per_chip == 1)
2509                         ncpu_per_core = cpi->cpi_ncpu_per_chip;
2510                 else if (cpi->cpi_ncore_per_chip > 1)
2511                         ncpu_per_core = cpi->cpi_ncpu_per_chip /
2512                             cpi->cpi_ncore_per_chip;
2513                 /*
2514                  * 8bit APIC IDs on dual core Pentiums
2515                  * look like this:
2516                  *
2517                  * +-----------------------+------+------+
2518                  * | Physical Package ID   |  MC  |  HT  |
2519                  * +-----------------------+------+------+
2520                  * <------- chipid -------->
2521                  * <------- coreid --------------->
2522                  *                         <--- clogid -->
2523                  *                         <------>
2524                  *                         pkgcoreid
2525                  *
2526                  * Where the number of bits necessary to
2527                  * represent MC and HT fields together equals
2528                  * to the minimum number of bits necessary to
2529                  * store the value of cpi->cpi_ncpu_per_chip.
2530                  * Of those bits, the MC part uses the number
2531                  * of bits necessary to store the value of
2532                  * cpi->cpi_ncore_per_chip.
2533                  */
2534                 for (i = 1; i < ncpu_per_core; i <<= 1)
2535                         coreid_shift++;
2536                 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2537                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2538         } else if (is_x86_feature(feature, X86FSET_HTT)) {
2539                 /*
2540                  * Single-core multi-threaded processors.
2541                  */
2542                 cpi->cpi_coreid = cpi->cpi_chipid;
2543                 cpi->cpi_pkgcoreid = 0;
2544         } else {
2545                 /*
2546                  * Single-core single-thread processors.
2547                  */
2548                 cpi->cpi_coreid = cpu->cpu_id;
2549                 cpi->cpi_pkgcoreid = 0;
2550         }
2551         cpi->cpi_procnodeid = cpi->cpi_chipid;
2552         cpi->cpi_compunitid = cpi->cpi_coreid;
2553 }
2554 
2555 /*
2556  * Historically, AMD has had CMP chips with only a single thread per core.
2557  * However, starting in family 17h (Zen), this has changed and they now have
2558  * multiple threads. Our internal core id needs to be a unique value.
2559  *
2560  * To determine the core id of an AMD system, if we're from a family before 17h,
2561  * then we just use the cpu id, as that gives us a good value that will be
2562  * unique for each core. If instead, we're on family 17h or later, then we need
2563  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2564  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2565  * We can't use the normal core id in that leaf as it's only unique within the
2566  * socket, which is perfect for cpi_pkgcoreid, but not us.
2567  */
2568 static id_t
2569 cpuid_amd_get_coreid(cpu_t *cpu)
2570 {
2571         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2572 
2573         if (cpi->cpi_family >= 0x17 &&
2574             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2575             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2576                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2577                 if (nthreads > 1) {
2578                         VERIFY3U(nthreads, ==, 2);
2579                         return (cpi->cpi_apicid >> 1);
2580                 }
2581         }
2582 
2583         return (cpu->cpu_id);
2584 }
2585 
2586 /*
2587  * IDs on AMD is a more challenging task. This is notable because of the
2588  * following two facts:
2589  *
2590  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2591  *     also no way to get an actual unique core id from the system. As such, we
2592  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2593  *     however, guarantee that sibling cores of a chip will have sequential
2594  *     coreids starting at a multiple of the number of cores per chip - that is
2595  *     usually the case, but if the APIC IDs have been set up in a different
2596  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2597  *
2598  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2599  *     called compute units. These compute units share the L1I cache, L2 cache,
2600  *     and the FPU. To deal with this, a new topology leaf was added in
2601  *     0x8000001e. However, parts of this leaf have different meanings
2602  *     once we get to family 0x17.
2603  */
2604 
2605 static void
2606 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2607 {
2608         int i, first_half, coreidsz;
2609         uint32_t nb_caps_reg;
2610         uint_t node2_1;
2611         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2612         struct cpuid_regs *cp;
2613 
2614         /*
2615          * Calculate the core id (this comes from hardware in family 0x17 if it
2616          * hasn't been stripped by virtualization). We always set the compute
2617          * unit id to the same value. Also, initialize the default number of
2618          * cores per compute unit and nodes per package. This will be
2619          * overwritten when we know information about a particular family.
2620          */
2621         cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2622         cpi->cpi_compunitid = cpi->cpi_coreid;
2623         cpi->cpi_cores_per_compunit = 1;
2624         cpi->cpi_procnodes_per_pkg = 1;
2625 
2626         /*
2627          * To construct the logical ID, we need to determine how many APIC IDs
2628          * are dedicated to the cores and threads. This is provided for us in
2629          * 0x80000008. However, if it's not present (say due to virtualization),
2630          * then we assume it's one. This should be present on all 64-bit AMD
2631          * processors.  It was added in family 0xf (Hammer).
2632          */
2633         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2634                 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2635 
2636                 /*
2637                  * In AMD parlance chip is really a node while illumos
2638                  * uses chip as equivalent to socket/package.
2639                  */
2640                 if (coreidsz == 0) {
2641                         /* Use legacy method */
2642                         for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2643                                 coreidsz++;
2644                         if (coreidsz == 0)
2645                                 coreidsz = 1;
2646                 }
2647         } else {
2648                 /* Assume single-core part */
2649                 coreidsz = 1;
2650         }
2651         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2652 
2653         /*
2654          * The package core ID varies depending on the family. While it may be
2655          * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2656          * this value is the core id in the given node. For non-virtualized
2657          * family 17h, we need to take the logical core id and shift off the
2658          * threads like we do when getting the core id.  Otherwise, we can use
2659          * the clogid as is. When family 17h is virtualized, the clogid should
2660          * be sufficient as if we don't have valid data in the leaf, then we
2661          * won't think we have SMT, in which case the cpi_clogid should be
2662          * sufficient.
2663          */
2664         if (cpi->cpi_family >= 0x17 &&
2665             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2666             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2667             cpi->cpi_extd[0x1e].cp_ebx != 0) {
2668                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2669                 if (nthreads > 1) {
2670                         VERIFY3U(nthreads, ==, 2);
2671                         cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2672                 } else {
2673                         cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2674                 }
2675         } else {
2676                 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2677         }
2678 
2679         /*
2680          * Obtain the node ID and compute unit IDs. If we're on family 0x15
2681          * (bulldozer) or newer, then we can derive all of this from leaf
2682          * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2683          */
2684         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2685             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2686                 cp = &cpi->cpi_extd[0x1e];
2687 
2688                 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2689                 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2690 
2691                 /*
2692                  * For Bulldozer-era CPUs, recalculate the compute unit
2693                  * information.
2694                  */
2695                 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2696                         cpi->cpi_cores_per_compunit =
2697                             BITX(cp->cp_ebx, 15, 8) + 1;
2698                         cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2699                             (cpi->cpi_ncore_per_chip /
2700                             cpi->cpi_cores_per_compunit) *
2701                             (cpi->cpi_procnodeid /
2702                             cpi->cpi_procnodes_per_pkg);
2703                 }
2704         } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2705                 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2706         } else if (cpi->cpi_family == 0x10) {
2707                 /*
2708                  * See if we are a multi-node processor.
2709                  * All processors in the system have the same number of nodes
2710                  */
2711                 nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2712                 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2713                         /* Single-node */
2714                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2715                             coreidsz);
2716                 } else {
2717 
2718                         /*
2719                          * Multi-node revision D (2 nodes per package
2720                          * are supported)
2721                          */
2722                         cpi->cpi_procnodes_per_pkg = 2;
2723 
2724                         first_half = (cpi->cpi_pkgcoreid <=
2725                             (cpi->cpi_ncore_per_chip/2 - 1));
2726 
2727                         if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2728                                 /* We are BSP */
2729                                 cpi->cpi_procnodeid = (first_half ? 0 : 1);
2730                         } else {
2731 
2732                                 /* We are AP */
2733                                 /* NodeId[2:1] bits to use for reading F3xe8 */
2734                                 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2735 
2736                                 nb_caps_reg =
2737                                     pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2738 
2739                                 /*
2740                                  * Check IntNodeNum bit (31:30, but bit 31 is
2741                                  * always 0 on dual-node processors)
2742                                  */
2743                                 if (BITX(nb_caps_reg, 30, 30) == 0)
2744                                         cpi->cpi_procnodeid = node2_1 +
2745                                             !first_half;
2746                                 else
2747                                         cpi->cpi_procnodeid = node2_1 +
2748                                             first_half;
2749                         }
2750                 }
2751         } else {
2752                 cpi->cpi_procnodeid = 0;
2753         }
2754 
2755         cpi->cpi_chipid =
2756             cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2757 
2758         cpi->cpi_ncore_bits = coreidsz;
2759         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2760             cpi->cpi_ncore_per_chip);
2761 }
2762 
2763 static void
2764 spec_uarch_flush_noop(void)
2765 {
2766 }
2767 
2768 /*
2769  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2770  * MDS-related micro-architectural state that would normally happen by calling
2771  * x86_md_clear().
2772  */
2773 static void
2774 spec_uarch_flush_msr(void)
2775 {
2776         wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2777 }
2778 
2779 /*
2780  * This function points to a function that will flush certain
2781  * micro-architectural state on the processor. This flush is used to mitigate
2782  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2783  * function can point to one of three functions:
2784  *
2785  * - A noop which is done because we either are vulnerable, but do not have
2786  *   microcode available to help deal with a fix, or because we aren't
2787  *   vulnerable.
2788  *
2789  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2790  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2791  *   however, it only flushes the MDS related micro-architectural state on the
2792  *   current hyperthread, it does not do anything for the twin.
2793  *
2794  * - x86_md_clear which will flush the MDS related state. This is done when we
2795  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2796  *   (RDCL_NO is set).
2797  */
2798 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2799 
2800 static void
2801 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2802 {
2803         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2804 
2805         /*
2806          * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2807          * has been fixed in hardware, it doesn't cover everything related to
2808          * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2809          * need to mitigate this.
2810          */
2811         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2812             is_x86_feature(featureset, X86FSET_MDS_NO)) {
2813                 return;
2814         }
2815 
2816         if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2817                 const uint8_t nop = NOP_INSTR;
2818                 uint8_t *md = (uint8_t *)x86_md_clear;
2819 
2820                 *md = nop;
2821         }
2822 
2823         membar_producer();
2824 }
2825 
2826 static void
2827 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2828 {
2829         boolean_t need_l1d, need_mds;
2830         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2831 
2832         /*
2833          * If we're not on Intel or we've mitigated both RDCL and MDS in
2834          * hardware, then there's nothing left for us to do for enabling the
2835          * flush. We can also go ahead and say that SMT exclusion is
2836          * unnecessary.
2837          */
2838         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2839             (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2840             is_x86_feature(featureset, X86FSET_MDS_NO))) {
2841                 extern int smt_exclusion;
2842                 smt_exclusion = 0;
2843                 spec_uarch_flush = spec_uarch_flush_noop;
2844                 membar_producer();
2845                 return;
2846         }
2847 
2848         /*
2849          * The locations where we need to perform an L1D flush are required both
2850          * for mitigating L1TF and MDS. When verw support is present in
2851          * microcode, then the L1D flush will take care of doing that as well.
2852          * However, if we have a system where RDCL_NO is present, but we don't
2853          * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2854          * L1D flush.
2855          */
2856         if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2857             is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2858             !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2859                 need_l1d = B_TRUE;
2860         } else {
2861                 need_l1d = B_FALSE;
2862         }
2863 
2864         if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2865             is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2866                 need_mds = B_TRUE;
2867         } else {
2868                 need_mds = B_FALSE;
2869         }
2870 
2871         if (need_l1d) {
2872                 spec_uarch_flush = spec_uarch_flush_msr;
2873         } else if (need_mds) {
2874                 spec_uarch_flush = x86_md_clear;
2875         } else {
2876                 /*
2877                  * We have no hardware mitigations available to us.
2878                  */
2879                 spec_uarch_flush = spec_uarch_flush_noop;
2880         }
2881         membar_producer();
2882 }
2883 
2884 /*
2885  * We default to enabling RSB mitigations.
2886  *
2887  * NOTE: We used to skip RSB mitigations with eIBRS, but developments around
2888  * post-barrier RSB guessing suggests we should enable RSB mitigations always
2889  * unless specifically instructed not to.
2890  *
2891  * AMD indicates that when Automatic IBRS is enabled we do not need to implement
2892  * return stack buffer clearing for VMEXIT as it takes care of it. The manual
2893  * also states that as long as SMEP and we maintain at least one page between
2894  * the kernel and user space (we have much more of a red zone), then we do not
2895  * need to clear the RSB. We constrain this to only when Automatic IRBS is
2896  * present.
2897  */
2898 static void
2899 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2900 {
2901         const uint8_t ret = RET_INSTR;
2902         uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2903 
2904         switch (mit) {
2905         case X86_SPECTREV2_AUTO_IBRS:
2906         case X86_SPECTREV2_DISABLED:
2907                 *stuff = ret;
2908                 break;
2909         default:
2910                 break;
2911         }
2912 }
2913 
2914 static void
2915 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2916 {
2917         const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2918             "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2919             "_r14", "_r15" };
2920         const uint_t nthunks = ARRAY_SIZE(thunks);
2921         const char *type;
2922         uint_t i;
2923 
2924         if (mit == x86_spectrev2_mitigation)
2925                 return;
2926 
2927         switch (mit) {
2928         case X86_SPECTREV2_RETPOLINE:
2929                 type = "gen";
2930                 break;
2931         case X86_SPECTREV2_AUTO_IBRS:
2932         case X86_SPECTREV2_ENHANCED_IBRS:
2933         case X86_SPECTREV2_DISABLED:
2934                 type = "jmp";
2935                 break;
2936         default:
2937                 panic("asked to update retpoline state with unknown state!");
2938         }
2939 
2940         for (i = 0; i < nthunks; i++) {
2941                 uintptr_t source, dest;
2942                 int ssize, dsize;
2943                 char sourcebuf[64], destbuf[64];
2944 
2945                 (void) snprintf(destbuf, sizeof (destbuf),
2946                     "__x86_indirect_thunk%s", thunks[i]);
2947                 (void) snprintf(sourcebuf, sizeof (sourcebuf),
2948                     "__x86_indirect_thunk_%s%s", type, thunks[i]);
2949 
2950                 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2951                 dest = kobj_getelfsym(destbuf, NULL, &dsize);
2952                 VERIFY3U(source, !=, 0);
2953                 VERIFY3U(dest, !=, 0);
2954                 VERIFY3S(dsize, >=, ssize);
2955                 bcopy((void *)source, (void *)dest, ssize);
2956         }
2957 }
2958 
2959 static void
2960 cpuid_enable_enhanced_ibrs(void)
2961 {
2962         uint64_t val;
2963 
2964         val = rdmsr(MSR_IA32_SPEC_CTRL);
2965         val |= IA32_SPEC_CTRL_IBRS;
2966         wrmsr(MSR_IA32_SPEC_CTRL, val);
2967 }
2968 
2969 static void
2970 cpuid_enable_auto_ibrs(void)
2971 {
2972         uint64_t val;
2973 
2974         val = rdmsr(MSR_AMD_EFER);
2975         val |= AMD_EFER_AIBRSE;
2976         wrmsr(MSR_AMD_EFER, val);
2977 }
2978 
2979 /*
2980  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2981  * we can disable TSX, we do so.
2982  *
2983  * This determination is done only on the boot CPU, potentially after loading
2984  * updated microcode.
2985  */
2986 static void
2987 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2988 {
2989         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2990 
2991         VERIFY(cpu->cpu_id == 0);
2992 
2993         if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2994                 x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2995                 return;
2996         }
2997 
2998         if (x86_disable_taa) {
2999                 x86_taa_mitigation = X86_TAA_DISABLED;
3000                 return;
3001         }
3002 
3003         /*
3004          * If we do not have the ability to disable TSX, then our only
3005          * mitigation options are in hardware (TAA_NO), or by using our existing
3006          * MDS mitigation as described above.  The latter relies upon us having
3007          * configured MDS mitigations correctly! This includes disabling SMT if
3008          * we want to cross-CPU-thread protection.
3009          */
3010         if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3011                 /*
3012                  * It's not clear whether any parts will enumerate TAA_NO
3013                  * *without* TSX_CTRL, but let's mark it as such if we see this.
3014                  */
3015                 if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3016                         x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3017                         return;
3018                 }
3019 
3020                 if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3021                     !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3022                         x86_taa_mitigation = X86_TAA_MD_CLEAR;
3023                 } else {
3024                         x86_taa_mitigation = X86_TAA_NOTHING;
3025                 }
3026                 return;
3027         }
3028 
3029         /*
3030          * We have TSX_CTRL, but we can only fully disable TSX if we're early
3031          * enough in boot.
3032          *
3033          * Otherwise, we'll fall back to causing transactions to abort as our
3034          * mitigation. TSX-using code will always take the fallback path.
3035          */
3036         if (cpi->cpi_pass < 4) {
3037                 x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3038         } else {
3039                 x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3040         }
3041 }
3042 
3043 /*
3044  * As mentioned, we should only touch the MSR when we've got a suitable
3045  * microcode loaded on this CPU.
3046  */
3047 static void
3048 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3049 {
3050         uint64_t val;
3051 
3052         switch (taa) {
3053         case X86_TAA_TSX_DISABLE:
3054                 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3055                         return;
3056                 val = rdmsr(MSR_IA32_TSX_CTRL);
3057                 val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3058                 wrmsr(MSR_IA32_TSX_CTRL, val);
3059                 break;
3060         case X86_TAA_TSX_FORCE_ABORT:
3061                 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3062                         return;
3063                 val = rdmsr(MSR_IA32_TSX_CTRL);
3064                 val |= IA32_TSX_CTRL_RTM_DISABLE;
3065                 wrmsr(MSR_IA32_TSX_CTRL, val);
3066                 break;
3067         case X86_TAA_HW_MITIGATED:
3068         case X86_TAA_MD_CLEAR:
3069         case X86_TAA_DISABLED:
3070         case X86_TAA_NOTHING:
3071                 break;
3072         }
3073 }
3074 
3075 static void
3076 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3077 {
3078         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3079         x86_spectrev2_mitigation_t v2mit;
3080 
3081         if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3082             cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3083             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3084                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3085                         add_x86_feature(featureset, X86FSET_IBPB);
3086                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3087                         add_x86_feature(featureset, X86FSET_IBRS);
3088                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3089                         add_x86_feature(featureset, X86FSET_STIBP);
3090                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3091                         add_x86_feature(featureset, X86FSET_STIBP_ALL);
3092                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3093                         add_x86_feature(featureset, X86FSET_SSBD);
3094                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3095                         add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3096                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3097                         add_x86_feature(featureset, X86FSET_SSB_NO);
3098 
3099                 /*
3100                  * Rather than Enhanced IBRS, AMD has a different feature that
3101                  * is a bit in EFER that can be enabled and will basically do
3102                  * the right thing while executing in the kernel.
3103                  */
3104                 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3105                     (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3106                     cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3107                     (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3108                         add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3109                 }
3110 
3111         } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3112             cpi->cpi_maxeax >= 7) {
3113                 struct cpuid_regs *ecp;
3114                 ecp = &cpi->cpi_std[7];
3115 
3116                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3117                         add_x86_feature(featureset, X86FSET_MD_CLEAR);
3118                 }
3119 
3120                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3121                         add_x86_feature(featureset, X86FSET_IBRS);
3122                         add_x86_feature(featureset, X86FSET_IBPB);
3123                 }
3124 
3125                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3126                         add_x86_feature(featureset, X86FSET_STIBP);
3127                 }
3128 
3129                 /*
3130                  * Don't read the arch caps MSR on xpv where we lack the
3131                  * on_trap().
3132                  */
3133 #ifndef __xpv
3134                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3135                         on_trap_data_t otd;
3136 
3137                         /*
3138                          * Be paranoid and assume we'll get a #GP.
3139                          */
3140                         if (!on_trap(&otd, OT_DATA_ACCESS)) {
3141                                 uint64_t reg;
3142 
3143                                 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3144                                 if (reg & IA32_ARCH_CAP_RDCL_NO) {
3145                                         add_x86_feature(featureset,
3146                                             X86FSET_RDCL_NO);
3147                                 }
3148                                 if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3149                                         add_x86_feature(featureset,
3150                                             X86FSET_IBRS_ALL);
3151                                 }
3152                                 if (reg & IA32_ARCH_CAP_RSBA) {
3153                                         add_x86_feature(featureset,
3154                                             X86FSET_RSBA);
3155                                 }
3156                                 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3157                                         add_x86_feature(featureset,
3158                                             X86FSET_L1D_VM_NO);
3159                                 }
3160                                 if (reg & IA32_ARCH_CAP_SSB_NO) {
3161                                         add_x86_feature(featureset,
3162                                             X86FSET_SSB_NO);
3163                                 }
3164                                 if (reg & IA32_ARCH_CAP_MDS_NO) {
3165                                         add_x86_feature(featureset,
3166                                             X86FSET_MDS_NO);
3167                                 }
3168                                 if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3169                                         add_x86_feature(featureset,
3170                                             X86FSET_TSX_CTRL);
3171                                 }
3172                                 if (reg & IA32_ARCH_CAP_TAA_NO) {
3173                                         add_x86_feature(featureset,
3174                                             X86FSET_TAA_NO);
3175                                 }
3176                         }
3177                         no_trap();
3178                 }
3179 #endif  /* !__xpv */
3180 
3181                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3182                         add_x86_feature(featureset, X86FSET_SSBD);
3183 
3184                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3185                         add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3186         }
3187 
3188         /*
3189          * Take care of certain mitigations on the non-boot CPU. The boot CPU
3190          * will have already run this function and determined what we need to
3191          * do. This gives us a hook for per-HW thread mitigations such as
3192          * enhanced IBRS, or disabling TSX.
3193          */
3194         if (cpu->cpu_id != 0) {
3195                 switch (x86_spectrev2_mitigation) {
3196                 case X86_SPECTREV2_ENHANCED_IBRS:
3197                         cpuid_enable_enhanced_ibrs();
3198                         break;
3199                 case X86_SPECTREV2_AUTO_IBRS:
3200                         cpuid_enable_auto_ibrs();
3201                         break;
3202                 default:
3203                         break;
3204                 }
3205 
3206                 cpuid_apply_tsx(x86_taa_mitigation, featureset);
3207                 return;
3208         }
3209 
3210         /*
3211          * Go through and initialize various security mechanisms that we should
3212          * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3213          * TAA.
3214          */
3215 
3216         /*
3217          * By default we've come in with retpolines enabled. Check whether we
3218          * should disable them or enable enhanced or automatic IBRS. RSB
3219          * stuffing is enabled by default. Note, we do not allow the use of AMD
3220          * optimized retpolines as it was disclosed by AMD in March 2022 that
3221          * they were still vulnerable. Prior to that point, we used them.
3222          */
3223         if (x86_disable_spectrev2 != 0) {
3224                 v2mit = X86_SPECTREV2_DISABLED;
3225         } else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3226                 cpuid_enable_auto_ibrs();
3227                 v2mit = X86_SPECTREV2_AUTO_IBRS;
3228         } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3229                 cpuid_enable_enhanced_ibrs();
3230                 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3231         } else {
3232                 v2mit = X86_SPECTREV2_RETPOLINE;
3233         }
3234 
3235         cpuid_patch_retpolines(v2mit);
3236         cpuid_patch_rsb(v2mit);
3237         x86_spectrev2_mitigation = v2mit;
3238         membar_producer();
3239 
3240         /*
3241          * We need to determine what changes are required for mitigating L1TF
3242          * and MDS. If the CPU suffers from either of them, then SMT exclusion
3243          * is required.
3244          *
3245          * If any of these are present, then we need to flush u-arch state at
3246          * various points. For MDS, we need to do so whenever we change to a
3247          * lesser privilege level or we are halting the CPU. For L1TF we need to
3248          * flush the L1D cache at VM entry. When we have microcode that handles
3249          * MDS, the L1D flush also clears the other u-arch state that the
3250          * md_clear does.
3251          */
3252 
3253         /*
3254          * Update whether or not we need to be taking explicit action against
3255          * MDS.
3256          */
3257         cpuid_update_md_clear(cpu, featureset);
3258 
3259         /*
3260          * Determine whether SMT exclusion is required and whether or not we
3261          * need to perform an l1d flush.
3262          */
3263         cpuid_update_l1d_flush(cpu, featureset);
3264 
3265         /*
3266          * Determine what our mitigation strategy should be for TAA and then
3267          * also apply TAA mitigations.
3268          */
3269         cpuid_update_tsx(cpu, featureset);
3270         cpuid_apply_tsx(x86_taa_mitigation, featureset);
3271 }
3272 
3273 /*
3274  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3275  */
3276 void
3277 setup_xfem(void)
3278 {
3279         uint64_t flags = XFEATURE_LEGACY_FP;
3280 
3281         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3282 
3283         if (is_x86_feature(x86_featureset, X86FSET_SSE))
3284                 flags |= XFEATURE_SSE;
3285 
3286         if (is_x86_feature(x86_featureset, X86FSET_AVX))
3287                 flags |= XFEATURE_AVX;
3288 
3289         if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3290                 flags |= XFEATURE_AVX512;
3291 
3292         set_xcr(XFEATURE_ENABLED_MASK, flags);
3293 
3294         xsave_bv_all = flags;
3295 }
3296 
3297 static void
3298 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3299 {
3300         struct cpuid_info *cpi;
3301 
3302         cpi = cpu->cpu_m.mcpu_cpi;
3303 
3304         if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3305             cpi->cpi_vendor == X86_VENDOR_HYGON) {
3306                 cpuid_gather_amd_topology_leaves(cpu);
3307         }
3308 
3309         cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3310 
3311         /*
3312          * Before we can calculate the IDs that we should assign to this
3313          * processor, we need to understand how many cores and threads it has.
3314          */
3315         switch (cpi->cpi_vendor) {
3316         case X86_VENDOR_Intel:
3317                 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3318                     &cpi->cpi_ncore_per_chip);
3319                 break;
3320         case X86_VENDOR_AMD:
3321         case X86_VENDOR_HYGON:
3322                 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3323                     &cpi->cpi_ncore_per_chip);
3324                 break;
3325         default:
3326                 /*
3327                  * If we have some other x86 compatible chip, it's not clear how
3328                  * they would behave. The most common case is virtualization
3329                  * today, though there are also 64-bit VIA chips. Assume that
3330                  * all we can get is the basic Leaf 1 HTT information.
3331                  */
3332                 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3333                         cpi->cpi_ncore_per_chip = 1;
3334                         cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3335                 }
3336                 break;
3337         }
3338 
3339         /*
3340          * Based on the calculated number of threads and cores, potentially
3341          * assign the HTT and CMT features.
3342          */
3343         if (cpi->cpi_ncore_per_chip > 1) {
3344                 add_x86_feature(featureset, X86FSET_CMP);
3345         }
3346 
3347         if (cpi->cpi_ncpu_per_chip > 1 &&
3348             cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3349                 add_x86_feature(featureset, X86FSET_HTT);
3350         }
3351 
3352         /*
3353          * Now that has been set up, we need to go through and calculate all of
3354          * the rest of the parameters that exist. If we think the CPU doesn't
3355          * have either SMT (HTT) or CMP, then we basically go through and fake
3356          * up information in some way. The most likely case for this is
3357          * virtualization where we have a lot of partial topology information.
3358          */
3359         if (!is_x86_feature(featureset, X86FSET_HTT) &&
3360             !is_x86_feature(featureset, X86FSET_CMP)) {
3361                 /*
3362                  * This is a single core, single-threaded processor.
3363                  */
3364                 cpi->cpi_procnodes_per_pkg = 1;
3365                 cpi->cpi_cores_per_compunit = 1;
3366                 cpi->cpi_compunitid = 0;
3367                 cpi->cpi_chipid = -1;
3368                 cpi->cpi_clogid = 0;
3369                 cpi->cpi_coreid = cpu->cpu_id;
3370                 cpi->cpi_pkgcoreid = 0;
3371                 if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3372                     cpi->cpi_vendor == X86_VENDOR_HYGON) {
3373                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3374                 } else {
3375                         cpi->cpi_procnodeid = cpi->cpi_chipid;
3376                 }
3377         } else {
3378                 switch (cpi->cpi_vendor) {
3379                 case X86_VENDOR_Intel:
3380                         cpuid_intel_getids(cpu, featureset);
3381                         break;
3382                 case X86_VENDOR_AMD:
3383                 case X86_VENDOR_HYGON:
3384                         cpuid_amd_getids(cpu, featureset);
3385                         break;
3386                 default:
3387                         /*
3388                          * In this case, it's hard to say what we should do.
3389                          * We're going to model them to the OS as single core
3390                          * threads. We don't have a good identifier for them, so
3391                          * we're just going to use the cpu id all on a single
3392                          * chip.
3393                          *
3394                          * This case has historically been different from the
3395                          * case above where we don't have HTT or CMP. While they
3396                          * could be combined, we've opted to keep it separate to
3397                          * minimize the risk of topology changes in weird cases.
3398                          */
3399                         cpi->cpi_procnodes_per_pkg = 1;
3400                         cpi->cpi_cores_per_compunit = 1;
3401                         cpi->cpi_chipid = 0;
3402                         cpi->cpi_coreid = cpu->cpu_id;
3403                         cpi->cpi_clogid = cpu->cpu_id;
3404                         cpi->cpi_pkgcoreid = cpu->cpu_id;
3405                         cpi->cpi_procnodeid = cpi->cpi_chipid;
3406                         cpi->cpi_compunitid = cpi->cpi_coreid;
3407                         break;
3408                 }
3409         }
3410 }
3411 
3412 /*
3413  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3414  * always gather leaf 6 if it's supported; however, we only look for features on
3415  * Intel systems as AMD does not currently define any of the features we look
3416  * for below.
3417  */
3418 static void
3419 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3420 {
3421         struct cpuid_regs *cp;
3422         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3423 
3424         if (cpi->cpi_maxeax < 6) {
3425                 return;
3426         }
3427 
3428         cp = &cpi->cpi_std[6];
3429         cp->cp_eax = 6;
3430         cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3431         (void) __cpuid_insn(cp);
3432         platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3433 
3434         if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3435                 return;
3436         }
3437 
3438         if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3439                 add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3440         }
3441 
3442         if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3443                 add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3444         }
3445 }
3446 
3447 /*
3448  * This is used when we discover that we have AVX support in cpuid. This
3449  * proceeds to scan for the rest of the AVX derived features.
3450  */
3451 static void
3452 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3453 {
3454         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3455 
3456         /*
3457          * If we don't have AVX, don't bother with most of this.
3458          */
3459         if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3460                 return;
3461 
3462         add_x86_feature(featureset, X86FSET_AVX);
3463 
3464         /*
3465          * Intel says we can't check these without also
3466          * checking AVX.
3467          */
3468         if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3469                 add_x86_feature(featureset, X86FSET_F16C);
3470 
3471         if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3472                 add_x86_feature(featureset, X86FSET_FMA);
3473 
3474         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3475                 add_x86_feature(featureset, X86FSET_BMI1);
3476 
3477         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3478                 add_x86_feature(featureset, X86FSET_BMI2);
3479 
3480         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3481                 add_x86_feature(featureset, X86FSET_AVX2);
3482 
3483         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3484                 add_x86_feature(featureset, X86FSET_VAES);
3485 
3486         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3487                 add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3488 
3489         /*
3490          * The rest of the AVX features require AVX512. Do not check them unless
3491          * it is present.
3492          */
3493         if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3494                 return;
3495         add_x86_feature(featureset, X86FSET_AVX512F);
3496 
3497         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3498                 add_x86_feature(featureset, X86FSET_AVX512DQ);
3499 
3500         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3501                 add_x86_feature(featureset, X86FSET_AVX512FMA);
3502 
3503         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3504                 add_x86_feature(featureset, X86FSET_AVX512PF);
3505 
3506         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3507                 add_x86_feature(featureset, X86FSET_AVX512ER);
3508 
3509         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3510                 add_x86_feature(featureset, X86FSET_AVX512CD);
3511 
3512         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3513                 add_x86_feature(featureset, X86FSET_AVX512BW);
3514 
3515         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3516                 add_x86_feature(featureset, X86FSET_AVX512VL);
3517 
3518         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3519                 add_x86_feature(featureset, X86FSET_AVX512VBMI);
3520 
3521         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3522                 add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3523 
3524         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3525                 add_x86_feature(featureset, X86FSET_AVX512VNNI);
3526 
3527         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3528                 add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3529 
3530         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3531                 add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3532 
3533         if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3534                 add_x86_feature(featureset, X86FSET_AVX512NNIW);
3535 
3536         if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3537                 add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3538 
3539         /*
3540          * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3541          * we don't need to.
3542          */
3543         if (cpi->cpi_std[7].cp_eax < 1)
3544                 return;
3545 
3546         if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3547                 add_x86_feature(featureset, X86FSET_AVX512_BF16);
3548 }
3549 
3550 /*
3551  * PPIN is the protected processor inventory number. On AMD this is an actual
3552  * feature bit. However, on Intel systems we need to read the platform
3553  * information MSR if we're on a specific model.
3554  */
3555 #if !defined(__xpv)
3556 static void
3557 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3558 {
3559         on_trap_data_t otd;
3560         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3561 
3562         switch (cpi->cpi_vendor) {
3563         case X86_VENDOR_AMD:
3564                 /*
3565                  * This leaf will have already been gathered in the topology
3566                  * functions.
3567                  */
3568                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3569                         if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3570                                 add_x86_feature(featureset, X86FSET_PPIN);
3571                         }
3572                 }
3573                 break;
3574         case X86_VENDOR_Intel:
3575                 if (cpi->cpi_family != 6)
3576                         break;
3577                 switch (cpi->cpi_model) {
3578                 case INTC_MODEL_IVYBRIDGE_XEON:
3579                 case INTC_MODEL_HASWELL_XEON:
3580                 case INTC_MODEL_BROADWELL_XEON:
3581                 case INTC_MODEL_BROADWELL_XEON_D:
3582                 case INTC_MODEL_SKYLAKE_XEON:
3583                 case INTC_MODEL_ICELAKE_XEON:
3584                         if (!on_trap(&otd, OT_DATA_ACCESS)) {
3585                                 uint64_t value;
3586 
3587                                 value = rdmsr(MSR_PLATFORM_INFO);
3588                                 if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3589                                         add_x86_feature(featureset,
3590                                             X86FSET_PPIN);
3591                                 }
3592                         }
3593                         no_trap();
3594                         break;
3595                 default:
3596                         break;
3597                 }
3598                 break;
3599         default:
3600                 break;
3601         }
3602 }
3603 #endif  /* ! __xpv */
3604 
3605 static void
3606 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3607 {
3608         uchar_t *featureset = (uchar_t *)arg;
3609 
3610         /*
3611          * We don't run on any processor that doesn't have cpuid, and could not
3612          * possibly have arrived here.
3613          */
3614         add_x86_feature(featureset, X86FSET_CPUID);
3615 }
3616 
3617 static void
3618 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3619 {
3620         struct cpuid_info *cpi;
3621         struct cpuid_regs *cp;
3622 
3623         /*
3624          * We require that virtual/native detection be complete and that PCI
3625          * config space access has been set up; at present there is no reliable
3626          * way to determine the latter.
3627          */
3628 #if !defined(__xpv)
3629         ASSERT3S(platform_type, !=, -1);
3630 #endif  /* !__xpv */
3631 
3632         cpi = cpu->cpu_m.mcpu_cpi;
3633         ASSERT(cpi != NULL);
3634 
3635         cp = &cpi->cpi_std[0];
3636         cp->cp_eax = 0;
3637         cpi->cpi_maxeax = __cpuid_insn(cp);
3638         {
3639                 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3640                 *iptr++ = cp->cp_ebx;
3641                 *iptr++ = cp->cp_edx;
3642                 *iptr++ = cp->cp_ecx;
3643                 *(char *)&cpi->cpi_vendorstr[12] = '\0';
3644         }
3645 
3646         cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3647         x86_vendor = cpi->cpi_vendor; /* for compatibility */
3648 
3649         /*
3650          * Limit the range in case of weird hardware
3651          */
3652         if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3653                 cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3654         if (cpi->cpi_maxeax < 1)
3655                 return;
3656 
3657         cp = &cpi->cpi_std[1];
3658         cp->cp_eax = 1;
3659         (void) __cpuid_insn(cp);
3660 
3661         /*
3662          * Extract identifying constants for easy access.
3663          */
3664         cpi->cpi_model = CPI_MODEL(cpi);
3665         cpi->cpi_family = CPI_FAMILY(cpi);
3666 
3667         if (cpi->cpi_family == 0xf)
3668                 cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3669 
3670         /*
3671          * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3672          * Intel, and presumably everyone else, uses model == 0xf, as
3673          * one would expect (max value means possible overflow).  Sigh.
3674          */
3675 
3676         switch (cpi->cpi_vendor) {
3677         case X86_VENDOR_Intel:
3678                 if (IS_EXTENDED_MODEL_INTEL(cpi))
3679                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3680                 break;
3681         case X86_VENDOR_AMD:
3682                 if (CPI_FAMILY(cpi) == 0xf)
3683                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3684                 break;
3685         case X86_VENDOR_HYGON:
3686                 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3687                 break;
3688         default:
3689                 if (cpi->cpi_model == 0xf)
3690                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3691                 break;
3692         }
3693 
3694         cpi->cpi_step = CPI_STEP(cpi);
3695         cpi->cpi_brandid = CPI_BRANDID(cpi);
3696 
3697         /*
3698          * Synthesize chip "revision" and socket type
3699          */
3700         cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3701             cpi->cpi_model, cpi->cpi_step);
3702         cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3703             cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3704         cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3705             cpi->cpi_model, cpi->cpi_step);
3706         cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
3707             cpi->cpi_model, cpi->cpi_step);
3708 }
3709 
3710 static void
3711 cpuid_pass_basic(cpu_t *cpu, void *arg)
3712 {
3713         uchar_t *featureset = (uchar_t *)arg;
3714         uint32_t mask_ecx, mask_edx;
3715         struct cpuid_info *cpi;
3716         struct cpuid_regs *cp;
3717         int xcpuid;
3718 #if !defined(__xpv)
3719         extern int idle_cpu_prefer_mwait;
3720 #endif
3721 
3722         cpi = cpu->cpu_m.mcpu_cpi;
3723         ASSERT(cpi != NULL);
3724 
3725         if (cpi->cpi_maxeax < 1)
3726                 return;
3727 
3728         /*
3729          * This was filled during the identification pass.
3730          */
3731         cp = &cpi->cpi_std[1];
3732 
3733         /*
3734          * *default* assumptions:
3735          * - believe %edx feature word
3736          * - ignore %ecx feature word
3737          * - 32-bit virtual and physical addressing
3738          */
3739         mask_edx = 0xffffffff;
3740         mask_ecx = 0;
3741 
3742         cpi->cpi_pabits = cpi->cpi_vabits = 32;
3743 
3744         switch (cpi->cpi_vendor) {
3745         case X86_VENDOR_Intel:
3746                 if (cpi->cpi_family == 5)
3747                         x86_type = X86_TYPE_P5;
3748                 else if (IS_LEGACY_P6(cpi)) {
3749                         x86_type = X86_TYPE_P6;
3750                         pentiumpro_bug4046376 = 1;
3751                         /*
3752                          * Clear the SEP bit when it was set erroneously
3753                          */
3754                         if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3755                                 cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3756                 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3757                         x86_type = X86_TYPE_P4;
3758                         /*
3759                          * We don't currently depend on any of the %ecx
3760                          * features until Prescott, so we'll only check
3761                          * this from P4 onwards.  We might want to revisit
3762                          * that idea later.
3763                          */
3764                         mask_ecx = 0xffffffff;
3765                 } else if (cpi->cpi_family > 0xf)
3766                         mask_ecx = 0xffffffff;
3767                 /*
3768                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3769                  * to obtain the monitor linesize.
3770                  */
3771                 if (cpi->cpi_maxeax < 5)
3772                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3773                 break;
3774         case X86_VENDOR_IntelClone:
3775         default:
3776                 break;
3777         case X86_VENDOR_AMD:
3778 #if defined(OPTERON_ERRATUM_108)
3779                 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3780                         cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3781                         cpi->cpi_model = 0xc;
3782                 } else
3783 #endif
3784                 if (cpi->cpi_family == 5) {
3785                         /*
3786                          * AMD K5 and K6
3787                          *
3788                          * These CPUs have an incomplete implementation
3789                          * of MCA/MCE which we mask away.
3790                          */
3791                         mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3792 
3793                         /*
3794                          * Model 0 uses the wrong (APIC) bit
3795                          * to indicate PGE.  Fix it here.
3796                          */
3797                         if (cpi->cpi_model == 0) {
3798                                 if (cp->cp_edx & 0x200) {
3799                                         cp->cp_edx &= ~0x200;
3800                                         cp->cp_edx |= CPUID_INTC_EDX_PGE;
3801                                 }
3802                         }
3803 
3804                         /*
3805                          * Early models had problems w/ MMX; disable.
3806                          */
3807                         if (cpi->cpi_model < 6)
3808                                 mask_edx &= ~CPUID_INTC_EDX_MMX;
3809                 }
3810 
3811                 /*
3812                  * For newer families, SSE3 and CX16, at least, are valid;
3813                  * enable all
3814                  */
3815                 if (cpi->cpi_family >= 0xf)
3816                         mask_ecx = 0xffffffff;
3817                 /*
3818                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3819                  * to obtain the monitor linesize.
3820                  */
3821                 if (cpi->cpi_maxeax < 5)
3822                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3823 
3824 #if !defined(__xpv)
3825                 /*
3826                  * AMD has not historically used MWAIT in the CPU's idle loop.
3827                  * Pre-family-10h Opterons do not have the MWAIT instruction. We
3828                  * know for certain that in at least family 17h, per AMD, mwait
3829                  * is preferred. Families in-between are less certain.
3830                  */
3831                 if (cpi->cpi_family < 0x17) {
3832                         idle_cpu_prefer_mwait = 0;
3833                 }
3834 #endif
3835 
3836                 break;
3837         case X86_VENDOR_HYGON:
3838                 /* Enable all for Hygon Dhyana CPU */
3839                 mask_ecx = 0xffffffff;
3840                 break;
3841         case X86_VENDOR_TM:
3842                 /*
3843                  * workaround the NT workaround in CMS 4.1
3844                  */
3845                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3846                     (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3847                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3848                 break;
3849         case X86_VENDOR_Centaur:
3850                 /*
3851                  * workaround the NT workarounds again
3852                  */
3853                 if (cpi->cpi_family == 6)
3854                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3855                 break;
3856         case X86_VENDOR_Cyrix:
3857                 /*
3858                  * We rely heavily on the probing in locore
3859                  * to actually figure out what parts, if any,
3860                  * of the Cyrix cpuid instruction to believe.
3861                  */
3862                 switch (x86_type) {
3863                 case X86_TYPE_CYRIX_486:
3864                         mask_edx = 0;
3865                         break;
3866                 case X86_TYPE_CYRIX_6x86:
3867                         mask_edx = 0;
3868                         break;
3869                 case X86_TYPE_CYRIX_6x86L:
3870                         mask_edx =
3871                             CPUID_INTC_EDX_DE |
3872                             CPUID_INTC_EDX_CX8;
3873                         break;
3874                 case X86_TYPE_CYRIX_6x86MX:
3875                         mask_edx =
3876                             CPUID_INTC_EDX_DE |
3877                             CPUID_INTC_EDX_MSR |
3878                             CPUID_INTC_EDX_CX8 |
3879                             CPUID_INTC_EDX_PGE |
3880                             CPUID_INTC_EDX_CMOV |
3881                             CPUID_INTC_EDX_MMX;
3882                         break;
3883                 case X86_TYPE_CYRIX_GXm:
3884                         mask_edx =
3885                             CPUID_INTC_EDX_MSR |
3886                             CPUID_INTC_EDX_CX8 |
3887                             CPUID_INTC_EDX_CMOV |
3888                             CPUID_INTC_EDX_MMX;
3889                         break;
3890                 case X86_TYPE_CYRIX_MediaGX:
3891                         break;
3892                 case X86_TYPE_CYRIX_MII:
3893                 case X86_TYPE_VIA_CYRIX_III:
3894                         mask_edx =
3895                             CPUID_INTC_EDX_DE |
3896                             CPUID_INTC_EDX_TSC |
3897                             CPUID_INTC_EDX_MSR |
3898                             CPUID_INTC_EDX_CX8 |
3899                             CPUID_INTC_EDX_PGE |
3900                             CPUID_INTC_EDX_CMOV |
3901                             CPUID_INTC_EDX_MMX;
3902                         break;
3903                 default:
3904                         break;
3905                 }
3906                 break;
3907         }
3908 
3909 #if defined(__xpv)
3910         /*
3911          * Do not support MONITOR/MWAIT under a hypervisor
3912          */
3913         mask_ecx &= ~CPUID_INTC_ECX_MON;
3914         /*
3915          * Do not support XSAVE under a hypervisor for now
3916          */
3917         xsave_force_disable = B_TRUE;
3918 
3919 #endif  /* __xpv */
3920 
3921         if (xsave_force_disable) {
3922                 mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3923                 mask_ecx &= ~CPUID_INTC_ECX_AVX;
3924                 mask_ecx &= ~CPUID_INTC_ECX_F16C;
3925                 mask_ecx &= ~CPUID_INTC_ECX_FMA;
3926         }
3927 
3928         /*
3929          * Now we've figured out the masks that determine
3930          * which bits we choose to believe, apply the masks
3931          * to the feature words, then map the kernel's view
3932          * of these feature words into its feature word.
3933          */
3934         cp->cp_edx &= mask_edx;
3935         cp->cp_ecx &= mask_ecx;
3936 
3937         /*
3938          * apply any platform restrictions (we don't call this
3939          * immediately after __cpuid_insn here, because we need the
3940          * workarounds applied above first)
3941          */
3942         platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3943 
3944         /*
3945          * In addition to ecx and edx, Intel and AMD are storing a bunch of
3946          * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
3947          * 7 has sub-leaves determined by ecx.
3948          */
3949         if (cpi->cpi_maxeax >= 7) {
3950                 struct cpuid_regs *ecp;
3951                 ecp = &cpi->cpi_std[7];
3952                 ecp->cp_eax = 7;
3953                 ecp->cp_ecx = 0;
3954                 (void) __cpuid_insn(ecp);
3955 
3956                 /*
3957                  * If XSAVE has been disabled, just ignore all of the
3958                  * extended-save-area dependent flags here. By removing most of
3959                  * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
3960                  * end up looking at additional xsave dependent leaves right
3961                  * now.
3962                  */
3963                 if (xsave_force_disable) {
3964                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3965                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3966                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3967                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3968                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3969                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3970                         ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3971                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
3972                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
3973                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
3974                 }
3975 
3976                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3977                         add_x86_feature(featureset, X86FSET_SMEP);
3978 
3979                 /*
3980                  * We check disable_smap here in addition to in startup_smap()
3981                  * to ensure CPUs that aren't the boot CPU don't accidentally
3982                  * include it in the feature set and thus generate a mismatched
3983                  * x86 feature set across CPUs.
3984                  */
3985                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3986                     disable_smap == 0)
3987                         add_x86_feature(featureset, X86FSET_SMAP);
3988 
3989                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3990                         add_x86_feature(featureset, X86FSET_RDSEED);
3991 
3992                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3993                         add_x86_feature(featureset, X86FSET_ADX);
3994 
3995                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3996                         add_x86_feature(featureset, X86FSET_FSGSBASE);
3997 
3998                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3999                         add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4000 
4001                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4002                         add_x86_feature(featureset, X86FSET_INVPCID);
4003 
4004                 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4005                         add_x86_feature(featureset, X86FSET_UMIP);
4006                 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4007                         add_x86_feature(featureset, X86FSET_PKU);
4008                 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4009                         add_x86_feature(featureset, X86FSET_OSPKE);
4010                 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4011                         add_x86_feature(featureset, X86FSET_GFNI);
4012 
4013                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4014                         add_x86_feature(featureset, X86FSET_CLWB);
4015 
4016                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4017                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4018                                 add_x86_feature(featureset, X86FSET_MPX);
4019                 }
4020 
4021                 /*
4022                  * If we have subleaf 1 available, grab and store that. This is
4023                  * used for more AVX and related features.
4024                  */
4025                 if (ecp->cp_eax >= 1) {
4026                         struct cpuid_regs *c71;
4027                         c71 = &cpi->cpi_sub7[0];
4028                         c71->cp_eax = 7;
4029                         c71->cp_ecx = 1;
4030                         (void) __cpuid_insn(c71);
4031                 }
4032         }
4033 
4034         /*
4035          * fold in overrides from the "eeprom" mechanism
4036          */
4037         cp->cp_edx |= cpuid_feature_edx_include;
4038         cp->cp_edx &= ~cpuid_feature_edx_exclude;
4039 
4040         cp->cp_ecx |= cpuid_feature_ecx_include;
4041         cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4042 
4043         if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4044                 add_x86_feature(featureset, X86FSET_LARGEPAGE);
4045         }
4046         if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4047                 add_x86_feature(featureset, X86FSET_TSC);
4048         }
4049         if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4050                 add_x86_feature(featureset, X86FSET_MSR);
4051         }
4052         if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4053                 add_x86_feature(featureset, X86FSET_MTRR);
4054         }
4055         if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4056                 add_x86_feature(featureset, X86FSET_PGE);
4057         }
4058         if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4059                 add_x86_feature(featureset, X86FSET_CMOV);
4060         }
4061         if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4062                 add_x86_feature(featureset, X86FSET_MMX);
4063         }
4064         if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4065             (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4066                 add_x86_feature(featureset, X86FSET_MCA);
4067         }
4068         if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4069                 add_x86_feature(featureset, X86FSET_PAE);
4070         }
4071         if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4072                 add_x86_feature(featureset, X86FSET_CX8);
4073         }
4074         if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4075                 add_x86_feature(featureset, X86FSET_CX16);
4076         }
4077         if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4078                 add_x86_feature(featureset, X86FSET_PAT);
4079         }
4080         if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4081                 add_x86_feature(featureset, X86FSET_SEP);
4082         }
4083         if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4084                 /*
4085                  * In our implementation, fxsave/fxrstor
4086                  * are prerequisites before we'll even
4087                  * try and do SSE things.
4088                  */
4089                 if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4090                         add_x86_feature(featureset, X86FSET_SSE);
4091                 }
4092                 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4093                         add_x86_feature(featureset, X86FSET_SSE2);
4094                 }
4095                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4096                         add_x86_feature(featureset, X86FSET_SSE3);
4097                 }
4098                 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4099                         add_x86_feature(featureset, X86FSET_SSSE3);
4100                 }
4101                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4102                         add_x86_feature(featureset, X86FSET_SSE4_1);
4103                 }
4104                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4105                         add_x86_feature(featureset, X86FSET_SSE4_2);
4106                 }
4107                 if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4108                         add_x86_feature(featureset, X86FSET_AES);
4109                 }
4110                 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4111                         add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4112                 }
4113 
4114                 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4115                         add_x86_feature(featureset, X86FSET_SHA);
4116 
4117                 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4118                         add_x86_feature(featureset, X86FSET_XSAVE);
4119 
4120                         /* We only test AVX & AVX512 when there is XSAVE */
4121                         cpuid_basic_avx(cpu, featureset);
4122                 }
4123         }
4124 
4125         if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4126                 add_x86_feature(featureset, X86FSET_PCID);
4127         }
4128 
4129         if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4130                 add_x86_feature(featureset, X86FSET_X2APIC);
4131         }
4132         if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4133                 add_x86_feature(featureset, X86FSET_DE);
4134         }
4135 #if !defined(__xpv)
4136         if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4137 
4138                 /*
4139                  * We require the CLFLUSH instruction for erratum workaround
4140                  * to use MONITOR/MWAIT.
4141                  */
4142                 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4143                         cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4144                         add_x86_feature(featureset, X86FSET_MWAIT);
4145                 } else {
4146                         extern int idle_cpu_assert_cflush_monitor;
4147 
4148                         /*
4149                          * All processors we are aware of which have
4150                          * MONITOR/MWAIT also have CLFLUSH.
4151                          */
4152                         if (idle_cpu_assert_cflush_monitor) {
4153                                 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4154                                     (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4155                         }
4156                 }
4157         }
4158 #endif  /* __xpv */
4159 
4160         if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4161                 add_x86_feature(featureset, X86FSET_VMX);
4162         }
4163 
4164         if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4165                 add_x86_feature(featureset, X86FSET_RDRAND);
4166 
4167         /*
4168          * Only need it first time, rest of the cpus would follow suit.
4169          * we only capture this for the bootcpu.
4170          */
4171         if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4172                 add_x86_feature(featureset, X86FSET_CLFSH);
4173                 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4174         }
4175         if (is_x86_feature(featureset, X86FSET_PAE))
4176                 cpi->cpi_pabits = 36;
4177 
4178         if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4179                 struct cpuid_regs r, *ecp;
4180 
4181                 ecp = &r;
4182                 ecp->cp_eax = 0xD;
4183                 ecp->cp_ecx = 1;
4184                 ecp->cp_edx = ecp->cp_ebx = 0;
4185                 (void) __cpuid_insn(ecp);
4186 
4187                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4188                         add_x86_feature(featureset, X86FSET_XSAVEOPT);
4189                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4190                         add_x86_feature(featureset, X86FSET_XSAVEC);
4191                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4192                         add_x86_feature(featureset, X86FSET_XSAVES);
4193 
4194                 /*
4195                  * Zen 2 family processors suffer from erratum 1386 that causes
4196                  * xsaves to not function correctly in some circumstances. There
4197                  * are no supervisor states in Zen 2 and earlier. Practically
4198                  * speaking this has no impact for us as we currently do not
4199                  * leverage compressed xsave formats. To safeguard against
4200                  * issues in the future where we may opt to using it, we remove
4201                  * it from the feature set now. While Matisse has a microcode
4202                  * update available with a fix, not all Zen 2 CPUs do so it's
4203                  * simpler for the moment to unconditionally remove it.
4204                  */
4205                 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4206                     uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4207                         remove_x86_feature(featureset, X86FSET_XSAVES);
4208                 }
4209         }
4210 
4211         /*
4212          * Work on the "extended" feature information, doing
4213          * some basic initialization to be used in the extended pass.
4214          */
4215         xcpuid = 0;
4216         switch (cpi->cpi_vendor) {
4217         case X86_VENDOR_Intel:
4218                 /*
4219                  * On KVM we know we will have proper support for extended
4220                  * cpuid.
4221                  */
4222                 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4223                     (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4224                     (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4225                         xcpuid++;
4226                 break;
4227         case X86_VENDOR_AMD:
4228                 if (cpi->cpi_family > 5 ||
4229                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4230                         xcpuid++;
4231                 break;
4232         case X86_VENDOR_Cyrix:
4233                 /*
4234                  * Only these Cyrix CPUs are -known- to support
4235                  * extended cpuid operations.
4236                  */
4237                 if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4238                     x86_type == X86_TYPE_CYRIX_GXm)
4239                         xcpuid++;
4240                 break;
4241         case X86_VENDOR_HYGON:
4242         case X86_VENDOR_Centaur:
4243         case X86_VENDOR_TM:
4244         default:
4245                 xcpuid++;
4246                 break;
4247         }
4248 
4249         if (xcpuid) {
4250                 cp = &cpi->cpi_extd[0];
4251                 cp->cp_eax = CPUID_LEAF_EXT_0;
4252                 cpi->cpi_xmaxeax = __cpuid_insn(cp);
4253         }
4254 
4255         if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4256 
4257                 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4258                         cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4259 
4260                 switch (cpi->cpi_vendor) {
4261                 case X86_VENDOR_Intel:
4262                 case X86_VENDOR_AMD:
4263                 case X86_VENDOR_HYGON:
4264                         if (cpi->cpi_xmaxeax < 0x80000001)
4265                                 break;
4266                         cp = &cpi->cpi_extd[1];
4267                         cp->cp_eax = 0x80000001;
4268                         (void) __cpuid_insn(cp);
4269 
4270                         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4271                             cpi->cpi_family == 5 &&
4272                             cpi->cpi_model == 6 &&
4273                             cpi->cpi_step == 6) {
4274                                 /*
4275                                  * K6 model 6 uses bit 10 to indicate SYSC
4276                                  * Later models use bit 11. Fix it here.
4277                                  */
4278                                 if (cp->cp_edx & 0x400) {
4279                                         cp->cp_edx &= ~0x400;
4280                                         cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4281                                 }
4282                         }
4283 
4284                         platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4285 
4286                         /*
4287                          * Compute the additions to the kernel's feature word.
4288                          */
4289                         if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4290                                 add_x86_feature(featureset, X86FSET_NX);
4291                         }
4292 
4293                         /*
4294                          * Regardless whether or not we boot 64-bit,
4295                          * we should have a way to identify whether
4296                          * the CPU is capable of running 64-bit.
4297                          */
4298                         if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4299                                 add_x86_feature(featureset, X86FSET_64);
4300                         }
4301 
4302                         /* 1 GB large page - enable only for 64 bit kernel */
4303                         if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4304                                 add_x86_feature(featureset, X86FSET_1GPG);
4305                         }
4306 
4307                         if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4308                             cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4309                             (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4310                             (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4311                                 add_x86_feature(featureset, X86FSET_SSE4A);
4312                         }
4313 
4314                         /*
4315                          * It's really tricky to support syscall/sysret in
4316                          * the i386 kernel; we rely on sysenter/sysexit
4317                          * instead.  In the amd64 kernel, things are -way-
4318                          * better.
4319                          */
4320                         if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4321                                 add_x86_feature(featureset, X86FSET_ASYSC);
4322                         }
4323 
4324                         /*
4325                          * While we're thinking about system calls, note
4326                          * that AMD processors don't support sysenter
4327                          * in long mode at all, so don't try to program them.
4328                          */
4329                         if (x86_vendor == X86_VENDOR_AMD ||
4330                             x86_vendor == X86_VENDOR_HYGON) {
4331                                 remove_x86_feature(featureset, X86FSET_SEP);
4332                         }
4333 
4334                         if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4335                                 add_x86_feature(featureset, X86FSET_TSCP);
4336                         }
4337 
4338                         if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4339                                 add_x86_feature(featureset, X86FSET_SVM);
4340                         }
4341 
4342                         if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4343                                 add_x86_feature(featureset, X86FSET_TOPOEXT);
4344                         }
4345 
4346                         if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4347                                 add_x86_feature(featureset, X86FSET_AMD_PCEC);
4348                         }
4349 
4350                         if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4351                                 add_x86_feature(featureset, X86FSET_XOP);
4352                         }
4353 
4354                         if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4355                                 add_x86_feature(featureset, X86FSET_FMA4);
4356                         }
4357 
4358                         if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4359                                 add_x86_feature(featureset, X86FSET_TBM);
4360                         }
4361 
4362                         if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4363                                 add_x86_feature(featureset, X86FSET_MONITORX);
4364                         }
4365                         break;
4366                 default:
4367                         break;
4368                 }
4369 
4370                 /*
4371                  * Get CPUID data about processor cores and hyperthreads.
4372                  */
4373                 switch (cpi->cpi_vendor) {
4374                 case X86_VENDOR_Intel:
4375                         if (cpi->cpi_maxeax >= 4) {
4376                                 cp = &cpi->cpi_std[4];
4377                                 cp->cp_eax = 4;
4378                                 cp->cp_ecx = 0;
4379                                 (void) __cpuid_insn(cp);
4380                                 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4381                         }
4382                         /*FALLTHROUGH*/
4383                 case X86_VENDOR_AMD:
4384                 case X86_VENDOR_HYGON:
4385                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4386                                 break;
4387                         cp = &cpi->cpi_extd[8];
4388                         cp->cp_eax = CPUID_LEAF_EXT_8;
4389                         (void) __cpuid_insn(cp);
4390                         platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4391                             cp);
4392 
4393                         /*
4394                          * AMD uses ebx for some extended functions.
4395                          */
4396                         if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4397                             cpi->cpi_vendor == X86_VENDOR_HYGON) {
4398                                 /*
4399                                  * While we're here, check for the AMD "Error
4400                                  * Pointer Zero/Restore" feature. This can be
4401                                  * used to setup the FP save handlers
4402                                  * appropriately.
4403                                  */
4404                                 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4405                                         cpi->cpi_fp_amd_save = 0;
4406                                 } else {
4407                                         cpi->cpi_fp_amd_save = 1;
4408                                 }
4409 
4410                                 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4411                                         add_x86_feature(featureset,
4412                                             X86FSET_CLZERO);
4413                                 }
4414                         }
4415 
4416                         /*
4417                          * Virtual and physical address limits from
4418                          * cpuid override previously guessed values.
4419                          */
4420                         cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4421                         cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4422                         break;
4423                 default:
4424                         break;
4425                 }
4426 
4427                 /*
4428                  * Get CPUID data about TSC Invariance in Deep C-State.
4429                  */
4430                 switch (cpi->cpi_vendor) {
4431                 case X86_VENDOR_Intel:
4432                 case X86_VENDOR_AMD:
4433                 case X86_VENDOR_HYGON:
4434                         if (cpi->cpi_maxeax >= 7) {
4435                                 cp = &cpi->cpi_extd[7];
4436                                 cp->cp_eax = 0x80000007;
4437                                 cp->cp_ecx = 0;
4438                                 (void) __cpuid_insn(cp);
4439                         }
4440                         break;
4441                 default:
4442                         break;
4443                 }
4444         }
4445 
4446         /*
4447          * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4448          * run and thus gathered some of its dependent leaves.
4449          */
4450         cpuid_basic_topology(cpu, featureset);
4451         cpuid_basic_thermal(cpu, featureset);
4452 #if !defined(__xpv)
4453         cpuid_basic_ppin(cpu, featureset);
4454 #endif
4455 
4456         if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4457             cpi->cpi_vendor == X86_VENDOR_HYGON) {
4458                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4459                     cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4460                         /* Special handling for AMD FP not necessary. */
4461                         cpi->cpi_fp_amd_save = 0;
4462                 } else {
4463                         cpi->cpi_fp_amd_save = 1;
4464                 }
4465         }
4466 
4467         /*
4468          * Check (and potentially set) if lfence is serializing.
4469          * This is useful for accurate rdtsc measurements and AMD retpolines.
4470          */
4471         if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4472             cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4473             is_x86_feature(featureset, X86FSET_SSE2)) {
4474                 /*
4475                  * The AMD white paper Software Techniques For Managing
4476                  * Speculation on AMD Processors details circumstances for when
4477                  * lfence instructions are serializing.
4478                  *
4479                  * On family 0xf and 0x11, it is inherently so.  On family 0x10
4480                  * and later (excluding 0x11), a bit in the DE_CFG MSR
4481                  * determines the lfence behavior.  Per that whitepaper, AMD has
4482                  * committed to supporting that MSR on all later CPUs.
4483                  */
4484                 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4485                         add_x86_feature(featureset, X86FSET_LFENCE_SER);
4486                 } else if (cpi->cpi_family >= 0x10) {
4487 #if !defined(__xpv)
4488                         uint64_t val;
4489 
4490                         /*
4491                          * Be careful when attempting to enable the bit, and
4492                          * verify that it was actually set in case we are
4493                          * running in a hypervisor which is less than faithful
4494                          * about its emulation of this feature.
4495                          */
4496                         on_trap_data_t otd;
4497                         if (!on_trap(&otd, OT_DATA_ACCESS)) {
4498                                 val = rdmsr(MSR_AMD_DE_CFG);
4499                                 val |= AMD_DE_CFG_LFENCE_DISPATCH;
4500                                 wrmsr(MSR_AMD_DE_CFG, val);
4501                                 val = rdmsr(MSR_AMD_DE_CFG);
4502                         } else {
4503                                 val = 0;
4504                         }
4505                         no_trap();
4506 
4507                         if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4508                                 add_x86_feature(featureset, X86FSET_LFENCE_SER);
4509                         }
4510 #endif
4511                 }
4512         } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4513             is_x86_feature(featureset, X86FSET_SSE2)) {
4514                 /*
4515                  * Documentation and other OSes indicate that lfence is always
4516                  * serializing on Intel CPUs.
4517                  */
4518                 add_x86_feature(featureset, X86FSET_LFENCE_SER);
4519         }
4520 
4521 
4522         /*
4523          * Check the processor leaves that are used for security features. Grab
4524          * any additional processor-specific leaves that we may not have yet.
4525          */
4526         switch (cpi->cpi_vendor) {
4527         case X86_VENDOR_AMD:
4528         case X86_VENDOR_HYGON:
4529                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4530                         cp = &cpi->cpi_extd[7];
4531                         cp->cp_eax = CPUID_LEAF_EXT_21;
4532                         cp->cp_ecx = 0;
4533                         (void) __cpuid_insn(cp);
4534                 }
4535                 break;
4536         default:
4537                 break;
4538         }
4539 
4540         cpuid_scan_security(cpu, featureset);
4541 }
4542 
4543 /*
4544  * Make copies of the cpuid table entries we depend on, in
4545  * part for ease of parsing now, in part so that we have only
4546  * one place to correct any of it, in part for ease of
4547  * later export to userland, and in part so we can look at
4548  * this stuff in a crash dump.
4549  */
4550 
4551 static void
4552 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4553 {
4554         uint_t n, nmax;
4555         int i;
4556         struct cpuid_regs *cp;
4557         uint8_t *dp;
4558         uint32_t *iptr;
4559         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4560 
4561         if (cpi->cpi_maxeax < 1)
4562                 return;
4563 
4564         if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4565                 nmax = NMAX_CPI_STD;
4566         /*
4567          * (We already handled n == 0 and n == 1 in the basic pass)
4568          */
4569         for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4570                 /*
4571                  * leaves 6 and 7 were handled in the basic pass
4572                  */
4573                 if (n == 6 || n == 7)
4574                         continue;
4575 
4576                 cp->cp_eax = n;
4577 
4578                 /*
4579                  * CPUID function 4 expects %ecx to be initialized
4580                  * with an index which indicates which cache to return
4581                  * information about. The OS is expected to call function 4
4582                  * with %ecx set to 0, 1, 2, ... until it returns with
4583                  * EAX[4:0] set to 0, which indicates there are no more
4584                  * caches.
4585                  *
4586                  * Here, populate cpi_std[4] with the information returned by
4587                  * function 4 when %ecx == 0, and do the rest in a later pass
4588                  * when dynamic memory allocation becomes available.
4589                  *
4590                  * Note: we need to explicitly initialize %ecx here, since
4591                  * function 4 may have been previously invoked.
4592                  */
4593                 if (n == 4)
4594                         cp->cp_ecx = 0;
4595 
4596                 (void) __cpuid_insn(cp);
4597                 platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4598                 switch (n) {
4599                 case 2:
4600                         /*
4601                          * "the lower 8 bits of the %eax register
4602                          * contain a value that identifies the number
4603                          * of times the cpuid [instruction] has to be
4604                          * executed to obtain a complete image of the
4605                          * processor's caching systems."
4606                          *
4607                          * How *do* they make this stuff up?
4608                          */
4609                         cpi->cpi_ncache = sizeof (*cp) *
4610                             BITX(cp->cp_eax, 7, 0);
4611                         if (cpi->cpi_ncache == 0)
4612                                 break;
4613                         cpi->cpi_ncache--;   /* skip count byte */
4614 
4615                         /*
4616                          * Well, for now, rather than attempt to implement
4617                          * this slightly dubious algorithm, we just look
4618                          * at the first 15 ..
4619                          */
4620                         if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4621                                 cpi->cpi_ncache = sizeof (*cp) - 1;
4622 
4623                         dp = cpi->cpi_cacheinfo;
4624                         if (BITX(cp->cp_eax, 31, 31) == 0) {
4625                                 uint8_t *p = (void *)&cp->cp_eax;
4626                                 for (i = 1; i < 4; i++)
4627                                         if (p[i] != 0)
4628                                                 *dp++ = p[i];
4629                         }
4630                         if (BITX(cp->cp_ebx, 31, 31) == 0) {
4631                                 uint8_t *p = (void *)&cp->cp_ebx;
4632                                 for (i = 0; i < 4; i++)
4633                                         if (p[i] != 0)
4634                                                 *dp++ = p[i];
4635                         }
4636                         if (BITX(cp->cp_ecx, 31, 31) == 0) {
4637                                 uint8_t *p = (void *)&cp->cp_ecx;
4638                                 for (i = 0; i < 4; i++)
4639                                         if (p[i] != 0)
4640                                                 *dp++ = p[i];
4641                         }
4642                         if (BITX(cp->cp_edx, 31, 31) == 0) {
4643                                 uint8_t *p = (void *)&cp->cp_edx;
4644                                 for (i = 0; i < 4; i++)
4645                                         if (p[i] != 0)
4646                                                 *dp++ = p[i];
4647                         }
4648                         break;
4649 
4650                 case 3: /* Processor serial number, if PSN supported */
4651                         break;
4652 
4653                 case 4: /* Deterministic cache parameters */
4654                         break;
4655 
4656                 case 5: /* Monitor/Mwait parameters */
4657                 {
4658                         size_t mwait_size;
4659 
4660                         /*
4661                          * check cpi_mwait.support which was set in
4662                          * cpuid_pass_basic()
4663                          */
4664                         if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4665                                 break;
4666 
4667                         /*
4668                          * Protect ourself from insane mwait line size.
4669                          * Workaround for incomplete hardware emulator(s).
4670                          */
4671                         mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4672                         if (mwait_size < sizeof (uint32_t) ||
4673                             !ISP2(mwait_size)) {
4674 #if DEBUG
4675                                 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4676                                     "size %ld", cpu->cpu_id, (long)mwait_size);
4677 #endif
4678                                 break;
4679                         }
4680 
4681                         cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4682                         cpi->cpi_mwait.mon_max = mwait_size;
4683                         if (MWAIT_EXTENSION(cpi)) {
4684                                 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4685                                 if (MWAIT_INT_ENABLE(cpi))
4686                                         cpi->cpi_mwait.support |=
4687                                             MWAIT_ECX_INT_ENABLE;
4688                         }
4689                         break;
4690                 }
4691                 default:
4692                         break;
4693                 }
4694         }
4695 
4696         /*
4697          * XSAVE enumeration
4698          */
4699         if (cpi->cpi_maxeax >= 0xD) {
4700                 struct cpuid_regs regs;
4701                 boolean_t cpuid_d_valid = B_TRUE;
4702 
4703                 cp = &regs;
4704                 cp->cp_eax = 0xD;
4705                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4706 
4707                 (void) __cpuid_insn(cp);
4708 
4709                 /*
4710                  * Sanity checks for debug
4711                  */
4712                 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4713                     (cp->cp_eax & XFEATURE_SSE) == 0) {
4714                         cpuid_d_valid = B_FALSE;
4715                 }
4716 
4717                 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4718                 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4719                 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4720 
4721                 /*
4722                  * If the hw supports AVX, get the size and offset in the save
4723                  * area for the ymm state.
4724                  */
4725                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4726                         cp->cp_eax = 0xD;
4727                         cp->cp_ecx = 2;
4728                         cp->cp_edx = cp->cp_ebx = 0;
4729 
4730                         (void) __cpuid_insn(cp);
4731 
4732                         if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4733                             cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4734                                 cpuid_d_valid = B_FALSE;
4735                         }
4736 
4737                         cpi->cpi_xsave.ymm_size = cp->cp_eax;
4738                         cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4739                 }
4740 
4741                 /*
4742                  * If the hw supports MPX, get the size and offset in the
4743                  * save area for BNDREGS and BNDCSR.
4744                  */
4745                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4746                         cp->cp_eax = 0xD;
4747                         cp->cp_ecx = 3;
4748                         cp->cp_edx = cp->cp_ebx = 0;
4749 
4750                         (void) __cpuid_insn(cp);
4751 
4752                         cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4753                         cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4754 
4755                         cp->cp_eax = 0xD;
4756                         cp->cp_ecx = 4;
4757                         cp->cp_edx = cp->cp_ebx = 0;
4758 
4759                         (void) __cpuid_insn(cp);
4760 
4761                         cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4762                         cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4763                 }
4764 
4765                 /*
4766                  * If the hw supports AVX512, get the size and offset in the
4767                  * save area for the opmask registers and zmm state.
4768                  */
4769                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4770                         cp->cp_eax = 0xD;
4771                         cp->cp_ecx = 5;
4772                         cp->cp_edx = cp->cp_ebx = 0;
4773 
4774                         (void) __cpuid_insn(cp);
4775 
4776                         cpi->cpi_xsave.opmask_size = cp->cp_eax;
4777                         cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4778 
4779                         cp->cp_eax = 0xD;
4780                         cp->cp_ecx = 6;
4781                         cp->cp_edx = cp->cp_ebx = 0;
4782 
4783                         (void) __cpuid_insn(cp);
4784 
4785                         cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4786                         cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4787 
4788                         cp->cp_eax = 0xD;
4789                         cp->cp_ecx = 7;
4790                         cp->cp_edx = cp->cp_ebx = 0;
4791 
4792                         (void) __cpuid_insn(cp);
4793 
4794                         cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4795                         cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4796                 }
4797 
4798                 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4799                         xsave_state_size = 0;
4800                 } else if (cpuid_d_valid) {
4801                         xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4802                 } else {
4803                         /* Broken CPUID 0xD, probably in HVM */
4804                         cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4805                             "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4806                             ", ymm_size = %d, ymm_offset = %d\n",
4807                             cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4808                             cpi->cpi_xsave.xsav_hw_features_high,
4809                             (int)cpi->cpi_xsave.xsav_max_size,
4810                             (int)cpi->cpi_xsave.ymm_size,
4811                             (int)cpi->cpi_xsave.ymm_offset);
4812 
4813                         if (xsave_state_size != 0) {
4814                                 /*
4815                                  * This must be a non-boot CPU. We cannot
4816                                  * continue, because boot cpu has already
4817                                  * enabled XSAVE.
4818                                  */
4819                                 ASSERT(cpu->cpu_id != 0);
4820                                 cmn_err(CE_PANIC, "cpu%d: we have already "
4821                                     "enabled XSAVE on boot cpu, cannot "
4822                                     "continue.", cpu->cpu_id);
4823                         } else {
4824                                 /*
4825                                  * If we reached here on the boot CPU, it's also
4826                                  * almost certain that we'll reach here on the
4827                                  * non-boot CPUs. When we're here on a boot CPU
4828                                  * we should disable the feature, on a non-boot
4829                                  * CPU we need to confirm that we have.
4830                                  */
4831                                 if (cpu->cpu_id == 0) {
4832                                         remove_x86_feature(x86_featureset,
4833                                             X86FSET_XSAVE);
4834                                         remove_x86_feature(x86_featureset,
4835                                             X86FSET_AVX);
4836                                         remove_x86_feature(x86_featureset,
4837                                             X86FSET_F16C);
4838                                         remove_x86_feature(x86_featureset,
4839                                             X86FSET_BMI1);
4840                                         remove_x86_feature(x86_featureset,
4841                                             X86FSET_BMI2);
4842                                         remove_x86_feature(x86_featureset,
4843                                             X86FSET_FMA);
4844                                         remove_x86_feature(x86_featureset,
4845                                             X86FSET_AVX2);
4846                                         remove_x86_feature(x86_featureset,
4847                                             X86FSET_MPX);
4848                                         remove_x86_feature(x86_featureset,
4849                                             X86FSET_AVX512F);
4850                                         remove_x86_feature(x86_featureset,
4851                                             X86FSET_AVX512DQ);
4852                                         remove_x86_feature(x86_featureset,
4853                                             X86FSET_AVX512PF);
4854                                         remove_x86_feature(x86_featureset,
4855                                             X86FSET_AVX512ER);
4856                                         remove_x86_feature(x86_featureset,
4857                                             X86FSET_AVX512CD);
4858                                         remove_x86_feature(x86_featureset,
4859                                             X86FSET_AVX512BW);
4860                                         remove_x86_feature(x86_featureset,
4861                                             X86FSET_AVX512VL);
4862                                         remove_x86_feature(x86_featureset,
4863                                             X86FSET_AVX512FMA);
4864                                         remove_x86_feature(x86_featureset,
4865                                             X86FSET_AVX512VBMI);
4866                                         remove_x86_feature(x86_featureset,
4867                                             X86FSET_AVX512VNNI);
4868                                         remove_x86_feature(x86_featureset,
4869                                             X86FSET_AVX512VPOPCDQ);
4870                                         remove_x86_feature(x86_featureset,
4871                                             X86FSET_AVX512NNIW);
4872                                         remove_x86_feature(x86_featureset,
4873                                             X86FSET_AVX512FMAPS);
4874                                         remove_x86_feature(x86_featureset,
4875                                             X86FSET_VAES);
4876                                         remove_x86_feature(x86_featureset,
4877                                             X86FSET_VPCLMULQDQ);
4878                                         remove_x86_feature(x86_featureset,
4879                                             X86FSET_GFNI);
4880                                         remove_x86_feature(x86_featureset,
4881                                             X86FSET_AVX512_VP2INT);
4882                                         remove_x86_feature(x86_featureset,
4883                                             X86FSET_AVX512_BITALG);
4884                                         remove_x86_feature(x86_featureset,
4885                                             X86FSET_AVX512_VBMI2);
4886                                         remove_x86_feature(x86_featureset,
4887                                             X86FSET_AVX512_BF16);
4888 
4889                                         xsave_force_disable = B_TRUE;
4890                                 } else {
4891                                         VERIFY(is_x86_feature(x86_featureset,
4892                                             X86FSET_XSAVE) == B_FALSE);
4893                                 }
4894                         }
4895                 }
4896         }
4897 
4898 
4899         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4900                 return;
4901 
4902         if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4903                 nmax = NMAX_CPI_EXTD;
4904         /*
4905          * Copy the extended properties, fixing them as we go. While we start at
4906          * 2 because we've already handled a few cases in the basic pass, the
4907          * rest we let ourselves just grab again (e.g. 0x8, 0x21).
4908          */
4909         iptr = (void *)cpi->cpi_brandstr;
4910         for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4911                 cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4912                 (void) __cpuid_insn(cp);
4913                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4914                     cp);
4915                 switch (n) {
4916                 case 2:
4917                 case 3:
4918                 case 4:
4919                         /*
4920                          * Extract the brand string
4921                          */
4922                         *iptr++ = cp->cp_eax;
4923                         *iptr++ = cp->cp_ebx;
4924                         *iptr++ = cp->cp_ecx;
4925                         *iptr++ = cp->cp_edx;
4926                         break;
4927                 case 5:
4928                         switch (cpi->cpi_vendor) {
4929                         case X86_VENDOR_AMD:
4930                                 /*
4931                                  * The Athlon and Duron were the first
4932                                  * parts to report the sizes of the
4933                                  * TLB for large pages. Before then,
4934                                  * we don't trust the data.
4935                                  */
4936                                 if (cpi->cpi_family < 6 ||
4937                                     (cpi->cpi_family == 6 &&
4938                                     cpi->cpi_model < 1))
4939                                         cp->cp_eax = 0;
4940                                 break;
4941                         default:
4942                                 break;
4943                         }
4944                         break;
4945                 case 6:
4946                         switch (cpi->cpi_vendor) {
4947                         case X86_VENDOR_AMD:
4948                                 /*
4949                                  * The Athlon and Duron were the first
4950                                  * AMD parts with L2 TLB's.
4951                                  * Before then, don't trust the data.
4952                                  */
4953                                 if (cpi->cpi_family < 6 ||
4954                                     (cpi->cpi_family == 6 &&
4955                                     cpi->cpi_model < 1))
4956                                         cp->cp_eax = cp->cp_ebx = 0;
4957                                 /*
4958                                  * AMD Duron rev A0 reports L2
4959                                  * cache size incorrectly as 1K
4960                                  * when it is really 64K
4961                                  */
4962                                 if (cpi->cpi_family == 6 &&
4963                                     cpi->cpi_model == 3 &&
4964                                     cpi->cpi_step == 0) {
4965                                         cp->cp_ecx &= 0xffff;
4966                                         cp->cp_ecx |= 0x400000;
4967                                 }
4968                                 break;
4969                         case X86_VENDOR_Cyrix:  /* VIA C3 */
4970                                 /*
4971                                  * VIA C3 processors are a bit messed
4972                                  * up w.r.t. encoding cache sizes in %ecx
4973                                  */
4974                                 if (cpi->cpi_family != 6)
4975                                         break;
4976                                 /*
4977                                  * model 7 and 8 were incorrectly encoded
4978                                  *
4979                                  * xxx is model 8 really broken?
4980                                  */
4981                                 if (cpi->cpi_model == 7 ||
4982                                     cpi->cpi_model == 8)
4983                                         cp->cp_ecx =
4984                                             BITX(cp->cp_ecx, 31, 24) << 16 |
4985                                             BITX(cp->cp_ecx, 23, 16) << 12 |
4986                                             BITX(cp->cp_ecx, 15, 8) << 8 |
4987                                             BITX(cp->cp_ecx, 7, 0);
4988                                 /*
4989                                  * model 9 stepping 1 has wrong associativity
4990                                  */
4991                                 if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4992                                         cp->cp_ecx |= 8 << 12;
4993                                 break;
4994                         case X86_VENDOR_Intel:
4995                                 /*
4996                                  * Extended L2 Cache features function.
4997                                  * First appeared on Prescott.
4998                                  */
4999                         default:
5000                                 break;
5001                         }
5002                         break;
5003                 default:
5004                         break;
5005                 }
5006         }
5007 }
5008 
5009 static const char *
5010 intel_cpubrand(const struct cpuid_info *cpi)
5011 {
5012         int i;
5013 
5014         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5015 
5016         switch (cpi->cpi_family) {
5017         case 5:
5018                 return ("Intel Pentium(r)");
5019         case 6:
5020                 switch (cpi->cpi_model) {
5021                         uint_t celeron, xeon;
5022                         const struct cpuid_regs *cp;
5023                 case 0:
5024                 case 1:
5025                 case 2:
5026                         return ("Intel Pentium(r) Pro");
5027                 case 3:
5028                 case 4:
5029                         return ("Intel Pentium(r) II");
5030                 case 6:
5031                         return ("Intel Celeron(r)");
5032                 case 5:
5033                 case 7:
5034                         celeron = xeon = 0;
5035                         cp = &cpi->cpi_std[2];   /* cache info */
5036 
5037                         for (i = 1; i < 4; i++) {
5038                                 uint_t tmp;
5039 
5040                                 tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5041                                 if (tmp == 0x40)
5042                                         celeron++;
5043                                 if (tmp >= 0x44 && tmp <= 0x45)
5044                                         xeon++;
5045                         }
5046 
5047                         for (i = 0; i < 2; i++) {
5048                                 uint_t tmp;
5049 
5050                                 tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5051                                 if (tmp == 0x40)
5052                                         celeron++;
5053                                 else if (tmp >= 0x44 && tmp <= 0x45)
5054                                         xeon++;
5055                         }
5056 
5057                         for (i = 0; i < 4; i++) {
5058                                 uint_t tmp;
5059 
5060                                 tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5061                                 if (tmp == 0x40)
5062                                         celeron++;
5063                                 else if (tmp >= 0x44 && tmp <= 0x45)
5064                                         xeon++;
5065                         }
5066 
5067                         for (i = 0; i < 4; i++) {
5068                                 uint_t tmp;
5069 
5070                                 tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5071                                 if (tmp == 0x40)
5072                                         celeron++;
5073                                 else if (tmp >= 0x44 && tmp <= 0x45)
5074                                         xeon++;
5075                         }
5076 
5077                         if (celeron)
5078                                 return ("Intel Celeron(r)");
5079                         if (xeon)
5080                                 return (cpi->cpi_model == 5 ?
5081                                     "Intel Pentium(r) II Xeon(tm)" :
5082                                     "Intel Pentium(r) III Xeon(tm)");
5083                         return (cpi->cpi_model == 5 ?
5084                             "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5085                             "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5086                 default:
5087                         break;
5088                 }
5089         default:
5090                 break;
5091         }
5092 
5093         /* BrandID is present if the field is nonzero */
5094         if (cpi->cpi_brandid != 0) {
5095                 static const struct {
5096                         uint_t bt_bid;
5097                         const char *bt_str;
5098                 } brand_tbl[] = {
5099                         { 0x1,  "Intel(r) Celeron(r)" },
5100                         { 0x2,  "Intel(r) Pentium(r) III" },
5101                         { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
5102                         { 0x4,  "Intel(r) Pentium(r) III" },
5103                         { 0x6,  "Mobile Intel(r) Pentium(r) III" },
5104                         { 0x7,  "Mobile Intel(r) Celeron(r)" },
5105                         { 0x8,  "Intel(r) Pentium(r) 4" },
5106                         { 0x9,  "Intel(r) Pentium(r) 4" },
5107                         { 0xa,  "Intel(r) Celeron(r)" },
5108                         { 0xb,  "Intel(r) Xeon(tm)" },
5109                         { 0xc,  "Intel(r) Xeon(tm) MP" },
5110                         { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
5111                         { 0xf,  "Mobile Intel(r) Celeron(r)" },
5112                         { 0x11, "Mobile Genuine Intel(r)" },
5113                         { 0x12, "Intel(r) Celeron(r) M" },
5114                         { 0x13, "Mobile Intel(r) Celeron(r)" },
5115                         { 0x14, "Intel(r) Celeron(r)" },
5116                         { 0x15, "Mobile Genuine Intel(r)" },
5117                         { 0x16, "Intel(r) Pentium(r) M" },
5118                         { 0x17, "Mobile Intel(r) Celeron(r)" }
5119                 };
5120                 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5121                 uint_t sgn;
5122 
5123                 sgn = (cpi->cpi_family << 8) |
5124                     (cpi->cpi_model << 4) | cpi->cpi_step;
5125 
5126                 for (i = 0; i < btblmax; i++)
5127                         if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5128                                 break;
5129                 if (i < btblmax) {
5130                         if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5131                                 return ("Intel(r) Celeron(r)");
5132                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5133                                 return ("Intel(r) Xeon(tm) MP");
5134                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5135                                 return ("Intel(r) Xeon(tm)");
5136                         return (brand_tbl[i].bt_str);
5137                 }
5138         }
5139 
5140         return (NULL);
5141 }
5142 
5143 static const char *
5144 amd_cpubrand(const struct cpuid_info *cpi)
5145 {
5146         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5147 
5148         switch (cpi->cpi_family) {
5149         case 5:
5150                 switch (cpi->cpi_model) {
5151                 case 0:
5152                 case 1:
5153                 case 2:
5154                 case 3:
5155                 case 4:
5156                 case 5:
5157                         return ("AMD-K5(r)");
5158                 case 6:
5159                 case 7:
5160                         return ("AMD-K6(r)");
5161                 case 8:
5162                         return ("AMD-K6(r)-2");
5163                 case 9:
5164                         return ("AMD-K6(r)-III");
5165                 default:
5166                         return ("AMD (family 5)");
5167                 }
5168         case 6:
5169                 switch (cpi->cpi_model) {
5170                 case 1:
5171                         return ("AMD-K7(tm)");
5172                 case 0:
5173                 case 2:
5174                 case 4:
5175                         return ("AMD Athlon(tm)");
5176                 case 3:
5177                 case 7:
5178                         return ("AMD Duron(tm)");
5179                 case 6:
5180                 case 8:
5181                 case 10:
5182                         /*
5183                          * Use the L2 cache size to distinguish
5184                          */
5185                         return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5186                             "AMD Athlon(tm)" : "AMD Duron(tm)");
5187                 default:
5188                         return ("AMD (family 6)");
5189                 }
5190         default:
5191                 break;
5192         }
5193 
5194         if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5195             cpi->cpi_brandid != 0) {
5196                 switch (BITX(cpi->cpi_brandid, 7, 5)) {
5197                 case 3:
5198                         return ("AMD Opteron(tm) UP 1xx");
5199                 case 4:
5200                         return ("AMD Opteron(tm) DP 2xx");
5201                 case 5:
5202                         return ("AMD Opteron(tm) MP 8xx");
5203                 default:
5204                         return ("AMD Opteron(tm)");
5205                 }
5206         }
5207 
5208         return (NULL);
5209 }
5210 
5211 static const char *
5212 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5213 {
5214         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5215 
5216         switch (type) {
5217         case X86_TYPE_CYRIX_6x86:
5218                 return ("Cyrix 6x86");
5219         case X86_TYPE_CYRIX_6x86L:
5220                 return ("Cyrix 6x86L");
5221         case X86_TYPE_CYRIX_6x86MX:
5222                 return ("Cyrix 6x86MX");
5223         case X86_TYPE_CYRIX_GXm:
5224                 return ("Cyrix GXm");
5225         case X86_TYPE_CYRIX_MediaGX:
5226                 return ("Cyrix MediaGX");
5227         case X86_TYPE_CYRIX_MII:
5228                 return ("Cyrix M2");
5229         case X86_TYPE_VIA_CYRIX_III:
5230                 return ("VIA Cyrix M3");
5231         default:
5232                 /*
5233                  * Have another wild guess ..
5234                  */
5235                 if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5236                         return ("Cyrix 5x86");
5237                 else if (cpi->cpi_family == 5) {
5238                         switch (cpi->cpi_model) {
5239                         case 2:
5240                                 return ("Cyrix 6x86");  /* Cyrix M1 */
5241                         case 4:
5242                                 return ("Cyrix MediaGX");
5243                         default:
5244                                 break;
5245                         }
5246                 } else if (cpi->cpi_family == 6) {
5247                         switch (cpi->cpi_model) {
5248                         case 0:
5249                                 return ("Cyrix 6x86MX"); /* Cyrix M2? */
5250                         case 5:
5251                         case 6:
5252                         case 7:
5253                         case 8:
5254                         case 9:
5255                                 return ("VIA C3");
5256                         default:
5257                                 break;
5258                         }
5259                 }
5260                 break;
5261         }
5262         return (NULL);
5263 }
5264 
5265 /*
5266  * This only gets called in the case that the CPU extended
5267  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5268  * aren't available, or contain null bytes for some reason.
5269  */
5270 static void
5271 fabricate_brandstr(struct cpuid_info *cpi)
5272 {
5273         const char *brand = NULL;
5274 
5275         switch (cpi->cpi_vendor) {
5276         case X86_VENDOR_Intel:
5277                 brand = intel_cpubrand(cpi);
5278                 break;
5279         case X86_VENDOR_AMD:
5280                 brand = amd_cpubrand(cpi);
5281                 break;
5282         case X86_VENDOR_Cyrix:
5283                 brand = cyrix_cpubrand(cpi, x86_type);
5284                 break;
5285         case X86_VENDOR_NexGen:
5286                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5287                         brand = "NexGen Nx586";
5288                 break;
5289         case X86_VENDOR_Centaur:
5290                 if (cpi->cpi_family == 5)
5291                         switch (cpi->cpi_model) {
5292                         case 4:
5293                                 brand = "Centaur C6";
5294                                 break;
5295                         case 8:
5296                                 brand = "Centaur C2";
5297                                 break;
5298                         case 9:
5299                                 brand = "Centaur C3";
5300                                 break;
5301                         default:
5302                                 break;
5303                         }
5304                 break;
5305         case X86_VENDOR_Rise:
5306                 if (cpi->cpi_family == 5 &&
5307                     (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5308                         brand = "Rise mP6";
5309                 break;
5310         case X86_VENDOR_SiS:
5311                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5312                         brand = "SiS 55x";
5313                 break;
5314         case X86_VENDOR_TM:
5315                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5316                         brand = "Transmeta Crusoe TM3x00 or TM5x00";
5317                 break;
5318         case X86_VENDOR_NSC:
5319         case X86_VENDOR_UMC:
5320         default:
5321                 break;
5322         }
5323         if (brand) {
5324                 (void) strcpy((char *)cpi->cpi_brandstr, brand);
5325                 return;
5326         }
5327 
5328         /*
5329          * If all else fails ...
5330          */
5331         (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5332             "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5333             cpi->cpi_model, cpi->cpi_step);
5334 }
5335 
5336 /*
5337  * This routine is called just after kernel memory allocation
5338  * becomes available on cpu0, and as part of mp_startup() on
5339  * the other cpus.
5340  *
5341  * Fixup the brand string, and collect any information from cpuid
5342  * that requires dynamically allocated storage to represent.
5343  */
5344 
5345 static void
5346 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5347 {
5348         int     i, max, shft, level, size;
5349         struct cpuid_regs regs;
5350         struct cpuid_regs *cp;
5351         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5352 
5353         /*
5354          * Deterministic cache parameters
5355          *
5356          * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5357          * values that are present are currently defined to be the same. This
5358          * means we can use the same logic to parse it as long as we use the
5359          * appropriate leaf to get the data. If you're updating this, make sure
5360          * you're careful about which vendor supports which aspect.
5361          *
5362          * Take this opportunity to detect the number of threads sharing the
5363          * last level cache, and construct a corresponding cache id. The
5364          * respective cpuid_info members are initialized to the default case of
5365          * "no last level cache sharing".
5366          */
5367         cpi->cpi_ncpu_shr_last_cache = 1;
5368         cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5369 
5370         if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5371             ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5372             cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5373             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5374             is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5375                 uint32_t leaf;
5376 
5377                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5378                         leaf = 4;
5379                 } else {
5380                         leaf = CPUID_LEAF_EXT_1d;
5381                 }
5382 
5383                 /*
5384                  * Find the # of elements (size) returned by the leaf and along
5385                  * the way detect last level cache sharing details.
5386                  */
5387                 bzero(&regs, sizeof (regs));
5388                 cp = &regs;
5389                 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5390                         cp->cp_eax = leaf;
5391                         cp->cp_ecx = i;
5392 
5393                         (void) __cpuid_insn(cp);
5394 
5395                         if (CPI_CACHE_TYPE(cp) == 0)
5396                                 break;
5397                         level = CPI_CACHE_LVL(cp);
5398                         if (level > max) {
5399                                 max = level;
5400                                 cpi->cpi_ncpu_shr_last_cache =
5401                                     CPI_NTHR_SHR_CACHE(cp) + 1;
5402                         }
5403                 }
5404                 cpi->cpi_cache_leaf_size = size = i;
5405 
5406                 /*
5407                  * Allocate the cpi_cache_leaves array. The first element
5408                  * references the regs for the corresponding leaf with %ecx set
5409                  * to 0. This was gathered in cpuid_pass_extended().
5410                  */
5411                 if (size > 0) {
5412                         cpi->cpi_cache_leaves =
5413                             kmem_alloc(size * sizeof (cp), KM_SLEEP);
5414                         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5415                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5416                         } else {
5417                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5418                         }
5419 
5420                         /*
5421                          * Allocate storage to hold the additional regs
5422                          * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5423                          *
5424                          * The regs for the leaf, %ecx == 0 has already
5425                          * been allocated as indicated above.
5426                          */
5427                         for (i = 1; i < size; i++) {
5428                                 cp = cpi->cpi_cache_leaves[i] =
5429                                     kmem_zalloc(sizeof (regs), KM_SLEEP);
5430                                 cp->cp_eax = leaf;
5431                                 cp->cp_ecx = i;
5432 
5433                                 (void) __cpuid_insn(cp);
5434                         }
5435                 }
5436                 /*
5437                  * Determine the number of bits needed to represent
5438                  * the number of CPUs sharing the last level cache.
5439                  *
5440                  * Shift off that number of bits from the APIC id to
5441                  * derive the cache id.
5442                  */
5443                 shft = 0;
5444                 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5445                         shft++;
5446                 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5447         }
5448 
5449         /*
5450          * Now fixup the brand string
5451          */
5452         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5453                 fabricate_brandstr(cpi);
5454         } else {
5455 
5456                 /*
5457                  * If we successfully extracted a brand string from the cpuid
5458                  * instruction, clean it up by removing leading spaces and
5459                  * similar junk.
5460                  */
5461                 if (cpi->cpi_brandstr[0]) {
5462                         size_t maxlen = sizeof (cpi->cpi_brandstr);
5463                         char *src, *dst;
5464 
5465                         dst = src = (char *)cpi->cpi_brandstr;
5466                         src[maxlen - 1] = '\0';
5467                         /*
5468                          * strip leading spaces
5469                          */
5470                         while (*src == ' ')
5471                                 src++;
5472                         /*
5473                          * Remove any 'Genuine' or "Authentic" prefixes
5474                          */
5475                         if (strncmp(src, "Genuine ", 8) == 0)
5476                                 src += 8;
5477                         if (strncmp(src, "Authentic ", 10) == 0)
5478                                 src += 10;
5479 
5480                         /*
5481                          * Now do an in-place copy.
5482                          * Map (R) to (r) and (TM) to (tm).
5483                          * The era of teletypes is long gone, and there's
5484                          * -really- no need to shout.
5485                          */
5486                         while (*src != '\0') {
5487                                 if (src[0] == '(') {
5488                                         if (strncmp(src + 1, "R)", 2) == 0) {
5489                                                 (void) strncpy(dst, "(r)", 3);
5490                                                 src += 3;
5491                                                 dst += 3;
5492                                                 continue;
5493                                         }
5494                                         if (strncmp(src + 1, "TM)", 3) == 0) {
5495                                                 (void) strncpy(dst, "(tm)", 4);
5496                                                 src += 4;
5497                                                 dst += 4;
5498                                                 continue;
5499                                         }
5500                                 }
5501                                 *dst++ = *src++;
5502                         }
5503                         *dst = '\0';
5504 
5505                         /*
5506                          * Finally, remove any trailing spaces
5507                          */
5508                         while (--dst > cpi->cpi_brandstr)
5509                                 if (*dst == ' ')
5510                                         *dst = '\0';
5511                                 else
5512                                         break;
5513                 } else
5514                         fabricate_brandstr(cpi);
5515         }
5516 }
5517 
5518 typedef struct {
5519         uint32_t avm_av;
5520         uint32_t avm_feat;
5521 } av_feat_map_t;
5522 
5523 /*
5524  * These arrays are used to map features that we should add based on x86
5525  * features that are present. As a large number depend on kernel features,
5526  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5527  * There is an array of these for each hwcap word. Some features aren't tracked
5528  * in the kernel x86 featureset and that's ok. They will not show up in here.
5529  */
5530 static const av_feat_map_t x86fset_to_av1[] = {
5531         { AV_386_CX8, X86FSET_CX8 },
5532         { AV_386_SEP, X86FSET_SEP },
5533         { AV_386_AMD_SYSC, X86FSET_ASYSC },
5534         { AV_386_CMOV, X86FSET_CMOV },
5535         { AV_386_FXSR, X86FSET_SSE },
5536         { AV_386_SSE, X86FSET_SSE },
5537         { AV_386_SSE2, X86FSET_SSE2 },
5538         { AV_386_SSE3, X86FSET_SSE3 },
5539         { AV_386_CX16, X86FSET_CX16 },
5540         { AV_386_TSCP, X86FSET_TSCP },
5541         { AV_386_AMD_SSE4A, X86FSET_SSE4A },
5542         { AV_386_SSSE3, X86FSET_SSSE3 },
5543         { AV_386_SSE4_1, X86FSET_SSE4_1 },
5544         { AV_386_SSE4_2, X86FSET_SSE4_2 },
5545         { AV_386_AES, X86FSET_AES },
5546         { AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5547         { AV_386_XSAVE, X86FSET_XSAVE },
5548         { AV_386_AVX, X86FSET_AVX },
5549         { AV_386_VMX, X86FSET_VMX },
5550         { AV_386_AMD_SVM, X86FSET_SVM }
5551 };
5552 
5553 static const av_feat_map_t x86fset_to_av2[] = {
5554         { AV_386_2_F16C, X86FSET_F16C },
5555         { AV_386_2_RDRAND, X86FSET_RDRAND },
5556         { AV_386_2_BMI1, X86FSET_BMI1 },
5557         { AV_386_2_BMI2, X86FSET_BMI2 },
5558         { AV_386_2_FMA, X86FSET_FMA },
5559         { AV_386_2_AVX2, X86FSET_AVX2 },
5560         { AV_386_2_ADX, X86FSET_ADX },
5561         { AV_386_2_RDSEED, X86FSET_RDSEED },
5562         { AV_386_2_AVX512F, X86FSET_AVX512F },
5563         { AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5564         { AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5565         { AV_386_2_AVX512PF, X86FSET_AVX512PF },
5566         { AV_386_2_AVX512ER, X86FSET_AVX512ER },
5567         { AV_386_2_AVX512CD, X86FSET_AVX512CD },
5568         { AV_386_2_AVX512BW, X86FSET_AVX512BW },
5569         { AV_386_2_AVX512VL, X86FSET_AVX512VL },
5570         { AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5571         { AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5572         { AV_386_2_SHA, X86FSET_SHA },
5573         { AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5574         { AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5575         { AV_386_2_CLWB, X86FSET_CLWB },
5576         { AV_386_2_MONITORX, X86FSET_MONITORX },
5577         { AV_386_2_CLZERO, X86FSET_CLZERO },
5578         { AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5579         { AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5580         { AV_386_2_VAES, X86FSET_VAES },
5581         { AV_386_2_GFNI, X86FSET_GFNI },
5582         { AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5583         { AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5584 };
5585 
5586 static const av_feat_map_t x86fset_to_av3[] = {
5587         { AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5588         { AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5589 };
5590 
5591 /*
5592  * This routine is called out of bind_hwcap() much later in the life
5593  * of the kernel (post_startup()).  The job of this routine is to resolve
5594  * the hardware feature support and kernel support for those features into
5595  * what we're actually going to tell applications via the aux vector.
5596  *
5597  * Most of the aux vector is derived from the x86_featureset array vector where
5598  * a given feature indicates that an aux vector should be plumbed through. This
5599  * allows the kernel to use one tracking mechanism for these based on whether or
5600  * not it has the required hardware support (most often xsave). Most newer
5601  * features are added there in case we need them in the kernel. Otherwise,
5602  * features are evaluated based on looking at the cpuid features that remain. If
5603  * you find yourself wanting to clear out cpuid features for some reason, they
5604  * should instead be driven by the feature set so we have a consistent view.
5605  */
5606 
5607 static void
5608 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5609 {
5610         uint_t *hwcap_out = (uint_t *)arg;
5611         struct cpuid_info *cpi;
5612         uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5613 
5614         cpi = cpu->cpu_m.mcpu_cpi;
5615 
5616         for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5617                 if (is_x86_feature(x86_featureset,
5618                     x86fset_to_av1[i].avm_feat)) {
5619                         hwcap_flags |= x86fset_to_av1[i].avm_av;
5620                 }
5621         }
5622 
5623         for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5624                 if (is_x86_feature(x86_featureset,
5625                     x86fset_to_av2[i].avm_feat)) {
5626                         hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5627                 }
5628         }
5629 
5630         for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5631                 if (is_x86_feature(x86_featureset,
5632                     x86fset_to_av3[i].avm_feat)) {
5633                         hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5634                 }
5635         }
5636 
5637         /*
5638          * From here on out we're working through features that don't have
5639          * corresponding kernel feature flags for various reasons that are
5640          * mostly just due to the historical implementation.
5641          */
5642         if (cpi->cpi_maxeax >= 1) {
5643                 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5644                 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5645 
5646                 *edx = CPI_FEATURES_EDX(cpi);
5647                 *ecx = CPI_FEATURES_ECX(cpi);
5648 
5649                 /*
5650                  * [no explicit support required beyond x87 fp context]
5651                  */
5652                 if (!fpu_exists)
5653                         *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5654 
5655                 /*
5656                  * Now map the supported feature vector to things that we
5657                  * think userland will care about.
5658                  */
5659                 if (*ecx & CPUID_INTC_ECX_MOVBE)
5660                         hwcap_flags |= AV_386_MOVBE;
5661 
5662                 if (*ecx & CPUID_INTC_ECX_POPCNT)
5663                         hwcap_flags |= AV_386_POPCNT;
5664                 if (*edx & CPUID_INTC_EDX_FPU)
5665                         hwcap_flags |= AV_386_FPU;
5666                 if (*edx & CPUID_INTC_EDX_MMX)
5667                         hwcap_flags |= AV_386_MMX;
5668                 if (*edx & CPUID_INTC_EDX_TSC)
5669                         hwcap_flags |= AV_386_TSC;
5670         }
5671 
5672         /*
5673          * Check a few miscellaneous features.
5674          */
5675         if (cpi->cpi_xmaxeax < 0x80000001)
5676                 goto resolve_done;
5677 
5678         switch (cpi->cpi_vendor) {
5679                 uint32_t *edx, *ecx;
5680 
5681         case X86_VENDOR_Intel:
5682                 /*
5683                  * Seems like Intel duplicated what we necessary
5684                  * here to make the initial crop of 64-bit OS's work.
5685                  * Hopefully, those are the only "extended" bits
5686                  * they'll add.
5687                  */
5688                 /*FALLTHROUGH*/
5689 
5690         case X86_VENDOR_AMD:
5691         case X86_VENDOR_HYGON:
5692                 edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5693                 ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5694 
5695                 *edx = CPI_FEATURES_XTD_EDX(cpi);
5696                 *ecx = CPI_FEATURES_XTD_ECX(cpi);
5697 
5698                 /*
5699                  * [no explicit support required beyond
5700                  * x87 fp context and exception handlers]
5701                  */
5702                 if (!fpu_exists)
5703                         *edx &= ~(CPUID_AMD_EDX_MMXamd |
5704                             CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5705 
5706                 /*
5707                  * Now map the supported feature vector to
5708                  * things that we think userland will care about.
5709                  */
5710                 if (*edx & CPUID_AMD_EDX_MMXamd)
5711                         hwcap_flags |= AV_386_AMD_MMX;
5712                 if (*edx & CPUID_AMD_EDX_3DNow)
5713                         hwcap_flags |= AV_386_AMD_3DNow;
5714                 if (*edx & CPUID_AMD_EDX_3DNowx)
5715                         hwcap_flags |= AV_386_AMD_3DNowx;
5716 
5717                 switch (cpi->cpi_vendor) {
5718                 case X86_VENDOR_AMD:
5719                 case X86_VENDOR_HYGON:
5720                         if (*ecx & CPUID_AMD_ECX_AHF64)
5721                                 hwcap_flags |= AV_386_AHF;
5722                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5723                                 hwcap_flags |= AV_386_AMD_LZCNT;
5724                         break;
5725 
5726                 case X86_VENDOR_Intel:
5727                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5728                                 hwcap_flags |= AV_386_AMD_LZCNT;
5729                         /*
5730                          * Aarrgh.
5731                          * Intel uses a different bit in the same word.
5732                          */
5733                         if (*ecx & CPUID_INTC_ECX_AHF64)
5734                                 hwcap_flags |= AV_386_AHF;
5735                         break;
5736                 default:
5737                         break;
5738                 }
5739                 break;
5740 
5741         default:
5742                 break;
5743         }
5744 
5745 resolve_done:
5746         if (hwcap_out != NULL) {
5747                 hwcap_out[0] = hwcap_flags;
5748                 hwcap_out[1] = hwcap_flags_2;
5749                 hwcap_out[2] = hwcap_flags_3;
5750         }
5751 }
5752 
5753 
5754 /*
5755  * Simulate the cpuid instruction using the data we previously
5756  * captured about this CPU.  We try our best to return the truth
5757  * about the hardware, independently of kernel support.
5758  */
5759 uint32_t
5760 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5761 {
5762         struct cpuid_info *cpi;
5763         struct cpuid_regs *xcp;
5764 
5765         if (cpu == NULL)
5766                 cpu = CPU;
5767         cpi = cpu->cpu_m.mcpu_cpi;
5768 
5769         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5770 
5771         /*
5772          * CPUID data is cached in two separate places: cpi_std for standard
5773          * CPUID leaves , and cpi_extd for extended CPUID leaves.
5774          */
5775         if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5776                 xcp = &cpi->cpi_std[cp->cp_eax];
5777         } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5778             cp->cp_eax <= cpi->cpi_xmaxeax &&
5779             cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5780                 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5781         } else {
5782                 /*
5783                  * The caller is asking for data from an input parameter which
5784                  * the kernel has not cached.  In this case we go fetch from
5785                  * the hardware and return the data directly to the user.
5786                  */
5787                 return (__cpuid_insn(cp));
5788         }
5789 
5790         cp->cp_eax = xcp->cp_eax;
5791         cp->cp_ebx = xcp->cp_ebx;
5792         cp->cp_ecx = xcp->cp_ecx;
5793         cp->cp_edx = xcp->cp_edx;
5794         return (cp->cp_eax);
5795 }
5796 
5797 boolean_t
5798 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
5799 {
5800         return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5801             cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5802 }
5803 
5804 int
5805 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5806 {
5807         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5808 
5809         return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5810 }
5811 
5812 int
5813 cpuid_is_cmt(cpu_t *cpu)
5814 {
5815         if (cpu == NULL)
5816                 cpu = CPU;
5817 
5818         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5819 
5820         return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5821 }
5822 
5823 /*
5824  * AMD and Intel both implement the 64-bit variant of the syscall
5825  * instruction (syscallq), so if there's -any- support for syscall,
5826  * cpuid currently says "yes, we support this".
5827  *
5828  * However, Intel decided to -not- implement the 32-bit variant of the
5829  * syscall instruction, so we provide a predicate to allow our caller
5830  * to test that subtlety here.
5831  *
5832  * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5833  *      even in the case where the hardware would in fact support it.
5834  */
5835 /*ARGSUSED*/
5836 int
5837 cpuid_syscall32_insn(cpu_t *cpu)
5838 {
5839         ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
5840 
5841 #if !defined(__xpv)
5842         if (cpu == NULL)
5843                 cpu = CPU;
5844 
5845         /*CSTYLED*/
5846         {
5847                 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5848 
5849                 if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5850                     cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5851                     cpi->cpi_xmaxeax >= 0x80000001 &&
5852                     (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5853                         return (1);
5854         }
5855 #endif
5856         return (0);
5857 }
5858 
5859 int
5860 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5861 {
5862         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5863 
5864         static const char fmt[] =
5865             "x86 (%s %X family %d model %d step %d clock %d MHz)";
5866         static const char fmt_ht[] =
5867             "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5868 
5869         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5870 
5871         if (cpuid_is_cmt(cpu))
5872                 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5873                     cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5874                     cpi->cpi_family, cpi->cpi_model,
5875                     cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5876         return (snprintf(s, n, fmt,
5877             cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5878             cpi->cpi_family, cpi->cpi_model,
5879             cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5880 }
5881 
5882 const char *
5883 cpuid_getvendorstr(cpu_t *cpu)
5884 {
5885         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5886         return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5887 }
5888 
5889 uint_t
5890 cpuid_getvendor(cpu_t *cpu)
5891 {
5892         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5893         return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5894 }
5895 
5896 uint_t
5897 cpuid_getfamily(cpu_t *cpu)
5898 {
5899         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5900         return (cpu->cpu_m.mcpu_cpi->cpi_family);
5901 }
5902 
5903 uint_t
5904 cpuid_getmodel(cpu_t *cpu)
5905 {
5906         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5907         return (cpu->cpu_m.mcpu_cpi->cpi_model);
5908 }
5909 
5910 uint_t
5911 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5912 {
5913         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5914         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5915 }
5916 
5917 uint_t
5918 cpuid_get_ncore_per_chip(cpu_t *cpu)
5919 {
5920         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5921         return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5922 }
5923 
5924 uint_t
5925 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5926 {
5927         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5928         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5929 }
5930 
5931 id_t
5932 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5933 {
5934         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5935         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5936 }
5937 
5938 uint_t
5939 cpuid_getstep(cpu_t *cpu)
5940 {
5941         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5942         return (cpu->cpu_m.mcpu_cpi->cpi_step);
5943 }
5944 
5945 uint_t
5946 cpuid_getsig(struct cpu *cpu)
5947 {
5948         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5949         return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5950 }
5951 
5952 uint32_t
5953 cpuid_getchiprev(struct cpu *cpu)
5954 {
5955         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5956         return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5957 }
5958 
5959 const char *
5960 cpuid_getchiprevstr(struct cpu *cpu)
5961 {
5962         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5963         return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5964 }
5965 
5966 uint32_t
5967 cpuid_getsockettype(struct cpu *cpu)
5968 {
5969         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5970         return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5971 }
5972 
5973 const char *
5974 cpuid_getsocketstr(cpu_t *cpu)
5975 {
5976         static const char *socketstr = NULL;
5977         struct cpuid_info *cpi;
5978 
5979         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5980         cpi = cpu->cpu_m.mcpu_cpi;
5981 
5982         /* Assume that socket types are the same across the system */
5983         if (socketstr == NULL)
5984                 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5985                     cpi->cpi_model, cpi->cpi_step);
5986 
5987 
5988         return (socketstr);
5989 }
5990 
5991 x86_uarchrev_t
5992 cpuid_getuarchrev(cpu_t *cpu)
5993 {
5994         return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
5995 }
5996 
5997 int
5998 cpuid_get_chipid(cpu_t *cpu)
5999 {
6000         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6001 
6002         if (cpuid_is_cmt(cpu))
6003                 return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6004         return (cpu->cpu_id);
6005 }
6006 
6007 id_t
6008 cpuid_get_coreid(cpu_t *cpu)
6009 {
6010         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6011         return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6012 }
6013 
6014 int
6015 cpuid_get_pkgcoreid(cpu_t *cpu)
6016 {
6017         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6018         return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6019 }
6020 
6021 int
6022 cpuid_get_clogid(cpu_t *cpu)
6023 {
6024         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6025         return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6026 }
6027 
6028 int
6029 cpuid_get_cacheid(cpu_t *cpu)
6030 {
6031         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6032         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6033 }
6034 
6035 uint_t
6036 cpuid_get_procnodeid(cpu_t *cpu)
6037 {
6038         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6039         return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6040 }
6041 
6042 uint_t
6043 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6044 {
6045         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6046         return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6047 }
6048 
6049 uint_t
6050 cpuid_get_compunitid(cpu_t *cpu)
6051 {
6052         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6053         return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6054 }
6055 
6056 uint_t
6057 cpuid_get_cores_per_compunit(cpu_t *cpu)
6058 {
6059         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6060         return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6061 }
6062 
6063 uint32_t
6064 cpuid_get_apicid(cpu_t *cpu)
6065 {
6066         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6067         if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6068                 return (UINT32_MAX);
6069         } else {
6070                 return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6071         }
6072 }
6073 
6074 void
6075 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6076 {
6077         struct cpuid_info *cpi;
6078 
6079         if (cpu == NULL)
6080                 cpu = CPU;
6081         cpi = cpu->cpu_m.mcpu_cpi;
6082 
6083         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6084 
6085         if (pabits)
6086                 *pabits = cpi->cpi_pabits;
6087         if (vabits)
6088                 *vabits = cpi->cpi_vabits;
6089 }
6090 
6091 size_t
6092 cpuid_get_xsave_size(void)
6093 {
6094         return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6095             sizeof (struct xsave_state)));
6096 }
6097 
6098 /*
6099  * Export information about known offsets to the kernel. We only care about
6100  * things we have actually enabled support for in %xcr0.
6101  */
6102 void
6103 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6104 {
6105         size_t size, off;
6106 
6107         VERIFY3U(bit & xsave_bv_all, !=, 0);
6108 
6109         if (sizep == NULL)
6110                 sizep = &size;
6111         if (offp == NULL)
6112                 offp = &off;
6113 
6114         switch (bit) {
6115         case XFEATURE_LEGACY_FP:
6116         case XFEATURE_SSE:
6117                 *sizep = sizeof (struct fxsave_state);
6118                 *offp = 0;
6119                 break;
6120         case XFEATURE_AVX:
6121                 *sizep = cpuid_info0.cpi_xsave.ymm_size;
6122                 *offp = cpuid_info0.cpi_xsave.ymm_offset;
6123                 break;
6124         case XFEATURE_AVX512_OPMASK:
6125                 *sizep = cpuid_info0.cpi_xsave.opmask_size;
6126                 *offp = cpuid_info0.cpi_xsave.opmask_offset;
6127                 break;
6128         case XFEATURE_AVX512_ZMM:
6129                 *sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6130                 *offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6131                 break;
6132         case XFEATURE_AVX512_HI_ZMM:
6133                 *sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6134                 *offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6135                 break;
6136         default:
6137                 panic("asked for unsupported xsave feature: 0x%lx", bit);
6138         }
6139 }
6140 
6141 /*
6142  * Return true if the CPUs on this system require 'pointer clearing' for the
6143  * floating point error pointer exception handling. In the past, this has been
6144  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6145  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6146  * feature bit and is reflected in the cpi_fp_amd_save member.
6147  */
6148 boolean_t
6149 cpuid_need_fp_excp_handling(void)
6150 {
6151         return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6152             cpuid_info0.cpi_fp_amd_save != 0);
6153 }
6154 
6155 /*
6156  * Returns the number of data TLB entries for a corresponding
6157  * pagesize.  If it can't be computed, or isn't known, the
6158  * routine returns zero.  If you ask about an architecturally
6159  * impossible pagesize, the routine will panic (so that the
6160  * hat implementor knows that things are inconsistent.)
6161  */
6162 uint_t
6163 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6164 {
6165         struct cpuid_info *cpi;
6166         uint_t dtlb_nent = 0;
6167 
6168         if (cpu == NULL)
6169                 cpu = CPU;
6170         cpi = cpu->cpu_m.mcpu_cpi;
6171 
6172         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6173 
6174         /*
6175          * Check the L2 TLB info
6176          */
6177         if (cpi->cpi_xmaxeax >= 0x80000006) {
6178                 struct cpuid_regs *cp = &cpi->cpi_extd[6];
6179 
6180                 switch (pagesize) {
6181 
6182                 case 4 * 1024:
6183                         /*
6184                          * All zero in the top 16 bits of the register
6185                          * indicates a unified TLB. Size is in low 16 bits.
6186                          */
6187                         if ((cp->cp_ebx & 0xffff0000) == 0)
6188                                 dtlb_nent = cp->cp_ebx & 0x0000ffff;
6189                         else
6190                                 dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6191                         break;
6192 
6193                 case 2 * 1024 * 1024:
6194                         if ((cp->cp_eax & 0xffff0000) == 0)
6195                                 dtlb_nent = cp->cp_eax & 0x0000ffff;
6196                         else
6197                                 dtlb_nent = BITX(cp->cp_eax, 27, 16);
6198                         break;
6199 
6200                 default:
6201                         panic("unknown L2 pagesize");
6202                         /*NOTREACHED*/
6203                 }
6204         }
6205 
6206         if (dtlb_nent != 0)
6207                 return (dtlb_nent);
6208 
6209         /*
6210          * No L2 TLB support for this size, try L1.
6211          */
6212         if (cpi->cpi_xmaxeax >= 0x80000005) {
6213                 struct cpuid_regs *cp = &cpi->cpi_extd[5];
6214 
6215                 switch (pagesize) {
6216                 case 4 * 1024:
6217                         dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6218                         break;
6219                 case 2 * 1024 * 1024:
6220                         dtlb_nent = BITX(cp->cp_eax, 23, 16);
6221                         break;
6222                 default:
6223                         panic("unknown L1 d-TLB pagesize");
6224                         /*NOTREACHED*/
6225                 }
6226         }
6227 
6228         return (dtlb_nent);
6229 }
6230 
6231 /*
6232  * Return 0 if the erratum is not present or not applicable, positive
6233  * if it is, and negative if the status of the erratum is unknown.
6234  *
6235  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6236  * Processors" #25759, Rev 3.57, August 2005
6237  */
6238 int
6239 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6240 {
6241         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6242         uint_t eax;
6243 
6244         /*
6245          * Bail out if this CPU isn't an AMD CPU, or if it's
6246          * a legacy (32-bit) AMD CPU.
6247          */
6248         if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6249             cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6250             cpi->cpi_family == 6) {
6251                 return (0);
6252         }
6253 
6254         eax = cpi->cpi_std[1].cp_eax;
6255 
6256 #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
6257 #define SH_B3(eax)      (eax == 0xf51)
6258 #define B(eax)          (SH_B0(eax) || SH_B3(eax))
6259 
6260 #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
6261 
6262 #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6263 #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6264 #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
6265 #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6266 
6267 #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6268 #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
6269 #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
6270 #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6271 
6272 #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6273 #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
6274 #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
6275 #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
6276 #define BH_E4(eax)      (eax == 0x20fb1)
6277 #define SH_E5(eax)      (eax == 0x20f42)
6278 #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
6279 #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
6280 #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6281                             SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6282                             DH_E6(eax) || JH_E6(eax))
6283 
6284 #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6285 #define DR_B0(eax)      (eax == 0x100f20)
6286 #define DR_B1(eax)      (eax == 0x100f21)
6287 #define DR_BA(eax)      (eax == 0x100f2a)
6288 #define DR_B2(eax)      (eax == 0x100f22)
6289 #define DR_B3(eax)      (eax == 0x100f23)
6290 #define RB_C0(eax)      (eax == 0x100f40)
6291 
6292         switch (erratum) {
6293         case 1:
6294                 return (cpi->cpi_family < 0x10);
6295         case 51:        /* what does the asterisk mean? */
6296                 return (B(eax) || SH_C0(eax) || CG(eax));
6297         case 52:
6298                 return (B(eax));
6299         case 57:
6300                 return (cpi->cpi_family <= 0x11);
6301         case 58:
6302                 return (B(eax));
6303         case 60:
6304                 return (cpi->cpi_family <= 0x11);
6305         case 61:
6306         case 62:
6307         case 63:
6308         case 64:
6309         case 65:
6310         case 66:
6311         case 68:
6312         case 69:
6313         case 70:
6314         case 71:
6315                 return (B(eax));
6316         case 72:
6317                 return (SH_B0(eax));
6318         case 74:
6319                 return (B(eax));
6320         case 75:
6321                 return (cpi->cpi_family < 0x10);
6322         case 76:
6323                 return (B(eax));
6324         case 77:
6325                 return (cpi->cpi_family <= 0x11);
6326         case 78:
6327                 return (B(eax) || SH_C0(eax));
6328         case 79:
6329                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6330         case 80:
6331         case 81:
6332         case 82:
6333                 return (B(eax));
6334         case 83:
6335                 return (B(eax) || SH_C0(eax) || CG(eax));
6336         case 85:
6337                 return (cpi->cpi_family < 0x10);
6338         case 86:
6339                 return (SH_C0(eax) || CG(eax));
6340         case 88:
6341                 return (B(eax) || SH_C0(eax));
6342         case 89:
6343                 return (cpi->cpi_family < 0x10);
6344         case 90:
6345                 return (B(eax) || SH_C0(eax) || CG(eax));
6346         case 91:
6347         case 92:
6348                 return (B(eax) || SH_C0(eax));
6349         case 93:
6350                 return (SH_C0(eax));
6351         case 94:
6352                 return (B(eax) || SH_C0(eax) || CG(eax));
6353         case 95:
6354                 return (B(eax) || SH_C0(eax));
6355         case 96:
6356                 return (B(eax) || SH_C0(eax) || CG(eax));
6357         case 97:
6358         case 98:
6359                 return (SH_C0(eax) || CG(eax));
6360         case 99:
6361                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6362         case 100:
6363                 return (B(eax) || SH_C0(eax));
6364         case 101:
6365         case 103:
6366                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6367         case 104:
6368                 return (SH_C0(eax) || CG(eax) || D0(eax));
6369         case 105:
6370         case 106:
6371         case 107:
6372                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6373         case 108:
6374                 return (DH_CG(eax));
6375         case 109:
6376                 return (SH_C0(eax) || CG(eax) || D0(eax));
6377         case 110:
6378                 return (D0(eax) || EX(eax));
6379         case 111:
6380                 return (CG(eax));
6381         case 112:
6382                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6383         case 113:
6384                 return (eax == 0x20fc0);
6385         case 114:
6386                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6387         case 115:
6388                 return (SH_E0(eax) || JH_E1(eax));
6389         case 116:
6390                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6391         case 117:
6392                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6393         case 118:
6394                 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6395                     JH_E6(eax));
6396         case 121:
6397                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6398         case 122:
6399                 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6400         case 123:
6401                 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6402         case 131:
6403                 return (cpi->cpi_family < 0x10);
6404         case 6336786:
6405 
6406                 /*
6407                  * Test for AdvPowerMgmtInfo.TscPStateInvariant
6408                  * if this is a K8 family or newer processor. We're testing for
6409                  * this 'erratum' to determine whether or not we have a constant
6410                  * TSC.
6411                  *
6412                  * Our current fix for this is to disable the C1-Clock ramping.
6413                  * However, this doesn't work on newer processor families nor
6414                  * does it work when virtualized as those devices don't exist.
6415                  */
6416                 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6417                         return (0);
6418                 }
6419 
6420                 if (CPI_FAMILY(cpi) == 0xf) {
6421                         struct cpuid_regs regs;
6422                         regs.cp_eax = 0x80000007;
6423                         (void) __cpuid_insn(&regs);
6424                         return (!(regs.cp_edx & 0x100));
6425                 }
6426                 return (0);
6427         case 147:
6428                 /*
6429                  * This erratum (K8 #147) is not present on family 10 and newer.
6430                  */
6431                 if (cpi->cpi_family >= 0x10) {
6432                         return (0);
6433                 }
6434                 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6435                     (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6436 
6437         case 6671130:
6438                 /*
6439                  * check for processors (pre-Shanghai) that do not provide
6440                  * optimal management of 1gb ptes in its tlb.
6441                  */
6442                 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6443 
6444         case 298:
6445                 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6446                     DR_B2(eax) || RB_C0(eax));
6447 
6448         case 721:
6449                 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6450 
6451         default:
6452                 return (-1);
6453 
6454         }
6455 }
6456 
6457 /*
6458  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6459  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6460  */
6461 int
6462 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6463 {
6464         struct cpuid_info       *cpi;
6465         uint_t                  osvwid;
6466         static int              osvwfeature = -1;
6467         uint64_t                osvwlength;
6468 
6469 
6470         cpi = cpu->cpu_m.mcpu_cpi;
6471 
6472         /* confirm OSVW supported */
6473         if (osvwfeature == -1) {
6474                 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6475         } else {
6476                 /* assert that osvw feature setting is consistent on all cpus */
6477                 ASSERT(osvwfeature ==
6478                     (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6479         }
6480         if (!osvwfeature)
6481                 return (-1);
6482 
6483         osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6484 
6485         switch (erratum) {
6486         case 298:       /* osvwid is 0 */
6487                 osvwid = 0;
6488                 if (osvwlength <= (uint64_t)osvwid) {
6489                         /* osvwid 0 is unknown */
6490                         return (-1);
6491                 }
6492 
6493                 /*
6494                  * Check the OSVW STATUS MSR to determine the state
6495                  * of the erratum where:
6496                  *   0 - fixed by HW
6497                  *   1 - BIOS has applied the workaround when BIOS
6498                  *   workaround is available. (Or for other errata,
6499                  *   OS workaround is required.)
6500                  * For a value of 1, caller will confirm that the
6501                  * erratum 298 workaround has indeed been applied by BIOS.
6502                  *
6503                  * A 1 may be set in cpus that have a HW fix
6504                  * in a mixed cpu system. Regarding erratum 298:
6505                  *   In a multiprocessor platform, the workaround above
6506                  *   should be applied to all processors regardless of
6507                  *   silicon revision when an affected processor is
6508                  *   present.
6509                  */
6510 
6511                 return (rdmsr(MSR_AMD_OSVW_STATUS +
6512                     (osvwid / OSVW_ID_CNT_PER_MSR)) &
6513                     (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6514 
6515         default:
6516                 return (-1);
6517         }
6518 }
6519 
6520 static const char assoc_str[] = "associativity";
6521 static const char line_str[] = "line-size";
6522 static const char size_str[] = "size";
6523 
6524 static void
6525 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6526     uint32_t val)
6527 {
6528         char buf[128];
6529 
6530         /*
6531          * ndi_prop_update_int() is used because it is desirable for
6532          * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6533          */
6534         if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6535                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6536 }
6537 
6538 /*
6539  * Intel-style cache/tlb description
6540  *
6541  * Standard cpuid level 2 gives a randomly ordered
6542  * selection of tags that index into a table that describes
6543  * cache and tlb properties.
6544  */
6545 
6546 static const char l1_icache_str[] = "l1-icache";
6547 static const char l1_dcache_str[] = "l1-dcache";
6548 static const char l2_cache_str[] = "l2-cache";
6549 static const char l3_cache_str[] = "l3-cache";
6550 static const char itlb4k_str[] = "itlb-4K";
6551 static const char dtlb4k_str[] = "dtlb-4K";
6552 static const char itlb2M_str[] = "itlb-2M";
6553 static const char itlb4M_str[] = "itlb-4M";
6554 static const char dtlb4M_str[] = "dtlb-4M";
6555 static const char dtlb24_str[] = "dtlb0-2M-4M";
6556 static const char itlb424_str[] = "itlb-4K-2M-4M";
6557 static const char itlb24_str[] = "itlb-2M-4M";
6558 static const char dtlb44_str[] = "dtlb-4K-4M";
6559 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6560 static const char sl2_cache_str[] = "sectored-l2-cache";
6561 static const char itrace_str[] = "itrace-cache";
6562 static const char sl3_cache_str[] = "sectored-l3-cache";
6563 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6564 
6565 static const struct cachetab {
6566         uint8_t         ct_code;
6567         uint8_t         ct_assoc;
6568         uint16_t        ct_line_size;
6569         size_t          ct_size;
6570         const char      *ct_label;
6571 } intel_ctab[] = {
6572         /*
6573          * maintain descending order!
6574          *
6575          * Codes ignored - Reason
6576          * ----------------------
6577          * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6578          * f0H/f1H - Currently we do not interpret prefetch size by design
6579          */
6580         { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6581         { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6582         { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6583         { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6584         { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6585         { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6586         { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6587         { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6588         { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6589         { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6590         { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6591         { 0xd0, 4, 64, 512*1024, l3_cache_str},
6592         { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6593         { 0xc0, 4, 0, 8, dtlb44_str },
6594         { 0xba, 4, 0, 64, dtlb4k_str },
6595         { 0xb4, 4, 0, 256, dtlb4k_str },
6596         { 0xb3, 4, 0, 128, dtlb4k_str },
6597         { 0xb2, 4, 0, 64, itlb4k_str },
6598         { 0xb0, 4, 0, 128, itlb4k_str },
6599         { 0x87, 8, 64, 1024*1024, l2_cache_str},
6600         { 0x86, 4, 64, 512*1024, l2_cache_str},
6601         { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6602         { 0x84, 8, 32, 1024*1024, l2_cache_str},
6603         { 0x83, 8, 32, 512*1024, l2_cache_str},
6604         { 0x82, 8, 32, 256*1024, l2_cache_str},
6605         { 0x80, 8, 64, 512*1024, l2_cache_str},
6606         { 0x7f, 2, 64, 512*1024, l2_cache_str},
6607         { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6608         { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6609         { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6610         { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6611         { 0x79, 8, 64, 128*1024, sl2_cache_str},
6612         { 0x78, 8, 64, 1024*1024, l2_cache_str},
6613         { 0x73, 8, 0, 64*1024, itrace_str},
6614         { 0x72, 8, 0, 32*1024, itrace_str},
6615         { 0x71, 8, 0, 16*1024, itrace_str},
6616         { 0x70, 8, 0, 12*1024, itrace_str},
6617         { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6618         { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6619         { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6620         { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6621         { 0x5d, 0, 0, 256, dtlb44_str},
6622         { 0x5c, 0, 0, 128, dtlb44_str},
6623         { 0x5b, 0, 0, 64, dtlb44_str},
6624         { 0x5a, 4, 0, 32, dtlb24_str},
6625         { 0x59, 0, 0, 16, dtlb4k_str},
6626         { 0x57, 4, 0, 16, dtlb4k_str},
6627         { 0x56, 4, 0, 16, dtlb4M_str},
6628         { 0x55, 0, 0, 7, itlb24_str},
6629         { 0x52, 0, 0, 256, itlb424_str},
6630         { 0x51, 0, 0, 128, itlb424_str},
6631         { 0x50, 0, 0, 64, itlb424_str},
6632         { 0x4f, 0, 0, 32, itlb4k_str},
6633         { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6634         { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6635         { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6636         { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6637         { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6638         { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6639         { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6640         { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6641         { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6642         { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6643         { 0x44, 4, 32, 1024*1024, l2_cache_str},
6644         { 0x43, 4, 32, 512*1024, l2_cache_str},
6645         { 0x42, 4, 32, 256*1024, l2_cache_str},
6646         { 0x41, 4, 32, 128*1024, l2_cache_str},
6647         { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6648         { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6649         { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6650         { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6651         { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6652         { 0x39, 4, 64, 128*1024, sl2_cache_str},
6653         { 0x30, 8, 64, 32*1024, l1_icache_str},
6654         { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6655         { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6656         { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6657         { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6658         { 0x22, 4, 64, 512*1024, sl3_cache_str},
6659         { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6660         { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6661         { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6662         { 0x0b, 4, 0, 4, itlb4M_str},
6663         { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6664         { 0x08, 4, 32, 16*1024, l1_icache_str},
6665         { 0x06, 4, 32, 8*1024, l1_icache_str},
6666         { 0x05, 4, 0, 32, dtlb4M_str},
6667         { 0x04, 4, 0, 8, dtlb4M_str},
6668         { 0x03, 4, 0, 64, dtlb4k_str},
6669         { 0x02, 4, 0, 2, itlb4M_str},
6670         { 0x01, 4, 0, 32, itlb4k_str},
6671         { 0 }
6672 };
6673 
6674 static const struct cachetab cyrix_ctab[] = {
6675         { 0x70, 4, 0, 32, "tlb-4K" },
6676         { 0x80, 4, 16, 16*1024, "l1-cache" },
6677         { 0 }
6678 };
6679 
6680 /*
6681  * Search a cache table for a matching entry
6682  */
6683 static const struct cachetab *
6684 find_cacheent(const struct cachetab *ct, uint_t code)
6685 {
6686         if (code != 0) {
6687                 for (; ct->ct_code != 0; ct++)
6688                         if (ct->ct_code <= code)
6689                                 break;
6690                 if (ct->ct_code == code)
6691                         return (ct);
6692         }
6693         return (NULL);
6694 }
6695 
6696 /*
6697  * Populate cachetab entry with L2 or L3 cache-information using
6698  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6699  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6700  * information is found.
6701  */
6702 static int
6703 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6704 {
6705         uint32_t level, i;
6706         int ret = 0;
6707 
6708         for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6709                 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6710 
6711                 if (level == 2 || level == 3) {
6712                         ct->ct_assoc =
6713                             CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6714                         ct->ct_line_size =
6715                             CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6716                         ct->ct_size = ct->ct_assoc *
6717                             (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6718                             ct->ct_line_size *
6719                             (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6720 
6721                         if (level == 2) {
6722                                 ct->ct_label = l2_cache_str;
6723                         } else if (level == 3) {
6724                                 ct->ct_label = l3_cache_str;
6725                         }
6726                         ret = 1;
6727                 }
6728         }
6729 
6730         return (ret);
6731 }
6732 
6733 /*
6734  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6735  * The walk is terminated if the walker returns non-zero.
6736  */
6737 static void
6738 intel_walk_cacheinfo(struct cpuid_info *cpi,
6739     void *arg, int (*func)(void *, const struct cachetab *))
6740 {
6741         const struct cachetab *ct;
6742         struct cachetab des_49_ct, des_b1_ct;
6743         uint8_t *dp;
6744         int i;
6745 
6746         if ((dp = cpi->cpi_cacheinfo) == NULL)
6747                 return;
6748         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6749                 /*
6750                  * For overloaded descriptor 0x49 we use cpuid function 4
6751                  * if supported by the current processor, to create
6752                  * cache information.
6753                  * For overloaded descriptor 0xb1 we use X86_PAE flag
6754                  * to disambiguate the cache information.
6755                  */
6756                 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6757                     intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6758                                 ct = &des_49_ct;
6759                 } else if (*dp == 0xb1) {
6760                         des_b1_ct.ct_code = 0xb1;
6761                         des_b1_ct.ct_assoc = 4;
6762                         des_b1_ct.ct_line_size = 0;
6763                         if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6764                                 des_b1_ct.ct_size = 8;
6765                                 des_b1_ct.ct_label = itlb2M_str;
6766                         } else {
6767                                 des_b1_ct.ct_size = 4;
6768                                 des_b1_ct.ct_label = itlb4M_str;
6769                         }
6770                         ct = &des_b1_ct;
6771                 } else {
6772                         if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6773                                 continue;
6774                         }
6775                 }
6776 
6777                 if (func(arg, ct) != 0) {
6778                         break;
6779                 }
6780         }
6781 }
6782 
6783 /*
6784  * (Like the Intel one, except for Cyrix CPUs)
6785  */
6786 static void
6787 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6788     void *arg, int (*func)(void *, const struct cachetab *))
6789 {
6790         const struct cachetab *ct;
6791         uint8_t *dp;
6792         int i;
6793 
6794         if ((dp = cpi->cpi_cacheinfo) == NULL)
6795                 return;
6796         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6797                 /*
6798                  * Search Cyrix-specific descriptor table first ..
6799                  */
6800                 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6801                         if (func(arg, ct) != 0)
6802                                 break;
6803                         continue;
6804                 }
6805                 /*
6806                  * .. else fall back to the Intel one
6807                  */
6808                 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6809                         if (func(arg, ct) != 0)
6810                                 break;
6811                         continue;
6812                 }
6813         }
6814 }
6815 
6816 /*
6817  * A cacheinfo walker that adds associativity, line-size, and size properties
6818  * to the devinfo node it is passed as an argument.
6819  */
6820 static int
6821 add_cacheent_props(void *arg, const struct cachetab *ct)
6822 {
6823         dev_info_t *devi = arg;
6824 
6825         add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6826         if (ct->ct_line_size != 0)
6827                 add_cache_prop(devi, ct->ct_label, line_str,
6828                     ct->ct_line_size);
6829         add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6830         return (0);
6831 }
6832 
6833 
6834 static const char fully_assoc[] = "fully-associative?";
6835 
6836 /*
6837  * AMD style cache/tlb description
6838  *
6839  * Extended functions 5 and 6 directly describe properties of
6840  * tlbs and various cache levels.
6841  */
6842 static void
6843 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6844 {
6845         switch (assoc) {
6846         case 0: /* reserved; ignore */
6847                 break;
6848         default:
6849                 add_cache_prop(devi, label, assoc_str, assoc);
6850                 break;
6851         case 0xff:
6852                 add_cache_prop(devi, label, fully_assoc, 1);
6853                 break;
6854         }
6855 }
6856 
6857 static void
6858 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6859 {
6860         if (size == 0)
6861                 return;
6862         add_cache_prop(devi, label, size_str, size);
6863         add_amd_assoc(devi, label, assoc);
6864 }
6865 
6866 static void
6867 add_amd_cache(dev_info_t *devi, const char *label,
6868     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6869 {
6870         if (size == 0 || line_size == 0)
6871                 return;
6872         add_amd_assoc(devi, label, assoc);
6873         /*
6874          * Most AMD parts have a sectored cache. Multiple cache lines are
6875          * associated with each tag. A sector consists of all cache lines
6876          * associated with a tag. For example, the AMD K6-III has a sector
6877          * size of 2 cache lines per tag.
6878          */
6879         if (lines_per_tag != 0)
6880                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6881         add_cache_prop(devi, label, line_str, line_size);
6882         add_cache_prop(devi, label, size_str, size * 1024);
6883 }
6884 
6885 static void
6886 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6887 {
6888         switch (assoc) {
6889         case 0: /* off */
6890                 break;
6891         case 1:
6892         case 2:
6893         case 4:
6894                 add_cache_prop(devi, label, assoc_str, assoc);
6895                 break;
6896         case 6:
6897                 add_cache_prop(devi, label, assoc_str, 8);
6898                 break;
6899         case 8:
6900                 add_cache_prop(devi, label, assoc_str, 16);
6901                 break;
6902         case 0xf:
6903                 add_cache_prop(devi, label, fully_assoc, 1);
6904                 break;
6905         default: /* reserved; ignore */
6906                 break;
6907         }
6908 }
6909 
6910 static void
6911 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6912 {
6913         if (size == 0 || assoc == 0)
6914                 return;
6915         add_amd_l2_assoc(devi, label, assoc);
6916         add_cache_prop(devi, label, size_str, size);
6917 }
6918 
6919 static void
6920 add_amd_l2_cache(dev_info_t *devi, const char *label,
6921     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6922 {
6923         if (size == 0 || assoc == 0 || line_size == 0)
6924                 return;
6925         add_amd_l2_assoc(devi, label, assoc);
6926         if (lines_per_tag != 0)
6927                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6928         add_cache_prop(devi, label, line_str, line_size);
6929         add_cache_prop(devi, label, size_str, size * 1024);
6930 }
6931 
6932 static void
6933 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6934 {
6935         struct cpuid_regs *cp;
6936 
6937         if (cpi->cpi_xmaxeax < 0x80000005)
6938                 return;
6939         cp = &cpi->cpi_extd[5];
6940 
6941         /*
6942          * 4M/2M L1 TLB configuration
6943          *
6944          * We report the size for 2M pages because AMD uses two
6945          * TLB entries for one 4M page.
6946          */
6947         add_amd_tlb(devi, "dtlb-2M",
6948             BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6949         add_amd_tlb(devi, "itlb-2M",
6950             BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6951 
6952         /*
6953          * 4K L1 TLB configuration
6954          */
6955 
6956         switch (cpi->cpi_vendor) {
6957                 uint_t nentries;
6958         case X86_VENDOR_TM:
6959                 if (cpi->cpi_family >= 5) {
6960                         /*
6961                          * Crusoe processors have 256 TLB entries, but
6962                          * cpuid data format constrains them to only
6963                          * reporting 255 of them.
6964                          */
6965                         if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6966                                 nentries = 256;
6967                         /*
6968                          * Crusoe processors also have a unified TLB
6969                          */
6970                         add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6971                             nentries);
6972                         break;
6973                 }
6974                 /*FALLTHROUGH*/
6975         default:
6976                 add_amd_tlb(devi, itlb4k_str,
6977                     BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6978                 add_amd_tlb(devi, dtlb4k_str,
6979                     BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6980                 break;
6981         }
6982 
6983         /*
6984          * data L1 cache configuration
6985          */
6986 
6987         add_amd_cache(devi, l1_dcache_str,
6988             BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6989             BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6990 
6991         /*
6992          * code L1 cache configuration
6993          */
6994 
6995         add_amd_cache(devi, l1_icache_str,
6996             BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6997             BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6998 
6999         if (cpi->cpi_xmaxeax < 0x80000006)
7000                 return;
7001         cp = &cpi->cpi_extd[6];
7002 
7003         /* Check for a unified L2 TLB for large pages */
7004 
7005         if (BITX(cp->cp_eax, 31, 16) == 0)
7006                 add_amd_l2_tlb(devi, "l2-tlb-2M",
7007                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7008         else {
7009                 add_amd_l2_tlb(devi, "l2-dtlb-2M",
7010                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7011                 add_amd_l2_tlb(devi, "l2-itlb-2M",
7012                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7013         }
7014 
7015         /* Check for a unified L2 TLB for 4K pages */
7016 
7017         if (BITX(cp->cp_ebx, 31, 16) == 0) {
7018                 add_amd_l2_tlb(devi, "l2-tlb-4K",
7019                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7020         } else {
7021                 add_amd_l2_tlb(devi, "l2-dtlb-4K",
7022                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7023                 add_amd_l2_tlb(devi, "l2-itlb-4K",
7024                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7025         }
7026 
7027         add_amd_l2_cache(devi, l2_cache_str,
7028             BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7029             BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7030 }
7031 
7032 /*
7033  * There are two basic ways that the x86 world describes it cache
7034  * and tlb architecture - Intel's way and AMD's way.
7035  *
7036  * Return which flavor of cache architecture we should use
7037  */
7038 static int
7039 x86_which_cacheinfo(struct cpuid_info *cpi)
7040 {
7041         switch (cpi->cpi_vendor) {
7042         case X86_VENDOR_Intel:
7043                 if (cpi->cpi_maxeax >= 2)
7044                         return (X86_VENDOR_Intel);
7045                 break;
7046         case X86_VENDOR_AMD:
7047                 /*
7048                  * The K5 model 1 was the first part from AMD that reported
7049                  * cache sizes via extended cpuid functions.
7050                  */
7051                 if (cpi->cpi_family > 5 ||
7052                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7053                         return (X86_VENDOR_AMD);
7054                 break;
7055         case X86_VENDOR_HYGON:
7056                 return (X86_VENDOR_AMD);
7057         case X86_VENDOR_TM:
7058                 if (cpi->cpi_family >= 5)
7059                         return (X86_VENDOR_AMD);
7060                 /*FALLTHROUGH*/
7061         default:
7062                 /*
7063                  * If they have extended CPU data for 0x80000005
7064                  * then we assume they have AMD-format cache
7065                  * information.
7066                  *
7067                  * If not, and the vendor happens to be Cyrix,
7068                  * then try our-Cyrix specific handler.
7069                  *
7070                  * If we're not Cyrix, then assume we're using Intel's
7071                  * table-driven format instead.
7072                  */
7073                 if (cpi->cpi_xmaxeax >= 0x80000005)
7074                         return (X86_VENDOR_AMD);
7075                 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7076                         return (X86_VENDOR_Cyrix);
7077                 else if (cpi->cpi_maxeax >= 2)
7078                         return (X86_VENDOR_Intel);
7079                 break;
7080         }
7081         return (-1);
7082 }
7083 
7084 void
7085 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7086     struct cpuid_info *cpi)
7087 {
7088         dev_info_t *cpu_devi;
7089         int create;
7090 
7091         cpu_devi = (dev_info_t *)dip;
7092 
7093         /* device_type */
7094         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7095             "device_type", "cpu");
7096 
7097         /* reg */
7098         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7099             "reg", cpu_id);
7100 
7101         /* cpu-mhz, and clock-frequency */
7102         if (cpu_freq > 0) {
7103                 long long mul;
7104 
7105                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7106                     "cpu-mhz", cpu_freq);
7107                 if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7108                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7109                             "clock-frequency", (int)mul);
7110         }
7111 
7112         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7113 
7114         /* vendor-id */
7115         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7116             "vendor-id", cpi->cpi_vendorstr);
7117 
7118         if (cpi->cpi_maxeax == 0) {
7119                 return;
7120         }
7121 
7122         /*
7123          * family, model, and step
7124          */
7125         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7126             "family", CPI_FAMILY(cpi));
7127         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7128             "cpu-model", CPI_MODEL(cpi));
7129         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7130             "stepping-id", CPI_STEP(cpi));
7131 
7132         /* type */
7133         switch (cpi->cpi_vendor) {
7134         case X86_VENDOR_Intel:
7135                 create = 1;
7136                 break;
7137         default:
7138                 create = 0;
7139                 break;
7140         }
7141         if (create)
7142                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7143                     "type", CPI_TYPE(cpi));
7144 
7145         /* ext-family */
7146         switch (cpi->cpi_vendor) {
7147         case X86_VENDOR_Intel:
7148         case X86_VENDOR_AMD:
7149                 create = cpi->cpi_family >= 0xf;
7150                 break;
7151         case X86_VENDOR_HYGON:
7152                 create = 1;
7153                 break;
7154         default:
7155                 create = 0;
7156                 break;
7157         }
7158         if (create)
7159                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7160                     "ext-family", CPI_FAMILY_XTD(cpi));
7161 
7162         /* ext-model */
7163         switch (cpi->cpi_vendor) {
7164         case X86_VENDOR_Intel:
7165                 create = IS_EXTENDED_MODEL_INTEL(cpi);
7166                 break;
7167         case X86_VENDOR_AMD:
7168                 create = CPI_FAMILY(cpi) == 0xf;
7169                 break;
7170         case X86_VENDOR_HYGON:
7171                 create = 1;
7172                 break;
7173         default:
7174                 create = 0;
7175                 break;
7176         }
7177         if (create)
7178                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7179                     "ext-model", CPI_MODEL_XTD(cpi));
7180 
7181         /* generation */
7182         switch (cpi->cpi_vendor) {
7183         case X86_VENDOR_AMD:
7184         case X86_VENDOR_HYGON:
7185                 /*
7186                  * AMD K5 model 1 was the first part to support this
7187                  */
7188                 create = cpi->cpi_xmaxeax >= 0x80000001;
7189                 break;
7190         default:
7191                 create = 0;
7192                 break;
7193         }
7194         if (create)
7195                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7196                     "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7197 
7198         /* brand-id */
7199         switch (cpi->cpi_vendor) {
7200         case X86_VENDOR_Intel:
7201                 /*
7202                  * brand id first appeared on Pentium III Xeon model 8,
7203                  * and Celeron model 8 processors and Opteron
7204                  */
7205                 create = cpi->cpi_family > 6 ||
7206                     (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7207                 break;
7208         case X86_VENDOR_AMD:
7209                 create = cpi->cpi_family >= 0xf;
7210                 break;
7211         case X86_VENDOR_HYGON:
7212                 create = 1;
7213                 break;
7214         default:
7215                 create = 0;
7216                 break;
7217         }
7218         if (create && cpi->cpi_brandid != 0) {
7219                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7220                     "brand-id", cpi->cpi_brandid);
7221         }
7222 
7223         /* chunks, and apic-id */
7224         switch (cpi->cpi_vendor) {
7225                 /*
7226                  * first available on Pentium IV and Opteron (K8)
7227                  */
7228         case X86_VENDOR_Intel:
7229                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7230                 break;
7231         case X86_VENDOR_AMD:
7232                 create = cpi->cpi_family >= 0xf;
7233                 break;
7234         case X86_VENDOR_HYGON:
7235                 create = 1;
7236                 break;
7237         default:
7238                 create = 0;
7239                 break;
7240         }
7241         if (create) {
7242                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7243                     "chunks", CPI_CHUNKS(cpi));
7244                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7245                     "apic-id", cpi->cpi_apicid);
7246                 if (cpi->cpi_chipid >= 0) {
7247                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7248                             "chip#", cpi->cpi_chipid);
7249                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7250                             "clog#", cpi->cpi_clogid);
7251                 }
7252         }
7253 
7254         /* cpuid-features */
7255         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7256             "cpuid-features", CPI_FEATURES_EDX(cpi));
7257 
7258 
7259         /* cpuid-features-ecx */
7260         switch (cpi->cpi_vendor) {
7261         case X86_VENDOR_Intel:
7262                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7263                 break;
7264         case X86_VENDOR_AMD:
7265                 create = cpi->cpi_family >= 0xf;
7266                 break;
7267         case X86_VENDOR_HYGON:
7268                 create = 1;
7269                 break;
7270         default:
7271                 create = 0;
7272                 break;
7273         }
7274         if (create)
7275                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7276                     "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7277 
7278         /* ext-cpuid-features */
7279         switch (cpi->cpi_vendor) {
7280         case X86_VENDOR_Intel:
7281         case X86_VENDOR_AMD:
7282         case X86_VENDOR_HYGON:
7283         case X86_VENDOR_Cyrix:
7284         case X86_VENDOR_TM:
7285         case X86_VENDOR_Centaur:
7286                 create = cpi->cpi_xmaxeax >= 0x80000001;
7287                 break;
7288         default:
7289                 create = 0;
7290                 break;
7291         }
7292         if (create) {
7293                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7294                     "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7295                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7296                     "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7297         }
7298 
7299         /*
7300          * Brand String first appeared in Intel Pentium IV, AMD K5
7301          * model 1, and Cyrix GXm.  On earlier models we try and
7302          * simulate something similar .. so this string should always
7303          * same -something- about the processor, however lame.
7304          */
7305         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7306             "brand-string", cpi->cpi_brandstr);
7307 
7308         /*
7309          * Finally, cache and tlb information
7310          */
7311         switch (x86_which_cacheinfo(cpi)) {
7312         case X86_VENDOR_Intel:
7313                 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7314                 break;
7315         case X86_VENDOR_Cyrix:
7316                 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7317                 break;
7318         case X86_VENDOR_AMD:
7319                 amd_cache_info(cpi, cpu_devi);
7320                 break;
7321         default:
7322                 break;
7323         }
7324 }
7325 
7326 struct l2info {
7327         int *l2i_csz;
7328         int *l2i_lsz;
7329         int *l2i_assoc;
7330         int l2i_ret;
7331 };
7332 
7333 /*
7334  * A cacheinfo walker that fetches the size, line-size and associativity
7335  * of the L2 cache
7336  */
7337 static int
7338 intel_l2cinfo(void *arg, const struct cachetab *ct)
7339 {
7340         struct l2info *l2i = arg;
7341         int *ip;
7342 
7343         if (ct->ct_label != l2_cache_str &&
7344             ct->ct_label != sl2_cache_str)
7345                 return (0);     /* not an L2 -- keep walking */
7346 
7347         if ((ip = l2i->l2i_csz) != NULL)
7348                 *ip = ct->ct_size;
7349         if ((ip = l2i->l2i_lsz) != NULL)
7350                 *ip = ct->ct_line_size;
7351         if ((ip = l2i->l2i_assoc) != NULL)
7352                 *ip = ct->ct_assoc;
7353         l2i->l2i_ret = ct->ct_size;
7354         return (1);             /* was an L2 -- terminate walk */
7355 }
7356 
7357 /*
7358  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7359  *
7360  *      Unlike the associativity for the L1 cache and tlb where the 8 bit
7361  *      value is the associativity, the associativity for the L2 cache and
7362  *      tlb is encoded in the following table. The 4 bit L2 value serves as
7363  *      an index into the amd_afd[] array to determine the associativity.
7364  *      -1 is undefined. 0 is fully associative.
7365  */
7366 
7367 static int amd_afd[] =
7368         {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7369 
7370 static void
7371 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7372 {
7373         struct cpuid_regs *cp;
7374         uint_t size, assoc;
7375         int i;
7376         int *ip;
7377 
7378         if (cpi->cpi_xmaxeax < 0x80000006)
7379                 return;
7380         cp = &cpi->cpi_extd[6];
7381 
7382         if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7383             (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7384                 uint_t cachesz = size * 1024;
7385                 assoc = amd_afd[i];
7386 
7387                 ASSERT(assoc != -1);
7388 
7389                 if ((ip = l2i->l2i_csz) != NULL)
7390                         *ip = cachesz;
7391                 if ((ip = l2i->l2i_lsz) != NULL)
7392                         *ip = BITX(cp->cp_ecx, 7, 0);
7393                 if ((ip = l2i->l2i_assoc) != NULL)
7394                         *ip = assoc;
7395                 l2i->l2i_ret = cachesz;
7396         }
7397 }
7398 
7399 int
7400 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7401 {
7402         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7403         struct l2info __l2info, *l2i = &__l2info;
7404 
7405         l2i->l2i_csz = csz;
7406         l2i->l2i_lsz = lsz;
7407         l2i->l2i_assoc = assoc;
7408         l2i->l2i_ret = -1;
7409 
7410         switch (x86_which_cacheinfo(cpi)) {
7411         case X86_VENDOR_Intel:
7412                 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7413                 break;
7414         case X86_VENDOR_Cyrix:
7415                 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7416                 break;
7417         case X86_VENDOR_AMD:
7418                 amd_l2cacheinfo(cpi, l2i);
7419                 break;
7420         default:
7421                 break;
7422         }
7423         return (l2i->l2i_ret);
7424 }
7425 
7426 #if !defined(__xpv)
7427 
7428 uint32_t *
7429 cpuid_mwait_alloc(cpu_t *cpu)
7430 {
7431         uint32_t        *ret;
7432         size_t          mwait_size;
7433 
7434         ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7435 
7436         mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7437         if (mwait_size == 0)
7438                 return (NULL);
7439 
7440         /*
7441          * kmem_alloc() returns cache line size aligned data for mwait_size
7442          * allocations.  mwait_size is currently cache line sized.  Neither
7443          * of these implementation details are guarantied to be true in the
7444          * future.
7445          *
7446          * First try allocating mwait_size as kmem_alloc() currently returns
7447          * correctly aligned memory.  If kmem_alloc() does not return
7448          * mwait_size aligned memory, then use mwait_size ROUNDUP.
7449          *
7450          * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7451          * decide to free this memory.
7452          */
7453         ret = kmem_zalloc(mwait_size, KM_SLEEP);
7454         if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7455                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7456                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7457                 *ret = MWAIT_RUNNING;
7458                 return (ret);
7459         } else {
7460                 kmem_free(ret, mwait_size);
7461                 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7462                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7463                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7464                 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7465                 *ret = MWAIT_RUNNING;
7466                 return (ret);
7467         }
7468 }
7469 
7470 void
7471 cpuid_mwait_free(cpu_t *cpu)
7472 {
7473         if (cpu->cpu_m.mcpu_cpi == NULL) {
7474                 return;
7475         }
7476 
7477         if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7478             cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7479                 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7480                     cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7481         }
7482 
7483         cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7484         cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7485 }
7486 
7487 void
7488 patch_tsc_read(int flag)
7489 {
7490         size_t cnt;
7491 
7492         switch (flag) {
7493         case TSC_NONE:
7494                 cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7495                 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7496                 break;
7497         case TSC_RDTSC_LFENCE:
7498                 cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7499                 (void) memcpy((void *)tsc_read,
7500                     (void *)&_tsc_lfence_start, cnt);
7501                 break;
7502         case TSC_TSCP:
7503                 cnt = &_tscp_end - &_tscp_start;
7504                 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7505                 break;
7506         default:
7507                 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7508                 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7509                 break;
7510         }
7511         tsc_type = flag;
7512 }
7513 
7514 int
7515 cpuid_deep_cstates_supported(void)
7516 {
7517         struct cpuid_info *cpi;
7518         struct cpuid_regs regs;
7519 
7520         ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7521         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7522 
7523         cpi = CPU->cpu_m.mcpu_cpi;
7524 
7525         switch (cpi->cpi_vendor) {
7526         case X86_VENDOR_Intel:
7527                 if (cpi->cpi_xmaxeax < 0x80000007)
7528                         return (0);
7529 
7530                 /*
7531                  * Does TSC run at a constant rate in all C-states?
7532                  */
7533                 regs.cp_eax = 0x80000007;
7534                 (void) __cpuid_insn(&regs);
7535                 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7536 
7537         default:
7538                 return (0);
7539         }
7540 }
7541 
7542 #endif  /* !__xpv */
7543 
7544 void
7545 post_startup_cpu_fixups(void)
7546 {
7547 #ifndef __xpv
7548         /*
7549          * Some AMD processors support C1E state. Entering this state will
7550          * cause the local APIC timer to stop, which we can't deal with at
7551          * this time.
7552          */
7553         if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7554                 on_trap_data_t otd;
7555                 uint64_t reg;
7556 
7557                 if (!on_trap(&otd, OT_DATA_ACCESS)) {
7558                         reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7559                         /* Disable C1E state if it is enabled by BIOS */
7560                         if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7561                             AMD_ACTONCMPHALT_MASK) {
7562                                 reg &= ~(AMD_ACTONCMPHALT_MASK <<
7563                                     AMD_ACTONCMPHALT_SHIFT);
7564                                 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7565                         }
7566                 }
7567                 no_trap();
7568         }
7569 #endif  /* !__xpv */
7570 }
7571 
7572 void
7573 enable_pcid(void)
7574 {
7575         if (x86_use_pcid == -1)
7576                 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7577 
7578         if (x86_use_invpcid == -1) {
7579                 x86_use_invpcid = is_x86_feature(x86_featureset,
7580                     X86FSET_INVPCID);
7581         }
7582 
7583         if (!x86_use_pcid)
7584                 return;
7585 
7586         /*
7587          * Intel say that on setting PCIDE, it immediately starts using the PCID
7588          * bits; better make sure there's nothing there.
7589          */
7590         ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7591 
7592         setcr4(getcr4() | CR4_PCIDE);
7593 }
7594 
7595 /*
7596  * Setup necessary registers to enable XSAVE feature on this processor.
7597  * This function needs to be called early enough, so that no xsave/xrstor
7598  * ops will execute on the processor before the MSRs are properly set up.
7599  *
7600  * Current implementation has the following assumption:
7601  * - cpuid_pass_basic() is done, so that X86 features are known.
7602  * - fpu_probe() is done, so that fp_save_mech is chosen.
7603  */
7604 void
7605 xsave_setup_msr(cpu_t *cpu)
7606 {
7607         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7608         ASSERT(fp_save_mech == FP_XSAVE);
7609         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7610 
7611         /* Enable OSXSAVE in CR4. */
7612         setcr4(getcr4() | CR4_OSXSAVE);
7613         /*
7614          * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7615          * correct value.
7616          */
7617         cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7618         setup_xfem();
7619 }
7620 
7621 /*
7622  * Starting with the Westmere processor the local
7623  * APIC timer will continue running in all C-states,
7624  * including the deepest C-states.
7625  */
7626 int
7627 cpuid_arat_supported(void)
7628 {
7629         struct cpuid_info *cpi;
7630         struct cpuid_regs regs;
7631 
7632         ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7633         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7634 
7635         cpi = CPU->cpu_m.mcpu_cpi;
7636 
7637         switch (cpi->cpi_vendor) {
7638         case X86_VENDOR_Intel:
7639                 /*
7640                  * Always-running Local APIC Timer is
7641                  * indicated by CPUID.6.EAX[2].
7642                  */
7643                 if (cpi->cpi_maxeax >= 6) {
7644                         regs.cp_eax = 6;
7645                         (void) cpuid_insn(NULL, &regs);
7646                         return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7647                 } else {
7648                         return (0);
7649                 }
7650         default:
7651                 return (0);
7652         }
7653 }
7654 
7655 /*
7656  * Check support for Intel ENERGY_PERF_BIAS feature
7657  */
7658 int
7659 cpuid_iepb_supported(struct cpu *cp)
7660 {
7661         struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7662         struct cpuid_regs regs;
7663 
7664         ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7665         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7666 
7667         if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7668                 return (0);
7669         }
7670 
7671         /*
7672          * Intel ENERGY_PERF_BIAS MSR is indicated by
7673          * capability bit CPUID.6.ECX.3
7674          */
7675         if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7676                 return (0);
7677 
7678         regs.cp_eax = 0x6;
7679         (void) cpuid_insn(NULL, &regs);
7680         return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7681 }
7682 
7683 /*
7684  * Check support for TSC deadline timer
7685  *
7686  * TSC deadline timer provides a superior software programming
7687  * model over local APIC timer that eliminates "time drifts".
7688  * Instead of specifying a relative time, software specifies an
7689  * absolute time as the target at which the processor should
7690  * generate a timer event.
7691  */
7692 int
7693 cpuid_deadline_tsc_supported(void)
7694 {
7695         struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7696         struct cpuid_regs regs;
7697 
7698         ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7699         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7700 
7701         switch (cpi->cpi_vendor) {
7702         case X86_VENDOR_Intel:
7703                 if (cpi->cpi_maxeax >= 1) {
7704                         regs.cp_eax = 1;
7705                         (void) cpuid_insn(NULL, &regs);
7706                         return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7707                 } else {
7708                         return (0);
7709                 }
7710         default:
7711                 return (0);
7712         }
7713 }
7714 
7715 #if !defined(__xpv)
7716 /*
7717  * Patch in versions of bcopy for high performance Intel Nhm processors
7718  * and later...
7719  */
7720 void
7721 patch_memops(uint_t vendor)
7722 {
7723         size_t cnt, i;
7724         caddr_t to, from;
7725 
7726         if ((vendor == X86_VENDOR_Intel) &&
7727             is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7728                 cnt = &bcopy_patch_end - &bcopy_patch_start;
7729                 to = &bcopy_ck_size;
7730                 from = &bcopy_patch_start;
7731                 for (i = 0; i < cnt; i++) {
7732                         *to++ = *from++;
7733                 }
7734         }
7735 }
7736 #endif  /*  !__xpv */
7737 
7738 /*
7739  * We're being asked to tell the system how many bits are required to represent
7740  * the various thread and strand IDs. While it's tempting to derive this based
7741  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7742  * correct. Instead, this needs to be based on the number of bits that the APIC
7743  * allows for these different configurations. We only update these to a larger
7744  * value if we find one.
7745  */
7746 void
7747 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7748 {
7749         struct cpuid_info *cpi;
7750 
7751         VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7752         cpi = cpu->cpu_m.mcpu_cpi;
7753 
7754         if (cpi->cpi_ncore_bits > *core_nbits) {
7755                 *core_nbits = cpi->cpi_ncore_bits;
7756         }
7757 
7758         if (cpi->cpi_nthread_bits > *strand_nbits) {
7759                 *strand_nbits = cpi->cpi_nthread_bits;
7760         }
7761 }
7762 
7763 void
7764 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7765 {
7766         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7767         struct cpuid_regs cp;
7768 
7769         /*
7770          * Reread the CPUID portions that we need for various security
7771          * information.
7772          */
7773         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7774                 /*
7775                  * Check if we now have leaf 7 available to us.
7776                  */
7777                 if (cpi->cpi_maxeax < 7) {
7778                         bzero(&cp, sizeof (cp));
7779                         cp.cp_eax = 0;
7780                         cpi->cpi_maxeax = __cpuid_insn(&cp);
7781                         if (cpi->cpi_maxeax < 7)
7782                                 return;
7783                 }
7784 
7785                 bzero(&cp, sizeof (cp));
7786                 cp.cp_eax = 7;
7787                 cp.cp_ecx = 0;
7788                 (void) __cpuid_insn(&cp);
7789                 cpi->cpi_std[7] = cp;
7790         } else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
7791             cpi->cpi_vendor == X86_VENDOR_HYGON) {
7792                 /* No xcpuid support */
7793                 if (cpi->cpi_family < 5 ||
7794                     (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7795                         return;
7796 
7797                 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7798                         bzero(&cp, sizeof (cp));
7799                         cp.cp_eax = CPUID_LEAF_EXT_0;
7800                         cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7801                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7802                                 return;
7803                         }
7804                 }
7805 
7806                 /*
7807                  * Most AMD features are in leaf 8. Automatic IBRS was added in
7808                  * leaf 0x21. So we also check that.
7809                  */
7810                 bzero(&cp, sizeof (cp));
7811                 cp.cp_eax = CPUID_LEAF_EXT_8;
7812                 (void) __cpuid_insn(&cp);
7813                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7814                 cpi->cpi_extd[8] = cp;
7815 
7816                 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21) {
7817                         return;
7818                 }
7819 
7820                 bzero(&cp, sizeof (cp));
7821                 cp.cp_eax = CPUID_LEAF_EXT_21;
7822                 (void) __cpuid_insn(&cp);
7823                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
7824                 cpi->cpi_extd[0x21] = cp;
7825         } else {
7826                 /*
7827                  * Nothing to do here. Return an empty set which has already
7828                  * been zeroed for us.
7829                  */
7830                 return;
7831         }
7832         cpuid_scan_security(cpu, fset);
7833 }
7834 
7835 /* ARGSUSED */
7836 static int
7837 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7838 {
7839         uchar_t *fset;
7840         boolean_t first_pass = (boolean_t)arg1;
7841 
7842         fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7843         if (first_pass && CPU->cpu_id != 0)
7844                 return (0);
7845         if (!first_pass && CPU->cpu_id == 0)
7846                 return (0);
7847         cpuid_pass_ucode(CPU, fset);
7848 
7849         return (0);
7850 }
7851 
7852 /*
7853  * After a microcode update where the version has changed, then we need to
7854  * rescan CPUID. To do this we check every CPU to make sure that they have the
7855  * same microcode. Then we perform a cross call to all such CPUs. It's the
7856  * caller's job to make sure that no one else can end up doing an update while
7857  * this is going on.
7858  *
7859  * We assume that the system is microcode capable if we're called.
7860  */
7861 void
7862 cpuid_post_ucodeadm(void)
7863 {
7864         uint32_t rev;
7865         int i;
7866         struct cpu *cpu;
7867         cpuset_t cpuset;
7868         void *argdata;
7869         uchar_t *f0;
7870 
7871         argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7872 
7873         mutex_enter(&cpu_lock);
7874         cpu = cpu_get(0);
7875         rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7876         CPUSET_ONLY(cpuset, 0);
7877         for (i = 1; i < max_ncpus; i++) {
7878                 if ((cpu = cpu_get(i)) == NULL)
7879                         continue;
7880 
7881                 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7882                         panic("post microcode update CPU %d has differing "
7883                             "microcode revision (%u) from CPU 0 (%u)",
7884                             i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7885                 }
7886                 CPUSET_ADD(cpuset, i);
7887         }
7888 
7889         /*
7890          * We do the cross calls in two passes. The first pass is only for the
7891          * boot CPU. The second pass is for all of the other CPUs. This allows
7892          * the boot CPU to go through and change behavior related to patching or
7893          * whether or not Enhanced IBRS needs to be enabled and then allow all
7894          * other CPUs to follow suit.
7895          */
7896         kpreempt_disable();
7897         xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7898             cpuid_post_ucodeadm_xc);
7899         xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7900             cpuid_post_ucodeadm_xc);
7901         kpreempt_enable();
7902 
7903         /*
7904          * OK, now look at each CPU and see if their feature sets are equal.
7905          */
7906         f0 = argdata;
7907         for (i = 1; i < max_ncpus; i++) {
7908                 uchar_t *fset;
7909                 if (!CPU_IN_SET(cpuset, i))
7910                         continue;
7911 
7912                 fset = (uchar_t *)((uintptr_t)argdata +
7913                     sizeof (x86_featureset) * i);
7914 
7915                 if (!compare_x86_featureset(f0, fset)) {
7916                         panic("Post microcode update CPU %d has "
7917                             "differing security feature (%p) set from CPU 0 "
7918                             "(%p), not appending to feature set", i,
7919                             (void *)fset, (void *)f0);
7920                 }
7921         }
7922 
7923         mutex_exit(&cpu_lock);
7924 
7925         for (i = 0; i < NUM_X86_FEATURES; i++) {
7926                 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7927                     x86_feature_names[i]);
7928                 if (is_x86_feature(f0, i)) {
7929                         add_x86_feature(x86_featureset, i);
7930                 }
7931         }
7932         kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7933 }
7934 
7935 typedef void (*cpuid_pass_f)(cpu_t *, void *);
7936 
7937 typedef struct cpuid_pass_def {
7938         cpuid_pass_t cpd_pass;
7939         cpuid_pass_f cpd_func;
7940 } cpuid_pass_def_t;
7941 
7942 /*
7943  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
7944  * normal sense and should not appear here.
7945  */
7946 static const cpuid_pass_def_t cpuid_pass_defs[] = {
7947         { CPUID_PASS_PRELUDE, cpuid_pass_prelude },
7948         { CPUID_PASS_IDENT, cpuid_pass_ident },
7949         { CPUID_PASS_BASIC, cpuid_pass_basic },
7950         { CPUID_PASS_EXTENDED, cpuid_pass_extended },
7951         { CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
7952         { CPUID_PASS_RESOLVE, cpuid_pass_resolve },
7953 };
7954 
7955 void
7956 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
7957 {
7958         VERIFY3S(pass, !=, CPUID_PASS_NONE);
7959 
7960         if (cp == NULL)
7961                 cp = CPU;
7962 
7963         /*
7964          * Space statically allocated for BSP, ensure pointer is set
7965          */
7966         if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
7967                 cp->cpu_m.mcpu_cpi = &cpuid_info0;
7968 
7969         ASSERT(cpuid_checkpass(cp, pass - 1));
7970 
7971         for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
7972                 if (cpuid_pass_defs[i].cpd_pass == pass) {
7973                         cpuid_pass_defs[i].cpd_func(cp, arg);
7974                         cp->cpu_m.mcpu_cpi->cpi_pass = pass;
7975                         return;
7976                 }
7977         }
7978 
7979         panic("unable to execute invalid cpuid pass %d on cpu%d\n",
7980             pass, cp->cpu_id);
7981 }
7982 
7983 /*
7984  * Extract the processor family from a chiprev.  Processor families are not the
7985  * same as cpuid families; see comments above and in x86_archext.h.
7986  */
7987 x86_processor_family_t
7988 chiprev_family(const x86_chiprev_t cr)
7989 {
7990         return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
7991 }
7992 
7993 /*
7994  * A chiprev matches its template if the vendor and family are identical and the
7995  * revision of the chiprev matches one of the bits set in the template.  Callers
7996  * may bitwise-OR together chiprevs of the same vendor and family to form the
7997  * template, or use the _ANY variant.  It is not possible to match chiprevs of
7998  * multiple vendors or processor families with a single call.  Note that this
7999  * function operates on processor families, not cpuid families.
8000  */
8001 boolean_t
8002 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8003 {
8004         return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8005             _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8006             (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8007 }
8008 
8009 /*
8010  * A chiprev is at least min if the vendor and family are identical and the
8011  * revision of the chiprev is at least as recent as that of min.  Processor
8012  * families are considered unordered and cannot be compared using this function.
8013  * Note that this function operates on processor families, not cpuid families.
8014  * Use of the _ANY chiprev variant with this function is not useful; it will
8015  * always return B_FALSE if the _ANY variant is supplied as the minimum
8016  * revision.  To determine only whether a chiprev is of a given processor
8017  * family, test the return value of chiprev_family() instead.
8018  */
8019 boolean_t
8020 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8021 {
8022         return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8023             _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8024             _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8025 }
8026 
8027 /*
8028  * The uarch functions operate in a manner similar to the chiprev functions
8029  * above.  While it is tempting to allow these to operate on microarchitectures
8030  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8031  * than ZEN2), we elect not to do so because a manufacturer may supply
8032  * processors of multiple different microarchitecture families each of which may
8033  * be internally ordered but unordered with respect to those of other families.
8034  */
8035 x86_uarch_t
8036 uarchrev_uarch(const x86_uarchrev_t ur)
8037 {
8038         return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8039 }
8040 
8041 boolean_t
8042 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8043 {
8044         return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8045             _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8046             (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8047 }
8048 
8049 boolean_t
8050 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8051 {
8052         return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8053             _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8054             _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8055 }