16413 New usr/src/uts/intel/os/cpuid.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
  26  * Copyright 2020 Joyent, Inc.
  27  * Copyright 2023 Oxide Computer Company
  28  * Copyright 2024 MNX Cloud, Inc.
  29  */
  30 /*
  31  * Copyright (c) 2010, Intel Corporation.
  32  * All rights reserved.
  33  */
  34 /*
  35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
  36  */
  37 
  38 /*
  39  * CPU Identification logic
  40  *
  41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
  42  * with the identification of CPUs, their features, and their topologies. More
  43  * specifically, this file helps drive the following:
  44  *
  45  * 1. Enumeration of features of the processor which are used by the kernel to
  46  *    determine what features to enable or disable. These may be instruction set
  47  *    enhancements or features that we use.
  48  *
  49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
  50  *    will be told about through the auxiliary vector.
  51  *
  52  * 3. Understanding the physical topology of the CPU such as the number of
  53  *    caches, how many cores it has, whether or not it supports symmetric
  54  *    multi-processing (SMT), etc.
  55  *
  56  * ------------------------
  57  * CPUID History and Basics
  58  * ------------------------
  59  *
  60  * The cpuid instruction was added by Intel roughly around the time that the
  61  * original Pentium was introduced. The purpose of cpuid was to tell in a
  62  * programmatic fashion information about the CPU that previously was guessed
  63  * at. For example, an important part of cpuid is that we can know what
  64  * extensions to the ISA exist. If you use an invalid opcode you would get a
  65  * #UD, so this method allows a program (whether a user program or the kernel)
  66  * to determine what exists without crashing or getting a SIGILL. Of course,
  67  * this was also during the era of the clones and the AMD Am5x86. The vendor
  68  * name shows up first in cpuid for a reason.
  69  *
  70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
  71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
  72  * its own meaning. The different leaves are broken down into different regions:
  73  *
  74  *      [ 0, 7fffffff ]                 This region is called the 'basic'
  75  *                                      region. This region is generally defined
  76  *                                      by Intel, though some of the original
  77  *                                      portions have different meanings based
  78  *                                      on the manufacturer. These days, Intel
  79  *                                      adds most new features to this region.
  80  *                                      AMD adds non-Intel compatible
  81  *                                      information in the third, extended
  82  *                                      region. Intel uses this for everything
  83  *                                      including ISA extensions, CPU
  84  *                                      features, cache information, topology,
  85  *                                      and more.
  86  *
  87  *                                      There is a hole carved out of this
  88  *                                      region which is reserved for
  89  *                                      hypervisors.
  90  *
  91  *      [ 40000000, 4fffffff ]          This region, which is found in the
  92  *                                      middle of the previous region, is
  93  *                                      explicitly promised to never be used by
  94  *                                      CPUs. Instead, it is used by hypervisors
  95  *                                      to communicate information about
  96  *                                      themselves to the operating system. The
  97  *                                      values and details are unique for each
  98  *                                      hypervisor.
  99  *
 100  *      [ 80000000, ffffffff ]          This region is called the 'extended'
 101  *                                      region. Some of the low leaves mirror
 102  *                                      parts of the basic leaves. This region
 103  *                                      has generally been used by AMD for
 104  *                                      various extensions. For example, AMD-
 105  *                                      specific information about caches,
 106  *                                      features, and topology are found in this
 107  *                                      region.
 108  *
 109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
 110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
 111  * the ranges, one of the primary things returned is the maximum valid leaf in
 112  * that range. This allows for discovery of what range of CPUID is valid.
 113  *
 114  * The CPUs have potentially surprising behavior when using an invalid leaf or
 115  * unimplemented leaf. If the requested leaf is within the valid basic or
 116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
 117  * set to zero. However, if you specify a leaf that is outside of a valid range,
 118  * then instead it will be filled with the last valid _basic_ leaf. For example,
 119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
 120  * an invalid extended leaf will return the information for leaf 3.
 121  *
 122  * Some leaves are broken down into sub-leaves. This means that the value
 123  * depends on both the leaf asked for in %eax and a secondary register. For
 124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
 125  * additional information. Or when getting topology information in leaf 0xb, the
 126  * initial value in %ecx changes which level of the topology that you are
 127  * getting information about.
 128  *
 129  * cpuid values are always kept to 32 bits regardless of whether or not the
 130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
 131  * 32 bits of the register are always set to zero so that way the values are the
 132  * same regardless of execution mode.
 133  *
 134  * ----------------------
 135  * Identifying Processors
 136  * ----------------------
 137  *
 138  * We can identify a processor in two steps. The first step looks at cpuid leaf
 139  * 0. Leaf 0 contains the processor's vendor information. This is done by
 140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
 141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
 142  *
 143  * From there, a processor is identified by a combination of three different
 144  * values:
 145  *
 146  *  1. Family
 147  *  2. Model
 148  *  3. Stepping
 149  *
 150  * Each vendor uses the family and model to uniquely identify a processor. The
 151  * way that family and model are changed depends on the vendor. For example,
 152  * Intel has been using family 0x6 for almost all of their processor since the
 153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
 154  * identify the exact processor. Different models are often used for the client
 155  * (consumer) and server parts. Even though each processor often has major
 156  * architectural differences, they still are considered the same family by
 157  * Intel.
 158  *
 159  * On the other hand, each major AMD architecture generally has its own family.
 160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
 161  * the model number is used to help identify specific processors.  As AMD's
 162  * product lines have expanded, they have started putting a mixed bag of
 163  * processors into the same family, with each processor under a single
 164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
 165  * refer to each such collection as a processor family, distinct from cpuid
 166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
 167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
 168  * defines the processor family's non-architectural features.  In general, we'll
 169  * use "family" here to mean the family number reported by the cpuid instruction
 170  * and distinguish the processor family from it where appropriate.
 171  *
 172  * The stepping is used to refer to a revision of a specific microprocessor. The
 173  * term comes from equipment used to produce masks that are used to create
 174  * integrated circuits.
 175  *
 176  * The information is present in leaf 1, %eax. In technical documentation you
 177  * will see the terms extended model and extended family. The original family,
 178  * model, and stepping fields were each 4 bits wide. If the values in either
 179  * are 0xf, then one is to consult the extended model and extended family, which
 180  * take previously reserved bits and allow for a larger number of models and add
 181  * 0xf to them.
 182  *
 183  * When we process this information, we store the full family, model, and
 184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
 185  * cpi_step, respectively. Whenever you are performing comparisons with the
 186  * family, model, and stepping, you should use these members and not the raw
 187  * values from cpuid. If you must use the raw values from cpuid directly, you
 188  * must make sure that you add the extended model and family to the base model
 189  * and family.
 190  *
 191  * In general, we do not use information about the family, model, and stepping
 192  * to determine whether or not a feature is present; that is generally driven by
 193  * specific leaves. However, when something we care about on the processor is
 194  * not considered 'architectural' meaning that it is specific to a set of
 195  * processors and not promised in the architecture model to be consistent from
 196  * generation to generation, then we will fall back on this information. The
 197  * most common cases where this comes up is when we have to workaround errata in
 198  * the processor, are dealing with processor-specific features such as CPU
 199  * performance counters, or we want to provide additional information for things
 200  * such as fault management.
 201  *
 202  * While processors also do have a brand string, which is the name that people
 203  * are familiar with when buying the processor, they are not meant for
 204  * programmatic consumption. That is what the family, model, and stepping are
 205  * for.
 206  *
 207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
 208  * and stepping(s) that refer to a single or very closely related set of silicon
 209  * implementations; while there are sometimes more specific ways to learn of the
 210  * presence or absence of a particular erratum or workaround, one may generally
 211  * assume that all processors of the same chiprev have the same errata and we
 212  * have chosen to represent them this way precisely because that is how AMD
 213  * groups them in their revision guides (errata documentation).  The processor
 214  * family (x86_processor_family_t) may be extracted from the chiprev if that
 215  * level of detail is not needed.  Processor families are considered unordered
 216  * but revisions within a family may be compared for either an exact match or at
 217  * least as recent as a reference revision.  See the chiprev_xxx() functions
 218  * below.
 219  *
 220  * Similarly, each processor family implements a particular microarchitecture,
 221  * which itself may have multiple revisions.  In general, non-architectural
 222  * features are specific to a processor family, but some may exist across
 223  * families containing cores that implement the same microarchitectural revision
 224  * (and, such cores share common bugs, too).  We provide utility routines
 225  * analogous to those for extracting and comparing chiprevs for
 226  * microarchitectures as well; see the uarch_xxx() functions.
 227  *
 228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
 229  * present used and available only for AMD and AMD-like processors.
 230  *
 231  * ------------
 232  * CPUID Passes
 233  * ------------
 234  *
 235  * As part of performing feature detection, we break this into several different
 236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
 237  * support processors that have a missing or broken cpuid instruction (notably
 238  * certain Cyrix processors) but those were all 32-bit processors which are no
 239  * longer supported. Passes are no longer numbered explicitly to make it easier
 240  * to break them up or move them around as needed; however, they still have a
 241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
 242  * x86_archext.h. The external interface to execute a cpuid pass or determine
 243  * whether a pass has been completed consists of cpuid_execpass() and
 244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
 245  * are as follows:
 246  *
 247  *      PRELUDE         This pass does not have any dependencies on system
 248  *                      setup; in particular, unlike all subsequent passes it is
 249  *                      guaranteed not to require PCI config space access.  It
 250  *                      sets the flag indicating that the processor we are
 251  *                      running on supports the cpuid instruction, which all
 252  *                      64-bit processors do.  This would also be the place to
 253  *                      add any other basic state that is required later on and
 254  *                      can be learned without dependencies.
 255  *
 256  *      IDENT           Determine which vendor manufactured the CPU, the family,
 257  *                      model, and stepping information, and compute basic
 258  *                      identifying tags from those values.  This is done first
 259  *                      so that machine-dependent code can control the features
 260  *                      the cpuid instruction will report during subsequent
 261  *                      passes if needed, and so that any intervening
 262  *                      machine-dependent code that needs basic identity will
 263  *                      have it available.  This includes synthesised
 264  *                      identifiers such as chiprev and uarchrev as well as the
 265  *                      values obtained directly from cpuid.  Prior to executing
 266  *                      this pass, machine-depedent boot code is responsible for
 267  *                      ensuring that the PCI configuration space access
 268  *                      functions have been set up and, if necessary, that
 269  *                      determine_platform() has been called.
 270  *
 271  *      BASIC           This is the primary pass and is responsible for doing a
 272  *                      large number of different things:
 273  *
 274  *                      1. Gathering a large number of feature flags to
 275  *                      determine which features the CPU support and which
 276  *                      indicate things that we need to do other work in the OS
 277  *                      to enable. Features detected this way are added to the
 278  *                      x86_featureset which can be queried to
 279  *                      determine what we should do. This includes processing
 280  *                      all of the basic and extended CPU features that we care
 281  *                      about.
 282  *
 283  *                      2. Determining the CPU's topology. This includes
 284  *                      information about how many cores and threads are present
 285  *                      in the package. It also is responsible for figuring out
 286  *                      which logical CPUs are potentially part of the same core
 287  *                      and what other resources they might share. For more
 288  *                      information see the 'Topology' section.
 289  *
 290  *                      3. Determining the set of CPU security-specific features
 291  *                      that we need to worry about and determine the
 292  *                      appropriate set of workarounds.
 293  *
 294  *                      Pass 1 on the boot CPU occurs before KMDB is started.
 295  *
 296  *      EXTENDED        The second pass is done after startup(). Here, we check
 297  *                      other miscellaneous features. Most of this is gathering
 298  *                      additional basic and extended features that we'll use in
 299  *                      later passes or for debugging support.
 300  *
 301  *      DYNAMIC         The third pass occurs after the kernel memory allocator
 302  *                      has been fully initialized. This gathers information
 303  *                      where we might need dynamic memory available for our
 304  *                      uses. This includes several varying width leaves that
 305  *                      have cache information and the processor's brand string.
 306  *
 307  *      RESOLVE         The fourth and final normal pass is performed after the
 308  *                      kernel has brought most everything online. This is
 309  *                      invoked from post_startup(). In this pass, we go through
 310  *                      the set of features that we have enabled and turn that
 311  *                      into the hardware auxiliary vector features that
 312  *                      userland receives. This is used by userland, primarily
 313  *                      by the run-time link-editor (RTLD), though userland
 314  *                      software could also refer to it directly.
 315  *
 316  * The function that performs a pass is currently assumed to be infallible, and
 317  * all existing implementation are.  This simplifies callers by allowing
 318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
 319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
 320  * Both of these assumptions can be relaxed if needed by future developments.
 321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
 322  * error to attempt to execute a pass before all previous passes have been
 323  * completed on the specified CPU, or to request cpuid information before the
 324  * pass that captures it has been executed.  These conditions can be tested
 325  * using cpuid_checkpass().
 326  *
 327  * The Microcode Pass
 328  *
 329  * After a microcode update, we do a selective rescan of the cpuid leaves to
 330  * determine what features have changed. Microcode updates can provide more
 331  * details about security related features to deal with issues like Spectre and
 332  * L1TF. On occasion, vendors have violated their contract and removed bits.
 333  * However, we don't try to detect that because that puts us in a situation that
 334  * we really can't deal with. As such, the only thing we rescan are security
 335  * related features today. See cpuid_pass_ucode().  This pass may be run in a
 336  * different sequence on APs and therefore is not part of the sequential order;
 337  * It is invoked directly instead of by cpuid_execpass() and its completion
 338  * status cannot be checked by cpuid_checkpass().  This could be integrated with
 339  * a more complex dependency mechanism if warranted by future developments.
 340  *
 341  * All of the passes are run on all CPUs. However, for the most part we only
 342  * care about what the boot CPU says about this information and use the other
 343  * CPUs as a rough guide to sanity check that we have the same feature set.
 344  *
 345  * We do not support running multiple logical CPUs with disjoint, let alone
 346  * different, feature sets.
 347  *
 348  * ------------------
 349  * Processor Topology
 350  * ------------------
 351  *
 352  * One of the important things that we need to do is to understand the topology
 353  * of the underlying processor. When we say topology in this case, we're trying
 354  * to understand the relationship between the logical CPUs that the operating
 355  * system sees and the underlying physical layout. Different logical CPUs may
 356  * share different resources which can have important consequences for the
 357  * performance of the system. For example, they may share caches, execution
 358  * units, and more.
 359  *
 360  * The topology of the processor changes from generation to generation and
 361  * vendor to vendor.  Along with that, different vendors use different
 362  * terminology, and the operating system itself uses occasionally overlapping
 363  * terminology. It's important to understand what this topology looks like so
 364  * one can understand the different things that we try to calculate and
 365  * determine.
 366  *
 367  * To get started, let's talk about a little bit of terminology that we've used
 368  * so far, is used throughout this file, and is fairly generic across multiple
 369  * vendors:
 370  *
 371  * CPU
 372  *      A central processing unit (CPU) refers to a logical and/or virtual
 373  *      entity that the operating system can execute instructions on. The
 374  *      underlying resources for this CPU may be shared between multiple
 375  *      entities; however, to the operating system it is a discrete unit.
 376  *
 377  * PROCESSOR and PACKAGE
 378  *
 379  *      Generally, when we use the term 'processor' on its own, we are referring
 380  *      to the physical entity that one buys and plugs into a board. However,
 381  *      because processor has been overloaded and one might see it used to mean
 382  *      multiple different levels, we will instead use the term 'package' for
 383  *      the rest of this file. The term package comes from the electrical
 384  *      engineering side and refers to the physical entity that encloses the
 385  *      electronics inside. Strictly speaking the package can contain more than
 386  *      just the CPU, for example, on many processors it may also have what's
 387  *      called an 'integrated graphical processing unit (GPU)'. Because the
 388  *      package can encapsulate multiple units, it is the largest physical unit
 389  *      that we refer to.
 390  *
 391  * SOCKET
 392  *
 393  *      A socket refers to unit on a system board (generally the motherboard)
 394  *      that can receive a package. A single package, or processor, is plugged
 395  *      into a single socket. A system may have multiple sockets. Often times,
 396  *      the term socket is used interchangeably with package and refers to the
 397  *      electrical component that has plugged in, and not the receptacle itself.
 398  *
 399  * CORE
 400  *
 401  *      A core refers to the physical instantiation of a CPU, generally, with a
 402  *      full set of hardware resources available to it. A package may contain
 403  *      multiple cores inside of it or it may just have a single one. A
 404  *      processor with more than one core is often referred to as 'multi-core'.
 405  *      In illumos, we will use the feature X86FSET_CMP to refer to a system
 406  *      that has 'multi-core' processors.
 407  *
 408  *      A core may expose a single logical CPU to the operating system, or it
 409  *      may expose multiple CPUs, which we call threads, defined below.
 410  *
 411  *      Some resources may still be shared by cores in the same package. For
 412  *      example, many processors will share the level 3 cache between cores.
 413  *      Some AMD generations share hardware resources between cores. For more
 414  *      information on that see the section 'AMD Topology'.
 415  *
 416  * THREAD and STRAND
 417  *
 418  *      In this file, generally a thread refers to a hardware resources and not
 419  *      the operating system's logical abstraction. A thread is always exposed
 420  *      as an independent logical CPU to the operating system. A thread belongs
 421  *      to a specific core. A core may have more than one thread. When that is
 422  *      the case, the threads that are part of the same core are often referred
 423  *      to as 'siblings'.
 424  *
 425  *      When multiple threads exist, this is generally referred to as
 426  *      simultaneous multi-threading (SMT). When Intel introduced this in their
 427  *      processors they called it hyper-threading (HT). When multiple threads
 428  *      are active in a core, they split the resources of the core. For example,
 429  *      two threads may share the same set of hardware execution units.
 430  *
 431  *      The operating system often uses the term 'strand' to refer to a thread.
 432  *      This helps disambiguate it from the software concept.
 433  *
 434  * CHIP
 435  *
 436  *      Unfortunately, the term 'chip' is dramatically overloaded. At its most
 437  *      base meaning, it is used to refer to a single integrated circuit, which
 438  *      may or may not be the only thing in the package. In illumos, when you
 439  *      see the term 'chip' it is almost always referring to the same thing as
 440  *      the 'package'. However, many vendors may use chip to refer to one of
 441  *      many integrated circuits that have been placed in the package. As an
 442  *      example, see the subsequent definition.
 443  *
 444  *      To try and keep things consistent, we will only use chip when referring
 445  *      to the entire integrated circuit package, with the exception of the
 446  *      definition of multi-chip module (because it is in the name) and use the
 447  *      term 'die' when we want the more general, potential sub-component
 448  *      definition.
 449  *
 450  * DIE
 451  *
 452  *      A die refers to an integrated circuit. Inside of the package there may
 453  *      be a single die or multiple dies. This is sometimes called a 'chip' in
 454  *      vendor's parlance, but in this file, we use the term die to refer to a
 455  *      subcomponent.
 456  *
 457  * MULTI-CHIP MODULE
 458  *
 459  *      A multi-chip module (MCM) refers to putting multiple distinct chips that
 460  *      are connected together in the same package. When a multi-chip design is
 461  *      used, generally each chip is manufactured independently and then joined
 462  *      together in the package. For example, on AMD's Zen microarchitecture
 463  *      (family 0x17), the package contains several dies (the second meaning of
 464  *      chip from above) that are connected together.
 465  *
 466  * CACHE
 467  *
 468  *      A cache is a part of the processor that maintains copies of recently
 469  *      accessed memory. Caches are split into levels and then into types.
 470  *      Commonly there are one to three levels, called level one, two, and
 471  *      three. The lower the level, the smaller it is, the closer it is to the
 472  *      execution units of the CPU, and the faster it is to access. The layout
 473  *      and design of the cache come in many different flavors, consult other
 474  *      resources for a discussion of those.
 475  *
 476  *      Caches are generally split into two types, the instruction and data
 477  *      cache. The caches contain what their names suggest, the instruction
 478  *      cache has executable program text, while the data cache has all other
 479  *      memory that the processor accesses. As of this writing, data is kept
 480  *      coherent between all of the caches on x86, so if one modifies program
 481  *      text before it is executed, that will be in the data cache, and the
 482  *      instruction cache will be synchronized with that change when the
 483  *      processor actually executes those instructions. This coherency also
 484  *      covers the fact that data could show up in multiple caches.
 485  *
 486  *      Generally, the lowest level caches are specific to a core. However, the
 487  *      last layer cache is shared between some number of cores. The number of
 488  *      CPUs sharing this last level cache is important. This has implications
 489  *      for the choices that the scheduler makes, as accessing memory that might
 490  *      be in a remote cache after thread migration can be quite expensive.
 491  *
 492  *      Sometimes, the word cache is abbreviated with a '$', because in US
 493  *      English the word cache is pronounced the same as cash. So L1D$ refers to
 494  *      the L1 data cache, and L2$ would be the L2 cache. This will not be used
 495  *      in the rest of this theory statement for clarity.
 496  *
 497  * MEMORY CONTROLLER
 498  *
 499  *      The memory controller is a component that provides access to DRAM. Each
 500  *      memory controller can access a set number of DRAM channels. Each channel
 501  *      can have a number of DIMMs (sticks of memory) associated with it. A
 502  *      given package may have more than one memory controller. The association
 503  *      of the memory controller to a group of cores is important as it is
 504  *      cheaper to access memory on the controller that you are associated with.
 505  *
 506  * NUMA
 507  *
 508  *      NUMA or non-uniform memory access, describes a way that systems are
 509  *      built. On x86, any processor core can address all of the memory in the
 510  *      system. However, When using multiple sockets or possibly within a
 511  *      multi-chip module, some of that memory is physically closer and some of
 512  *      it is further. Memory that is further away is more expensive to access.
 513  *      Consider the following image of multiple sockets with memory:
 514  *
 515  *      +--------+                                                +--------+
 516  *      | DIMM A |         +----------+      +----------+         | DIMM D |
 517  *      +--------+-+       |          |      |          |       +-+------+-+
 518  *        | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
 519  *        +--------+-+     |          |      |          |     +-+------+-+
 520  *          | DIMM C |     +----------+      +----------+     | DIMM F |
 521  *          +--------+                                        +--------+
 522  *
 523  *      In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
 524  *      closer to DIMMs D-F. This means that it is cheaper for socket 0 to
 525  *      access DIMMs A-C and more expensive to access D-F as it has to go
 526  *      through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
 527  *      D-F are cheaper than A-C. While the socket form is the most common, when
 528  *      using multi-chip modules, this can also sometimes occur. For another
 529  *      example of this that's more involved, see the AMD topology section.
 530  *
 531  *
 532  * Intel Topology
 533  * --------------
 534  *
 535  * Most Intel processors since Nehalem, (as of this writing the current gen
 536  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
 537  * the package is a single monolithic die. MCMs currently aren't used. Most
 538  * parts have three levels of caches, with the L3 cache being shared between
 539  * all of the cores on the package. The L1/L2 cache is generally specific to
 540  * an individual core. The following image shows at a simplified level what
 541  * this looks like. The memory controller is commonly part of something called
 542  * the 'Uncore', that used to be separate physical chips that were not a part of
 543  * the package, but are now part of the same chip.
 544  *
 545  *  +-----------------------------------------------------------------------+
 546  *  | Package                                                               |
 547  *  |  +-------------------+  +-------------------+  +-------------------+  |
 548  *  |  | Core              |  | Core              |  | Core              |  |
 549  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 550  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
 551  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
 552  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
 553  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
 554  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
 555  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 556  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
 557  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
 558  *  |  +-------------------+  +-------------------+  +-------------------+  |
 559  *  | +-------------------------------------------------------------------+ |
 560  *  | |                         Shared L3 Cache                           | |
 561  *  | +-------------------------------------------------------------------+ |
 562  *  | +-------------------------------------------------------------------+ |
 563  *  | |                        Memory Controller                          | |
 564  *  | +-------------------------------------------------------------------+ |
 565  *  +-----------------------------------------------------------------------+
 566  *
 567  * A side effect of this current architecture is that what we care about from a
 568  * scheduling and topology perspective, is simplified. In general we care about
 569  * understanding which logical CPUs are part of the same core and socket.
 570  *
 571  * To determine the relationship between threads and cores, Intel initially used
 572  * the identifier in the advanced programmable interrupt controller (APIC). They
 573  * also added cpuid leaf 4 to give additional information about the number of
 574  * threads and CPUs in the processor. With the addition of x2apic (which
 575  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
 576  * additional cpuid topology leaf 0xB was added.
 577  *
 578  * AMD Topology
 579  * ------------
 580  *
 581  * When discussing AMD topology, we want to break this into three distinct
 582  * generations of topology. There's the basic topology that has been used in
 583  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
 584  * with family 0x15 (Bulldozer), and there's the topology that was introduced
 585  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
 586  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
 587  * additional terminology that's worth talking about.
 588  *
 589  * Until the introduction of family 0x17 (Zen), AMD did not implement something
 590  * that they considered SMT. Whether or not the AMD processors have SMT
 591  * influences many things including scheduling and reliability, availability,
 592  * and serviceability (RAS) features.
 593  *
 594  * NODE
 595  *
 596  *      AMD uses the term node to refer to a die that contains a number of cores
 597  *      and I/O resources. Depending on the processor family and model, more
 598  *      than one node can be present in the package. When there is more than one
 599  *      node this indicates a multi-chip module. Usually each node has its own
 600  *      access to memory and I/O devices. This is important and generally
 601  *      different from the corresponding Intel Nehalem-Skylake+ processors. As a
 602  *      result, we track this relationship in the operating system.
 603  *
 604  *      In processors with an L3 cache, the L3 cache is generally shared across
 605  *      the entire node, though the way this is carved up varies from generation
 606  *      to generation.
 607  *
 608  * BULLDOZER
 609  *
 610  *      Starting with the Bulldozer family (0x15) and continuing until the
 611  *      introduction of the Zen microarchitecture, AMD introduced the idea of a
 612  *      compute unit. In a compute unit, two traditional cores share a number of
 613  *      hardware resources. Critically, they share the FPU, L1 instruction
 614  *      cache, and the L2 cache. Several compute units were then combined inside
 615  *      of a single node.  Because the integer execution units, L1 data cache,
 616  *      and some other resources were not shared between the cores, AMD never
 617  *      considered this to be SMT.
 618  *
 619  * ZEN
 620  *
 621  *      The Zen family (0x17) uses a multi-chip module (MCM) design, the module
 622  *      is called Zeppelin. These modules are similar to the idea of nodes used
 623  *      previously. Each of these nodes has two DRAM channels which all of the
 624  *      cores in the node can access uniformly. These nodes are linked together
 625  *      in the package, creating a NUMA environment.
 626  *
 627  *      The Zeppelin die itself contains two different 'core complexes'. Each
 628  *      core complex consists of four cores which each have two threads, for a
 629  *      total of 8 logical CPUs per complex. Unlike other generations,
 630  *      where all the logical CPUs in a given node share the L3 cache, here each
 631  *      core complex has its own shared L3 cache.
 632  *
 633  *      A further thing that we need to consider is that in some configurations,
 634  *      particularly with the Threadripper line of processors, not every die
 635  *      actually has its memory controllers wired up to actual memory channels.
 636  *      This means that some cores have memory attached to them and others
 637  *      don't.
 638  *
 639  *      To put Zen in perspective, consider the following images:
 640  *
 641  *      +--------------------------------------------------------+
 642  *      | Core Complex                                           |
 643  *      | +-------------------+    +-------------------+  +---+  |
 644  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
 645  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
 646  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
 647  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
 648  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
 649  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 650  *      | +-------------------+    +-------------------+  | C |  |
 651  *      | +-------------------+    +-------------------+  | a |  |
 652  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
 653  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
 654  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
 655  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
 656  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
 657  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
 658  *      | +-------------------+    +-------------------+  +---+  |
 659  *      |                                                        |
 660  *      +--------------------------------------------------------+
 661  *
 662  *  This first image represents a single Zen core complex that consists of four
 663  *  cores.
 664  *
 665  *
 666  *      +--------------------------------------------------------+
 667  *      | Zeppelin Die                                           |
 668  *      |  +--------------------------------------------------+  |
 669  *      |  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
 670  *      |  +--------------------------------------------------+  |
 671  *      |                           HH                           |
 672  *      |          +-----------+    HH    +-----------+          |
 673  *      |          |           |    HH    |           |          |
 674  *      |          |    Core   |==========|    Core   |          |
 675  *      |          |  Complex  |==========|  Complex  |          |
 676  *      |          |           |    HH    |           |          |
 677  *      |          +-----------+    HH    +-----------+          |
 678  *      |                           HH                           |
 679  *      |  +--------------------------------------------------+  |
 680  *      |  |                Memory Controller                 |  |
 681  *      |  +--------------------------------------------------+  |
 682  *      |                                                        |
 683  *      +--------------------------------------------------------+
 684  *
 685  *  This image represents a single Zeppelin Die. Note how both cores are
 686  *  connected to the same memory controller and I/O units. While each core
 687  *  complex has its own L3 cache as seen in the first image, they both have
 688  *  uniform access to memory.
 689  *
 690  *
 691  *                      PP                     PP
 692  *                      PP                     PP
 693  *           +----------PP---------------------PP---------+
 694  *           |          PP                     PP         |
 695  *           |    +-----------+          +-----------+    |
 696  *           |    |           |          |           |    |
 697  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 698  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 699  *           |    |           |          |           |    |
 700  *           |    +-----------+ooo    ...+-----------+    |
 701  *           |          HH      ooo  ...       HH         |
 702  *           |          HH        oo..         HH         |
 703  *           |          HH        ..oo         HH         |
 704  *           |          HH      ...  ooo       HH         |
 705  *           |    +-----------+...    ooo+-----------+    |
 706  *           |    |           |          |           |    |
 707  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
 708  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
 709  *           |    |           |          |           |    |
 710  *           |    +-----------+          +-----------+    |
 711  *           |          PP                     PP         |
 712  *           +----------PP---------------------PP---------+
 713  *                      PP                     PP
 714  *                      PP                     PP
 715  *
 716  *  This image represents a single Zen package. In this example, it has four
 717  *  Zeppelin dies, though some configurations only have a single one. In this
 718  *  example, each die is directly connected to the next. Also, each die is
 719  *  represented as being connected to memory by the 'M' character and connected
 720  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
 721  *  die is made up of two core complexes, we have multiple different NUMA
 722  *  domains that we care about for these systems.
 723  *
 724  * ZEN 2
 725  *
 726  *      Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
 727  *      each Zeppelin Die had its own I/O die, that has been moved out of the
 728  *      core complex in Zen 2. The actual core complex looks pretty similar, but
 729  *      now the die actually looks much simpler:
 730  *
 731  *      +--------------------------------------------------------+
 732  *      | Zen 2 Core Complex Die    HH                           |
 733  *      |                           HH                           |
 734  *      |          +-----------+    HH    +-----------+          |
 735  *      |          |           |    HH    |           |          |
 736  *      |          |    Core   |==========|    Core   |          |
 737  *      |          |  Complex  |==========|  Complex  |          |
 738  *      |          |           |    HH    |           |          |
 739  *      |          +-----------+    HH    +-----------+          |
 740  *      |                           HH                           |
 741  *      |                           HH                           |
 742  *      +--------------------------------------------------------+
 743  *
 744  *      From here, when we add the central I/O die, this changes things a bit.
 745  *      Each die is connected to the I/O die, rather than trying to interconnect
 746  *      them directly. The following image takes the same Zen 1 image that we
 747  *      had earlier and shows what it looks like with the I/O die instead:
 748  *
 749  *                                 PP    PP
 750  *                                 PP    PP
 751  *           +---------------------PP----PP---------------------+
 752  *           |                     PP    PP                     |
 753  *           |  +-----------+      PP    PP      +-----------+  |
 754  *           |  |           |      PP    PP      |           |  |
 755  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
 756  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
 757  *           |  |         |o|oooo|          |oooo|o|         |  |
 758  *           |  +-----------+    |          |    +-----------+  |
 759  *           |                   |   I/O    |                   |
 760  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
 761  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
 762  *           |                   |          |                   |
 763  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
 764  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
 765  *           |                   |          |                   |
 766  *           |  +-----------+    |          |    +-----------+  |
 767  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
 768  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
 769  *           |  |    Die    |      PP    PP      |    Die    |  |
 770  *           |  |           |      PP    PP      |           |  |
 771  *           |  +-----------+      PP    PP      +-----------+  |
 772  *           |                     PP    PP                     |
 773  *           +---------------------PP----PP---------------------+
 774  *                                 PP    PP
 775  *                                 PP    PP
 776  *
 777  *      The above has four core complex dies installed, though the Zen 2 EPYC
 778  *      and ThreadRipper parts allow for up to eight, while the Ryzen parts
 779  *      generally only have one to two. The more notable difference here is how
 780  *      everything communicates. Note that memory and PCIe come out of the
 781  *      central die. This changes the way that one die accesses a resource. It
 782  *      basically always has to go to the I/O die, where as in Zen 1 it may have
 783  *      satisfied it locally. In general, this ends up being a better strategy
 784  *      for most things, though it is possible to still treat everything in four
 785  *      distinct NUMA domains with each Zen 2 die slightly closer to some memory
 786  *      and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
 787  *      now there is only one 'node' present.
 788  *
 789  * ZEN 3
 790  *
 791  *      From an architectural perspective, Zen 3 is a much smaller change from
 792  *      Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
 793  *      its microarchitectural changes. The biggest thing for us is how the die
 794  *      changes. In Zen 1 and Zen 2, each core complex still had its own L3
 795  *      cache. However, in Zen 3, the L3 is now shared between the entire core
 796  *      complex die and is no longer partitioned between each core complex. This
 797  *      means that all cores on the die can share the same L3 cache. Otherwise,
 798  *      the general layout of the overall package with various core complexes
 799  *      and an I/O die stays the same. Here's what the Core Complex Die looks
 800  *      like in a bit more detail:
 801  *
 802  *               +-------------------------------------------------+
 803  *               | Zen 3 Core Complex Die                          |
 804  *               | +-------------------+    +-------------------+  |
 805  *               | | Core       +----+ |    | Core       +----+ |  |
 806  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 807  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 808  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 809  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 810  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 811  *               | +-------------------+    +-------------------+  |
 812  *               | +-------------------+    +-------------------+  |
 813  *               | | Core       +----+ |    | Core       +----+ |  |
 814  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 815  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 816  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 817  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 818  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 819  *               | +-------------------+    +-------------------+  |
 820  *               |                                                 |
 821  *               | +--------------------------------------------+  |
 822  *               | |                 L3 Cache                   |  |
 823  *               | +--------------------------------------------+  |
 824  *               |                                                 |
 825  *               | +-------------------+    +-------------------+  |
 826  *               | | Core       +----+ |    | Core       +----+ |  |
 827  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 828  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 829  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 830  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 831  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 832  *               | +-------------------+    +-------------------+  |
 833  *               | +-------------------+    +-------------------+  |
 834  *               | | Core       +----+ |    | Core       +----+ |  |
 835  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
 836  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
 837  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
 838  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
 839  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
 840  *               | +-------------------+    +-------------------+  |
 841  *               +-------------------------------------------------+
 842  *
 843  *      While it is not pictured, there are connections from the die to the
 844  *      broader data fabric and additional functional blocks to support that
 845  *      communication and coherency.
 846  *
 847  * CPUID LEAVES
 848  *
 849  * There are a few different CPUID leaves that we can use to try and understand
 850  * the actual state of the world. As part of the introduction of family 0xf, AMD
 851  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
 852  * processors that are in the system. Because families before Zen didn't have
 853  * SMT, this was always the number of cores that were in the system. However, it
 854  * should always be thought of as the number of logical threads to be consistent
 855  * between generations. In addition we also get the size of the APIC ID that is
 856  * used to represent the number of logical processors. This is important for
 857  * deriving topology information.
 858  *
 859  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
 860  * bit between Bulldozer and later families, but it is quite useful in
 861  * determining the topology information. Because this information has changed
 862  * across family generations, it's worth calling out what these mean
 863  * explicitly. The registers have the following meanings:
 864  *
 865  *      %eax    The APIC ID. The entire register is defined to have a 32-bit
 866  *              APIC ID, even though on systems without x2apic support, it will
 867  *              be limited to 8 bits.
 868  *
 869  *      %ebx    On Bulldozer-era systems this contains information about the
 870  *              number of cores that are in a compute unit (cores that share
 871  *              resources). It also contains a per-package compute unit ID that
 872  *              identifies which compute unit the logical CPU is a part of.
 873  *
 874  *              On Zen-era systems this instead contains the number of threads
 875  *              per core and the ID of the core that the logical CPU is a part
 876  *              of. Note, this ID is unique only to the package, it is not
 877  *              globally unique across the entire system.
 878  *
 879  *      %ecx    This contains the number of nodes that exist in the package. It
 880  *              also contains an ID that identifies which node the logical CPU
 881  *              is a part of.
 882  *
 883  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
 884  * cache layout to determine which logical CPUs are sharing which caches.
 885  *
 886  * illumos Topology
 887  * ----------------
 888  *
 889  * Based on the above we synthesize the information into several different
 890  * variables that we store in the 'struct cpuid_info'. We'll go into the details
 891  * of what each member is supposed to represent and their uniqueness. In
 892  * general, there are two levels of uniqueness that we care about. We care about
 893  * an ID that is globally unique. That means that it will be unique across all
 894  * entities in the system. For example, the default logical CPU ID is globally
 895  * unique. On the other hand, there is some information that we only care about
 896  * being unique within the context of a single package / socket. Here are the
 897  * variables that we keep track of and their meaning.
 898  *
 899  * Several of the values that are asking for an identifier, with the exception
 900  * of cpi_apicid, are allowed to be synthetic.
 901  *
 902  *
 903  * cpi_apicid
 904  *
 905  *      This is the value of the CPU's APIC id. This should be the full 32-bit
 906  *      ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
 907  *      APIC ID. This value is globally unique between all logical CPUs across
 908  *      all packages. This is usually required by the APIC.
 909  *
 910  * cpi_chipid
 911  *
 912  *      This value indicates the ID of the package that the logical CPU is a
 913  *      part of. This value is allowed to be synthetic. It is usually derived by
 914  *      taking the CPU's APIC ID and determining how many bits are used to
 915  *      represent CPU cores in the package. All logical CPUs that are part of
 916  *      the same package must have the same value.
 917  *
 918  * cpi_coreid
 919  *
 920  *      This represents the ID of a CPU core. Two logical CPUs should only have
 921  *      the same cpi_coreid value if they are part of the same core. These
 922  *      values may be synthetic. On systems that support SMT, this value is
 923  *      usually derived from the APIC ID, otherwise it is often synthetic and
 924  *      just set to the value of the cpu_id in the cpu_t.
 925  *
 926  * cpi_pkgcoreid
 927  *
 928  *      This is similar to the cpi_coreid in that logical CPUs that are part of
 929  *      the same core should have the same ID. The main difference is that these
 930  *      values are only required to be unique to a given socket.
 931  *
 932  * cpi_clogid
 933  *
 934  *      This represents the logical ID of a logical CPU. This value should be
 935  *      unique within a given socket for each logical CPU. This is allowed to be
 936  *      synthetic, though it is usually based off of the CPU's apic ID. The
 937  *      broader system expects that logical CPUs that have are part of the same
 938  *      core have contiguous numbers. For example, if there were two threads per
 939  *      core, then the core IDs divided by two should be the same and the first
 940  *      modulus two should be zero and the second one. For example, IDs 4 and 5
 941  *      indicate two logical CPUs that are part of the same core. But IDs 5 and
 942  *      6 represent two logical CPUs that are part of different cores.
 943  *
 944  *      While it is common for the cpi_coreid and the cpi_clogid to be derived
 945  *      from the same source, strictly speaking, they don't have to be and the
 946  *      two values should be considered logically independent. One should not
 947  *      try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
 948  *      some kind of relationship. While this is tempting, we've seen cases on
 949  *      AMD family 0xf where the system's cpu id is not related to its APIC ID.
 950  *
 951  * cpi_ncpu_per_chip
 952  *
 953  *      This value indicates the total number of logical CPUs that exist in the
 954  *      physical package. Critically, this is not the number of logical CPUs
 955  *      that exist for just the single core.
 956  *
 957  *      This value should be the same for all logical CPUs in the same package.
 958  *
 959  * cpi_ncore_per_chip
 960  *
 961  *      This value indicates the total number of physical CPU cores that exist
 962  *      in the package. The system compares this value with cpi_ncpu_per_chip to
 963  *      determine if simultaneous multi-threading (SMT) is enabled. When
 964  *      cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
 965  *      the X86FSET_HTT feature is not set. If this value is greater than one,
 966  *      than we consider the processor to have the feature X86FSET_CMP, to
 967  *      indicate that there is support for more than one core.
 968  *
 969  *      This value should be the same for all logical CPUs in the same package.
 970  *
 971  * cpi_procnodes_per_pkg
 972  *
 973  *      This value indicates the number of 'nodes' that exist in the package.
 974  *      When processors are actually a multi-chip module, this represents the
 975  *      number of such modules that exist in the package. Currently, on Intel
 976  *      based systems this member is always set to 1.
 977  *
 978  *      This value should be the same for all logical CPUs in the same package.
 979  *
 980  * cpi_procnodeid
 981  *
 982  *      This value indicates the ID of the node that the logical CPU is a part
 983  *      of. All logical CPUs that are in the same node must have the same value
 984  *      here. This value must be unique across all of the packages in the
 985  *      system.  On Intel based systems, this is currently set to the value in
 986  *      cpi_chipid because there is only one node.
 987  *
 988  * cpi_cores_per_compunit
 989  *
 990  *      This value indicates the number of cores that are part of a compute
 991  *      unit. See the AMD topology section for this. This member only has real
 992  *      meaning currently for AMD Bulldozer family processors. For all other
 993  *      processors, this should currently be set to 1.
 994  *
 995  * cpi_compunitid
 996  *
 997  *      This indicates the compute unit that the logical CPU belongs to. For
 998  *      processors without AMD Bulldozer-style compute units this should be set
 999  *      to the value of cpi_coreid.
1000  *
1001  * cpi_ncpu_shr_last_cache
1002  *
1003  *      This indicates the number of logical CPUs that are sharing the same last
1004  *      level cache. This value should be the same for all CPUs that are sharing
1005  *      that cache. The last cache refers to the cache that is closest to memory
1006  *      and furthest away from the CPU.
1007  *
1008  * cpi_last_lvl_cacheid
1009  *
1010  *      This indicates the ID of the last cache that the logical CPU uses. This
1011  *      cache is often shared between multiple logical CPUs and is the cache
1012  *      that is closest to memory and furthest away from the CPU. This value
1013  *      should be the same for a group of logical CPUs only if they actually
1014  *      share the same last level cache. IDs should not overlap between
1015  *      packages.
1016  *
1017  * cpi_ncore_bits
1018  *
1019  *      This indicates the number of bits that are required to represent all of
1020  *      the cores in the system. As cores are derived based on their APIC IDs,
1021  *      we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1022  *      this value to be larger than the actual number of IDs that are present
1023  *      in the system. This is used to size tables by the CMI framework. It is
1024  *      only filled in for Intel and AMD CPUs.
1025  *
1026  * cpi_nthread_bits
1027  *
1028  *      This indicates the number of bits required to represent all of the IDs
1029  *      that cover the logical CPUs that exist on a given core. It's OK for this
1030  *      value to be larger than the actual number of IDs that are present in the
1031  *      system.  This is used to size tables by the CMI framework. It is
1032  *      only filled in for Intel and AMD CPUs.
1033  *
1034  * -----------
1035  * Hypervisors
1036  * -----------
1037  *
1038  * If trying to manage the differences between vendors wasn't bad enough, it can
1039  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1040  * the ability to interpose on all cpuid instructions and change them to suit
1041  * their purposes. In general, this is necessary as the hypervisor wants to be
1042  * able to present a more uniform set of features or not necessarily give the
1043  * guest operating system kernel knowledge of all features so it can be
1044  * more easily migrated between systems.
1045  *
1046  * When it comes to trying to determine topology information, this can be a
1047  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1048  * leaf, it'll often return all zeros. Because of that, you'll often see various
1049  * checks scattered about fields being non-zero before we assume we can use
1050  * them.
1051  *
1052  * When it comes to topology information, the hypervisor is often incentivized
1053  * to lie to you about topology. This is because it doesn't always actually
1054  * guarantee that topology at all. The topology path we take in the system
1055  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1056  * or AMD CPU, then we basically do our normal path. However, when they don't
1057  * use an actual vendor, then that usually turns into multiple one-core CPUs
1058  * that we enumerate that are often on different sockets. The actual behavior
1059  * depends greatly on what the hypervisor actually exposes to us.
1060  *
1061  * --------------------
1062  * Exposing Information
1063  * --------------------
1064  *
1065  * We expose CPUID information in three different forms in the system.
1066  *
1067  * The first is through the x86_featureset variable. This is used in conjunction
1068  * with the is_x86_feature() function. This is queried by x86-specific functions
1069  * to determine which features are or aren't present in the system and to make
1070  * decisions based upon them. For example, users of this include everything from
1071  * parts of the system dedicated to reliability, availability, and
1072  * serviceability (RAS), to making decisions about how to handle security
1073  * mitigations, to various x86-specific drivers. General purpose or
1074  * architecture independent drivers should never be calling this function.
1075  *
1076  * The second means is through the auxiliary vector. The auxiliary vector is a
1077  * series of tagged data that the kernel passes down to a user program when it
1078  * begins executing. This information is used to indicate to programs what
1079  * instruction set extensions are present. For example, information about the
1080  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1081  * since user programs cannot make use of it. However, things like the AVX
1082  * instruction sets are. Programs use this information to make run-time
1083  * decisions about what features they should use. As an example, the run-time
1084  * link-editor (rtld) can relocate different functions depending on the hardware
1085  * support available.
1086  *
1087  * The final form is through a series of accessor functions that all have the
1088  * form cpuid_get*. This is used by a number of different subsystems in the
1089  * kernel to determine more detailed information about what we're running on,
1090  * topology information, etc. Some of these subsystems include processor groups
1091  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1092  * microcode, and performance monitoring. These functions all ASSERT that the
1093  * CPU they're being called on has reached a certain cpuid pass. If the passes
1094  * are rearranged, then this needs to be adjusted.
1095  *
1096  * -----------------------------------------------
1097  * Speculative Execution CPU Side Channel Security
1098  * -----------------------------------------------
1099  *
1100  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1101  * execution in the CPU to create side channels there have been a number of
1102  * different attacks and corresponding issues that the operating system needs to
1103  * mitigate against. The following list is some of the common, but not
1104  * exhaustive, set of issues that we know about and have done some or need to do
1105  * more work in the system to mitigate against:
1106  *
1107  *   - Spectre v1
1108  *   - swapgs (Spectre v1 variant)
1109  *   - Spectre v2
1110  *   - Meltdown (Spectre v3)
1111  *   - Rogue Register Read (Spectre v3a)
1112  *   - Speculative Store Bypass (Spectre v4)
1113  *   - ret2spec, SpectreRSB
1114  *   - L1 Terminal Fault (L1TF)
1115  *   - Microarchitectural Data Sampling (MDS)
1116  *   - Register File Data Sampling (RFDS)
1117  *
1118  * Each of these requires different sets of mitigations and has different attack
1119  * surfaces. For the most part, this discussion is about protecting the kernel
1120  * from non-kernel executing environments such as user processes and hardware
1121  * virtual machines. Unfortunately, there are a number of user vs. user
1122  * scenarios that exist with these. The rest of this section will describe the
1123  * overall approach that the system has taken to address these as well as their
1124  * shortcomings. Unfortunately, not all of the above have been handled today.
1125  *
1126  * SPECTRE v2, ret2spec, SpectreRSB
1127  *
1128  * The second variant of the spectre attack focuses on performing branch target
1129  * injection. This generally impacts indirect call instructions in the system.
1130  * There are four different ways to mitigate this issue that are commonly
1131  * described today:
1132  *
1133  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1134  *  2. Using Retpolines and RSB Stuffing
1135  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1136  *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1137  *
1138  * IBRS uses a feature added to microcode to restrict speculation, among other
1139  * things. This form of mitigation has not been used as it has been generally
1140  * seen as too expensive and requires reactivation upon various transitions in
1141  * the system.
1142  *
1143  * As a less impactful alternative to IBRS, retpolines were developed by
1144  * Google. These basically require one to replace indirect calls with a specific
1145  * trampoline that will cause speculation to fail and break the attack.
1146  * Retpolines require compiler support. We always build with retpolines in the
1147  * external thunk mode. This means that a traditional indirect call is replaced
1148  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1149  * of this is that all indirect function calls are performed through a register.
1150  *
1151  * We have to use a common external location of the thunk and not inline it into
1152  * the callsite so that way we can have a single place to patch these functions.
1153  * As it turns out, we currently have two different forms of retpolines that
1154  * exist in the system:
1155  *
1156  *  1. A full retpoline
1157  *  2. A no-op version
1158  *
1159  * The first one is used in the general case. Historically, there was an
1160  * AMD-specific optimized retopoline variant that was based around using a
1161  * serializing lfence instruction; however, in March 2022 it was announced that
1162  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1163  * use it and it is no longer available in the system.
1164  *
1165  * The third form described above is the most curious. It turns out that the way
1166  * that retpolines are implemented is that they rely on how speculation is
1167  * performed on a 'ret' instruction. Intel has continued to optimize this
1168  * process (which is partly why we need to have return stack buffer stuffing,
1169  * but more on that in a bit) and in processors starting with Cascade Lake
1170  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1171  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1172  *
1173  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1174  * physical core. However, if this is the case, we don't want to use retpolines
1175  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1176  * function (called a thunk) into a jmp instruction. This means that we're still
1177  * paying the cost of an extra jump to the external thunk, but it gives us
1178  * flexibility and the ability to have a single kernel image that works across a
1179  * wide variety of systems and hardware features.
1180  *
1181  * Unfortunately, this alone is insufficient. First, Skylake systems have
1182  * additional speculation for the Return Stack Buffer (RSB) which is used to
1183  * return from call instructions which retpolines take advantage of. However,
1184  * this problem is not just limited to Skylake and is actually more pernicious.
1185  * The SpectreRSB paper introduces several more problems that can arise with
1186  * dealing with this. The RSB can be poisoned just like the indirect branch
1187  * predictor. This means that one needs to clear the RSB when transitioning
1188  * between two different privilege domains. Some examples include:
1189  *
1190  *  - Switching between two different user processes
1191  *  - Going between user land and the kernel
1192  *  - Returning to the kernel from a hardware virtual machine
1193  *
1194  * Mitigating this involves combining a couple of different things. The first is
1195  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1196  * Bridge. When an RSB entry refers to a user address and we're executing in the
1197  * kernel, speculation through it will be stopped when SMEP is enabled. This
1198  * protects against a number of the different cases that we would normally be
1199  * worried about such as when we enter the kernel from user land.
1200  *
1201  * To prevent against additional manipulation of the RSB from other contexts
1202  * such as a non-root VMX context attacking the kernel we first look to
1203  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1204  * nothing else that we need to do to protect the kernel at this time.
1205  *
1206  * Unfortunately, eIBRS or not, we need to manually overwrite the contents of
1207  * the return stack buffer. We do this through the x86_rsb_stuff() function.
1208  * Currently this is employed on context switch and vmx_exit. The
1209  * x86_rsb_stuff() function is disabled only when mitigations in general are.
1210  *
1211  * If SMEP is not present, then we would have to stuff the RSB every time we
1212  * transitioned from user mode to the kernel, which isn't very practical right
1213  * now.
1214  *
1215  * To fully protect user to user and vmx to vmx attacks from these classes of
1216  * issues, we would also need to allow them to opt into performing an Indirect
1217  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1218  *
1219  * The fourth form of mitigation here is specific to AMD and is called Automated
1220  * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1221  * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1222  * (extended feature enable register) MSR. This bit basically says that IBRS
1223  * acts as though it is always active when executing at CPL0 and when executing
1224  * in the 'host' context when SEV-SNP is enabled.
1225  *
1226  * When this is active, AMD states that the RSB is cleared on VMEXIT and
1227  * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1228  * to the kernel, we must still consider the remaining cases that exist, just
1229  * like above. While traditionally AMD employed a 32 entry RSB allowing the
1230  * traditional technique to work, this is not true on all CPUs. While a write to
1231  * IBRS would clear the RSB if the processor supports more than 32 entries (but
1232  * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1233  * guard page is present between user and kernel address spaces and SMEP is
1234  * enabled, then there is no need to clear the RSB at all.
1235  *
1236  * By default, the system will enable RSB stuffing and the required variant of
1237  * retpolines and store that information in the x86_spectrev2_mitigation value.
1238  * This will be evaluated after a microcode update as well, though it is
1239  * expected that microcode updates will not take away features. This may mean
1240  * that a late loaded microcode may not end up in the optimal configuration
1241  * (though this should be rare).
1242  *
1243  * Currently we do not build kmdb with retpolines or perform any additional side
1244  * channel security mitigations for it. One complication with kmdb is that it
1245  * requires its own retpoline thunks and it would need to adjust itself based on
1246  * what the kernel does. The threat model of kmdb is more limited and therefore
1247  * it may make more sense to investigate using prediction barriers as the whole
1248  * system is only executing a single instruction at a time while in kmdb.
1249  *
1250  * SPECTRE v1, v4
1251  *
1252  * The v1 and v4 variants of spectre are not currently mitigated in the
1253  * system and require other classes of changes to occur in the code.
1254  *
1255  * SPECTRE v1 (SWAPGS VARIANT)
1256  *
1257  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1258  * can generally affect any branch-dependent code. The swapgs issue is one
1259  * variant of this. If we are coming in from userspace, we can have code like
1260  * this:
1261  *
1262  *      cmpw    $KCS_SEL, REGOFF_CS(%rsp)
1263  *      je      1f
1264  *      movq    $0, REGOFF_SAVFP(%rsp)
1265  *      swapgs
1266  *      1:
1267  *      movq    %gs:CPU_THREAD, %rax
1268  *
1269  * If an attacker can cause a mis-speculation of the branch here, we could skip
1270  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1271  * load. If subsequent code can act as the usual Spectre cache gadget, this
1272  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1273  * any use of the %gs override.
1274  *
1275  * The other case is also an issue: if we're coming into a trap from kernel
1276  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1277  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1278  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1279  * case, and the fix is the same in both cases (an lfence at the branch target
1280  * 1: in this example), we'll just do it unconditionally.
1281  *
1282  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1283  * harder for user-space to actually set a useful %gsbase value: although it's
1284  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1285  * mitigate anyway.
1286  *
1287  * MELTDOWN
1288  *
1289  * Meltdown, or spectre v3, allowed a user process to read any data in their
1290  * address space regardless of whether or not the page tables in question
1291  * allowed the user to have the ability to read them. The solution to meltdown
1292  * is kernel page table isolation. In this world, there are two page tables that
1293  * are used for a process, one in user land and one in the kernel. To implement
1294  * this we use per-CPU page tables and switch between the user and kernel
1295  * variants when entering and exiting the kernel.  For more information about
1296  * this process and how the trampolines work, please see the big theory
1297  * statements and additional comments in:
1298  *
1299  *  - uts/i86pc/ml/kpti_trampolines.s
1300  *  - uts/i86pc/vm/hat_i86.c
1301  *
1302  * While Meltdown only impacted Intel systems and there are also Intel systems
1303  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1304  * kernel page table isolation enabled. While this may at first seem weird, an
1305  * important thing to remember is that you can't speculatively read an address
1306  * if it's never in your page table at all. Having user processes without kernel
1307  * pages present provides us with an important layer of defense in the kernel
1308  * against any other side channel attacks that exist and have yet to be
1309  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1310  * default, no matter the x86 system.
1311  *
1312  * L1 TERMINAL FAULT
1313  *
1314  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1315  * execution uses page table entries. Effectively, it is two different problems.
1316  * The first is that it ignores the not present bit in the page table entries
1317  * when performing speculative execution. This means that something can
1318  * speculatively read the listed physical address if it's present in the L1
1319  * cache under certain conditions (see Intel's documentation for the full set of
1320  * conditions). Secondly, this can be used to bypass hardware virtualization
1321  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1322  * instructions.
1323  *
1324  * For the non-hardware virtualized case, this is relatively easy to deal with.
1325  * We must make sure that all unmapped pages have an address of zero. This means
1326  * that they could read the first 4k of physical memory; however, we never use
1327  * that first page in the operating system and always skip putting it in our
1328  * memory map, even if firmware tells us we can use it in our memory map. While
1329  * other systems try to put extra metadata in the address and reserved bits,
1330  * which led to this being problematic in those cases, we do not.
1331  *
1332  * For hardware virtual machines things are more complicated. Because they can
1333  * construct their own page tables, it isn't hard for them to perform this
1334  * attack against any physical address. The one wrinkle is that this physical
1335  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1336  * to flush the L1 data cache. We wrap this up in the function
1337  * spec_uarch_flush(). This function is also used in the mitigation of
1338  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1339  * hypervisors such as KVM or bhyve are responsible for performing this before
1340  * entering the guest.
1341  *
1342  * Because this attack takes place in the L1 cache, there's another wrinkle
1343  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1344  * designs. This means that when a thread enters a hardware virtualized context
1345  * and flushes the L1 data cache, the other thread on the processor may then go
1346  * ahead and put new data in it that can be potentially attacked. While one
1347  * solution is to disable SMT on the system, another option that is available is
1348  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1349  * goes through and makes sure that if a HVM is being scheduled on one thread,
1350  * then the thing on the other thread is from the same hardware virtual machine.
1351  * If an interrupt comes in or the guest exits to the broader system, then the
1352  * other SMT thread will be kicked out.
1353  *
1354  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1355  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1356  * perform L1TF related mitigations.
1357  *
1358  * MICROARCHITECTURAL DATA SAMPLING
1359  *
1360  * Microarchitectural data sampling (MDS) is a combination of four discrete
1361  * vulnerabilities that are similar issues affecting various parts of the CPU's
1362  * microarchitectural implementation around load, store, and fill buffers.
1363  * Specifically it is made up of the following subcomponents:
1364  *
1365  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1366  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1367  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1368  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1369  *
1370  * To begin addressing these, Intel has introduced another feature in microcode
1371  * called MD_CLEAR. This changes the verw instruction to operate in a different
1372  * way. This allows us to execute the verw instruction in a particular way to
1373  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1374  * updated when this microcode is present to flush this state.
1375  *
1376  * Primarily we need to flush this state whenever we transition from the kernel
1377  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1378  * little bit different. Here the structures are statically sized when a logical
1379  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1380  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1381  * mwait, or another ACPI method. To perform these flushes, we call
1382  * x86_md_clear() at all of these transition points.
1383  *
1384  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1385  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1386  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1387  * a no-op.
1388  *
1389  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1390  * particular, everything we've discussed above is only valid for a single
1391  * thread executing on a core. In the case where you have hyper-threading
1392  * present, this attack can be performed between threads. The theoretical fix
1393  * for this is to ensure that both threads are always in the same security
1394  * domain. This means that they are executing in the same ring and mutually
1395  * trust each other. Practically speaking, this would mean that a system call
1396  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1397  * Rather than implement this, we recommend that one disables hyper-threading
1398  * through the use of psradm -aS.
1399  *
1400  * TSX ASYNCHRONOUS ABORT
1401  *
1402  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1403  * behaves like MDS, but leverages Intel's transactional instructions as another
1404  * vector. Effectively, when a transaction hits one of these cases (unmapped
1405  * page, various cache snoop activity, etc.) then the same data can be exposed
1406  * as in the case of MDS. This means that you can attack your twin.
1407  *
1408  * Intel has described that there are two different ways that we can mitigate
1409  * this problem on affected processors:
1410  *
1411  *   1) We can use the same techniques used to deal with MDS. Flushing the
1412  *      microarchitectural buffers and disabling hyperthreading will mitigate
1413  *      this in the same way.
1414  *
1415  *   2) Using microcode to disable TSX.
1416  *
1417  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1418  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1419  * That's OK as we're already doing all such mitigations. On the other hand,
1420  * processors with MDS_NO are all supposed to receive microcode updates that
1421  * enumerate support for disabling TSX. In general, we'd rather use this method
1422  * when available as it doesn't require disabling hyperthreading to be
1423  * effective. Currently we basically are relying on microcode for processors
1424  * that enumerate MDS_NO.
1425  *
1426  * Another MDS-variant in a few select Intel Atom CPUs is Register File Data
1427  * Sampling: RFDS. This allows an attacker to sample values that were in any
1428  * of integer, floating point, or vector registers. This was discovered by
1429  * Intel during internal validation work.  The existence of the RFDS_NO
1430  * capability, or the LACK of a RFDS_CLEAR capability, means we do not have to
1431  * act. Intel has said some CPU models immune to RFDS MAY NOT enumerate
1432  * RFDS_NO. If RFDS_NO is not set, but RFDS_CLEAR is, we must set x86_md_clear,
1433  * and make sure it's using VERW. Unlike MDS, RFDS can't be helped by the
1434  * MSR that L1D uses.
1435  *
1436  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1437  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1438  * different powers. The first allows us to cause all transactions to
1439  * immediately abort. The second gives us a means of disabling TSX completely,
1440  * which includes removing it from cpuid. If we have support for this in
1441  * microcode during the first cpuid pass, then we'll disable TSX completely such
1442  * that user land never has a chance to observe the bit. However, if we are late
1443  * loading the microcode, then we must use the functionality to cause
1444  * transactions to automatically abort. This is necessary for user land's sake.
1445  * Once a program sees a cpuid bit, it must not be taken away.
1446  *
1447  * We track whether or not we should do this based on what cpuid pass we're in.
1448  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1449  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1450  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1451  * second time after we do the initial microcode update.  As a result we need to
1452  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1453  * suitable microcode on the current CPU (which happens prior to
1454  * cpuid_pass_ucode()).
1455  *
1456  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1457  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1458  * unfortunate feature in a number of ways, and taking the opportunity to
1459  * finally be able to turn it off is likely to be of benefit in the future.
1460  *
1461  * SUMMARY
1462  *
1463  * The following table attempts to summarize the mitigations for various issues
1464  * and what's done in various places:
1465  *
1466  *  - Spectre v1: Not currently mitigated
1467  *  - swapgs: lfences after swapgs paths
1468  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1469  *  - Meltdown: Kernel Page Table Isolation
1470  *  - Spectre v3a: Updated CPU microcode
1471  *  - Spectre v4: Not currently mitigated
1472  *  - SpectreRSB: SMEP and RSB Stuffing
1473  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1474  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1475  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1476  *  - RFDS: microcode with x86_md_clear if RFDS_CLEAR set and RFDS_NO not.
1477  *
1478  * The following table indicates the x86 feature set bits that indicate that a
1479  * given problem has been solved or a notable feature is present:
1480  *
1481  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1482  *  - MDS_NO: All forms of MDS
1483  *  - TAA_NO: TAA
1484  *  - RFDS_NO: RFDS
1485  */
1486 
1487 #include <sys/types.h>
1488 #include <sys/archsystm.h>
1489 #include <sys/x86_archext.h>
1490 #include <sys/kmem.h>
1491 #include <sys/systm.h>
1492 #include <sys/cmn_err.h>
1493 #include <sys/sunddi.h>
1494 #include <sys/sunndi.h>
1495 #include <sys/cpuvar.h>
1496 #include <sys/processor.h>
1497 #include <sys/sysmacros.h>
1498 #include <sys/pg.h>
1499 #include <sys/fp.h>
1500 #include <sys/controlregs.h>
1501 #include <sys/bitmap.h>
1502 #include <sys/auxv_386.h>
1503 #include <sys/memnode.h>
1504 #include <sys/pci_cfgspace.h>
1505 #include <sys/comm_page.h>
1506 #include <sys/mach_mmu.h>
1507 #include <sys/ucode.h>
1508 #include <sys/tsc.h>
1509 #include <sys/kobj.h>
1510 #include <sys/asm_misc.h>
1511 #include <sys/bitmap.h>
1512 
1513 #ifdef __xpv
1514 #include <sys/hypervisor.h>
1515 #else
1516 #include <sys/ontrap.h>
1517 #endif
1518 
1519 uint_t x86_vendor = X86_VENDOR_IntelClone;
1520 uint_t x86_type = X86_TYPE_OTHER;
1521 uint_t x86_clflush_size = 0;
1522 
1523 #if defined(__xpv)
1524 int x86_use_pcid = 0;
1525 int x86_use_invpcid = 0;
1526 #else
1527 int x86_use_pcid = -1;
1528 int x86_use_invpcid = -1;
1529 #endif
1530 
1531 typedef enum {
1532         X86_SPECTREV2_RETPOLINE,
1533         X86_SPECTREV2_ENHANCED_IBRS,
1534         X86_SPECTREV2_AUTO_IBRS,
1535         X86_SPECTREV2_DISABLED
1536 } x86_spectrev2_mitigation_t;
1537 
1538 uint_t x86_disable_spectrev2 = 0;
1539 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1540     X86_SPECTREV2_RETPOLINE;
1541 
1542 /*
1543  * The mitigation status for TAA:
1544  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1545  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1546  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1547  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1548  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1549  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1550  */
1551 typedef enum {
1552         X86_TAA_NOTHING,
1553         X86_TAA_DISABLED,
1554         X86_TAA_MD_CLEAR,
1555         X86_TAA_TSX_FORCE_ABORT,
1556         X86_TAA_TSX_DISABLE,
1557         X86_TAA_HW_MITIGATED
1558 } x86_taa_mitigation_t;
1559 
1560 uint_t x86_disable_taa = 0;
1561 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1562 
1563 uint_t pentiumpro_bug4046376;
1564 
1565 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1566 
1567 static char *x86_feature_names[NUM_X86_FEATURES] = {
1568         "lgpg",
1569         "tsc",
1570         "msr",
1571         "mtrr",
1572         "pge",
1573         "de",
1574         "cmov",
1575         "mmx",
1576         "mca",
1577         "pae",
1578         "cv8",
1579         "pat",
1580         "sep",
1581         "sse",
1582         "sse2",
1583         "htt",
1584         "asysc",
1585         "nx",
1586         "sse3",
1587         "cx16",
1588         "cmp",
1589         "tscp",
1590         "mwait",
1591         "sse4a",
1592         "cpuid",
1593         "ssse3",
1594         "sse4_1",
1595         "sse4_2",
1596         "1gpg",
1597         "clfsh",
1598         "64",
1599         "aes",
1600         "pclmulqdq",
1601         "xsave",
1602         "avx",
1603         "vmx",
1604         "svm",
1605         "topoext",
1606         "f16c",
1607         "rdrand",
1608         "x2apic",
1609         "avx2",
1610         "bmi1",
1611         "bmi2",
1612         "fma",
1613         "smep",
1614         "smap",
1615         "adx",
1616         "rdseed",
1617         "mpx",
1618         "avx512f",
1619         "avx512dq",
1620         "avx512pf",
1621         "avx512er",
1622         "avx512cd",
1623         "avx512bw",
1624         "avx512vl",
1625         "avx512fma",
1626         "avx512vbmi",
1627         "avx512_vpopcntdq",
1628         "avx512_4vnniw",
1629         "avx512_4fmaps",
1630         "xsaveopt",
1631         "xsavec",
1632         "xsaves",
1633         "sha",
1634         "umip",
1635         "pku",
1636         "ospke",
1637         "pcid",
1638         "invpcid",
1639         "ibrs",
1640         "ibpb",
1641         "stibp",
1642         "ssbd",
1643         "ssbd_virt",
1644         "rdcl_no",
1645         "ibrs_all",
1646         "rsba",
1647         "ssb_no",
1648         "stibp_all",
1649         "flush_cmd",
1650         "l1d_vmentry_no",
1651         "fsgsbase",
1652         "clflushopt",
1653         "clwb",
1654         "monitorx",
1655         "clzero",
1656         "xop",
1657         "fma4",
1658         "tbm",
1659         "avx512_vnni",
1660         "amd_pcec",
1661         "md_clear",
1662         "mds_no",
1663         "core_thermal",
1664         "pkg_thermal",
1665         "tsx_ctrl",
1666         "taa_no",
1667         "ppin",
1668         "vaes",
1669         "vpclmulqdq",
1670         "lfence_serializing",
1671         "gfni",
1672         "avx512_vp2intersect",
1673         "avx512_bitalg",
1674         "avx512_vbmi2",
1675         "avx512_bf16",
1676         "auto_ibrs",
1677         "rfds_no",
1678         "rfds_clear",
1679         "pbrsb_no"
1680 };
1681 
1682 boolean_t
1683 is_x86_feature(void *featureset, uint_t feature)
1684 {
1685         ASSERT(feature < NUM_X86_FEATURES);
1686         return (BT_TEST((ulong_t *)featureset, feature));
1687 }
1688 
1689 void
1690 add_x86_feature(void *featureset, uint_t feature)
1691 {
1692         ASSERT(feature < NUM_X86_FEATURES);
1693         BT_SET((ulong_t *)featureset, feature);
1694 }
1695 
1696 void
1697 remove_x86_feature(void *featureset, uint_t feature)
1698 {
1699         ASSERT(feature < NUM_X86_FEATURES);
1700         BT_CLEAR((ulong_t *)featureset, feature);
1701 }
1702 
1703 boolean_t
1704 compare_x86_featureset(void *setA, void *setB)
1705 {
1706         /*
1707          * We assume that the unused bits of the bitmap are always zero.
1708          */
1709         if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1710                 return (B_TRUE);
1711         } else {
1712                 return (B_FALSE);
1713         }
1714 }
1715 
1716 void
1717 print_x86_featureset(void *featureset)
1718 {
1719         uint_t i;
1720 
1721         for (i = 0; i < NUM_X86_FEATURES; i++) {
1722                 if (is_x86_feature(featureset, i)) {
1723                         cmn_err(CE_CONT, "?x86_feature: %s\n",
1724                             x86_feature_names[i]);
1725                 }
1726         }
1727 }
1728 
1729 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1730 static size_t xsave_state_size = 0;
1731 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1732 boolean_t xsave_force_disable = B_FALSE;
1733 extern int disable_smap;
1734 
1735 /*
1736  * This is set to platform type we are running on.
1737  */
1738 static int platform_type = -1;
1739 
1740 #if !defined(__xpv)
1741 /*
1742  * Variable to patch if hypervisor platform detection needs to be
1743  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1744  */
1745 int enable_platform_detection = 1;
1746 #endif
1747 
1748 /*
1749  * monitor/mwait info.
1750  *
1751  * size_actual and buf_actual are the real address and size allocated to get
1752  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1753  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1754  * processor cache-line alignment, but this is not guarantied in the furture.
1755  */
1756 struct mwait_info {
1757         size_t          mon_min;        /* min size to avoid missed wakeups */
1758         size_t          mon_max;        /* size to avoid false wakeups */
1759         size_t          size_actual;    /* size actually allocated */
1760         void            *buf_actual;    /* memory actually allocated */
1761         uint32_t        support;        /* processor support of monitor/mwait */
1762 };
1763 
1764 /*
1765  * xsave/xrestor info.
1766  *
1767  * This structure contains HW feature bits and the size of the xsave save area.
1768  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1769  * (xsave_state) to describe the xsave layout. However, at runtime the
1770  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1771  * xsave_state structure simply represents the legacy layout of the beginning
1772  * of the xsave area.
1773  */
1774 struct xsave_info {
1775         uint32_t        xsav_hw_features_low;   /* Supported HW features */
1776         uint32_t        xsav_hw_features_high;  /* Supported HW features */
1777         size_t          xsav_max_size;  /* max size save area for HW features */
1778         size_t          ymm_size;       /* AVX: size of ymm save area */
1779         size_t          ymm_offset;     /* AVX: offset for ymm save area */
1780         size_t          bndregs_size;   /* MPX: size of bndregs save area */
1781         size_t          bndregs_offset; /* MPX: offset for bndregs save area */
1782         size_t          bndcsr_size;    /* MPX: size of bndcsr save area */
1783         size_t          bndcsr_offset;  /* MPX: offset for bndcsr save area */
1784         size_t          opmask_size;    /* AVX512: size of opmask save */
1785         size_t          opmask_offset;  /* AVX512: offset for opmask save */
1786         size_t          zmmlo_size;     /* AVX512: size of zmm 256 save */
1787         size_t          zmmlo_offset;   /* AVX512: offset for zmm 256 save */
1788         size_t          zmmhi_size;     /* AVX512: size of zmm hi reg save */
1789         size_t          zmmhi_offset;   /* AVX512: offset for zmm hi reg save */
1790 };
1791 
1792 
1793 /*
1794  * These constants determine how many of the elements of the
1795  * cpuid we cache in the cpuid_info data structure; the
1796  * remaining elements are accessible via the cpuid instruction.
1797  */
1798 
1799 #define NMAX_CPI_STD    8               /* eax = 0 .. 7 */
1800 #define NMAX_CPI_EXTD   0x22            /* eax = 0x80000000 .. 0x80000021 */
1801 #define NMAX_CPI_TOPO   0x10            /* Sanity check on leaf 8X26, 1F */
1802 
1803 /*
1804  * See the big theory statement for a more detailed explanation of what some of
1805  * these members mean.
1806  */
1807 struct cpuid_info {
1808         uint_t cpi_pass;                /* last pass completed */
1809         /*
1810          * standard function information
1811          */
1812         uint_t cpi_maxeax;              /* fn 0: %eax */
1813         char cpi_vendorstr[13];         /* fn 0: %ebx:%ecx:%edx */
1814         uint_t cpi_vendor;              /* enum of cpi_vendorstr */
1815 
1816         uint_t cpi_family;              /* fn 1: extended family */
1817         uint_t cpi_model;               /* fn 1: extended model */
1818         uint_t cpi_step;                /* fn 1: stepping */
1819         chipid_t cpi_chipid;            /* fn 1: %ebx:  Intel: chip # */
1820                                         /*              AMD: package/socket # */
1821         uint_t cpi_brandid;             /* fn 1: %ebx: brand ID */
1822         int cpi_clogid;                 /* fn 1: %ebx: thread # */
1823         uint_t cpi_ncpu_per_chip;       /* fn 1: %ebx: logical cpu count */
1824         uint8_t cpi_cacheinfo[16];      /* fn 2: intel-style cache desc */
1825         uint_t cpi_ncache;              /* fn 2: number of elements */
1826         uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1827         id_t cpi_last_lvl_cacheid;      /* fn 4: %eax: derived cache id */
1828         uint_t cpi_cache_leaf_size;     /* Number of cache elements */
1829                                         /* Intel fn: 4, AMD fn: 8000001d */
1830         struct cpuid_regs **cpi_cache_leaves;   /* Actual leaves from above */
1831         struct cpuid_regs cpi_std[NMAX_CPI_STD];        /* 0 .. 7 */
1832         struct cpuid_regs cpi_sub7[1];  /* Leaf 7, sub-leaf 1 */
1833         /*
1834          * extended function information
1835          */
1836         uint_t cpi_xmaxeax;             /* fn 0x80000000: %eax */
1837         char cpi_brandstr[49];          /* fn 0x8000000[234] */
1838         uint8_t cpi_pabits;             /* fn 0x80000006: %eax */
1839         uint8_t cpi_vabits;             /* fn 0x80000006: %eax */
1840         uint8_t cpi_fp_amd_save;        /* AMD: FP error pointer save rqd. */
1841         struct  cpuid_regs cpi_extd[NMAX_CPI_EXTD];     /* 0x800000XX */
1842 
1843         id_t cpi_coreid;                /* same coreid => strands share core */
1844         int cpi_pkgcoreid;              /* core number within single package */
1845         uint_t cpi_ncore_per_chip;      /* AMD: fn 0x80000008: %ecx[7-0] */
1846                                         /* Intel: fn 4: %eax[31-26] */
1847 
1848         /*
1849          * These values represent the number of bits that are required to store
1850          * information about the number of cores and threads.
1851          */
1852         uint_t cpi_ncore_bits;
1853         uint_t cpi_nthread_bits;
1854         /*
1855          * supported feature information
1856          */
1857         uint32_t cpi_support[6];
1858 #define STD_EDX_FEATURES        0
1859 #define AMD_EDX_FEATURES        1
1860 #define TM_EDX_FEATURES         2
1861 #define STD_ECX_FEATURES        3
1862 #define AMD_ECX_FEATURES        4
1863 #define STD_EBX_FEATURES        5
1864         /*
1865          * Synthesized information, where known.
1866          */
1867         x86_chiprev_t cpi_chiprev;      /* See X86_CHIPREV_* in x86_archext.h */
1868         const char *cpi_chiprevstr;     /* May be NULL if chiprev unknown */
1869         uint32_t cpi_socket;            /* Chip package/socket type */
1870         x86_uarchrev_t cpi_uarchrev;    /* Microarchitecture and revision */
1871 
1872         struct mwait_info cpi_mwait;    /* fn 5: monitor/mwait info */
1873         uint32_t cpi_apicid;
1874         uint_t cpi_procnodeid;          /* AMD: nodeID on HT, Intel: chipid */
1875         uint_t cpi_procnodes_per_pkg;   /* AMD: # of nodes in the package */
1876                                         /* Intel: 1 */
1877         uint_t cpi_compunitid;          /* AMD: ComputeUnit ID, Intel: coreid */
1878         uint_t cpi_cores_per_compunit;  /* AMD: # of cores in the ComputeUnit */
1879 
1880         struct xsave_info cpi_xsave;    /* fn D: xsave/xrestor info */
1881 
1882         /*
1883          * AMD and Intel extended topology information. Leaf 8X26 (AMD) and
1884          * eventually leaf 0x1F (Intel).
1885          */
1886         uint_t cpi_topo_nleaves;
1887         struct cpuid_regs cpi_topo[NMAX_CPI_TOPO];
1888 };
1889 
1890 
1891 static struct cpuid_info cpuid_info0;
1892 
1893 /*
1894  * These bit fields are defined by the Intel Application Note AP-485
1895  * "Intel Processor Identification and the CPUID Instruction"
1896  */
1897 #define CPI_FAMILY_XTD(cpi)     BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1898 #define CPI_MODEL_XTD(cpi)      BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1899 #define CPI_TYPE(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1900 #define CPI_FAMILY(cpi)         BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1901 #define CPI_STEP(cpi)           BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1902 #define CPI_MODEL(cpi)          BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1903 
1904 #define CPI_FEATURES_EDX(cpi)           ((cpi)->cpi_std[1].cp_edx)
1905 #define CPI_FEATURES_ECX(cpi)           ((cpi)->cpi_std[1].cp_ecx)
1906 #define CPI_FEATURES_XTD_EDX(cpi)       ((cpi)->cpi_extd[1].cp_edx)
1907 #define CPI_FEATURES_XTD_ECX(cpi)       ((cpi)->cpi_extd[1].cp_ecx)
1908 #define CPI_FEATURES_7_0_EBX(cpi)       ((cpi)->cpi_std[7].cp_ebx)
1909 #define CPI_FEATURES_7_0_ECX(cpi)       ((cpi)->cpi_std[7].cp_ecx)
1910 #define CPI_FEATURES_7_0_EDX(cpi)       ((cpi)->cpi_std[7].cp_edx)
1911 #define CPI_FEATURES_7_1_EAX(cpi)       ((cpi)->cpi_sub7[0].cp_eax)
1912 
1913 #define CPI_BRANDID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1914 #define CPI_CHUNKS(cpi)         BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1915 #define CPI_CPU_COUNT(cpi)      BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1916 #define CPI_APIC_ID(cpi)        BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1917 
1918 #define CPI_MAXEAX_MAX          0x100           /* sanity control */
1919 #define CPI_XMAXEAX_MAX         0x80000100
1920 #define CPI_FN4_ECX_MAX         0x20            /* sanity: max fn 4 levels */
1921 #define CPI_FNB_ECX_MAX         0x20            /* sanity: max fn B levels */
1922 
1923 /*
1924  * Function 4 (Deterministic Cache Parameters) macros
1925  * Defined by Intel Application Note AP-485
1926  */
1927 #define CPI_NUM_CORES(regs)             BITX((regs)->cp_eax, 31, 26)
1928 #define CPI_NTHR_SHR_CACHE(regs)        BITX((regs)->cp_eax, 25, 14)
1929 #define CPI_FULL_ASSOC_CACHE(regs)      BITX((regs)->cp_eax, 9, 9)
1930 #define CPI_SELF_INIT_CACHE(regs)       BITX((regs)->cp_eax, 8, 8)
1931 #define CPI_CACHE_LVL(regs)             BITX((regs)->cp_eax, 7, 5)
1932 #define CPI_CACHE_TYPE(regs)            BITX((regs)->cp_eax, 4, 0)
1933 #define CPI_CACHE_TYPE_DONE     0
1934 #define CPI_CACHE_TYPE_DATA     1
1935 #define CPI_CACHE_TYPE_INSTR    2
1936 #define CPI_CACHE_TYPE_UNIFIED  3
1937 #define CPI_CPU_LEVEL_TYPE(regs)        BITX((regs)->cp_ecx, 15, 8)
1938 
1939 #define CPI_CACHE_WAYS(regs)            BITX((regs)->cp_ebx, 31, 22)
1940 #define CPI_CACHE_PARTS(regs)           BITX((regs)->cp_ebx, 21, 12)
1941 #define CPI_CACHE_COH_LN_SZ(regs)       BITX((regs)->cp_ebx, 11, 0)
1942 
1943 #define CPI_CACHE_SETS(regs)            BITX((regs)->cp_ecx, 31, 0)
1944 
1945 #define CPI_PREFCH_STRIDE(regs)         BITX((regs)->cp_edx, 9, 0)
1946 
1947 
1948 /*
1949  * A couple of shorthand macros to identify "later" P6-family chips
1950  * like the Pentium M and Core.  First, the "older" P6-based stuff
1951  * (loosely defined as "pre-Pentium-4"):
1952  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1953  */
1954 #define IS_LEGACY_P6(cpi) (                     \
1955         cpi->cpi_family == 6 &&                      \
1956                 (cpi->cpi_model == 1 ||              \
1957                 cpi->cpi_model == 3 ||               \
1958                 cpi->cpi_model == 5 ||               \
1959                 cpi->cpi_model == 6 ||               \
1960                 cpi->cpi_model == 7 ||               \
1961                 cpi->cpi_model == 8 ||               \
1962                 cpi->cpi_model == 0xA ||     \
1963                 cpi->cpi_model == 0xB)               \
1964 )
1965 
1966 /* A "new F6" is everything with family 6 that's not the above */
1967 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1968 
1969 /* Extended family/model support */
1970 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1971         cpi->cpi_family >= 0xf)
1972 
1973 /*
1974  * Info for monitor/mwait idle loop.
1975  *
1976  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1977  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1978  * 2006.
1979  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1980  * Documentation Updates" #33633, Rev 2.05, December 2006.
1981  */
1982 #define MWAIT_SUPPORT           (0x00000001)    /* mwait supported */
1983 #define MWAIT_EXTENSIONS        (0x00000002)    /* extenstion supported */
1984 #define MWAIT_ECX_INT_ENABLE    (0x00000004)    /* ecx 1 extension supported */
1985 #define MWAIT_SUPPORTED(cpi)    ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1986 #define MWAIT_INT_ENABLE(cpi)   ((cpi)->cpi_std[5].cp_ecx & 0x2)
1987 #define MWAIT_EXTENSION(cpi)    ((cpi)->cpi_std[5].cp_ecx & 0x1)
1988 #define MWAIT_SIZE_MIN(cpi)     BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1989 #define MWAIT_SIZE_MAX(cpi)     BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1990 /*
1991  * Number of sub-cstates for a given c-state.
1992  */
1993 #define MWAIT_NUM_SUBC_STATES(cpi, c_state)                     \
1994         BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1995 
1996 /*
1997  * XSAVE leaf 0xD enumeration
1998  */
1999 #define CPUID_LEAFD_2_YMM_OFFSET        576
2000 #define CPUID_LEAFD_2_YMM_SIZE          256
2001 
2002 /*
2003  * Common extended leaf names to cut down on typos.
2004  */
2005 #define CPUID_LEAF_EXT_0                0x80000000
2006 #define CPUID_LEAF_EXT_8                0x80000008
2007 #define CPUID_LEAF_EXT_1d               0x8000001d
2008 #define CPUID_LEAF_EXT_1e               0x8000001e
2009 #define CPUID_LEAF_EXT_21               0x80000021
2010 #define CPUID_LEAF_EXT_26               0x80000026
2011 
2012 /*
2013  * Functions we consume from cpuid_subr.c;  don't publish these in a header
2014  * file to try and keep people using the expected cpuid_* interfaces.
2015  */
2016 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
2017 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
2018 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
2019 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
2020 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
2021 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
2022 
2023 /*
2024  * Apply up various platform-dependent restrictions where the
2025  * underlying platform restrictions mean the CPU can be marked
2026  * as less capable than its cpuid instruction would imply.
2027  */
2028 #if defined(__xpv)
2029 static void
2030 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2031 {
2032         switch (eax) {
2033         case 1: {
2034                 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2035                     0 : CPUID_INTC_EDX_MCA;
2036                 cp->cp_edx &=
2037                     ~(mcamask |
2038                     CPUID_INTC_EDX_PSE |
2039                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2040                     CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2041                     CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2042                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2043                     CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2044                 break;
2045         }
2046 
2047         case 0x80000001:
2048                 cp->cp_edx &=
2049                     ~(CPUID_AMD_EDX_PSE |
2050                     CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2051                     CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2052                     CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2053                     CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2054                     CPUID_AMD_EDX_TSCP);
2055                 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2056                 break;
2057         default:
2058                 break;
2059         }
2060 
2061         switch (vendor) {
2062         case X86_VENDOR_Intel:
2063                 switch (eax) {
2064                 case 4:
2065                         /*
2066                          * Zero out the (ncores-per-chip - 1) field
2067                          */
2068                         cp->cp_eax &= 0x03fffffff;
2069                         break;
2070                 default:
2071                         break;
2072                 }
2073                 break;
2074         case X86_VENDOR_AMD:
2075         case X86_VENDOR_HYGON:
2076                 switch (eax) {
2077 
2078                 case 0x80000001:
2079                         cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2080                         break;
2081 
2082                 case CPUID_LEAF_EXT_8:
2083                         /*
2084                          * Zero out the (ncores-per-chip - 1) field
2085                          */
2086                         cp->cp_ecx &= 0xffffff00;
2087                         break;
2088                 default:
2089                         break;
2090                 }
2091                 break;
2092         default:
2093                 break;
2094         }
2095 }
2096 #else
2097 #define platform_cpuid_mangle(vendor, eax, cp)  /* nothing */
2098 #endif
2099 
2100 /*
2101  *  Some undocumented ways of patching the results of the cpuid
2102  *  instruction to permit running Solaris 10 on future cpus that
2103  *  we don't currently support.  Could be set to non-zero values
2104  *  via settings in eeprom.
2105  */
2106 
2107 uint32_t cpuid_feature_ecx_include;
2108 uint32_t cpuid_feature_ecx_exclude;
2109 uint32_t cpuid_feature_edx_include;
2110 uint32_t cpuid_feature_edx_exclude;
2111 
2112 /*
2113  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2114  */
2115 void
2116 cpuid_alloc_space(cpu_t *cpu)
2117 {
2118         /*
2119          * By convention, cpu0 is the boot cpu, which is set up
2120          * before memory allocation is available.  All other cpus get
2121          * their cpuid_info struct allocated here.
2122          */
2123         ASSERT(cpu->cpu_id != 0);
2124         ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2125         cpu->cpu_m.mcpu_cpi =
2126             kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2127 }
2128 
2129 void
2130 cpuid_free_space(cpu_t *cpu)
2131 {
2132         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2133         int i;
2134 
2135         ASSERT(cpi != NULL);
2136         ASSERT(cpi != &cpuid_info0);
2137 
2138         /*
2139          * Free up any cache leaf related dynamic storage. The first entry was
2140          * cached from the standard cpuid storage, so we should not free it.
2141          */
2142         for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2143                 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2144         if (cpi->cpi_cache_leaf_size > 0)
2145                 kmem_free(cpi->cpi_cache_leaves,
2146                     cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2147 
2148         kmem_free(cpi, sizeof (*cpi));
2149         cpu->cpu_m.mcpu_cpi = NULL;
2150 }
2151 
2152 #if !defined(__xpv)
2153 /*
2154  * Determine the type of the underlying platform. This is used to customize
2155  * initialization of various subsystems (e.g. TSC). determine_platform() must
2156  * only ever be called once to prevent two processors from seeing different
2157  * values of platform_type. Must be called before cpuid_pass_ident(), the
2158  * earliest consumer to execute; the identification pass will call
2159  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2160  */
2161 void
2162 determine_platform(void)
2163 {
2164         struct cpuid_regs cp;
2165         uint32_t base;
2166         uint32_t regs[4];
2167         char *hvstr = (char *)regs;
2168 
2169         ASSERT(platform_type == -1);
2170 
2171         platform_type = HW_NATIVE;
2172 
2173         if (!enable_platform_detection)
2174                 return;
2175 
2176         /*
2177          * If Hypervisor CPUID bit is set, try to determine hypervisor
2178          * vendor signature, and set platform type accordingly.
2179          *
2180          * References:
2181          * http://lkml.org/lkml/2008/10/1/246
2182          * http://kb.vmware.com/kb/1009458
2183          */
2184         cp.cp_eax = 0x1;
2185         (void) __cpuid_insn(&cp);
2186         if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2187                 cp.cp_eax = 0x40000000;
2188                 (void) __cpuid_insn(&cp);
2189                 regs[0] = cp.cp_ebx;
2190                 regs[1] = cp.cp_ecx;
2191                 regs[2] = cp.cp_edx;
2192                 regs[3] = 0;
2193                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2194                         platform_type = HW_XEN_HVM;
2195                         return;
2196                 }
2197                 if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2198                         platform_type = HW_VMWARE;
2199                         return;
2200                 }
2201                 if (strcmp(hvstr, HVSIG_KVM) == 0) {
2202                         platform_type = HW_KVM;
2203                         return;
2204                 }
2205                 if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2206                         platform_type = HW_BHYVE;
2207                         return;
2208                 }
2209                 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) {
2210                         platform_type = HW_MICROSOFT;
2211                         return;
2212                 }
2213                 if (strcmp(hvstr, HVSIG_QEMU_TCG) == 0) {
2214                         platform_type = HW_QEMU_TCG;
2215                         return;
2216                 }
2217         } else {
2218                 /*
2219                  * Check older VMware hardware versions. VMware hypervisor is
2220                  * detected by performing an IN operation to VMware hypervisor
2221                  * port and checking that value returned in %ebx is VMware
2222                  * hypervisor magic value.
2223                  *
2224                  * References: http://kb.vmware.com/kb/1009458
2225                  */
2226                 vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2227                 if (regs[1] == VMWARE_HVMAGIC) {
2228                         platform_type = HW_VMWARE;
2229                         return;
2230                 }
2231         }
2232 
2233         /*
2234          * Check Xen hypervisor. In a fully virtualized domain,
2235          * Xen's pseudo-cpuid function returns a string representing the
2236          * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2237          * supported cpuid function. We need at least a (base + 2) leaf value
2238          * to do what we want to do. Try different base values, since the
2239          * hypervisor might use a different one depending on whether Hyper-V
2240          * emulation is switched on by default or not.
2241          */
2242         for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2243                 cp.cp_eax = base;
2244                 (void) __cpuid_insn(&cp);
2245                 regs[0] = cp.cp_ebx;
2246                 regs[1] = cp.cp_ecx;
2247                 regs[2] = cp.cp_edx;
2248                 regs[3] = 0;
2249                 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2250                     cp.cp_eax >= (base + 2)) {
2251                         platform_type &= ~HW_NATIVE;
2252                         platform_type |= HW_XEN_HVM;
2253                         return;
2254                 }
2255         }
2256 }
2257 
2258 int
2259 get_hwenv(void)
2260 {
2261         ASSERT(platform_type != -1);
2262         return (platform_type);
2263 }
2264 
2265 int
2266 is_controldom(void)
2267 {
2268         return (0);
2269 }
2270 
2271 #else
2272 
2273 int
2274 get_hwenv(void)
2275 {
2276         return (HW_XEN_PV);
2277 }
2278 
2279 int
2280 is_controldom(void)
2281 {
2282         return (DOMAIN_IS_INITDOMAIN(xen_info));
2283 }
2284 
2285 #endif  /* __xpv */
2286 
2287 /*
2288  * Gather the extended topology information. This should be the same for both
2289  * AMD leaf 8X26 and Intel leaf 0x1F (though the data interpretation varies).
2290  */
2291 static void
2292 cpuid_gather_ext_topo_leaf(struct cpuid_info *cpi, uint32_t leaf)
2293 {
2294         uint_t i;
2295 
2296         for (i = 0; i < ARRAY_SIZE(cpi->cpi_topo); i++) {
2297                 struct cpuid_regs *regs = &cpi->cpi_topo[i];
2298 
2299                 bzero(regs, sizeof (struct cpuid_regs));
2300                 regs->cp_eax = leaf;
2301                 regs->cp_ecx = i;
2302 
2303                 (void) __cpuid_insn(regs);
2304                 if (CPUID_AMD_8X26_ECX_TYPE(regs->cp_ecx) ==
2305                     CPUID_AMD_8X26_TYPE_DONE) {
2306                         break;
2307                 }
2308         }
2309 
2310         cpi->cpi_topo_nleaves = i;
2311 }
2312 
2313 /*
2314  * Make sure that we have gathered all of the CPUID leaves that we might need to
2315  * determine topology. We assume that the standard leaf 1 has already been done
2316  * and that xmaxeax has already been calculated.
2317  */
2318 static void
2319 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2320 {
2321         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2322 
2323         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2324                 struct cpuid_regs *cp;
2325 
2326                 cp = &cpi->cpi_extd[8];
2327                 cp->cp_eax = CPUID_LEAF_EXT_8;
2328                 (void) __cpuid_insn(cp);
2329                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2330         }
2331 
2332         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2333             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2334                 struct cpuid_regs *cp;
2335 
2336                 cp = &cpi->cpi_extd[0x1e];
2337                 cp->cp_eax = CPUID_LEAF_EXT_1e;
2338                 (void) __cpuid_insn(cp);
2339         }
2340 
2341         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_26) {
2342                 cpuid_gather_ext_topo_leaf(cpi, CPUID_LEAF_EXT_26);
2343         }
2344 }
2345 
2346 /*
2347  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2348  * it to everything else. If not, and we're on an AMD system where 8000001e is
2349  * valid, then we use that. Othewrise, we fall back to the default value for the
2350  * APIC ID in leaf 1.
2351  */
2352 static uint32_t
2353 cpuid_gather_apicid(struct cpuid_info *cpi)
2354 {
2355         /*
2356          * Leaf B changes based on the arguments to it. Because we don't cache
2357          * it, we need to gather it again.
2358          */
2359         if (cpi->cpi_maxeax >= 0xB) {
2360                 struct cpuid_regs regs;
2361                 struct cpuid_regs *cp;
2362 
2363                 cp = &regs;
2364                 cp->cp_eax = 0xB;
2365                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2366                 (void) __cpuid_insn(cp);
2367 
2368                 if (cp->cp_ebx != 0) {
2369                         return (cp->cp_edx);
2370                 }
2371         }
2372 
2373         if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2374             cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2375             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2376             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2377                 return (cpi->cpi_extd[0x1e].cp_eax);
2378         }
2379 
2380         return (CPI_APIC_ID(cpi));
2381 }
2382 
2383 /*
2384  * For AMD processors, attempt to calculate the number of chips and cores that
2385  * exist. The way that we do this varies based on the generation, because the
2386  * generations themselves have changed dramatically.
2387  *
2388  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2389  * However, with the advent of family 17h (Zen) it actually tells us the number
2390  * of threads, so we need to look at leaf 0x8000001e if available to determine
2391  * its value. Otherwise, for all prior families, the number of enabled cores is
2392  * the same as threads.
2393  *
2394  * If we do not have leaf 0x80000008, then we assume that this processor does
2395  * not have anything. AMD's older CPUID specification says there's no reason to
2396  * fall back to leaf 1.
2397  *
2398  * In some virtualization cases we will not have leaf 8000001e or it will be
2399  * zero. When that happens we assume the number of threads is one.
2400  */
2401 static void
2402 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2403 {
2404         uint_t nthreads, nthread_per_core;
2405 
2406         nthreads = nthread_per_core = 1;
2407 
2408         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2409                 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2410         } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2411                 nthreads = CPI_CPU_COUNT(cpi);
2412         }
2413 
2414         /*
2415          * For us to have threads, and know about it, we have to be at least at
2416          * family 17h and have the cpuid bit that says we have extended
2417          * topology.
2418          */
2419         if (cpi->cpi_family >= 0x17 &&
2420             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2421             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2422                 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2423         }
2424 
2425         *ncpus = nthreads;
2426         *ncores = nthreads / nthread_per_core;
2427 }
2428 
2429 /*
2430  * Seed the initial values for the cores and threads for an Intel based
2431  * processor. These values will be overwritten if we detect that the processor
2432  * supports CPUID leaf 0xb.
2433  */
2434 static void
2435 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2436 {
2437         /*
2438          * Only seed the number of physical cores from the first level leaf 4
2439          * information. The number of threads there indicate how many share the
2440          * L1 cache, which may or may not have anything to do with the number of
2441          * logical CPUs per core.
2442          */
2443         if (cpi->cpi_maxeax >= 4) {
2444                 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2445         } else {
2446                 *ncores = 1;
2447         }
2448 
2449         if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2450                 *ncpus = CPI_CPU_COUNT(cpi);
2451         } else {
2452                 *ncpus = *ncores;
2453         }
2454 }
2455 
2456 static boolean_t
2457 cpuid_leafB_getids(cpu_t *cpu)
2458 {
2459         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2460         struct cpuid_regs regs;
2461         struct cpuid_regs *cp;
2462 
2463         if (cpi->cpi_maxeax < 0xB)
2464                 return (B_FALSE);
2465 
2466         cp = &regs;
2467         cp->cp_eax = 0xB;
2468         cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2469 
2470         (void) __cpuid_insn(cp);
2471 
2472         /*
2473          * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2474          * indicates that the extended topology enumeration leaf is
2475          * available.
2476          */
2477         if (cp->cp_ebx != 0) {
2478                 uint32_t x2apic_id = 0;
2479                 uint_t coreid_shift = 0;
2480                 uint_t ncpu_per_core = 1;
2481                 uint_t chipid_shift = 0;
2482                 uint_t ncpu_per_chip = 1;
2483                 uint_t i;
2484                 uint_t level;
2485 
2486                 for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2487                         cp->cp_eax = 0xB;
2488                         cp->cp_ecx = i;
2489 
2490                         (void) __cpuid_insn(cp);
2491                         level = CPI_CPU_LEVEL_TYPE(cp);
2492 
2493                         if (level == 1) {
2494                                 x2apic_id = cp->cp_edx;
2495                                 coreid_shift = BITX(cp->cp_eax, 4, 0);
2496                                 ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2497                         } else if (level == 2) {
2498                                 x2apic_id = cp->cp_edx;
2499                                 chipid_shift = BITX(cp->cp_eax, 4, 0);
2500                                 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2501                         }
2502                 }
2503 
2504                 /*
2505                  * cpi_apicid is taken care of in cpuid_gather_apicid.
2506                  */
2507                 cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2508                 cpi->cpi_ncore_per_chip = ncpu_per_chip /
2509                     ncpu_per_core;
2510                 cpi->cpi_chipid = x2apic_id >> chipid_shift;
2511                 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2512                 cpi->cpi_coreid = x2apic_id >> coreid_shift;
2513                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2514                 cpi->cpi_procnodeid = cpi->cpi_chipid;
2515                 cpi->cpi_compunitid = cpi->cpi_coreid;
2516 
2517                 if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2518                         cpi->cpi_nthread_bits = coreid_shift;
2519                         cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2520                 }
2521 
2522                 return (B_TRUE);
2523         } else {
2524                 return (B_FALSE);
2525         }
2526 }
2527 
2528 static void
2529 cpuid_intel_getids(cpu_t *cpu, void *feature)
2530 {
2531         uint_t i;
2532         uint_t chipid_shift = 0;
2533         uint_t coreid_shift = 0;
2534         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2535 
2536         /*
2537          * There are no compute units or processor nodes currently on Intel.
2538          * Always set these to one.
2539          */
2540         cpi->cpi_procnodes_per_pkg = 1;
2541         cpi->cpi_cores_per_compunit = 1;
2542 
2543         /*
2544          * If cpuid Leaf B is present, use that to try and get this information.
2545          * It will be the most accurate for Intel CPUs.
2546          */
2547         if (cpuid_leafB_getids(cpu))
2548                 return;
2549 
2550         /*
2551          * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2552          * and ncore_per_chip. These represent the largest power of two values
2553          * that we need to cover all of the IDs in the system. Therefore, we use
2554          * those values to seed the number of bits needed to cover information
2555          * in the case when leaf B is not available. These values will probably
2556          * be larger than required, but that's OK.
2557          */
2558         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2559         cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2560 
2561         for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2562                 chipid_shift++;
2563 
2564         cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2565         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2566 
2567         if (is_x86_feature(feature, X86FSET_CMP)) {
2568                 /*
2569                  * Multi-core (and possibly multi-threaded)
2570                  * processors.
2571                  */
2572                 uint_t ncpu_per_core = 0;
2573 
2574                 if (cpi->cpi_ncore_per_chip == 1)
2575                         ncpu_per_core = cpi->cpi_ncpu_per_chip;
2576                 else if (cpi->cpi_ncore_per_chip > 1)
2577                         ncpu_per_core = cpi->cpi_ncpu_per_chip /
2578                             cpi->cpi_ncore_per_chip;
2579                 /*
2580                  * 8bit APIC IDs on dual core Pentiums
2581                  * look like this:
2582                  *
2583                  * +-----------------------+------+------+
2584                  * | Physical Package ID   |  MC  |  HT  |
2585                  * +-----------------------+------+------+
2586                  * <------- chipid -------->
2587                  * <------- coreid --------------->
2588                  *                         <--- clogid -->
2589                  *                         <------>
2590                  *                         pkgcoreid
2591                  *
2592                  * Where the number of bits necessary to
2593                  * represent MC and HT fields together equals
2594                  * to the minimum number of bits necessary to
2595                  * store the value of cpi->cpi_ncpu_per_chip.
2596                  * Of those bits, the MC part uses the number
2597                  * of bits necessary to store the value of
2598                  * cpi->cpi_ncore_per_chip.
2599                  */
2600                 for (i = 1; i < ncpu_per_core; i <<= 1)
2601                         coreid_shift++;
2602                 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2603                 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2604         } else if (is_x86_feature(feature, X86FSET_HTT)) {
2605                 /*
2606                  * Single-core multi-threaded processors.
2607                  */
2608                 cpi->cpi_coreid = cpi->cpi_chipid;
2609                 cpi->cpi_pkgcoreid = 0;
2610         } else {
2611                 /*
2612                  * Single-core single-thread processors.
2613                  */
2614                 cpi->cpi_coreid = cpu->cpu_id;
2615                 cpi->cpi_pkgcoreid = 0;
2616         }
2617         cpi->cpi_procnodeid = cpi->cpi_chipid;
2618         cpi->cpi_compunitid = cpi->cpi_coreid;
2619 }
2620 
2621 /*
2622  * Historically, AMD has had CMP chips with only a single thread per core.
2623  * However, starting in family 17h (Zen), this has changed and they now have
2624  * multiple threads. Our internal core id needs to be a unique value.
2625  *
2626  * To determine the core id of an AMD system, if we're from a family before 17h,
2627  * then we just use the cpu id, as that gives us a good value that will be
2628  * unique for each core. If instead, we're on family 17h or later, then we need
2629  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2630  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2631  * We can't use the normal core id in that leaf as it's only unique within the
2632  * socket, which is perfect for cpi_pkgcoreid, but not us.
2633  */
2634 static id_t
2635 cpuid_amd_get_coreid(cpu_t *cpu)
2636 {
2637         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2638 
2639         if (cpi->cpi_family >= 0x17 &&
2640             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2641             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2642                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2643                 if (nthreads > 1) {
2644                         VERIFY3U(nthreads, ==, 2);
2645                         return (cpi->cpi_apicid >> 1);
2646                 }
2647         }
2648 
2649         return (cpu->cpu_id);
2650 }
2651 
2652 /*
2653  * IDs on AMD is a more challenging task. This is notable because of the
2654  * following two facts:
2655  *
2656  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2657  *     also no way to get an actual unique core id from the system. As such, we
2658  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2659  *     however, guarantee that sibling cores of a chip will have sequential
2660  *     coreids starting at a multiple of the number of cores per chip - that is
2661  *     usually the case, but if the APIC IDs have been set up in a different
2662  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2663  *
2664  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2665  *     called compute units. These compute units share the L1I cache, L2 cache,
2666  *     and the FPU. To deal with this, a new topology leaf was added in
2667  *     0x8000001e. However, parts of this leaf have different meanings
2668  *     once we get to family 0x17.
2669  */
2670 
2671 static void
2672 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2673 {
2674         int i, first_half, coreidsz;
2675         uint32_t nb_caps_reg;
2676         uint_t node2_1;
2677         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2678         struct cpuid_regs *cp;
2679 
2680         /*
2681          * Calculate the core id (this comes from hardware in family 0x17 if it
2682          * hasn't been stripped by virtualization). We always set the compute
2683          * unit id to the same value. Also, initialize the default number of
2684          * cores per compute unit and nodes per package. This will be
2685          * overwritten when we know information about a particular family.
2686          */
2687         cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2688         cpi->cpi_compunitid = cpi->cpi_coreid;
2689         cpi->cpi_cores_per_compunit = 1;
2690         cpi->cpi_procnodes_per_pkg = 1;
2691 
2692         /*
2693          * To construct the logical ID, we need to determine how many APIC IDs
2694          * are dedicated to the cores and threads. This is provided for us in
2695          * 0x80000008. However, if it's not present (say due to virtualization),
2696          * then we assume it's one. This should be present on all 64-bit AMD
2697          * processors.  It was added in family 0xf (Hammer).
2698          */
2699         if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2700                 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2701 
2702                 /*
2703                  * In AMD parlance chip is really a node while illumos
2704                  * uses chip as equivalent to socket/package.
2705                  */
2706                 if (coreidsz == 0) {
2707                         /* Use legacy method */
2708                         for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2709                                 coreidsz++;
2710                         if (coreidsz == 0)
2711                                 coreidsz = 1;
2712                 }
2713         } else {
2714                 /* Assume single-core part */
2715                 coreidsz = 1;
2716         }
2717         cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2718 
2719         /*
2720          * The package core ID varies depending on the family. While it may be
2721          * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2722          * this value is the core id in the given node. For non-virtualized
2723          * family 17h, we need to take the logical core id and shift off the
2724          * threads like we do when getting the core id.  Otherwise, we can use
2725          * the clogid as is. When family 17h is virtualized, the clogid should
2726          * be sufficient as if we don't have valid data in the leaf, then we
2727          * won't think we have SMT, in which case the cpi_clogid should be
2728          * sufficient.
2729          */
2730         if (cpi->cpi_family >= 0x17 &&
2731             is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2732             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2733             cpi->cpi_extd[0x1e].cp_ebx != 0) {
2734                 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2735                 if (nthreads > 1) {
2736                         VERIFY3U(nthreads, ==, 2);
2737                         cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2738                 } else {
2739                         cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2740                 }
2741         } else {
2742                 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2743         }
2744 
2745         /*
2746          * Obtain the node ID and compute unit IDs. If we're on family 0x15
2747          * (bulldozer) or newer, then we can derive all of this from leaf
2748          * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2749          */
2750         if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2751             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2752                 cp = &cpi->cpi_extd[0x1e];
2753 
2754                 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2755                 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2756 
2757                 /*
2758                  * For Bulldozer-era CPUs, recalculate the compute unit
2759                  * information.
2760                  */
2761                 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2762                         cpi->cpi_cores_per_compunit =
2763                             BITX(cp->cp_ebx, 15, 8) + 1;
2764                         cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2765                             (cpi->cpi_ncore_per_chip /
2766                             cpi->cpi_cores_per_compunit) *
2767                             (cpi->cpi_procnodeid /
2768                             cpi->cpi_procnodes_per_pkg);
2769                 }
2770         } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2771                 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2772         } else if (cpi->cpi_family == 0x10) {
2773                 /*
2774                  * See if we are a multi-node processor.
2775                  * All processors in the system have the same number of nodes
2776                  */
2777                 nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2778                 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2779                         /* Single-node */
2780                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2781                             coreidsz);
2782                 } else {
2783 
2784                         /*
2785                          * Multi-node revision D (2 nodes per package
2786                          * are supported)
2787                          */
2788                         cpi->cpi_procnodes_per_pkg = 2;
2789 
2790                         first_half = (cpi->cpi_pkgcoreid <=
2791                             (cpi->cpi_ncore_per_chip/2 - 1));
2792 
2793                         if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2794                                 /* We are BSP */
2795                                 cpi->cpi_procnodeid = (first_half ? 0 : 1);
2796                         } else {
2797 
2798                                 /* We are AP */
2799                                 /* NodeId[2:1] bits to use for reading F3xe8 */
2800                                 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2801 
2802                                 nb_caps_reg =
2803                                     pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2804 
2805                                 /*
2806                                  * Check IntNodeNum bit (31:30, but bit 31 is
2807                                  * always 0 on dual-node processors)
2808                                  */
2809                                 if (BITX(nb_caps_reg, 30, 30) == 0)
2810                                         cpi->cpi_procnodeid = node2_1 +
2811                                             !first_half;
2812                                 else
2813                                         cpi->cpi_procnodeid = node2_1 +
2814                                             first_half;
2815                         }
2816                 }
2817         } else {
2818                 cpi->cpi_procnodeid = 0;
2819         }
2820 
2821         cpi->cpi_chipid =
2822             cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2823 
2824         cpi->cpi_ncore_bits = coreidsz;
2825         cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2826             cpi->cpi_ncore_per_chip);
2827 }
2828 
2829 static void
2830 spec_uarch_flush_noop(void)
2831 {
2832 }
2833 
2834 /*
2835  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2836  * MDS-related micro-architectural state that would normally happen by calling
2837  * x86_md_clear().
2838  */
2839 static void
2840 spec_uarch_flush_msr(void)
2841 {
2842         wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2843 }
2844 
2845 /*
2846  * This function points to a function that will flush certain
2847  * micro-architectural state on the processor. This flush is used to mitigate
2848  * three different classes of Intel CPU vulnerabilities: L1TF, MDS, and RFDS.
2849  * This function can point to one of three functions:
2850  *
2851  * - A noop which is done because we either are vulnerable, but do not have
2852  *   microcode available to help deal with a fix, or because we aren't
2853  *   vulnerable.
2854  *
2855  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2856  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2857  *   however, it only flushes the MDS related micro-architectural state on the
2858  *   current hyperthread, it does not do anything for the twin.
2859  *
2860  * - x86_md_clear which will flush the MDS related state. This is done when we
2861  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2862  *   (RDCL_NO is set); or if the CPU is vulnerable to RFDS and indicates VERW
2863  *   can clear it (RFDS_CLEAR is set).
2864  */
2865 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2866 
2867 static void
2868 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2869 {
2870         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2871 
2872         /* Non-Intel doesn't concern us here. */
2873         if (cpi->cpi_vendor != X86_VENDOR_Intel)
2874                 return;
2875 
2876         /*
2877          * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2878          * has been fixed in hardware, it doesn't cover everything related to
2879          * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2880          * need to mitigate this.
2881          *
2882          * We must ALSO check the case of RFDS_NO and if RFDS_CLEAR is set,
2883          * because of the small cases of RFDS.
2884          */
2885 
2886         if ((!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2887             is_x86_feature(featureset, X86FSET_MD_CLEAR)) ||
2888             (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2889             is_x86_feature(featureset, X86FSET_RFDS_CLEAR))) {
2890                 const uint8_t nop = NOP_INSTR;
2891                 uint8_t *md = (uint8_t *)x86_md_clear;
2892 
2893                 *md = nop;
2894         }
2895 
2896         membar_producer();
2897 }
2898 
2899 static void
2900 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2901 {
2902         boolean_t need_l1d, need_mds, need_rfds;
2903         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2904 
2905         /*
2906          * If we're not on Intel or we've mitigated all of RDCL, MDS, and RFDS
2907          * in hardware, then there's nothing left for us to do for enabling
2908          * the flush. We can also go ahead and say that SMT exclusion is
2909          * unnecessary.
2910          */
2911         if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2912             (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2913             is_x86_feature(featureset, X86FSET_MDS_NO) &&
2914             is_x86_feature(featureset, X86FSET_RFDS_NO))) {
2915                 extern int smt_exclusion;
2916                 smt_exclusion = 0;
2917                 spec_uarch_flush = spec_uarch_flush_noop;
2918                 membar_producer();
2919                 return;
2920         }
2921 
2922         /*
2923          * The locations where we need to perform an L1D flush are required both
2924          * for mitigating L1TF and MDS. When verw support is present in
2925          * microcode, then the L1D flush will take care of doing that as well.
2926          * However, if we have a system where RDCL_NO is present, but we don't
2927          * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2928          * L1D flush.
2929          */
2930         if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2931             is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2932             !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2933                 need_l1d = B_TRUE;
2934         } else {
2935                 need_l1d = B_FALSE;
2936         }
2937 
2938         if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2939             is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2940                 need_mds = B_TRUE;
2941         } else {
2942                 need_mds = B_FALSE;
2943         }
2944 
2945         if (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2946             is_x86_feature(featureset, X86FSET_RFDS_CLEAR)) {
2947                 need_rfds = B_TRUE;
2948         } else {
2949                 need_rfds = B_FALSE;
2950         }
2951 
2952         if (need_l1d) {
2953                 /*
2954                  * As of Feb, 2024, no CPU needs L1D *and* RFDS mitigation
2955                  * together. If the following VERIFY trips, we need to add
2956                  * further fixes here.
2957                  */
2958                 VERIFY(!need_rfds);
2959                 spec_uarch_flush = spec_uarch_flush_msr;
2960         } else if (need_mds || need_rfds) {
2961                 spec_uarch_flush = x86_md_clear;
2962         } else {
2963                 /*
2964                  * We have no hardware mitigations available to us.
2965                  */
2966                 spec_uarch_flush = spec_uarch_flush_noop;
2967         }
2968         membar_producer();
2969 }
2970 
2971 /*
2972  * We default to enabling Return Stack Buffer (RSB) mitigations.
2973  *
2974  * We used to skip RSB mitigations with Intel eIBRS, but developments around
2975  * post-barrier RSB (PBRSB) guessing suggests we should enable Intel RSB
2976  * mitigations always unless explicitly bypassed, or unless hardware indicates
2977  * the bug has been fixed. Intel also says that machines without eIBRS do not
2978  * have VMEXIT problems with PBRSB. Basically, if we're Intel and have eIBRS,
2979  * we must stuff the RSB in both context switches AND in VMEXIT, unless the
2980  * hardware says the PBRSB bug is fixed.  If we're Intel but without eIBRS
2981  * (i.e. using retpolines), we must stuff the RSB in context switches, but we
2982  * do not have to for VMEXIT.
2983  *
2984  * See (pardon broken URL)  https://www.intel.com/content/www/us/en/developer \
2985  * /articles/technical/software-security-guidance/advisory-guidance
2986  * /post-barrier-return-stack-buffer-predictions.html
2987  *
2988  * AMD indicates that when Automatic IBRS is enabled we do not need to implement
2989  * return stack buffer clearing for VMEXIT as it takes care of it. The manual
2990  * also states that as long as SMEP and we maintain at least one page between
2991  * the kernel and user space (we have much more of a red zone), then we do not
2992  * need to clear the RSB. We constrain this to only when Automatic IRBS is
2993  * present.
2994  */
2995 static void
2996 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit, bool intel_pbrsb_no)
2997 {
2998         const uint8_t ret = RET_INSTR;
2999         uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
3000         uint8_t *vmx_stuff = (uint8_t *)x86_rsb_stuff_vmexit;
3001 
3002         switch (mit) {
3003         case X86_SPECTREV2_AUTO_IBRS:
3004         case X86_SPECTREV2_DISABLED:
3005                 /* Don't bother with any RSB stuffing! */
3006                 *stuff = ret;
3007                 *vmx_stuff = ret;
3008                 break;
3009         case X86_SPECTREV2_RETPOLINE:
3010                 /*
3011                  * The Intel document on Post-Barrier RSB says that processors
3012                  * without eIBRS do not have PBRSB problems upon VMEXIT.
3013                  */
3014                 VERIFY(!intel_pbrsb_no);
3015                 VERIFY3U(*stuff, !=, ret);
3016                 *vmx_stuff = ret;
3017                 break;
3018         default:
3019                 /*
3020                  * eIBRS is all that's left.  If CPU claims PBRSB is fixed,
3021                  * don't use the RSB mitigation in either case.
3022                  */
3023                 if (intel_pbrsb_no) {
3024                         /* CPU claims PBRSB problems are fixed. */
3025                         *stuff = ret;
3026                         *vmx_stuff = ret;
3027                 }
3028                 VERIFY3U(*stuff, ==, *vmx_stuff);
3029                 break;
3030         }
3031 }
3032 
3033 static void
3034 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
3035 {
3036         const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
3037             "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
3038             "_r14", "_r15" };
3039         const uint_t nthunks = ARRAY_SIZE(thunks);
3040         const char *type;
3041         uint_t i;
3042 
3043         if (mit == x86_spectrev2_mitigation)
3044                 return;
3045 
3046         switch (mit) {
3047         case X86_SPECTREV2_RETPOLINE:
3048                 type = "gen";
3049                 break;
3050         case X86_SPECTREV2_AUTO_IBRS:
3051         case X86_SPECTREV2_ENHANCED_IBRS:
3052         case X86_SPECTREV2_DISABLED:
3053                 type = "jmp";
3054                 break;
3055         default:
3056                 panic("asked to update retpoline state with unknown state!");
3057         }
3058 
3059         for (i = 0; i < nthunks; i++) {
3060                 uintptr_t source, dest;
3061                 int ssize, dsize;
3062                 char sourcebuf[64], destbuf[64];
3063 
3064                 (void) snprintf(destbuf, sizeof (destbuf),
3065                     "__x86_indirect_thunk%s", thunks[i]);
3066                 (void) snprintf(sourcebuf, sizeof (sourcebuf),
3067                     "__x86_indirect_thunk_%s%s", type, thunks[i]);
3068 
3069                 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
3070                 dest = kobj_getelfsym(destbuf, NULL, &dsize);
3071                 VERIFY3U(source, !=, 0);
3072                 VERIFY3U(dest, !=, 0);
3073                 VERIFY3S(dsize, >=, ssize);
3074                 bcopy((void *)source, (void *)dest, ssize);
3075         }
3076 }
3077 
3078 static void
3079 cpuid_enable_enhanced_ibrs(void)
3080 {
3081         uint64_t val;
3082 
3083         val = rdmsr(MSR_IA32_SPEC_CTRL);
3084         val |= IA32_SPEC_CTRL_IBRS;
3085         wrmsr(MSR_IA32_SPEC_CTRL, val);
3086 }
3087 
3088 static void
3089 cpuid_enable_auto_ibrs(void)
3090 {
3091         uint64_t val;
3092 
3093         val = rdmsr(MSR_AMD_EFER);
3094         val |= AMD_EFER_AIBRSE;
3095         wrmsr(MSR_AMD_EFER, val);
3096 }
3097 
3098 /*
3099  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
3100  * we can disable TSX, we do so.
3101  *
3102  * This determination is done only on the boot CPU, potentially after loading
3103  * updated microcode.
3104  */
3105 static void
3106 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
3107 {
3108         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3109 
3110         VERIFY(cpu->cpu_id == 0);
3111 
3112         if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3113                 x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3114                 return;
3115         }
3116 
3117         if (x86_disable_taa) {
3118                 x86_taa_mitigation = X86_TAA_DISABLED;
3119                 return;
3120         }
3121 
3122         /*
3123          * If we do not have the ability to disable TSX, then our only
3124          * mitigation options are in hardware (TAA_NO), or by using our existing
3125          * MDS mitigation as described above.  The latter relies upon us having
3126          * configured MDS mitigations correctly! This includes disabling SMT if
3127          * we want to cross-CPU-thread protection.
3128          */
3129         if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3130                 /*
3131                  * It's not clear whether any parts will enumerate TAA_NO
3132                  * *without* TSX_CTRL, but let's mark it as such if we see this.
3133                  */
3134                 if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3135                         x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3136                         return;
3137                 }
3138 
3139                 if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3140                     !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3141                         x86_taa_mitigation = X86_TAA_MD_CLEAR;
3142                 } else {
3143                         x86_taa_mitigation = X86_TAA_NOTHING;
3144                 }
3145                 return;
3146         }
3147 
3148         /*
3149          * We have TSX_CTRL, but we can only fully disable TSX if we're early
3150          * enough in boot.
3151          *
3152          * Otherwise, we'll fall back to causing transactions to abort as our
3153          * mitigation. TSX-using code will always take the fallback path.
3154          */
3155         if (cpi->cpi_pass < 4) {
3156                 x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3157         } else {
3158                 x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3159         }
3160 }
3161 
3162 /*
3163  * As mentioned, we should only touch the MSR when we've got a suitable
3164  * microcode loaded on this CPU.
3165  */
3166 static void
3167 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3168 {
3169         uint64_t val;
3170 
3171         switch (taa) {
3172         case X86_TAA_TSX_DISABLE:
3173                 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3174                         return;
3175                 val = rdmsr(MSR_IA32_TSX_CTRL);
3176                 val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3177                 wrmsr(MSR_IA32_TSX_CTRL, val);
3178                 break;
3179         case X86_TAA_TSX_FORCE_ABORT:
3180                 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3181                         return;
3182                 val = rdmsr(MSR_IA32_TSX_CTRL);
3183                 val |= IA32_TSX_CTRL_RTM_DISABLE;
3184                 wrmsr(MSR_IA32_TSX_CTRL, val);
3185                 break;
3186         case X86_TAA_HW_MITIGATED:
3187         case X86_TAA_MD_CLEAR:
3188         case X86_TAA_DISABLED:
3189         case X86_TAA_NOTHING:
3190                 break;
3191         }
3192 }
3193 
3194 static void
3195 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3196 {
3197         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3198         x86_spectrev2_mitigation_t v2mit;
3199 
3200         if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3201             cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3202             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3203                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3204                         add_x86_feature(featureset, X86FSET_IBPB);
3205                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3206                         add_x86_feature(featureset, X86FSET_IBRS);
3207                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3208                         add_x86_feature(featureset, X86FSET_STIBP);
3209                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3210                         add_x86_feature(featureset, X86FSET_STIBP_ALL);
3211                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3212                         add_x86_feature(featureset, X86FSET_SSBD);
3213                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3214                         add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3215                 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3216                         add_x86_feature(featureset, X86FSET_SSB_NO);
3217 
3218                 /*
3219                  * Rather than Enhanced IBRS, AMD has a different feature that
3220                  * is a bit in EFER that can be enabled and will basically do
3221                  * the right thing while executing in the kernel.
3222                  */
3223                 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3224                     (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3225                     cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3226                     (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3227                         add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3228                 }
3229 
3230         } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3231             cpi->cpi_maxeax >= 7) {
3232                 struct cpuid_regs *ecp;
3233                 ecp = &cpi->cpi_std[7];
3234 
3235                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3236                         add_x86_feature(featureset, X86FSET_MD_CLEAR);
3237                 }
3238 
3239                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3240                         add_x86_feature(featureset, X86FSET_IBRS);
3241                         add_x86_feature(featureset, X86FSET_IBPB);
3242                 }
3243 
3244                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3245                         add_x86_feature(featureset, X86FSET_STIBP);
3246                 }
3247 
3248                 /*
3249                  * Don't read the arch caps MSR on xpv where we lack the
3250                  * on_trap().
3251                  */
3252 #ifndef __xpv
3253                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3254                         on_trap_data_t otd;
3255 
3256                         /*
3257                          * Be paranoid and assume we'll get a #GP.
3258                          */
3259                         if (!on_trap(&otd, OT_DATA_ACCESS)) {
3260                                 uint64_t reg;
3261 
3262                                 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3263                                 if (reg & IA32_ARCH_CAP_RDCL_NO) {
3264                                         add_x86_feature(featureset,
3265                                             X86FSET_RDCL_NO);
3266                                 }
3267                                 if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3268                                         add_x86_feature(featureset,
3269                                             X86FSET_IBRS_ALL);
3270                                 }
3271                                 if (reg & IA32_ARCH_CAP_RSBA) {
3272                                         add_x86_feature(featureset,
3273                                             X86FSET_RSBA);
3274                                 }
3275                                 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3276                                         add_x86_feature(featureset,
3277                                             X86FSET_L1D_VM_NO);
3278                                 }
3279                                 if (reg & IA32_ARCH_CAP_SSB_NO) {
3280                                         add_x86_feature(featureset,
3281                                             X86FSET_SSB_NO);
3282                                 }
3283                                 if (reg & IA32_ARCH_CAP_MDS_NO) {
3284                                         add_x86_feature(featureset,
3285                                             X86FSET_MDS_NO);
3286                                 }
3287                                 if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3288                                         add_x86_feature(featureset,
3289                                             X86FSET_TSX_CTRL);
3290                                 }
3291                                 if (reg & IA32_ARCH_CAP_TAA_NO) {
3292                                         add_x86_feature(featureset,
3293                                             X86FSET_TAA_NO);
3294                                 }
3295                                 if (reg & IA32_ARCH_CAP_RFDS_NO) {
3296                                         add_x86_feature(featureset,
3297                                             X86FSET_RFDS_NO);
3298                                 }
3299                                 if (reg & IA32_ARCH_CAP_RFDS_CLEAR) {
3300                                         add_x86_feature(featureset,
3301                                             X86FSET_RFDS_CLEAR);
3302                                 }
3303                                 if (reg & IA32_ARCH_CAP_PBRSB_NO) {
3304                                         add_x86_feature(featureset,
3305                                             X86FSET_PBRSB_NO);
3306                                 }
3307                         }
3308                         no_trap();
3309                 }
3310 #endif  /* !__xpv */
3311 
3312                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3313                         add_x86_feature(featureset, X86FSET_SSBD);
3314 
3315                 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3316                         add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3317         }
3318 
3319         /*
3320          * Take care of certain mitigations on the non-boot CPU. The boot CPU
3321          * will have already run this function and determined what we need to
3322          * do. This gives us a hook for per-HW thread mitigations such as
3323          * enhanced IBRS, or disabling TSX.
3324          */
3325         if (cpu->cpu_id != 0) {
3326                 switch (x86_spectrev2_mitigation) {
3327                 case X86_SPECTREV2_ENHANCED_IBRS:
3328                         cpuid_enable_enhanced_ibrs();
3329                         break;
3330                 case X86_SPECTREV2_AUTO_IBRS:
3331                         cpuid_enable_auto_ibrs();
3332                         break;
3333                 default:
3334                         break;
3335                 }
3336 
3337                 cpuid_apply_tsx(x86_taa_mitigation, featureset);
3338                 return;
3339         }
3340 
3341         /*
3342          * Go through and initialize various security mechanisms that we should
3343          * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3344          * TAA.
3345          */
3346 
3347         /*
3348          * By default we've come in with retpolines enabled. Check whether we
3349          * should disable them or enable enhanced or automatic IBRS. RSB
3350          * stuffing is enabled by default. Note, we do not allow the use of AMD
3351          * optimized retpolines as it was disclosed by AMD in March 2022 that
3352          * they were still vulnerable. Prior to that point, we used them.
3353          */
3354         if (x86_disable_spectrev2 != 0) {
3355                 v2mit = X86_SPECTREV2_DISABLED;
3356         } else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3357                 cpuid_enable_auto_ibrs();
3358                 v2mit = X86_SPECTREV2_AUTO_IBRS;
3359         } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3360                 cpuid_enable_enhanced_ibrs();
3361                 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3362         } else {
3363                 v2mit = X86_SPECTREV2_RETPOLINE;
3364         }
3365 
3366         cpuid_patch_retpolines(v2mit);
3367         x86_spectrev2_mitigation = v2mit;
3368         membar_producer();
3369 
3370         /*
3371          * Return-stack buffer clearing may need a software-sequence. Discover
3372          * and patch as appropriate, after setting the SPECTREv2 global
3373          * mitigation level.
3374          */
3375         cpuid_patch_rsb(v2mit, is_x86_feature(featureset, X86FSET_PBRSB_NO));
3376         membar_producer();
3377 
3378         /*
3379          * We need to determine what changes are required for mitigating L1TF
3380          * and MDS. If the CPU suffers from either of them, then SMT exclusion
3381          * is required.
3382          *
3383          * If any of these are present, then we need to flush u-arch state at
3384          * various points. For MDS, we need to do so whenever we change to a
3385          * lesser privilege level or we are halting the CPU. For L1TF we need to
3386          * flush the L1D cache at VM entry. When we have microcode that handles
3387          * MDS, the L1D flush also clears the other u-arch state that the
3388          * md_clear does.
3389          */
3390 
3391         /*
3392          * Update whether or not we need to be taking explicit action against
3393          * MDS or RFDS.
3394          */
3395         cpuid_update_md_clear(cpu, featureset);
3396 
3397         /*
3398          * Determine whether SMT exclusion is required and whether or not we
3399          * need to perform an l1d flush.
3400          */
3401         cpuid_update_l1d_flush(cpu, featureset);
3402 
3403         /*
3404          * Determine what our mitigation strategy should be for TAA and then
3405          * also apply TAA mitigations.
3406          */
3407         cpuid_update_tsx(cpu, featureset);
3408         cpuid_apply_tsx(x86_taa_mitigation, featureset);
3409 }
3410 
3411 /*
3412  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3413  */
3414 void
3415 setup_xfem(void)
3416 {
3417         uint64_t flags = XFEATURE_LEGACY_FP;
3418 
3419         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3420 
3421         if (is_x86_feature(x86_featureset, X86FSET_SSE))
3422                 flags |= XFEATURE_SSE;
3423 
3424         if (is_x86_feature(x86_featureset, X86FSET_AVX))
3425                 flags |= XFEATURE_AVX;
3426 
3427         if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3428                 flags |= XFEATURE_AVX512;
3429 
3430         set_xcr(XFEATURE_ENABLED_MASK, flags);
3431 
3432         xsave_bv_all = flags;
3433 }
3434 
3435 static void
3436 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3437 {
3438         struct cpuid_info *cpi;
3439 
3440         cpi = cpu->cpu_m.mcpu_cpi;
3441 
3442         if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3443             cpi->cpi_vendor == X86_VENDOR_HYGON) {
3444                 cpuid_gather_amd_topology_leaves(cpu);
3445         }
3446 
3447         cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3448 
3449         /*
3450          * Before we can calculate the IDs that we should assign to this
3451          * processor, we need to understand how many cores and threads it has.
3452          */
3453         switch (cpi->cpi_vendor) {
3454         case X86_VENDOR_Intel:
3455                 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3456                     &cpi->cpi_ncore_per_chip);
3457                 break;
3458         case X86_VENDOR_AMD:
3459         case X86_VENDOR_HYGON:
3460                 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3461                     &cpi->cpi_ncore_per_chip);
3462                 break;
3463         default:
3464                 /*
3465                  * If we have some other x86 compatible chip, it's not clear how
3466                  * they would behave. The most common case is virtualization
3467                  * today, though there are also 64-bit VIA chips. Assume that
3468                  * all we can get is the basic Leaf 1 HTT information.
3469                  */
3470                 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3471                         cpi->cpi_ncore_per_chip = 1;
3472                         cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3473                 }
3474                 break;
3475         }
3476 
3477         /*
3478          * Based on the calculated number of threads and cores, potentially
3479          * assign the HTT and CMT features.
3480          */
3481         if (cpi->cpi_ncore_per_chip > 1) {
3482                 add_x86_feature(featureset, X86FSET_CMP);
3483         }
3484 
3485         if (cpi->cpi_ncpu_per_chip > 1 &&
3486             cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3487                 add_x86_feature(featureset, X86FSET_HTT);
3488         }
3489 
3490         /*
3491          * Now that has been set up, we need to go through and calculate all of
3492          * the rest of the parameters that exist. If we think the CPU doesn't
3493          * have either SMT (HTT) or CMP, then we basically go through and fake
3494          * up information in some way. The most likely case for this is
3495          * virtualization where we have a lot of partial topology information.
3496          */
3497         if (!is_x86_feature(featureset, X86FSET_HTT) &&
3498             !is_x86_feature(featureset, X86FSET_CMP)) {
3499                 /*
3500                  * This is a single core, single-threaded processor.
3501                  */
3502                 cpi->cpi_procnodes_per_pkg = 1;
3503                 cpi->cpi_cores_per_compunit = 1;
3504                 cpi->cpi_compunitid = 0;
3505                 cpi->cpi_chipid = -1;
3506                 cpi->cpi_clogid = 0;
3507                 cpi->cpi_coreid = cpu->cpu_id;
3508                 cpi->cpi_pkgcoreid = 0;
3509                 if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3510                     cpi->cpi_vendor == X86_VENDOR_HYGON) {
3511                         cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3512                 } else {
3513                         cpi->cpi_procnodeid = cpi->cpi_chipid;
3514                 }
3515         } else {
3516                 switch (cpi->cpi_vendor) {
3517                 case X86_VENDOR_Intel:
3518                         cpuid_intel_getids(cpu, featureset);
3519                         break;
3520                 case X86_VENDOR_AMD:
3521                 case X86_VENDOR_HYGON:
3522                         cpuid_amd_getids(cpu, featureset);
3523                         break;
3524                 default:
3525                         /*
3526                          * In this case, it's hard to say what we should do.
3527                          * We're going to model them to the OS as single core
3528                          * threads. We don't have a good identifier for them, so
3529                          * we're just going to use the cpu id all on a single
3530                          * chip.
3531                          *
3532                          * This case has historically been different from the
3533                          * case above where we don't have HTT or CMP. While they
3534                          * could be combined, we've opted to keep it separate to
3535                          * minimize the risk of topology changes in weird cases.
3536                          */
3537                         cpi->cpi_procnodes_per_pkg = 1;
3538                         cpi->cpi_cores_per_compunit = 1;
3539                         cpi->cpi_chipid = 0;
3540                         cpi->cpi_coreid = cpu->cpu_id;
3541                         cpi->cpi_clogid = cpu->cpu_id;
3542                         cpi->cpi_pkgcoreid = cpu->cpu_id;
3543                         cpi->cpi_procnodeid = cpi->cpi_chipid;
3544                         cpi->cpi_compunitid = cpi->cpi_coreid;
3545                         break;
3546                 }
3547         }
3548 }
3549 
3550 /*
3551  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3552  * always gather leaf 6 if it's supported; however, we only look for features on
3553  * Intel systems as AMD does not currently define any of the features we look
3554  * for below.
3555  */
3556 static void
3557 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3558 {
3559         struct cpuid_regs *cp;
3560         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3561 
3562         if (cpi->cpi_maxeax < 6) {
3563                 return;
3564         }
3565 
3566         cp = &cpi->cpi_std[6];
3567         cp->cp_eax = 6;
3568         cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3569         (void) __cpuid_insn(cp);
3570         platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3571 
3572         if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3573                 return;
3574         }
3575 
3576         if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3577                 add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3578         }
3579 
3580         if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3581                 add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3582         }
3583 }
3584 
3585 /*
3586  * This is used when we discover that we have AVX support in cpuid. This
3587  * proceeds to scan for the rest of the AVX derived features.
3588  */
3589 static void
3590 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3591 {
3592         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3593 
3594         /*
3595          * If we don't have AVX, don't bother with most of this.
3596          */
3597         if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3598                 return;
3599 
3600         add_x86_feature(featureset, X86FSET_AVX);
3601 
3602         /*
3603          * Intel says we can't check these without also
3604          * checking AVX.
3605          */
3606         if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3607                 add_x86_feature(featureset, X86FSET_F16C);
3608 
3609         if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3610                 add_x86_feature(featureset, X86FSET_FMA);
3611 
3612         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3613                 add_x86_feature(featureset, X86FSET_BMI1);
3614 
3615         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3616                 add_x86_feature(featureset, X86FSET_BMI2);
3617 
3618         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3619                 add_x86_feature(featureset, X86FSET_AVX2);
3620 
3621         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3622                 add_x86_feature(featureset, X86FSET_VAES);
3623 
3624         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3625                 add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3626 
3627         /*
3628          * The rest of the AVX features require AVX512. Do not check them unless
3629          * it is present.
3630          */
3631         if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3632                 return;
3633         add_x86_feature(featureset, X86FSET_AVX512F);
3634 
3635         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3636                 add_x86_feature(featureset, X86FSET_AVX512DQ);
3637 
3638         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3639                 add_x86_feature(featureset, X86FSET_AVX512FMA);
3640 
3641         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3642                 add_x86_feature(featureset, X86FSET_AVX512PF);
3643 
3644         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3645                 add_x86_feature(featureset, X86FSET_AVX512ER);
3646 
3647         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3648                 add_x86_feature(featureset, X86FSET_AVX512CD);
3649 
3650         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3651                 add_x86_feature(featureset, X86FSET_AVX512BW);
3652 
3653         if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3654                 add_x86_feature(featureset, X86FSET_AVX512VL);
3655 
3656         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3657                 add_x86_feature(featureset, X86FSET_AVX512VBMI);
3658 
3659         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3660                 add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3661 
3662         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3663                 add_x86_feature(featureset, X86FSET_AVX512VNNI);
3664 
3665         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3666                 add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3667 
3668         if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3669                 add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3670 
3671         if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3672                 add_x86_feature(featureset, X86FSET_AVX512NNIW);
3673 
3674         if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3675                 add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3676 
3677         /*
3678          * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3679          * we don't need to.
3680          */
3681         if (cpi->cpi_std[7].cp_eax < 1)
3682                 return;
3683 
3684         if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3685                 add_x86_feature(featureset, X86FSET_AVX512_BF16);
3686 }
3687 
3688 /*
3689  * PPIN is the protected processor inventory number. On AMD this is an actual
3690  * feature bit. However, on Intel systems we need to read the platform
3691  * information MSR if we're on a specific model.
3692  */
3693 #if !defined(__xpv)
3694 static void
3695 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3696 {
3697         on_trap_data_t otd;
3698         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3699 
3700         switch (cpi->cpi_vendor) {
3701         case X86_VENDOR_AMD:
3702                 /*
3703                  * This leaf will have already been gathered in the topology
3704                  * functions.
3705                  */
3706                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3707                         if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3708                                 add_x86_feature(featureset, X86FSET_PPIN);
3709                         }
3710                 }
3711                 break;
3712         case X86_VENDOR_Intel:
3713                 if (cpi->cpi_family != 6)
3714                         break;
3715                 switch (cpi->cpi_model) {
3716                 case INTC_MODEL_IVYBRIDGE_XEON:
3717                 case INTC_MODEL_HASWELL_XEON:
3718                 case INTC_MODEL_BROADWELL_XEON:
3719                 case INTC_MODEL_BROADWELL_XEON_D:
3720                 case INTC_MODEL_SKYLAKE_XEON:
3721                 case INTC_MODEL_ICELAKE_XEON:
3722                         if (!on_trap(&otd, OT_DATA_ACCESS)) {
3723                                 uint64_t value;
3724 
3725                                 value = rdmsr(MSR_PLATFORM_INFO);
3726                                 if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3727                                         add_x86_feature(featureset,
3728                                             X86FSET_PPIN);
3729                                 }
3730                         }
3731                         no_trap();
3732                         break;
3733                 default:
3734                         break;
3735                 }
3736                 break;
3737         default:
3738                 break;
3739         }
3740 }
3741 #endif  /* ! __xpv */
3742 
3743 static void
3744 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3745 {
3746         uchar_t *featureset = (uchar_t *)arg;
3747 
3748         /*
3749          * We don't run on any processor that doesn't have cpuid, and could not
3750          * possibly have arrived here.
3751          */
3752         add_x86_feature(featureset, X86FSET_CPUID);
3753 }
3754 
3755 static void
3756 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3757 {
3758         struct cpuid_info *cpi;
3759         struct cpuid_regs *cp;
3760 
3761         /*
3762          * We require that virtual/native detection be complete and that PCI
3763          * config space access has been set up; at present there is no reliable
3764          * way to determine the latter.
3765          */
3766 #if !defined(__xpv)
3767         ASSERT3S(platform_type, !=, -1);
3768 #endif  /* !__xpv */
3769 
3770         cpi = cpu->cpu_m.mcpu_cpi;
3771         ASSERT(cpi != NULL);
3772 
3773         cp = &cpi->cpi_std[0];
3774         cp->cp_eax = 0;
3775         cpi->cpi_maxeax = __cpuid_insn(cp);
3776         {
3777                 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3778                 *iptr++ = cp->cp_ebx;
3779                 *iptr++ = cp->cp_edx;
3780                 *iptr++ = cp->cp_ecx;
3781                 *(char *)&cpi->cpi_vendorstr[12] = '\0';
3782         }
3783 
3784         cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3785         x86_vendor = cpi->cpi_vendor; /* for compatibility */
3786 
3787         /*
3788          * Limit the range in case of weird hardware
3789          */
3790         if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3791                 cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3792         if (cpi->cpi_maxeax < 1)
3793                 return;
3794 
3795         cp = &cpi->cpi_std[1];
3796         cp->cp_eax = 1;
3797         (void) __cpuid_insn(cp);
3798 
3799         /*
3800          * Extract identifying constants for easy access.
3801          */
3802         cpi->cpi_model = CPI_MODEL(cpi);
3803         cpi->cpi_family = CPI_FAMILY(cpi);
3804 
3805         if (cpi->cpi_family == 0xf)
3806                 cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3807 
3808         /*
3809          * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3810          * Intel, and presumably everyone else, uses model == 0xf, as
3811          * one would expect (max value means possible overflow).  Sigh.
3812          */
3813 
3814         switch (cpi->cpi_vendor) {
3815         case X86_VENDOR_Intel:
3816                 if (IS_EXTENDED_MODEL_INTEL(cpi))
3817                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3818                 break;
3819         case X86_VENDOR_AMD:
3820                 if (CPI_FAMILY(cpi) == 0xf)
3821                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3822                 break;
3823         case X86_VENDOR_HYGON:
3824                 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3825                 break;
3826         default:
3827                 if (cpi->cpi_model == 0xf)
3828                         cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3829                 break;
3830         }
3831 
3832         cpi->cpi_step = CPI_STEP(cpi);
3833         cpi->cpi_brandid = CPI_BRANDID(cpi);
3834 
3835         /*
3836          * Synthesize chip "revision" and socket type
3837          */
3838         cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3839             cpi->cpi_model, cpi->cpi_step);
3840         cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3841             cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3842         cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3843             cpi->cpi_model, cpi->cpi_step);
3844         cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
3845             cpi->cpi_model, cpi->cpi_step);
3846 }
3847 
3848 static void
3849 cpuid_pass_basic(cpu_t *cpu, void *arg)
3850 {
3851         uchar_t *featureset = (uchar_t *)arg;
3852         uint32_t mask_ecx, mask_edx;
3853         struct cpuid_info *cpi;
3854         struct cpuid_regs *cp;
3855         int xcpuid;
3856 #if !defined(__xpv)
3857         extern int idle_cpu_prefer_mwait;
3858 #endif
3859 
3860         cpi = cpu->cpu_m.mcpu_cpi;
3861         ASSERT(cpi != NULL);
3862 
3863         if (cpi->cpi_maxeax < 1)
3864                 return;
3865 
3866         /*
3867          * This was filled during the identification pass.
3868          */
3869         cp = &cpi->cpi_std[1];
3870 
3871         /*
3872          * *default* assumptions:
3873          * - believe %edx feature word
3874          * - ignore %ecx feature word
3875          * - 32-bit virtual and physical addressing
3876          */
3877         mask_edx = 0xffffffff;
3878         mask_ecx = 0;
3879 
3880         cpi->cpi_pabits = cpi->cpi_vabits = 32;
3881 
3882         switch (cpi->cpi_vendor) {
3883         case X86_VENDOR_Intel:
3884                 if (cpi->cpi_family == 5)
3885                         x86_type = X86_TYPE_P5;
3886                 else if (IS_LEGACY_P6(cpi)) {
3887                         x86_type = X86_TYPE_P6;
3888                         pentiumpro_bug4046376 = 1;
3889                         /*
3890                          * Clear the SEP bit when it was set erroneously
3891                          */
3892                         if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3893                                 cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3894                 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3895                         x86_type = X86_TYPE_P4;
3896                         /*
3897                          * We don't currently depend on any of the %ecx
3898                          * features until Prescott, so we'll only check
3899                          * this from P4 onwards.  We might want to revisit
3900                          * that idea later.
3901                          */
3902                         mask_ecx = 0xffffffff;
3903                 } else if (cpi->cpi_family > 0xf)
3904                         mask_ecx = 0xffffffff;
3905                 /*
3906                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3907                  * to obtain the monitor linesize.
3908                  */
3909                 if (cpi->cpi_maxeax < 5)
3910                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3911                 break;
3912         case X86_VENDOR_IntelClone:
3913         default:
3914                 break;
3915         case X86_VENDOR_AMD:
3916 #if defined(OPTERON_ERRATUM_108)
3917                 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3918                         cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3919                         cpi->cpi_model = 0xc;
3920                 } else
3921 #endif
3922                 if (cpi->cpi_family == 5) {
3923                         /*
3924                          * AMD K5 and K6
3925                          *
3926                          * These CPUs have an incomplete implementation
3927                          * of MCA/MCE which we mask away.
3928                          */
3929                         mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3930 
3931                         /*
3932                          * Model 0 uses the wrong (APIC) bit
3933                          * to indicate PGE.  Fix it here.
3934                          */
3935                         if (cpi->cpi_model == 0) {
3936                                 if (cp->cp_edx & 0x200) {
3937                                         cp->cp_edx &= ~0x200;
3938                                         cp->cp_edx |= CPUID_INTC_EDX_PGE;
3939                                 }
3940                         }
3941 
3942                         /*
3943                          * Early models had problems w/ MMX; disable.
3944                          */
3945                         if (cpi->cpi_model < 6)
3946                                 mask_edx &= ~CPUID_INTC_EDX_MMX;
3947                 }
3948 
3949                 /*
3950                  * For newer families, SSE3 and CX16, at least, are valid;
3951                  * enable all
3952                  */
3953                 if (cpi->cpi_family >= 0xf)
3954                         mask_ecx = 0xffffffff;
3955                 /*
3956                  * We don't support MONITOR/MWAIT if leaf 5 is not available
3957                  * to obtain the monitor linesize.
3958                  */
3959                 if (cpi->cpi_maxeax < 5)
3960                         mask_ecx &= ~CPUID_INTC_ECX_MON;
3961 
3962 #if !defined(__xpv)
3963                 /*
3964                  * AMD has not historically used MWAIT in the CPU's idle loop.
3965                  * Pre-family-10h Opterons do not have the MWAIT instruction. We
3966                  * know for certain that in at least family 17h, per AMD, mwait
3967                  * is preferred. Families in-between are less certain.
3968                  */
3969                 if (cpi->cpi_family < 0x17) {
3970                         idle_cpu_prefer_mwait = 0;
3971                 }
3972 #endif
3973 
3974                 break;
3975         case X86_VENDOR_HYGON:
3976                 /* Enable all for Hygon Dhyana CPU */
3977                 mask_ecx = 0xffffffff;
3978                 break;
3979         case X86_VENDOR_TM:
3980                 /*
3981                  * workaround the NT workaround in CMS 4.1
3982                  */
3983                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3984                     (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3985                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3986                 break;
3987         case X86_VENDOR_Centaur:
3988                 /*
3989                  * workaround the NT workarounds again
3990                  */
3991                 if (cpi->cpi_family == 6)
3992                         cp->cp_edx |= CPUID_INTC_EDX_CX8;
3993                 break;
3994         case X86_VENDOR_Cyrix:
3995                 /*
3996                  * We rely heavily on the probing in locore
3997                  * to actually figure out what parts, if any,
3998                  * of the Cyrix cpuid instruction to believe.
3999                  */
4000                 switch (x86_type) {
4001                 case X86_TYPE_CYRIX_486:
4002                         mask_edx = 0;
4003                         break;
4004                 case X86_TYPE_CYRIX_6x86:
4005                         mask_edx = 0;
4006                         break;
4007                 case X86_TYPE_CYRIX_6x86L:
4008                         mask_edx =
4009                             CPUID_INTC_EDX_DE |
4010                             CPUID_INTC_EDX_CX8;
4011                         break;
4012                 case X86_TYPE_CYRIX_6x86MX:
4013                         mask_edx =
4014                             CPUID_INTC_EDX_DE |
4015                             CPUID_INTC_EDX_MSR |
4016                             CPUID_INTC_EDX_CX8 |
4017                             CPUID_INTC_EDX_PGE |
4018                             CPUID_INTC_EDX_CMOV |
4019                             CPUID_INTC_EDX_MMX;
4020                         break;
4021                 case X86_TYPE_CYRIX_GXm:
4022                         mask_edx =
4023                             CPUID_INTC_EDX_MSR |
4024                             CPUID_INTC_EDX_CX8 |
4025                             CPUID_INTC_EDX_CMOV |
4026                             CPUID_INTC_EDX_MMX;
4027                         break;
4028                 case X86_TYPE_CYRIX_MediaGX:
4029                         break;
4030                 case X86_TYPE_CYRIX_MII:
4031                 case X86_TYPE_VIA_CYRIX_III:
4032                         mask_edx =
4033                             CPUID_INTC_EDX_DE |
4034                             CPUID_INTC_EDX_TSC |
4035                             CPUID_INTC_EDX_MSR |
4036                             CPUID_INTC_EDX_CX8 |
4037                             CPUID_INTC_EDX_PGE |
4038                             CPUID_INTC_EDX_CMOV |
4039                             CPUID_INTC_EDX_MMX;
4040                         break;
4041                 default:
4042                         break;
4043                 }
4044                 break;
4045         }
4046 
4047 #if defined(__xpv)
4048         /*
4049          * Do not support MONITOR/MWAIT under a hypervisor
4050          */
4051         mask_ecx &= ~CPUID_INTC_ECX_MON;
4052         /*
4053          * Do not support XSAVE under a hypervisor for now
4054          */
4055         xsave_force_disable = B_TRUE;
4056 
4057 #endif  /* __xpv */
4058 
4059         if (xsave_force_disable) {
4060                 mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
4061                 mask_ecx &= ~CPUID_INTC_ECX_AVX;
4062                 mask_ecx &= ~CPUID_INTC_ECX_F16C;
4063                 mask_ecx &= ~CPUID_INTC_ECX_FMA;
4064         }
4065 
4066         /*
4067          * Now we've figured out the masks that determine
4068          * which bits we choose to believe, apply the masks
4069          * to the feature words, then map the kernel's view
4070          * of these feature words into its feature word.
4071          */
4072         cp->cp_edx &= mask_edx;
4073         cp->cp_ecx &= mask_ecx;
4074 
4075         /*
4076          * apply any platform restrictions (we don't call this
4077          * immediately after __cpuid_insn here, because we need the
4078          * workarounds applied above first)
4079          */
4080         platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
4081 
4082         /*
4083          * In addition to ecx and edx, Intel and AMD are storing a bunch of
4084          * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
4085          * 7 has sub-leaves determined by ecx.
4086          */
4087         if (cpi->cpi_maxeax >= 7) {
4088                 struct cpuid_regs *ecp;
4089                 ecp = &cpi->cpi_std[7];
4090                 ecp->cp_eax = 7;
4091                 ecp->cp_ecx = 0;
4092                 (void) __cpuid_insn(ecp);
4093 
4094                 /*
4095                  * If XSAVE has been disabled, just ignore all of the
4096                  * extended-save-area dependent flags here. By removing most of
4097                  * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
4098                  * end up looking at additional xsave dependent leaves right
4099                  * now.
4100                  */
4101                 if (xsave_force_disable) {
4102                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4103                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4104                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4105                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
4106                         ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4107                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4108                         ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4109                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
4110                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4111                         ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
4112                 }
4113 
4114                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
4115                         add_x86_feature(featureset, X86FSET_SMEP);
4116 
4117                 /*
4118                  * We check disable_smap here in addition to in startup_smap()
4119                  * to ensure CPUs that aren't the boot CPU don't accidentally
4120                  * include it in the feature set and thus generate a mismatched
4121                  * x86 feature set across CPUs.
4122                  */
4123                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
4124                     disable_smap == 0)
4125                         add_x86_feature(featureset, X86FSET_SMAP);
4126 
4127                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
4128                         add_x86_feature(featureset, X86FSET_RDSEED);
4129 
4130                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
4131                         add_x86_feature(featureset, X86FSET_ADX);
4132 
4133                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4134                         add_x86_feature(featureset, X86FSET_FSGSBASE);
4135 
4136                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4137                         add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4138 
4139                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4140                         add_x86_feature(featureset, X86FSET_INVPCID);
4141 
4142                 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4143                         add_x86_feature(featureset, X86FSET_UMIP);
4144                 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4145                         add_x86_feature(featureset, X86FSET_PKU);
4146                 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4147                         add_x86_feature(featureset, X86FSET_OSPKE);
4148                 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4149                         add_x86_feature(featureset, X86FSET_GFNI);
4150 
4151                 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4152                         add_x86_feature(featureset, X86FSET_CLWB);
4153 
4154                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4155                         if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4156                                 add_x86_feature(featureset, X86FSET_MPX);
4157                 }
4158 
4159                 /*
4160                  * If we have subleaf 1 available, grab and store that. This is
4161                  * used for more AVX and related features.
4162                  */
4163                 if (ecp->cp_eax >= 1) {
4164                         struct cpuid_regs *c71;
4165                         c71 = &cpi->cpi_sub7[0];
4166                         c71->cp_eax = 7;
4167                         c71->cp_ecx = 1;
4168                         (void) __cpuid_insn(c71);
4169                 }
4170         }
4171 
4172         /*
4173          * fold in overrides from the "eeprom" mechanism
4174          */
4175         cp->cp_edx |= cpuid_feature_edx_include;
4176         cp->cp_edx &= ~cpuid_feature_edx_exclude;
4177 
4178         cp->cp_ecx |= cpuid_feature_ecx_include;
4179         cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4180 
4181         if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4182                 add_x86_feature(featureset, X86FSET_LARGEPAGE);
4183         }
4184         if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4185                 add_x86_feature(featureset, X86FSET_TSC);
4186         }
4187         if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4188                 add_x86_feature(featureset, X86FSET_MSR);
4189         }
4190         if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4191                 add_x86_feature(featureset, X86FSET_MTRR);
4192         }
4193         if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4194                 add_x86_feature(featureset, X86FSET_PGE);
4195         }
4196         if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4197                 add_x86_feature(featureset, X86FSET_CMOV);
4198         }
4199         if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4200                 add_x86_feature(featureset, X86FSET_MMX);
4201         }
4202         if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4203             (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4204                 add_x86_feature(featureset, X86FSET_MCA);
4205         }
4206         if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4207                 add_x86_feature(featureset, X86FSET_PAE);
4208         }
4209         if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4210                 add_x86_feature(featureset, X86FSET_CX8);
4211         }
4212         if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4213                 add_x86_feature(featureset, X86FSET_CX16);
4214         }
4215         if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4216                 add_x86_feature(featureset, X86FSET_PAT);
4217         }
4218         if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4219                 add_x86_feature(featureset, X86FSET_SEP);
4220         }
4221         if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4222                 /*
4223                  * In our implementation, fxsave/fxrstor
4224                  * are prerequisites before we'll even
4225                  * try and do SSE things.
4226                  */
4227                 if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4228                         add_x86_feature(featureset, X86FSET_SSE);
4229                 }
4230                 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4231                         add_x86_feature(featureset, X86FSET_SSE2);
4232                 }
4233                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4234                         add_x86_feature(featureset, X86FSET_SSE3);
4235                 }
4236                 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4237                         add_x86_feature(featureset, X86FSET_SSSE3);
4238                 }
4239                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4240                         add_x86_feature(featureset, X86FSET_SSE4_1);
4241                 }
4242                 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4243                         add_x86_feature(featureset, X86FSET_SSE4_2);
4244                 }
4245                 if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4246                         add_x86_feature(featureset, X86FSET_AES);
4247                 }
4248                 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4249                         add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4250                 }
4251 
4252                 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4253                         add_x86_feature(featureset, X86FSET_SHA);
4254 
4255                 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4256                         add_x86_feature(featureset, X86FSET_XSAVE);
4257 
4258                         /* We only test AVX & AVX512 when there is XSAVE */
4259                         cpuid_basic_avx(cpu, featureset);
4260                 }
4261         }
4262 
4263         if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4264                 add_x86_feature(featureset, X86FSET_PCID);
4265         }
4266 
4267         if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4268                 add_x86_feature(featureset, X86FSET_X2APIC);
4269         }
4270         if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4271                 add_x86_feature(featureset, X86FSET_DE);
4272         }
4273 #if !defined(__xpv)
4274         if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4275 
4276                 /*
4277                  * We require the CLFLUSH instruction for erratum workaround
4278                  * to use MONITOR/MWAIT.
4279                  */
4280                 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4281                         cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4282                         add_x86_feature(featureset, X86FSET_MWAIT);
4283                 } else {
4284                         extern int idle_cpu_assert_cflush_monitor;
4285 
4286                         /*
4287                          * All processors we are aware of which have
4288                          * MONITOR/MWAIT also have CLFLUSH.
4289                          */
4290                         if (idle_cpu_assert_cflush_monitor) {
4291                                 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4292                                     (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4293                         }
4294                 }
4295         }
4296 #endif  /* __xpv */
4297 
4298         if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4299                 add_x86_feature(featureset, X86FSET_VMX);
4300         }
4301 
4302         if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4303                 add_x86_feature(featureset, X86FSET_RDRAND);
4304 
4305         /*
4306          * Only need it first time, rest of the cpus would follow suit.
4307          * we only capture this for the bootcpu.
4308          */
4309         if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4310                 add_x86_feature(featureset, X86FSET_CLFSH);
4311                 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4312         }
4313         if (is_x86_feature(featureset, X86FSET_PAE))
4314                 cpi->cpi_pabits = 36;
4315 
4316         if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4317                 struct cpuid_regs r, *ecp;
4318 
4319                 ecp = &r;
4320                 ecp->cp_eax = 0xD;
4321                 ecp->cp_ecx = 1;
4322                 ecp->cp_edx = ecp->cp_ebx = 0;
4323                 (void) __cpuid_insn(ecp);
4324 
4325                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4326                         add_x86_feature(featureset, X86FSET_XSAVEOPT);
4327                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4328                         add_x86_feature(featureset, X86FSET_XSAVEC);
4329                 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4330                         add_x86_feature(featureset, X86FSET_XSAVES);
4331 
4332                 /*
4333                  * Zen 2 family processors suffer from erratum 1386 that causes
4334                  * xsaves to not function correctly in some circumstances. There
4335                  * are no supervisor states in Zen 2 and earlier. Practically
4336                  * speaking this has no impact for us as we currently do not
4337                  * leverage compressed xsave formats. To safeguard against
4338                  * issues in the future where we may opt to using it, we remove
4339                  * it from the feature set now. While Matisse has a microcode
4340                  * update available with a fix, not all Zen 2 CPUs do so it's
4341                  * simpler for the moment to unconditionally remove it.
4342                  */
4343                 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4344                     uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4345                         remove_x86_feature(featureset, X86FSET_XSAVES);
4346                 }
4347         }
4348 
4349         /*
4350          * Work on the "extended" feature information, doing
4351          * some basic initialization to be used in the extended pass.
4352          */
4353         xcpuid = 0;
4354         switch (cpi->cpi_vendor) {
4355         case X86_VENDOR_Intel:
4356                 /*
4357                  * On KVM we know we will have proper support for extended
4358                  * cpuid.
4359                  */
4360                 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4361                     (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4362                     (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4363                         xcpuid++;
4364                 break;
4365         case X86_VENDOR_AMD:
4366                 if (cpi->cpi_family > 5 ||
4367                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4368                         xcpuid++;
4369                 break;
4370         case X86_VENDOR_Cyrix:
4371                 /*
4372                  * Only these Cyrix CPUs are -known- to support
4373                  * extended cpuid operations.
4374                  */
4375                 if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4376                     x86_type == X86_TYPE_CYRIX_GXm)
4377                         xcpuid++;
4378                 break;
4379         case X86_VENDOR_HYGON:
4380         case X86_VENDOR_Centaur:
4381         case X86_VENDOR_TM:
4382         default:
4383                 xcpuid++;
4384                 break;
4385         }
4386 
4387         if (xcpuid) {
4388                 cp = &cpi->cpi_extd[0];
4389                 cp->cp_eax = CPUID_LEAF_EXT_0;
4390                 cpi->cpi_xmaxeax = __cpuid_insn(cp);
4391         }
4392 
4393         if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4394 
4395                 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4396                         cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4397 
4398                 switch (cpi->cpi_vendor) {
4399                 case X86_VENDOR_Intel:
4400                 case X86_VENDOR_AMD:
4401                 case X86_VENDOR_HYGON:
4402                         if (cpi->cpi_xmaxeax < 0x80000001)
4403                                 break;
4404                         cp = &cpi->cpi_extd[1];
4405                         cp->cp_eax = 0x80000001;
4406                         (void) __cpuid_insn(cp);
4407 
4408                         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4409                             cpi->cpi_family == 5 &&
4410                             cpi->cpi_model == 6 &&
4411                             cpi->cpi_step == 6) {
4412                                 /*
4413                                  * K6 model 6 uses bit 10 to indicate SYSC
4414                                  * Later models use bit 11. Fix it here.
4415                                  */
4416                                 if (cp->cp_edx & 0x400) {
4417                                         cp->cp_edx &= ~0x400;
4418                                         cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4419                                 }
4420                         }
4421 
4422                         platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4423 
4424                         /*
4425                          * Compute the additions to the kernel's feature word.
4426                          */
4427                         if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4428                                 add_x86_feature(featureset, X86FSET_NX);
4429                         }
4430 
4431                         /*
4432                          * Regardless whether or not we boot 64-bit,
4433                          * we should have a way to identify whether
4434                          * the CPU is capable of running 64-bit.
4435                          */
4436                         if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4437                                 add_x86_feature(featureset, X86FSET_64);
4438                         }
4439 
4440                         /* 1 GB large page - enable only for 64 bit kernel */
4441                         if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4442                                 add_x86_feature(featureset, X86FSET_1GPG);
4443                         }
4444 
4445                         if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4446                             cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4447                             (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4448                             (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4449                                 add_x86_feature(featureset, X86FSET_SSE4A);
4450                         }
4451 
4452                         /*
4453                          * It's really tricky to support syscall/sysret in
4454                          * the i386 kernel; we rely on sysenter/sysexit
4455                          * instead.  In the amd64 kernel, things are -way-
4456                          * better.
4457                          */
4458                         if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4459                                 add_x86_feature(featureset, X86FSET_ASYSC);
4460                         }
4461 
4462                         /*
4463                          * While we're thinking about system calls, note
4464                          * that AMD processors don't support sysenter
4465                          * in long mode at all, so don't try to program them.
4466                          */
4467                         if (x86_vendor == X86_VENDOR_AMD ||
4468                             x86_vendor == X86_VENDOR_HYGON) {
4469                                 remove_x86_feature(featureset, X86FSET_SEP);
4470                         }
4471 
4472                         if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4473                                 add_x86_feature(featureset, X86FSET_TSCP);
4474                         }
4475 
4476                         if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4477                                 add_x86_feature(featureset, X86FSET_SVM);
4478                         }
4479 
4480                         if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4481                                 add_x86_feature(featureset, X86FSET_TOPOEXT);
4482                         }
4483 
4484                         if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4485                                 add_x86_feature(featureset, X86FSET_AMD_PCEC);
4486                         }
4487 
4488                         if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4489                                 add_x86_feature(featureset, X86FSET_XOP);
4490                         }
4491 
4492                         if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4493                                 add_x86_feature(featureset, X86FSET_FMA4);
4494                         }
4495 
4496                         if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4497                                 add_x86_feature(featureset, X86FSET_TBM);
4498                         }
4499 
4500                         if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4501                                 add_x86_feature(featureset, X86FSET_MONITORX);
4502                         }
4503                         break;
4504                 default:
4505                         break;
4506                 }
4507 
4508                 /*
4509                  * Get CPUID data about processor cores and hyperthreads.
4510                  */
4511                 switch (cpi->cpi_vendor) {
4512                 case X86_VENDOR_Intel:
4513                         if (cpi->cpi_maxeax >= 4) {
4514                                 cp = &cpi->cpi_std[4];
4515                                 cp->cp_eax = 4;
4516                                 cp->cp_ecx = 0;
4517                                 (void) __cpuid_insn(cp);
4518                                 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4519                         }
4520                         /*FALLTHROUGH*/
4521                 case X86_VENDOR_AMD:
4522                 case X86_VENDOR_HYGON:
4523                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4524                                 break;
4525                         cp = &cpi->cpi_extd[8];
4526                         cp->cp_eax = CPUID_LEAF_EXT_8;
4527                         (void) __cpuid_insn(cp);
4528                         platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4529                             cp);
4530 
4531                         /*
4532                          * AMD uses ebx for some extended functions.
4533                          */
4534                         if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4535                             cpi->cpi_vendor == X86_VENDOR_HYGON) {
4536                                 /*
4537                                  * While we're here, check for the AMD "Error
4538                                  * Pointer Zero/Restore" feature. This can be
4539                                  * used to setup the FP save handlers
4540                                  * appropriately.
4541                                  */
4542                                 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4543                                         cpi->cpi_fp_amd_save = 0;
4544                                 } else {
4545                                         cpi->cpi_fp_amd_save = 1;
4546                                 }
4547 
4548                                 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4549                                         add_x86_feature(featureset,
4550                                             X86FSET_CLZERO);
4551                                 }
4552                         }
4553 
4554                         /*
4555                          * Virtual and physical address limits from
4556                          * cpuid override previously guessed values.
4557                          */
4558                         cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4559                         cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4560                         break;
4561                 default:
4562                         break;
4563                 }
4564 
4565                 /*
4566                  * Get CPUID data about TSC Invariance in Deep C-State.
4567                  */
4568                 switch (cpi->cpi_vendor) {
4569                 case X86_VENDOR_Intel:
4570                 case X86_VENDOR_AMD:
4571                 case X86_VENDOR_HYGON:
4572                         if (cpi->cpi_maxeax >= 7) {
4573                                 cp = &cpi->cpi_extd[7];
4574                                 cp->cp_eax = 0x80000007;
4575                                 cp->cp_ecx = 0;
4576                                 (void) __cpuid_insn(cp);
4577                         }
4578                         break;
4579                 default:
4580                         break;
4581                 }
4582         }
4583 
4584         /*
4585          * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4586          * run and thus gathered some of its dependent leaves.
4587          */
4588         cpuid_basic_topology(cpu, featureset);
4589         cpuid_basic_thermal(cpu, featureset);
4590 #if !defined(__xpv)
4591         cpuid_basic_ppin(cpu, featureset);
4592 #endif
4593 
4594         if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4595             cpi->cpi_vendor == X86_VENDOR_HYGON) {
4596                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4597                     cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4598                         /* Special handling for AMD FP not necessary. */
4599                         cpi->cpi_fp_amd_save = 0;
4600                 } else {
4601                         cpi->cpi_fp_amd_save = 1;
4602                 }
4603         }
4604 
4605         /*
4606          * Check (and potentially set) if lfence is serializing.
4607          * This is useful for accurate rdtsc measurements and AMD retpolines.
4608          */
4609         if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4610             cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4611             is_x86_feature(featureset, X86FSET_SSE2)) {
4612                 /*
4613                  * The AMD white paper Software Techniques For Managing
4614                  * Speculation on AMD Processors details circumstances for when
4615                  * lfence instructions are serializing.
4616                  *
4617                  * On family 0xf and 0x11, it is inherently so.  On family 0x10
4618                  * and later (excluding 0x11), a bit in the DE_CFG MSR
4619                  * determines the lfence behavior.  Per that whitepaper, AMD has
4620                  * committed to supporting that MSR on all later CPUs.
4621                  */
4622                 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4623                         add_x86_feature(featureset, X86FSET_LFENCE_SER);
4624                 } else if (cpi->cpi_family >= 0x10) {
4625 #if !defined(__xpv)
4626                         uint64_t val;
4627 
4628                         /*
4629                          * Be careful when attempting to enable the bit, and
4630                          * verify that it was actually set in case we are
4631                          * running in a hypervisor which is less than faithful
4632                          * about its emulation of this feature.
4633                          */
4634                         on_trap_data_t otd;
4635                         if (!on_trap(&otd, OT_DATA_ACCESS)) {
4636                                 val = rdmsr(MSR_AMD_DE_CFG);
4637                                 val |= AMD_DE_CFG_LFENCE_DISPATCH;
4638                                 wrmsr(MSR_AMD_DE_CFG, val);
4639                                 val = rdmsr(MSR_AMD_DE_CFG);
4640                         } else {
4641                                 val = 0;
4642                         }
4643                         no_trap();
4644 
4645                         if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4646                                 add_x86_feature(featureset, X86FSET_LFENCE_SER);
4647                         }
4648 #endif
4649                 }
4650         } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4651             is_x86_feature(featureset, X86FSET_SSE2)) {
4652                 /*
4653                  * Documentation and other OSes indicate that lfence is always
4654                  * serializing on Intel CPUs.
4655                  */
4656                 add_x86_feature(featureset, X86FSET_LFENCE_SER);
4657         }
4658 
4659 
4660         /*
4661          * Check the processor leaves that are used for security features. Grab
4662          * any additional processor-specific leaves that we may not have yet.
4663          */
4664         switch (cpi->cpi_vendor) {
4665         case X86_VENDOR_AMD:
4666         case X86_VENDOR_HYGON:
4667                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4668                         cp = &cpi->cpi_extd[7];
4669                         cp->cp_eax = CPUID_LEAF_EXT_21;
4670                         cp->cp_ecx = 0;
4671                         (void) __cpuid_insn(cp);
4672                 }
4673                 break;
4674         default:
4675                 break;
4676         }
4677 
4678         cpuid_scan_security(cpu, featureset);
4679 }
4680 
4681 /*
4682  * Make copies of the cpuid table entries we depend on, in
4683  * part for ease of parsing now, in part so that we have only
4684  * one place to correct any of it, in part for ease of
4685  * later export to userland, and in part so we can look at
4686  * this stuff in a crash dump.
4687  */
4688 
4689 static void
4690 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4691 {
4692         uint_t n, nmax;
4693         int i;
4694         struct cpuid_regs *cp;
4695         uint8_t *dp;
4696         uint32_t *iptr;
4697         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4698 
4699         if (cpi->cpi_maxeax < 1)
4700                 return;
4701 
4702         if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4703                 nmax = NMAX_CPI_STD;
4704         /*
4705          * (We already handled n == 0 and n == 1 in the basic pass)
4706          */
4707         for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4708                 /*
4709                  * leaves 6 and 7 were handled in the basic pass
4710                  */
4711                 if (n == 6 || n == 7)
4712                         continue;
4713 
4714                 cp->cp_eax = n;
4715 
4716                 /*
4717                  * CPUID function 4 expects %ecx to be initialized
4718                  * with an index which indicates which cache to return
4719                  * information about. The OS is expected to call function 4
4720                  * with %ecx set to 0, 1, 2, ... until it returns with
4721                  * EAX[4:0] set to 0, which indicates there are no more
4722                  * caches.
4723                  *
4724                  * Here, populate cpi_std[4] with the information returned by
4725                  * function 4 when %ecx == 0, and do the rest in a later pass
4726                  * when dynamic memory allocation becomes available.
4727                  *
4728                  * Note: we need to explicitly initialize %ecx here, since
4729                  * function 4 may have been previously invoked.
4730                  */
4731                 if (n == 4)
4732                         cp->cp_ecx = 0;
4733 
4734                 (void) __cpuid_insn(cp);
4735                 platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4736                 switch (n) {
4737                 case 2:
4738                         /*
4739                          * "the lower 8 bits of the %eax register
4740                          * contain a value that identifies the number
4741                          * of times the cpuid [instruction] has to be
4742                          * executed to obtain a complete image of the
4743                          * processor's caching systems."
4744                          *
4745                          * How *do* they make this stuff up?
4746                          */
4747                         cpi->cpi_ncache = sizeof (*cp) *
4748                             BITX(cp->cp_eax, 7, 0);
4749                         if (cpi->cpi_ncache == 0)
4750                                 break;
4751                         cpi->cpi_ncache--;   /* skip count byte */
4752 
4753                         /*
4754                          * Well, for now, rather than attempt to implement
4755                          * this slightly dubious algorithm, we just look
4756                          * at the first 15 ..
4757                          */
4758                         if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4759                                 cpi->cpi_ncache = sizeof (*cp) - 1;
4760 
4761                         dp = cpi->cpi_cacheinfo;
4762                         if (BITX(cp->cp_eax, 31, 31) == 0) {
4763                                 uint8_t *p = (void *)&cp->cp_eax;
4764                                 for (i = 1; i < 4; i++)
4765                                         if (p[i] != 0)
4766                                                 *dp++ = p[i];
4767                         }
4768                         if (BITX(cp->cp_ebx, 31, 31) == 0) {
4769                                 uint8_t *p = (void *)&cp->cp_ebx;
4770                                 for (i = 0; i < 4; i++)
4771                                         if (p[i] != 0)
4772                                                 *dp++ = p[i];
4773                         }
4774                         if (BITX(cp->cp_ecx, 31, 31) == 0) {
4775                                 uint8_t *p = (void *)&cp->cp_ecx;
4776                                 for (i = 0; i < 4; i++)
4777                                         if (p[i] != 0)
4778                                                 *dp++ = p[i];
4779                         }
4780                         if (BITX(cp->cp_edx, 31, 31) == 0) {
4781                                 uint8_t *p = (void *)&cp->cp_edx;
4782                                 for (i = 0; i < 4; i++)
4783                                         if (p[i] != 0)
4784                                                 *dp++ = p[i];
4785                         }
4786                         break;
4787 
4788                 case 3: /* Processor serial number, if PSN supported */
4789                         break;
4790 
4791                 case 4: /* Deterministic cache parameters */
4792                         break;
4793 
4794                 case 5: /* Monitor/Mwait parameters */
4795                 {
4796                         size_t mwait_size;
4797 
4798                         /*
4799                          * check cpi_mwait.support which was set in
4800                          * cpuid_pass_basic()
4801                          */
4802                         if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4803                                 break;
4804 
4805                         /*
4806                          * Protect ourself from insane mwait line size.
4807                          * Workaround for incomplete hardware emulator(s).
4808                          */
4809                         mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4810                         if (mwait_size < sizeof (uint32_t) ||
4811                             !ISP2(mwait_size)) {
4812 #if DEBUG
4813                                 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4814                                     "size %ld", cpu->cpu_id, (long)mwait_size);
4815 #endif
4816                                 break;
4817                         }
4818 
4819                         cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4820                         cpi->cpi_mwait.mon_max = mwait_size;
4821                         if (MWAIT_EXTENSION(cpi)) {
4822                                 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4823                                 if (MWAIT_INT_ENABLE(cpi))
4824                                         cpi->cpi_mwait.support |=
4825                                             MWAIT_ECX_INT_ENABLE;
4826                         }
4827                         break;
4828                 }
4829                 default:
4830                         break;
4831                 }
4832         }
4833 
4834         /*
4835          * XSAVE enumeration
4836          */
4837         if (cpi->cpi_maxeax >= 0xD) {
4838                 struct cpuid_regs regs;
4839                 boolean_t cpuid_d_valid = B_TRUE;
4840 
4841                 cp = &regs;
4842                 cp->cp_eax = 0xD;
4843                 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4844 
4845                 (void) __cpuid_insn(cp);
4846 
4847                 /*
4848                  * Sanity checks for debug
4849                  */
4850                 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4851                     (cp->cp_eax & XFEATURE_SSE) == 0) {
4852                         cpuid_d_valid = B_FALSE;
4853                 }
4854 
4855                 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4856                 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4857                 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4858 
4859                 /*
4860                  * If the hw supports AVX, get the size and offset in the save
4861                  * area for the ymm state.
4862                  */
4863                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4864                         cp->cp_eax = 0xD;
4865                         cp->cp_ecx = 2;
4866                         cp->cp_edx = cp->cp_ebx = 0;
4867 
4868                         (void) __cpuid_insn(cp);
4869 
4870                         if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4871                             cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4872                                 cpuid_d_valid = B_FALSE;
4873                         }
4874 
4875                         cpi->cpi_xsave.ymm_size = cp->cp_eax;
4876                         cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4877                 }
4878 
4879                 /*
4880                  * If the hw supports MPX, get the size and offset in the
4881                  * save area for BNDREGS and BNDCSR.
4882                  */
4883                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4884                         cp->cp_eax = 0xD;
4885                         cp->cp_ecx = 3;
4886                         cp->cp_edx = cp->cp_ebx = 0;
4887 
4888                         (void) __cpuid_insn(cp);
4889 
4890                         cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4891                         cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4892 
4893                         cp->cp_eax = 0xD;
4894                         cp->cp_ecx = 4;
4895                         cp->cp_edx = cp->cp_ebx = 0;
4896 
4897                         (void) __cpuid_insn(cp);
4898 
4899                         cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4900                         cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4901                 }
4902 
4903                 /*
4904                  * If the hw supports AVX512, get the size and offset in the
4905                  * save area for the opmask registers and zmm state.
4906                  */
4907                 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4908                         cp->cp_eax = 0xD;
4909                         cp->cp_ecx = 5;
4910                         cp->cp_edx = cp->cp_ebx = 0;
4911 
4912                         (void) __cpuid_insn(cp);
4913 
4914                         cpi->cpi_xsave.opmask_size = cp->cp_eax;
4915                         cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4916 
4917                         cp->cp_eax = 0xD;
4918                         cp->cp_ecx = 6;
4919                         cp->cp_edx = cp->cp_ebx = 0;
4920 
4921                         (void) __cpuid_insn(cp);
4922 
4923                         cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4924                         cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4925 
4926                         cp->cp_eax = 0xD;
4927                         cp->cp_ecx = 7;
4928                         cp->cp_edx = cp->cp_ebx = 0;
4929 
4930                         (void) __cpuid_insn(cp);
4931 
4932                         cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4933                         cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4934                 }
4935 
4936                 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4937                         xsave_state_size = 0;
4938                 } else if (cpuid_d_valid) {
4939                         xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4940                 } else {
4941                         /* Broken CPUID 0xD, probably in HVM */
4942                         cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4943                             "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4944                             ", ymm_size = %d, ymm_offset = %d\n",
4945                             cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4946                             cpi->cpi_xsave.xsav_hw_features_high,
4947                             (int)cpi->cpi_xsave.xsav_max_size,
4948                             (int)cpi->cpi_xsave.ymm_size,
4949                             (int)cpi->cpi_xsave.ymm_offset);
4950 
4951                         if (xsave_state_size != 0) {
4952                                 /*
4953                                  * This must be a non-boot CPU. We cannot
4954                                  * continue, because boot cpu has already
4955                                  * enabled XSAVE.
4956                                  */
4957                                 ASSERT(cpu->cpu_id != 0);
4958                                 cmn_err(CE_PANIC, "cpu%d: we have already "
4959                                     "enabled XSAVE on boot cpu, cannot "
4960                                     "continue.", cpu->cpu_id);
4961                         } else {
4962                                 /*
4963                                  * If we reached here on the boot CPU, it's also
4964                                  * almost certain that we'll reach here on the
4965                                  * non-boot CPUs. When we're here on a boot CPU
4966                                  * we should disable the feature, on a non-boot
4967                                  * CPU we need to confirm that we have.
4968                                  */
4969                                 if (cpu->cpu_id == 0) {
4970                                         remove_x86_feature(x86_featureset,
4971                                             X86FSET_XSAVE);
4972                                         remove_x86_feature(x86_featureset,
4973                                             X86FSET_AVX);
4974                                         remove_x86_feature(x86_featureset,
4975                                             X86FSET_F16C);
4976                                         remove_x86_feature(x86_featureset,
4977                                             X86FSET_BMI1);
4978                                         remove_x86_feature(x86_featureset,
4979                                             X86FSET_BMI2);
4980                                         remove_x86_feature(x86_featureset,
4981                                             X86FSET_FMA);
4982                                         remove_x86_feature(x86_featureset,
4983                                             X86FSET_AVX2);
4984                                         remove_x86_feature(x86_featureset,
4985                                             X86FSET_MPX);
4986                                         remove_x86_feature(x86_featureset,
4987                                             X86FSET_AVX512F);
4988                                         remove_x86_feature(x86_featureset,
4989                                             X86FSET_AVX512DQ);
4990                                         remove_x86_feature(x86_featureset,
4991                                             X86FSET_AVX512PF);
4992                                         remove_x86_feature(x86_featureset,
4993                                             X86FSET_AVX512ER);
4994                                         remove_x86_feature(x86_featureset,
4995                                             X86FSET_AVX512CD);
4996                                         remove_x86_feature(x86_featureset,
4997                                             X86FSET_AVX512BW);
4998                                         remove_x86_feature(x86_featureset,
4999                                             X86FSET_AVX512VL);
5000                                         remove_x86_feature(x86_featureset,
5001                                             X86FSET_AVX512FMA);
5002                                         remove_x86_feature(x86_featureset,
5003                                             X86FSET_AVX512VBMI);
5004                                         remove_x86_feature(x86_featureset,
5005                                             X86FSET_AVX512VNNI);
5006                                         remove_x86_feature(x86_featureset,
5007                                             X86FSET_AVX512VPOPCDQ);
5008                                         remove_x86_feature(x86_featureset,
5009                                             X86FSET_AVX512NNIW);
5010                                         remove_x86_feature(x86_featureset,
5011                                             X86FSET_AVX512FMAPS);
5012                                         remove_x86_feature(x86_featureset,
5013                                             X86FSET_VAES);
5014                                         remove_x86_feature(x86_featureset,
5015                                             X86FSET_VPCLMULQDQ);
5016                                         remove_x86_feature(x86_featureset,
5017                                             X86FSET_GFNI);
5018                                         remove_x86_feature(x86_featureset,
5019                                             X86FSET_AVX512_VP2INT);
5020                                         remove_x86_feature(x86_featureset,
5021                                             X86FSET_AVX512_BITALG);
5022                                         remove_x86_feature(x86_featureset,
5023                                             X86FSET_AVX512_VBMI2);
5024                                         remove_x86_feature(x86_featureset,
5025                                             X86FSET_AVX512_BF16);
5026 
5027                                         xsave_force_disable = B_TRUE;
5028                                 } else {
5029                                         VERIFY(is_x86_feature(x86_featureset,
5030                                             X86FSET_XSAVE) == B_FALSE);
5031                                 }
5032                         }
5033                 }
5034         }
5035 
5036 
5037         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
5038                 return;
5039 
5040         if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
5041                 nmax = NMAX_CPI_EXTD;
5042         /*
5043          * Copy the extended properties, fixing them as we go. While we start at
5044          * 2 because we've already handled a few cases in the basic pass, the
5045          * rest we let ourselves just grab again (e.g. 0x8, 0x21).
5046          */
5047         iptr = (void *)cpi->cpi_brandstr;
5048         for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
5049                 cp->cp_eax = CPUID_LEAF_EXT_0 + n;
5050                 (void) __cpuid_insn(cp);
5051                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
5052                     cp);
5053                 switch (n) {
5054                 case 2:
5055                 case 3:
5056                 case 4:
5057                         /*
5058                          * Extract the brand string
5059                          */
5060                         *iptr++ = cp->cp_eax;
5061                         *iptr++ = cp->cp_ebx;
5062                         *iptr++ = cp->cp_ecx;
5063                         *iptr++ = cp->cp_edx;
5064                         break;
5065                 case 5:
5066                         switch (cpi->cpi_vendor) {
5067                         case X86_VENDOR_AMD:
5068                                 /*
5069                                  * The Athlon and Duron were the first
5070                                  * parts to report the sizes of the
5071                                  * TLB for large pages. Before then,
5072                                  * we don't trust the data.
5073                                  */
5074                                 if (cpi->cpi_family < 6 ||
5075                                     (cpi->cpi_family == 6 &&
5076                                     cpi->cpi_model < 1))
5077                                         cp->cp_eax = 0;
5078                                 break;
5079                         default:
5080                                 break;
5081                         }
5082                         break;
5083                 case 6:
5084                         switch (cpi->cpi_vendor) {
5085                         case X86_VENDOR_AMD:
5086                                 /*
5087                                  * The Athlon and Duron were the first
5088                                  * AMD parts with L2 TLB's.
5089                                  * Before then, don't trust the data.
5090                                  */
5091                                 if (cpi->cpi_family < 6 ||
5092                                     (cpi->cpi_family == 6 &&
5093                                     cpi->cpi_model < 1))
5094                                         cp->cp_eax = cp->cp_ebx = 0;
5095                                 /*
5096                                  * AMD Duron rev A0 reports L2
5097                                  * cache size incorrectly as 1K
5098                                  * when it is really 64K
5099                                  */
5100                                 if (cpi->cpi_family == 6 &&
5101                                     cpi->cpi_model == 3 &&
5102                                     cpi->cpi_step == 0) {
5103                                         cp->cp_ecx &= 0xffff;
5104                                         cp->cp_ecx |= 0x400000;
5105                                 }
5106                                 break;
5107                         case X86_VENDOR_Cyrix:  /* VIA C3 */
5108                                 /*
5109                                  * VIA C3 processors are a bit messed
5110                                  * up w.r.t. encoding cache sizes in %ecx
5111                                  */
5112                                 if (cpi->cpi_family != 6)
5113                                         break;
5114                                 /*
5115                                  * model 7 and 8 were incorrectly encoded
5116                                  *
5117                                  * xxx is model 8 really broken?
5118                                  */
5119                                 if (cpi->cpi_model == 7 ||
5120                                     cpi->cpi_model == 8)
5121                                         cp->cp_ecx =
5122                                             BITX(cp->cp_ecx, 31, 24) << 16 |
5123                                             BITX(cp->cp_ecx, 23, 16) << 12 |
5124                                             BITX(cp->cp_ecx, 15, 8) << 8 |
5125                                             BITX(cp->cp_ecx, 7, 0);
5126                                 /*
5127                                  * model 9 stepping 1 has wrong associativity
5128                                  */
5129                                 if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
5130                                         cp->cp_ecx |= 8 << 12;
5131                                 break;
5132                         case X86_VENDOR_Intel:
5133                                 /*
5134                                  * Extended L2 Cache features function.
5135                                  * First appeared on Prescott.
5136                                  */
5137                         default:
5138                                 break;
5139                         }
5140                         break;
5141                 default:
5142                         break;
5143                 }
5144         }
5145 }
5146 
5147 static const char *
5148 intel_cpubrand(const struct cpuid_info *cpi)
5149 {
5150         int i;
5151 
5152         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5153 
5154         switch (cpi->cpi_family) {
5155         case 5:
5156                 return ("Intel Pentium(r)");
5157         case 6:
5158                 switch (cpi->cpi_model) {
5159                         uint_t celeron, xeon;
5160                         const struct cpuid_regs *cp;
5161                 case 0:
5162                 case 1:
5163                 case 2:
5164                         return ("Intel Pentium(r) Pro");
5165                 case 3:
5166                 case 4:
5167                         return ("Intel Pentium(r) II");
5168                 case 6:
5169                         return ("Intel Celeron(r)");
5170                 case 5:
5171                 case 7:
5172                         celeron = xeon = 0;
5173                         cp = &cpi->cpi_std[2];   /* cache info */
5174 
5175                         for (i = 1; i < 4; i++) {
5176                                 uint_t tmp;
5177 
5178                                 tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5179                                 if (tmp == 0x40)
5180                                         celeron++;
5181                                 if (tmp >= 0x44 && tmp <= 0x45)
5182                                         xeon++;
5183                         }
5184 
5185                         for (i = 0; i < 2; i++) {
5186                                 uint_t tmp;
5187 
5188                                 tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5189                                 if (tmp == 0x40)
5190                                         celeron++;
5191                                 else if (tmp >= 0x44 && tmp <= 0x45)
5192                                         xeon++;
5193                         }
5194 
5195                         for (i = 0; i < 4; i++) {
5196                                 uint_t tmp;
5197 
5198                                 tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5199                                 if (tmp == 0x40)
5200                                         celeron++;
5201                                 else if (tmp >= 0x44 && tmp <= 0x45)
5202                                         xeon++;
5203                         }
5204 
5205                         for (i = 0; i < 4; i++) {
5206                                 uint_t tmp;
5207 
5208                                 tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5209                                 if (tmp == 0x40)
5210                                         celeron++;
5211                                 else if (tmp >= 0x44 && tmp <= 0x45)
5212                                         xeon++;
5213                         }
5214 
5215                         if (celeron)
5216                                 return ("Intel Celeron(r)");
5217                         if (xeon)
5218                                 return (cpi->cpi_model == 5 ?
5219                                     "Intel Pentium(r) II Xeon(tm)" :
5220                                     "Intel Pentium(r) III Xeon(tm)");
5221                         return (cpi->cpi_model == 5 ?
5222                             "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5223                             "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5224                 default:
5225                         break;
5226                 }
5227         default:
5228                 break;
5229         }
5230 
5231         /* BrandID is present if the field is nonzero */
5232         if (cpi->cpi_brandid != 0) {
5233                 static const struct {
5234                         uint_t bt_bid;
5235                         const char *bt_str;
5236                 } brand_tbl[] = {
5237                         { 0x1,  "Intel(r) Celeron(r)" },
5238                         { 0x2,  "Intel(r) Pentium(r) III" },
5239                         { 0x3,  "Intel(r) Pentium(r) III Xeon(tm)" },
5240                         { 0x4,  "Intel(r) Pentium(r) III" },
5241                         { 0x6,  "Mobile Intel(r) Pentium(r) III" },
5242                         { 0x7,  "Mobile Intel(r) Celeron(r)" },
5243                         { 0x8,  "Intel(r) Pentium(r) 4" },
5244                         { 0x9,  "Intel(r) Pentium(r) 4" },
5245                         { 0xa,  "Intel(r) Celeron(r)" },
5246                         { 0xb,  "Intel(r) Xeon(tm)" },
5247                         { 0xc,  "Intel(r) Xeon(tm) MP" },
5248                         { 0xe,  "Mobile Intel(r) Pentium(r) 4" },
5249                         { 0xf,  "Mobile Intel(r) Celeron(r)" },
5250                         { 0x11, "Mobile Genuine Intel(r)" },
5251                         { 0x12, "Intel(r) Celeron(r) M" },
5252                         { 0x13, "Mobile Intel(r) Celeron(r)" },
5253                         { 0x14, "Intel(r) Celeron(r)" },
5254                         { 0x15, "Mobile Genuine Intel(r)" },
5255                         { 0x16, "Intel(r) Pentium(r) M" },
5256                         { 0x17, "Mobile Intel(r) Celeron(r)" }
5257                 };
5258                 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5259                 uint_t sgn;
5260 
5261                 sgn = (cpi->cpi_family << 8) |
5262                     (cpi->cpi_model << 4) | cpi->cpi_step;
5263 
5264                 for (i = 0; i < btblmax; i++)
5265                         if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5266                                 break;
5267                 if (i < btblmax) {
5268                         if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5269                                 return ("Intel(r) Celeron(r)");
5270                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5271                                 return ("Intel(r) Xeon(tm) MP");
5272                         if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5273                                 return ("Intel(r) Xeon(tm)");
5274                         return (brand_tbl[i].bt_str);
5275                 }
5276         }
5277 
5278         return (NULL);
5279 }
5280 
5281 static const char *
5282 amd_cpubrand(const struct cpuid_info *cpi)
5283 {
5284         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5285 
5286         switch (cpi->cpi_family) {
5287         case 5:
5288                 switch (cpi->cpi_model) {
5289                 case 0:
5290                 case 1:
5291                 case 2:
5292                 case 3:
5293                 case 4:
5294                 case 5:
5295                         return ("AMD-K5(r)");
5296                 case 6:
5297                 case 7:
5298                         return ("AMD-K6(r)");
5299                 case 8:
5300                         return ("AMD-K6(r)-2");
5301                 case 9:
5302                         return ("AMD-K6(r)-III");
5303                 default:
5304                         return ("AMD (family 5)");
5305                 }
5306         case 6:
5307                 switch (cpi->cpi_model) {
5308                 case 1:
5309                         return ("AMD-K7(tm)");
5310                 case 0:
5311                 case 2:
5312                 case 4:
5313                         return ("AMD Athlon(tm)");
5314                 case 3:
5315                 case 7:
5316                         return ("AMD Duron(tm)");
5317                 case 6:
5318                 case 8:
5319                 case 10:
5320                         /*
5321                          * Use the L2 cache size to distinguish
5322                          */
5323                         return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5324                             "AMD Athlon(tm)" : "AMD Duron(tm)");
5325                 default:
5326                         return ("AMD (family 6)");
5327                 }
5328         default:
5329                 break;
5330         }
5331 
5332         if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5333             cpi->cpi_brandid != 0) {
5334                 switch (BITX(cpi->cpi_brandid, 7, 5)) {
5335                 case 3:
5336                         return ("AMD Opteron(tm) UP 1xx");
5337                 case 4:
5338                         return ("AMD Opteron(tm) DP 2xx");
5339                 case 5:
5340                         return ("AMD Opteron(tm) MP 8xx");
5341                 default:
5342                         return ("AMD Opteron(tm)");
5343                 }
5344         }
5345 
5346         return (NULL);
5347 }
5348 
5349 static const char *
5350 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5351 {
5352         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5353 
5354         switch (type) {
5355         case X86_TYPE_CYRIX_6x86:
5356                 return ("Cyrix 6x86");
5357         case X86_TYPE_CYRIX_6x86L:
5358                 return ("Cyrix 6x86L");
5359         case X86_TYPE_CYRIX_6x86MX:
5360                 return ("Cyrix 6x86MX");
5361         case X86_TYPE_CYRIX_GXm:
5362                 return ("Cyrix GXm");
5363         case X86_TYPE_CYRIX_MediaGX:
5364                 return ("Cyrix MediaGX");
5365         case X86_TYPE_CYRIX_MII:
5366                 return ("Cyrix M2");
5367         case X86_TYPE_VIA_CYRIX_III:
5368                 return ("VIA Cyrix M3");
5369         default:
5370                 /*
5371                  * Have another wild guess ..
5372                  */
5373                 if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5374                         return ("Cyrix 5x86");
5375                 else if (cpi->cpi_family == 5) {
5376                         switch (cpi->cpi_model) {
5377                         case 2:
5378                                 return ("Cyrix 6x86");  /* Cyrix M1 */
5379                         case 4:
5380                                 return ("Cyrix MediaGX");
5381                         default:
5382                                 break;
5383                         }
5384                 } else if (cpi->cpi_family == 6) {
5385                         switch (cpi->cpi_model) {
5386                         case 0:
5387                                 return ("Cyrix 6x86MX"); /* Cyrix M2? */
5388                         case 5:
5389                         case 6:
5390                         case 7:
5391                         case 8:
5392                         case 9:
5393                                 return ("VIA C3");
5394                         default:
5395                                 break;
5396                         }
5397                 }
5398                 break;
5399         }
5400         return (NULL);
5401 }
5402 
5403 /*
5404  * This only gets called in the case that the CPU extended
5405  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5406  * aren't available, or contain null bytes for some reason.
5407  */
5408 static void
5409 fabricate_brandstr(struct cpuid_info *cpi)
5410 {
5411         const char *brand = NULL;
5412 
5413         switch (cpi->cpi_vendor) {
5414         case X86_VENDOR_Intel:
5415                 brand = intel_cpubrand(cpi);
5416                 break;
5417         case X86_VENDOR_AMD:
5418                 brand = amd_cpubrand(cpi);
5419                 break;
5420         case X86_VENDOR_Cyrix:
5421                 brand = cyrix_cpubrand(cpi, x86_type);
5422                 break;
5423         case X86_VENDOR_NexGen:
5424                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5425                         brand = "NexGen Nx586";
5426                 break;
5427         case X86_VENDOR_Centaur:
5428                 if (cpi->cpi_family == 5)
5429                         switch (cpi->cpi_model) {
5430                         case 4:
5431                                 brand = "Centaur C6";
5432                                 break;
5433                         case 8:
5434                                 brand = "Centaur C2";
5435                                 break;
5436                         case 9:
5437                                 brand = "Centaur C3";
5438                                 break;
5439                         default:
5440                                 break;
5441                         }
5442                 break;
5443         case X86_VENDOR_Rise:
5444                 if (cpi->cpi_family == 5 &&
5445                     (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5446                         brand = "Rise mP6";
5447                 break;
5448         case X86_VENDOR_SiS:
5449                 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5450                         brand = "SiS 55x";
5451                 break;
5452         case X86_VENDOR_TM:
5453                 if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5454                         brand = "Transmeta Crusoe TM3x00 or TM5x00";
5455                 break;
5456         case X86_VENDOR_NSC:
5457         case X86_VENDOR_UMC:
5458         default:
5459                 break;
5460         }
5461         if (brand) {
5462                 (void) strcpy((char *)cpi->cpi_brandstr, brand);
5463                 return;
5464         }
5465 
5466         /*
5467          * If all else fails ...
5468          */
5469         (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5470             "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5471             cpi->cpi_model, cpi->cpi_step);
5472 }
5473 
5474 /*
5475  * This routine is called just after kernel memory allocation
5476  * becomes available on cpu0, and as part of mp_startup() on
5477  * the other cpus.
5478  *
5479  * Fixup the brand string, and collect any information from cpuid
5480  * that requires dynamically allocated storage to represent.
5481  */
5482 
5483 static void
5484 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5485 {
5486         int     i, max, shft, level, size;
5487         struct cpuid_regs regs;
5488         struct cpuid_regs *cp;
5489         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5490 
5491         /*
5492          * Deterministic cache parameters
5493          *
5494          * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5495          * values that are present are currently defined to be the same. This
5496          * means we can use the same logic to parse it as long as we use the
5497          * appropriate leaf to get the data. If you're updating this, make sure
5498          * you're careful about which vendor supports which aspect.
5499          *
5500          * Take this opportunity to detect the number of threads sharing the
5501          * last level cache, and construct a corresponding cache id. The
5502          * respective cpuid_info members are initialized to the default case of
5503          * "no last level cache sharing".
5504          */
5505         cpi->cpi_ncpu_shr_last_cache = 1;
5506         cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5507 
5508         if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5509             ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5510             cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5511             cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5512             is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5513                 uint32_t leaf;
5514 
5515                 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5516                         leaf = 4;
5517                 } else {
5518                         leaf = CPUID_LEAF_EXT_1d;
5519                 }
5520 
5521                 /*
5522                  * Find the # of elements (size) returned by the leaf and along
5523                  * the way detect last level cache sharing details.
5524                  */
5525                 bzero(&regs, sizeof (regs));
5526                 cp = &regs;
5527                 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5528                         cp->cp_eax = leaf;
5529                         cp->cp_ecx = i;
5530 
5531                         (void) __cpuid_insn(cp);
5532 
5533                         if (CPI_CACHE_TYPE(cp) == 0)
5534                                 break;
5535                         level = CPI_CACHE_LVL(cp);
5536                         if (level > max) {
5537                                 max = level;
5538                                 cpi->cpi_ncpu_shr_last_cache =
5539                                     CPI_NTHR_SHR_CACHE(cp) + 1;
5540                         }
5541                 }
5542                 cpi->cpi_cache_leaf_size = size = i;
5543 
5544                 /*
5545                  * Allocate the cpi_cache_leaves array. The first element
5546                  * references the regs for the corresponding leaf with %ecx set
5547                  * to 0. This was gathered in cpuid_pass_extended().
5548                  */
5549                 if (size > 0) {
5550                         cpi->cpi_cache_leaves =
5551                             kmem_alloc(size * sizeof (cp), KM_SLEEP);
5552                         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5553                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5554                         } else {
5555                                 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5556                         }
5557 
5558                         /*
5559                          * Allocate storage to hold the additional regs
5560                          * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5561                          *
5562                          * The regs for the leaf, %ecx == 0 has already
5563                          * been allocated as indicated above.
5564                          */
5565                         for (i = 1; i < size; i++) {
5566                                 cp = cpi->cpi_cache_leaves[i] =
5567                                     kmem_zalloc(sizeof (regs), KM_SLEEP);
5568                                 cp->cp_eax = leaf;
5569                                 cp->cp_ecx = i;
5570 
5571                                 (void) __cpuid_insn(cp);
5572                         }
5573                 }
5574                 /*
5575                  * Determine the number of bits needed to represent
5576                  * the number of CPUs sharing the last level cache.
5577                  *
5578                  * Shift off that number of bits from the APIC id to
5579                  * derive the cache id.
5580                  */
5581                 shft = 0;
5582                 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5583                         shft++;
5584                 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5585         }
5586 
5587         /*
5588          * Now fixup the brand string
5589          */
5590         if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5591                 fabricate_brandstr(cpi);
5592         } else {
5593 
5594                 /*
5595                  * If we successfully extracted a brand string from the cpuid
5596                  * instruction, clean it up by removing leading spaces and
5597                  * similar junk.
5598                  */
5599                 if (cpi->cpi_brandstr[0]) {
5600                         size_t maxlen = sizeof (cpi->cpi_brandstr);
5601                         char *src, *dst;
5602 
5603                         dst = src = (char *)cpi->cpi_brandstr;
5604                         src[maxlen - 1] = '\0';
5605                         /*
5606                          * strip leading spaces
5607                          */
5608                         while (*src == ' ')
5609                                 src++;
5610                         /*
5611                          * Remove any 'Genuine' or "Authentic" prefixes
5612                          */
5613                         if (strncmp(src, "Genuine ", 8) == 0)
5614                                 src += 8;
5615                         if (strncmp(src, "Authentic ", 10) == 0)
5616                                 src += 10;
5617 
5618                         /*
5619                          * Now do an in-place copy.
5620                          * Map (R) to (r) and (TM) to (tm).
5621                          * The era of teletypes is long gone, and there's
5622                          * -really- no need to shout.
5623                          */
5624                         while (*src != '\0') {
5625                                 if (src[0] == '(') {
5626                                         if (strncmp(src + 1, "R)", 2) == 0) {
5627                                                 (void) strncpy(dst, "(r)", 3);
5628                                                 src += 3;
5629                                                 dst += 3;
5630                                                 continue;
5631                                         }
5632                                         if (strncmp(src + 1, "TM)", 3) == 0) {
5633                                                 (void) strncpy(dst, "(tm)", 4);
5634                                                 src += 4;
5635                                                 dst += 4;
5636                                                 continue;
5637                                         }
5638                                 }
5639                                 *dst++ = *src++;
5640                         }
5641                         *dst = '\0';
5642 
5643                         /*
5644                          * Finally, remove any trailing spaces
5645                          */
5646                         while (--dst > cpi->cpi_brandstr)
5647                                 if (*dst == ' ')
5648                                         *dst = '\0';
5649                                 else
5650                                         break;
5651                 } else
5652                         fabricate_brandstr(cpi);
5653         }
5654 }
5655 
5656 typedef struct {
5657         uint32_t avm_av;
5658         uint32_t avm_feat;
5659 } av_feat_map_t;
5660 
5661 /*
5662  * These arrays are used to map features that we should add based on x86
5663  * features that are present. As a large number depend on kernel features,
5664  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5665  * There is an array of these for each hwcap word. Some features aren't tracked
5666  * in the kernel x86 featureset and that's ok. They will not show up in here.
5667  */
5668 static const av_feat_map_t x86fset_to_av1[] = {
5669         { AV_386_CX8, X86FSET_CX8 },
5670         { AV_386_SEP, X86FSET_SEP },
5671         { AV_386_AMD_SYSC, X86FSET_ASYSC },
5672         { AV_386_CMOV, X86FSET_CMOV },
5673         { AV_386_FXSR, X86FSET_SSE },
5674         { AV_386_SSE, X86FSET_SSE },
5675         { AV_386_SSE2, X86FSET_SSE2 },
5676         { AV_386_SSE3, X86FSET_SSE3 },
5677         { AV_386_CX16, X86FSET_CX16 },
5678         { AV_386_TSCP, X86FSET_TSCP },
5679         { AV_386_AMD_SSE4A, X86FSET_SSE4A },
5680         { AV_386_SSSE3, X86FSET_SSSE3 },
5681         { AV_386_SSE4_1, X86FSET_SSE4_1 },
5682         { AV_386_SSE4_2, X86FSET_SSE4_2 },
5683         { AV_386_AES, X86FSET_AES },
5684         { AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5685         { AV_386_XSAVE, X86FSET_XSAVE },
5686         { AV_386_AVX, X86FSET_AVX },
5687         { AV_386_VMX, X86FSET_VMX },
5688         { AV_386_AMD_SVM, X86FSET_SVM }
5689 };
5690 
5691 static const av_feat_map_t x86fset_to_av2[] = {
5692         { AV_386_2_F16C, X86FSET_F16C },
5693         { AV_386_2_RDRAND, X86FSET_RDRAND },
5694         { AV_386_2_BMI1, X86FSET_BMI1 },
5695         { AV_386_2_BMI2, X86FSET_BMI2 },
5696         { AV_386_2_FMA, X86FSET_FMA },
5697         { AV_386_2_AVX2, X86FSET_AVX2 },
5698         { AV_386_2_ADX, X86FSET_ADX },
5699         { AV_386_2_RDSEED, X86FSET_RDSEED },
5700         { AV_386_2_AVX512F, X86FSET_AVX512F },
5701         { AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5702         { AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5703         { AV_386_2_AVX512PF, X86FSET_AVX512PF },
5704         { AV_386_2_AVX512ER, X86FSET_AVX512ER },
5705         { AV_386_2_AVX512CD, X86FSET_AVX512CD },
5706         { AV_386_2_AVX512BW, X86FSET_AVX512BW },
5707         { AV_386_2_AVX512VL, X86FSET_AVX512VL },
5708         { AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5709         { AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5710         { AV_386_2_SHA, X86FSET_SHA },
5711         { AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5712         { AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5713         { AV_386_2_CLWB, X86FSET_CLWB },
5714         { AV_386_2_MONITORX, X86FSET_MONITORX },
5715         { AV_386_2_CLZERO, X86FSET_CLZERO },
5716         { AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5717         { AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5718         { AV_386_2_VAES, X86FSET_VAES },
5719         { AV_386_2_GFNI, X86FSET_GFNI },
5720         { AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5721         { AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5722 };
5723 
5724 static const av_feat_map_t x86fset_to_av3[] = {
5725         { AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5726         { AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5727 };
5728 
5729 /*
5730  * This routine is called out of bind_hwcap() much later in the life
5731  * of the kernel (post_startup()).  The job of this routine is to resolve
5732  * the hardware feature support and kernel support for those features into
5733  * what we're actually going to tell applications via the aux vector.
5734  *
5735  * Most of the aux vector is derived from the x86_featureset array vector where
5736  * a given feature indicates that an aux vector should be plumbed through. This
5737  * allows the kernel to use one tracking mechanism for these based on whether or
5738  * not it has the required hardware support (most often xsave). Most newer
5739  * features are added there in case we need them in the kernel. Otherwise,
5740  * features are evaluated based on looking at the cpuid features that remain. If
5741  * you find yourself wanting to clear out cpuid features for some reason, they
5742  * should instead be driven by the feature set so we have a consistent view.
5743  */
5744 
5745 static void
5746 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5747 {
5748         uint_t *hwcap_out = (uint_t *)arg;
5749         struct cpuid_info *cpi;
5750         uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5751 
5752         cpi = cpu->cpu_m.mcpu_cpi;
5753 
5754         for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5755                 if (is_x86_feature(x86_featureset,
5756                     x86fset_to_av1[i].avm_feat)) {
5757                         hwcap_flags |= x86fset_to_av1[i].avm_av;
5758                 }
5759         }
5760 
5761         for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5762                 if (is_x86_feature(x86_featureset,
5763                     x86fset_to_av2[i].avm_feat)) {
5764                         hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5765                 }
5766         }
5767 
5768         for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5769                 if (is_x86_feature(x86_featureset,
5770                     x86fset_to_av3[i].avm_feat)) {
5771                         hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5772                 }
5773         }
5774 
5775         /*
5776          * From here on out we're working through features that don't have
5777          * corresponding kernel feature flags for various reasons that are
5778          * mostly just due to the historical implementation.
5779          */
5780         if (cpi->cpi_maxeax >= 1) {
5781                 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5782                 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5783 
5784                 *edx = CPI_FEATURES_EDX(cpi);
5785                 *ecx = CPI_FEATURES_ECX(cpi);
5786 
5787                 /*
5788                  * [no explicit support required beyond x87 fp context]
5789                  */
5790                 if (!fpu_exists)
5791                         *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5792 
5793                 /*
5794                  * Now map the supported feature vector to things that we
5795                  * think userland will care about.
5796                  */
5797                 if (*ecx & CPUID_INTC_ECX_MOVBE)
5798                         hwcap_flags |= AV_386_MOVBE;
5799 
5800                 if (*ecx & CPUID_INTC_ECX_POPCNT)
5801                         hwcap_flags |= AV_386_POPCNT;
5802                 if (*edx & CPUID_INTC_EDX_FPU)
5803                         hwcap_flags |= AV_386_FPU;
5804                 if (*edx & CPUID_INTC_EDX_MMX)
5805                         hwcap_flags |= AV_386_MMX;
5806                 if (*edx & CPUID_INTC_EDX_TSC)
5807                         hwcap_flags |= AV_386_TSC;
5808         }
5809 
5810         /*
5811          * Check a few miscellaneous features.
5812          */
5813         if (cpi->cpi_xmaxeax < 0x80000001)
5814                 goto resolve_done;
5815 
5816         switch (cpi->cpi_vendor) {
5817                 uint32_t *edx, *ecx;
5818 
5819         case X86_VENDOR_Intel:
5820                 /*
5821                  * Seems like Intel duplicated what we necessary
5822                  * here to make the initial crop of 64-bit OS's work.
5823                  * Hopefully, those are the only "extended" bits
5824                  * they'll add.
5825                  */
5826                 /*FALLTHROUGH*/
5827 
5828         case X86_VENDOR_AMD:
5829         case X86_VENDOR_HYGON:
5830                 edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5831                 ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5832 
5833                 *edx = CPI_FEATURES_XTD_EDX(cpi);
5834                 *ecx = CPI_FEATURES_XTD_ECX(cpi);
5835 
5836                 /*
5837                  * [no explicit support required beyond
5838                  * x87 fp context and exception handlers]
5839                  */
5840                 if (!fpu_exists)
5841                         *edx &= ~(CPUID_AMD_EDX_MMXamd |
5842                             CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5843 
5844                 /*
5845                  * Now map the supported feature vector to
5846                  * things that we think userland will care about.
5847                  */
5848                 if (*edx & CPUID_AMD_EDX_MMXamd)
5849                         hwcap_flags |= AV_386_AMD_MMX;
5850                 if (*edx & CPUID_AMD_EDX_3DNow)
5851                         hwcap_flags |= AV_386_AMD_3DNow;
5852                 if (*edx & CPUID_AMD_EDX_3DNowx)
5853                         hwcap_flags |= AV_386_AMD_3DNowx;
5854 
5855                 switch (cpi->cpi_vendor) {
5856                 case X86_VENDOR_AMD:
5857                 case X86_VENDOR_HYGON:
5858                         if (*ecx & CPUID_AMD_ECX_AHF64)
5859                                 hwcap_flags |= AV_386_AHF;
5860                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5861                                 hwcap_flags |= AV_386_AMD_LZCNT;
5862                         break;
5863 
5864                 case X86_VENDOR_Intel:
5865                         if (*ecx & CPUID_AMD_ECX_LZCNT)
5866                                 hwcap_flags |= AV_386_AMD_LZCNT;
5867                         /*
5868                          * Aarrgh.
5869                          * Intel uses a different bit in the same word.
5870                          */
5871                         if (*ecx & CPUID_INTC_ECX_AHF64)
5872                                 hwcap_flags |= AV_386_AHF;
5873                         break;
5874                 default:
5875                         break;
5876                 }
5877                 break;
5878 
5879         default:
5880                 break;
5881         }
5882 
5883 resolve_done:
5884         if (hwcap_out != NULL) {
5885                 hwcap_out[0] = hwcap_flags;
5886                 hwcap_out[1] = hwcap_flags_2;
5887                 hwcap_out[2] = hwcap_flags_3;
5888         }
5889 }
5890 
5891 
5892 /*
5893  * Simulate the cpuid instruction using the data we previously
5894  * captured about this CPU.  We try our best to return the truth
5895  * about the hardware, independently of kernel support.
5896  */
5897 uint32_t
5898 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5899 {
5900         struct cpuid_info *cpi;
5901         struct cpuid_regs *xcp;
5902 
5903         if (cpu == NULL)
5904                 cpu = CPU;
5905         cpi = cpu->cpu_m.mcpu_cpi;
5906 
5907         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5908 
5909         /*
5910          * CPUID data is cached in two separate places: cpi_std for standard
5911          * CPUID leaves , and cpi_extd for extended CPUID leaves.
5912          */
5913         if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5914                 xcp = &cpi->cpi_std[cp->cp_eax];
5915         } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5916             cp->cp_eax <= cpi->cpi_xmaxeax &&
5917             cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5918                 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5919         } else {
5920                 /*
5921                  * The caller is asking for data from an input parameter which
5922                  * the kernel has not cached.  In this case we go fetch from
5923                  * the hardware and return the data directly to the user.
5924                  */
5925                 return (__cpuid_insn(cp));
5926         }
5927 
5928         cp->cp_eax = xcp->cp_eax;
5929         cp->cp_ebx = xcp->cp_ebx;
5930         cp->cp_ecx = xcp->cp_ecx;
5931         cp->cp_edx = xcp->cp_edx;
5932         return (cp->cp_eax);
5933 }
5934 
5935 boolean_t
5936 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
5937 {
5938         return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5939             cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5940 }
5941 
5942 int
5943 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5944 {
5945         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5946 
5947         return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5948 }
5949 
5950 int
5951 cpuid_is_cmt(cpu_t *cpu)
5952 {
5953         if (cpu == NULL)
5954                 cpu = CPU;
5955 
5956         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5957 
5958         return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5959 }
5960 
5961 /*
5962  * AMD and Intel both implement the 64-bit variant of the syscall
5963  * instruction (syscallq), so if there's -any- support for syscall,
5964  * cpuid currently says "yes, we support this".
5965  *
5966  * However, Intel decided to -not- implement the 32-bit variant of the
5967  * syscall instruction, so we provide a predicate to allow our caller
5968  * to test that subtlety here.
5969  *
5970  * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
5971  *      even in the case where the hardware would in fact support it.
5972  */
5973 /*ARGSUSED*/
5974 int
5975 cpuid_syscall32_insn(cpu_t *cpu)
5976 {
5977         ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
5978 
5979 #if !defined(__xpv)
5980         if (cpu == NULL)
5981                 cpu = CPU;
5982 
5983         /*CSTYLED*/
5984         {
5985                 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5986 
5987                 if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5988                     cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5989                     cpi->cpi_xmaxeax >= 0x80000001 &&
5990                     (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5991                         return (1);
5992         }
5993 #endif
5994         return (0);
5995 }
5996 
5997 int
5998 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5999 {
6000         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6001 
6002         static const char fmt[] =
6003             "x86 (%s %X family %d model %d step %d clock %d MHz)";
6004         static const char fmt_ht[] =
6005             "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
6006 
6007         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6008 
6009         if (cpuid_is_cmt(cpu))
6010                 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
6011                     cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6012                     cpi->cpi_family, cpi->cpi_model,
6013                     cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6014         return (snprintf(s, n, fmt,
6015             cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6016             cpi->cpi_family, cpi->cpi_model,
6017             cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6018 }
6019 
6020 const char *
6021 cpuid_getvendorstr(cpu_t *cpu)
6022 {
6023         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6024         return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
6025 }
6026 
6027 uint_t
6028 cpuid_getvendor(cpu_t *cpu)
6029 {
6030         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6031         return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
6032 }
6033 
6034 uint_t
6035 cpuid_getfamily(cpu_t *cpu)
6036 {
6037         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6038         return (cpu->cpu_m.mcpu_cpi->cpi_family);
6039 }
6040 
6041 uint_t
6042 cpuid_getmodel(cpu_t *cpu)
6043 {
6044         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6045         return (cpu->cpu_m.mcpu_cpi->cpi_model);
6046 }
6047 
6048 uint_t
6049 cpuid_get_ncpu_per_chip(cpu_t *cpu)
6050 {
6051         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6052         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
6053 }
6054 
6055 uint_t
6056 cpuid_get_ncore_per_chip(cpu_t *cpu)
6057 {
6058         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6059         return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
6060 }
6061 
6062 uint_t
6063 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
6064 {
6065         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6066         return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
6067 }
6068 
6069 id_t
6070 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
6071 {
6072         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6073         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6074 }
6075 
6076 uint_t
6077 cpuid_getstep(cpu_t *cpu)
6078 {
6079         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6080         return (cpu->cpu_m.mcpu_cpi->cpi_step);
6081 }
6082 
6083 uint_t
6084 cpuid_getsig(struct cpu *cpu)
6085 {
6086         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6087         return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
6088 }
6089 
6090 uint32_t
6091 cpuid_getchiprev(struct cpu *cpu)
6092 {
6093         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6094         return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
6095 }
6096 
6097 const char *
6098 cpuid_getchiprevstr(struct cpu *cpu)
6099 {
6100         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6101         return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
6102 }
6103 
6104 uint32_t
6105 cpuid_getsockettype(struct cpu *cpu)
6106 {
6107         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6108         return (cpu->cpu_m.mcpu_cpi->cpi_socket);
6109 }
6110 
6111 const char *
6112 cpuid_getsocketstr(cpu_t *cpu)
6113 {
6114         static const char *socketstr = NULL;
6115         struct cpuid_info *cpi;
6116 
6117         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6118         cpi = cpu->cpu_m.mcpu_cpi;
6119 
6120         /* Assume that socket types are the same across the system */
6121         if (socketstr == NULL)
6122                 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
6123                     cpi->cpi_model, cpi->cpi_step);
6124 
6125 
6126         return (socketstr);
6127 }
6128 
6129 x86_uarchrev_t
6130 cpuid_getuarchrev(cpu_t *cpu)
6131 {
6132         return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
6133 }
6134 
6135 int
6136 cpuid_get_chipid(cpu_t *cpu)
6137 {
6138         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6139 
6140         if (cpuid_is_cmt(cpu))
6141                 return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6142         return (cpu->cpu_id);
6143 }
6144 
6145 id_t
6146 cpuid_get_coreid(cpu_t *cpu)
6147 {
6148         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6149         return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6150 }
6151 
6152 int
6153 cpuid_get_pkgcoreid(cpu_t *cpu)
6154 {
6155         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6156         return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6157 }
6158 
6159 int
6160 cpuid_get_clogid(cpu_t *cpu)
6161 {
6162         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6163         return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6164 }
6165 
6166 int
6167 cpuid_get_cacheid(cpu_t *cpu)
6168 {
6169         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6170         return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6171 }
6172 
6173 uint_t
6174 cpuid_get_procnodeid(cpu_t *cpu)
6175 {
6176         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6177         return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6178 }
6179 
6180 uint_t
6181 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6182 {
6183         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6184         return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6185 }
6186 
6187 uint_t
6188 cpuid_get_compunitid(cpu_t *cpu)
6189 {
6190         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6191         return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6192 }
6193 
6194 uint_t
6195 cpuid_get_cores_per_compunit(cpu_t *cpu)
6196 {
6197         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6198         return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6199 }
6200 
6201 uint32_t
6202 cpuid_get_apicid(cpu_t *cpu)
6203 {
6204         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6205         if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6206                 return (UINT32_MAX);
6207         } else {
6208                 return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6209         }
6210 }
6211 
6212 void
6213 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6214 {
6215         struct cpuid_info *cpi;
6216 
6217         if (cpu == NULL)
6218                 cpu = CPU;
6219         cpi = cpu->cpu_m.mcpu_cpi;
6220 
6221         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6222 
6223         if (pabits)
6224                 *pabits = cpi->cpi_pabits;
6225         if (vabits)
6226                 *vabits = cpi->cpi_vabits;
6227 }
6228 
6229 size_t
6230 cpuid_get_xsave_size(void)
6231 {
6232         return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6233             sizeof (struct xsave_state)));
6234 }
6235 
6236 /*
6237  * Export information about known offsets to the kernel. We only care about
6238  * things we have actually enabled support for in %xcr0.
6239  */
6240 void
6241 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6242 {
6243         size_t size, off;
6244 
6245         VERIFY3U(bit & xsave_bv_all, !=, 0);
6246 
6247         if (sizep == NULL)
6248                 sizep = &size;
6249         if (offp == NULL)
6250                 offp = &off;
6251 
6252         switch (bit) {
6253         case XFEATURE_LEGACY_FP:
6254         case XFEATURE_SSE:
6255                 *sizep = sizeof (struct fxsave_state);
6256                 *offp = 0;
6257                 break;
6258         case XFEATURE_AVX:
6259                 *sizep = cpuid_info0.cpi_xsave.ymm_size;
6260                 *offp = cpuid_info0.cpi_xsave.ymm_offset;
6261                 break;
6262         case XFEATURE_AVX512_OPMASK:
6263                 *sizep = cpuid_info0.cpi_xsave.opmask_size;
6264                 *offp = cpuid_info0.cpi_xsave.opmask_offset;
6265                 break;
6266         case XFEATURE_AVX512_ZMM:
6267                 *sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6268                 *offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6269                 break;
6270         case XFEATURE_AVX512_HI_ZMM:
6271                 *sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6272                 *offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6273                 break;
6274         default:
6275                 panic("asked for unsupported xsave feature: 0x%lx", bit);
6276         }
6277 }
6278 
6279 /*
6280  * Return true if the CPUs on this system require 'pointer clearing' for the
6281  * floating point error pointer exception handling. In the past, this has been
6282  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6283  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6284  * feature bit and is reflected in the cpi_fp_amd_save member.
6285  */
6286 boolean_t
6287 cpuid_need_fp_excp_handling(void)
6288 {
6289         return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6290             cpuid_info0.cpi_fp_amd_save != 0);
6291 }
6292 
6293 /*
6294  * Returns the number of data TLB entries for a corresponding
6295  * pagesize.  If it can't be computed, or isn't known, the
6296  * routine returns zero.  If you ask about an architecturally
6297  * impossible pagesize, the routine will panic (so that the
6298  * hat implementor knows that things are inconsistent.)
6299  */
6300 uint_t
6301 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6302 {
6303         struct cpuid_info *cpi;
6304         uint_t dtlb_nent = 0;
6305 
6306         if (cpu == NULL)
6307                 cpu = CPU;
6308         cpi = cpu->cpu_m.mcpu_cpi;
6309 
6310         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6311 
6312         /*
6313          * Check the L2 TLB info
6314          */
6315         if (cpi->cpi_xmaxeax >= 0x80000006) {
6316                 struct cpuid_regs *cp = &cpi->cpi_extd[6];
6317 
6318                 switch (pagesize) {
6319 
6320                 case 4 * 1024:
6321                         /*
6322                          * All zero in the top 16 bits of the register
6323                          * indicates a unified TLB. Size is in low 16 bits.
6324                          */
6325                         if ((cp->cp_ebx & 0xffff0000) == 0)
6326                                 dtlb_nent = cp->cp_ebx & 0x0000ffff;
6327                         else
6328                                 dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6329                         break;
6330 
6331                 case 2 * 1024 * 1024:
6332                         if ((cp->cp_eax & 0xffff0000) == 0)
6333                                 dtlb_nent = cp->cp_eax & 0x0000ffff;
6334                         else
6335                                 dtlb_nent = BITX(cp->cp_eax, 27, 16);
6336                         break;
6337 
6338                 default:
6339                         panic("unknown L2 pagesize");
6340                         /*NOTREACHED*/
6341                 }
6342         }
6343 
6344         if (dtlb_nent != 0)
6345                 return (dtlb_nent);
6346 
6347         /*
6348          * No L2 TLB support for this size, try L1.
6349          */
6350         if (cpi->cpi_xmaxeax >= 0x80000005) {
6351                 struct cpuid_regs *cp = &cpi->cpi_extd[5];
6352 
6353                 switch (pagesize) {
6354                 case 4 * 1024:
6355                         dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6356                         break;
6357                 case 2 * 1024 * 1024:
6358                         dtlb_nent = BITX(cp->cp_eax, 23, 16);
6359                         break;
6360                 default:
6361                         panic("unknown L1 d-TLB pagesize");
6362                         /*NOTREACHED*/
6363                 }
6364         }
6365 
6366         return (dtlb_nent);
6367 }
6368 
6369 /*
6370  * Return 0 if the erratum is not present or not applicable, positive
6371  * if it is, and negative if the status of the erratum is unknown.
6372  *
6373  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6374  * Processors" #25759, Rev 3.57, August 2005
6375  */
6376 int
6377 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6378 {
6379         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6380         uint_t eax;
6381 
6382         /*
6383          * Bail out if this CPU isn't an AMD CPU, or if it's
6384          * a legacy (32-bit) AMD CPU.
6385          */
6386         if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6387             cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6388             cpi->cpi_family == 6) {
6389                 return (0);
6390         }
6391 
6392         eax = cpi->cpi_std[1].cp_eax;
6393 
6394 #define SH_B0(eax)      (eax == 0xf40 || eax == 0xf50)
6395 #define SH_B3(eax)      (eax == 0xf51)
6396 #define B(eax)          (SH_B0(eax) || SH_B3(eax))
6397 
6398 #define SH_C0(eax)      (eax == 0xf48 || eax == 0xf58)
6399 
6400 #define SH_CG(eax)      (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6401 #define DH_CG(eax)      (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6402 #define CH_CG(eax)      (eax == 0xf82 || eax == 0xfb2)
6403 #define CG(eax)         (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6404 
6405 #define SH_D0(eax)      (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6406 #define DH_D0(eax)      (eax == 0x10fc0 || eax == 0x10ff0)
6407 #define CH_D0(eax)      (eax == 0x10f80 || eax == 0x10fb0)
6408 #define D0(eax)         (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6409 
6410 #define SH_E0(eax)      (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6411 #define JH_E1(eax)      (eax == 0x20f10)        /* JH8_E0 had 0x20f30 */
6412 #define DH_E3(eax)      (eax == 0x20fc0 || eax == 0x20ff0)
6413 #define SH_E4(eax)      (eax == 0x20f51 || eax == 0x20f71)
6414 #define BH_E4(eax)      (eax == 0x20fb1)
6415 #define SH_E5(eax)      (eax == 0x20f42)
6416 #define DH_E6(eax)      (eax == 0x20ff2 || eax == 0x20fc2)
6417 #define JH_E6(eax)      (eax == 0x20f12 || eax == 0x20f32)
6418 #define EX(eax)         (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6419                             SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6420                             DH_E6(eax) || JH_E6(eax))
6421 
6422 #define DR_AX(eax)      (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6423 #define DR_B0(eax)      (eax == 0x100f20)
6424 #define DR_B1(eax)      (eax == 0x100f21)
6425 #define DR_BA(eax)      (eax == 0x100f2a)
6426 #define DR_B2(eax)      (eax == 0x100f22)
6427 #define DR_B3(eax)      (eax == 0x100f23)
6428 #define RB_C0(eax)      (eax == 0x100f40)
6429 
6430         switch (erratum) {
6431         case 1:
6432                 return (cpi->cpi_family < 0x10);
6433         case 51:        /* what does the asterisk mean? */
6434                 return (B(eax) || SH_C0(eax) || CG(eax));
6435         case 52:
6436                 return (B(eax));
6437         case 57:
6438                 return (cpi->cpi_family <= 0x11);
6439         case 58:
6440                 return (B(eax));
6441         case 60:
6442                 return (cpi->cpi_family <= 0x11);
6443         case 61:
6444         case 62:
6445         case 63:
6446         case 64:
6447         case 65:
6448         case 66:
6449         case 68:
6450         case 69:
6451         case 70:
6452         case 71:
6453                 return (B(eax));
6454         case 72:
6455                 return (SH_B0(eax));
6456         case 74:
6457                 return (B(eax));
6458         case 75:
6459                 return (cpi->cpi_family < 0x10);
6460         case 76:
6461                 return (B(eax));
6462         case 77:
6463                 return (cpi->cpi_family <= 0x11);
6464         case 78:
6465                 return (B(eax) || SH_C0(eax));
6466         case 79:
6467                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6468         case 80:
6469         case 81:
6470         case 82:
6471                 return (B(eax));
6472         case 83:
6473                 return (B(eax) || SH_C0(eax) || CG(eax));
6474         case 85:
6475                 return (cpi->cpi_family < 0x10);
6476         case 86:
6477                 return (SH_C0(eax) || CG(eax));
6478         case 88:
6479                 return (B(eax) || SH_C0(eax));
6480         case 89:
6481                 return (cpi->cpi_family < 0x10);
6482         case 90:
6483                 return (B(eax) || SH_C0(eax) || CG(eax));
6484         case 91:
6485         case 92:
6486                 return (B(eax) || SH_C0(eax));
6487         case 93:
6488                 return (SH_C0(eax));
6489         case 94:
6490                 return (B(eax) || SH_C0(eax) || CG(eax));
6491         case 95:
6492                 return (B(eax) || SH_C0(eax));
6493         case 96:
6494                 return (B(eax) || SH_C0(eax) || CG(eax));
6495         case 97:
6496         case 98:
6497                 return (SH_C0(eax) || CG(eax));
6498         case 99:
6499                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6500         case 100:
6501                 return (B(eax) || SH_C0(eax));
6502         case 101:
6503         case 103:
6504                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6505         case 104:
6506                 return (SH_C0(eax) || CG(eax) || D0(eax));
6507         case 105:
6508         case 106:
6509         case 107:
6510                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6511         case 108:
6512                 return (DH_CG(eax));
6513         case 109:
6514                 return (SH_C0(eax) || CG(eax) || D0(eax));
6515         case 110:
6516                 return (D0(eax) || EX(eax));
6517         case 111:
6518                 return (CG(eax));
6519         case 112:
6520                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6521         case 113:
6522                 return (eax == 0x20fc0);
6523         case 114:
6524                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6525         case 115:
6526                 return (SH_E0(eax) || JH_E1(eax));
6527         case 116:
6528                 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6529         case 117:
6530                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6531         case 118:
6532                 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6533                     JH_E6(eax));
6534         case 121:
6535                 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6536         case 122:
6537                 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6538         case 123:
6539                 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6540         case 131:
6541                 return (cpi->cpi_family < 0x10);
6542         case 6336786:
6543 
6544                 /*
6545                  * Test for AdvPowerMgmtInfo.TscPStateInvariant
6546                  * if this is a K8 family or newer processor. We're testing for
6547                  * this 'erratum' to determine whether or not we have a constant
6548                  * TSC.
6549                  *
6550                  * Our current fix for this is to disable the C1-Clock ramping.
6551                  * However, this doesn't work on newer processor families nor
6552                  * does it work when virtualized as those devices don't exist.
6553                  */
6554                 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6555                         return (0);
6556                 }
6557 
6558                 if (CPI_FAMILY(cpi) == 0xf) {
6559                         struct cpuid_regs regs;
6560                         regs.cp_eax = 0x80000007;
6561                         (void) __cpuid_insn(&regs);
6562                         return (!(regs.cp_edx & 0x100));
6563                 }
6564                 return (0);
6565         case 147:
6566                 /*
6567                  * This erratum (K8 #147) is not present on family 10 and newer.
6568                  */
6569                 if (cpi->cpi_family >= 0x10) {
6570                         return (0);
6571                 }
6572                 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6573                     (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6574 
6575         case 6671130:
6576                 /*
6577                  * check for processors (pre-Shanghai) that do not provide
6578                  * optimal management of 1gb ptes in its tlb.
6579                  */
6580                 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6581 
6582         case 298:
6583                 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6584                     DR_B2(eax) || RB_C0(eax));
6585 
6586         case 721:
6587                 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6588 
6589         default:
6590                 return (-1);
6591 
6592         }
6593 }
6594 
6595 /*
6596  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6597  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6598  */
6599 int
6600 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6601 {
6602         struct cpuid_info       *cpi;
6603         uint_t                  osvwid;
6604         static int              osvwfeature = -1;
6605         uint64_t                osvwlength;
6606 
6607 
6608         cpi = cpu->cpu_m.mcpu_cpi;
6609 
6610         /* confirm OSVW supported */
6611         if (osvwfeature == -1) {
6612                 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6613         } else {
6614                 /* assert that osvw feature setting is consistent on all cpus */
6615                 ASSERT(osvwfeature ==
6616                     (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6617         }
6618         if (!osvwfeature)
6619                 return (-1);
6620 
6621         osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6622 
6623         switch (erratum) {
6624         case 298:       /* osvwid is 0 */
6625                 osvwid = 0;
6626                 if (osvwlength <= (uint64_t)osvwid) {
6627                         /* osvwid 0 is unknown */
6628                         return (-1);
6629                 }
6630 
6631                 /*
6632                  * Check the OSVW STATUS MSR to determine the state
6633                  * of the erratum where:
6634                  *   0 - fixed by HW
6635                  *   1 - BIOS has applied the workaround when BIOS
6636                  *   workaround is available. (Or for other errata,
6637                  *   OS workaround is required.)
6638                  * For a value of 1, caller will confirm that the
6639                  * erratum 298 workaround has indeed been applied by BIOS.
6640                  *
6641                  * A 1 may be set in cpus that have a HW fix
6642                  * in a mixed cpu system. Regarding erratum 298:
6643                  *   In a multiprocessor platform, the workaround above
6644                  *   should be applied to all processors regardless of
6645                  *   silicon revision when an affected processor is
6646                  *   present.
6647                  */
6648 
6649                 return (rdmsr(MSR_AMD_OSVW_STATUS +
6650                     (osvwid / OSVW_ID_CNT_PER_MSR)) &
6651                     (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6652 
6653         default:
6654                 return (-1);
6655         }
6656 }
6657 
6658 static const char assoc_str[] = "associativity";
6659 static const char line_str[] = "line-size";
6660 static const char size_str[] = "size";
6661 
6662 static void
6663 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6664     uint32_t val)
6665 {
6666         char buf[128];
6667 
6668         /*
6669          * ndi_prop_update_int() is used because it is desirable for
6670          * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6671          */
6672         if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6673                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6674 }
6675 
6676 /*
6677  * Intel-style cache/tlb description
6678  *
6679  * Standard cpuid level 2 gives a randomly ordered
6680  * selection of tags that index into a table that describes
6681  * cache and tlb properties.
6682  */
6683 
6684 static const char l1_icache_str[] = "l1-icache";
6685 static const char l1_dcache_str[] = "l1-dcache";
6686 static const char l2_cache_str[] = "l2-cache";
6687 static const char l3_cache_str[] = "l3-cache";
6688 static const char itlb4k_str[] = "itlb-4K";
6689 static const char dtlb4k_str[] = "dtlb-4K";
6690 static const char itlb2M_str[] = "itlb-2M";
6691 static const char itlb4M_str[] = "itlb-4M";
6692 static const char dtlb4M_str[] = "dtlb-4M";
6693 static const char dtlb24_str[] = "dtlb0-2M-4M";
6694 static const char itlb424_str[] = "itlb-4K-2M-4M";
6695 static const char itlb24_str[] = "itlb-2M-4M";
6696 static const char dtlb44_str[] = "dtlb-4K-4M";
6697 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6698 static const char sl2_cache_str[] = "sectored-l2-cache";
6699 static const char itrace_str[] = "itrace-cache";
6700 static const char sl3_cache_str[] = "sectored-l3-cache";
6701 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6702 
6703 static const struct cachetab {
6704         uint8_t         ct_code;
6705         uint8_t         ct_assoc;
6706         uint16_t        ct_line_size;
6707         size_t          ct_size;
6708         const char      *ct_label;
6709 } intel_ctab[] = {
6710         /*
6711          * maintain descending order!
6712          *
6713          * Codes ignored - Reason
6714          * ----------------------
6715          * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6716          * f0H/f1H - Currently we do not interpret prefetch size by design
6717          */
6718         { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6719         { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6720         { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6721         { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6722         { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6723         { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6724         { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6725         { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6726         { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6727         { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6728         { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6729         { 0xd0, 4, 64, 512*1024, l3_cache_str},
6730         { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6731         { 0xc0, 4, 0, 8, dtlb44_str },
6732         { 0xba, 4, 0, 64, dtlb4k_str },
6733         { 0xb4, 4, 0, 256, dtlb4k_str },
6734         { 0xb3, 4, 0, 128, dtlb4k_str },
6735         { 0xb2, 4, 0, 64, itlb4k_str },
6736         { 0xb0, 4, 0, 128, itlb4k_str },
6737         { 0x87, 8, 64, 1024*1024, l2_cache_str},
6738         { 0x86, 4, 64, 512*1024, l2_cache_str},
6739         { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6740         { 0x84, 8, 32, 1024*1024, l2_cache_str},
6741         { 0x83, 8, 32, 512*1024, l2_cache_str},
6742         { 0x82, 8, 32, 256*1024, l2_cache_str},
6743         { 0x80, 8, 64, 512*1024, l2_cache_str},
6744         { 0x7f, 2, 64, 512*1024, l2_cache_str},
6745         { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6746         { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6747         { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6748         { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6749         { 0x79, 8, 64, 128*1024, sl2_cache_str},
6750         { 0x78, 8, 64, 1024*1024, l2_cache_str},
6751         { 0x73, 8, 0, 64*1024, itrace_str},
6752         { 0x72, 8, 0, 32*1024, itrace_str},
6753         { 0x71, 8, 0, 16*1024, itrace_str},
6754         { 0x70, 8, 0, 12*1024, itrace_str},
6755         { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6756         { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6757         { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6758         { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6759         { 0x5d, 0, 0, 256, dtlb44_str},
6760         { 0x5c, 0, 0, 128, dtlb44_str},
6761         { 0x5b, 0, 0, 64, dtlb44_str},
6762         { 0x5a, 4, 0, 32, dtlb24_str},
6763         { 0x59, 0, 0, 16, dtlb4k_str},
6764         { 0x57, 4, 0, 16, dtlb4k_str},
6765         { 0x56, 4, 0, 16, dtlb4M_str},
6766         { 0x55, 0, 0, 7, itlb24_str},
6767         { 0x52, 0, 0, 256, itlb424_str},
6768         { 0x51, 0, 0, 128, itlb424_str},
6769         { 0x50, 0, 0, 64, itlb424_str},
6770         { 0x4f, 0, 0, 32, itlb4k_str},
6771         { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6772         { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6773         { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6774         { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6775         { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6776         { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6777         { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6778         { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6779         { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6780         { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6781         { 0x44, 4, 32, 1024*1024, l2_cache_str},
6782         { 0x43, 4, 32, 512*1024, l2_cache_str},
6783         { 0x42, 4, 32, 256*1024, l2_cache_str},
6784         { 0x41, 4, 32, 128*1024, l2_cache_str},
6785         { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6786         { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6787         { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6788         { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6789         { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6790         { 0x39, 4, 64, 128*1024, sl2_cache_str},
6791         { 0x30, 8, 64, 32*1024, l1_icache_str},
6792         { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6793         { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6794         { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6795         { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6796         { 0x22, 4, 64, 512*1024, sl3_cache_str},
6797         { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6798         { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6799         { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6800         { 0x0b, 4, 0, 4, itlb4M_str},
6801         { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6802         { 0x08, 4, 32, 16*1024, l1_icache_str},
6803         { 0x06, 4, 32, 8*1024, l1_icache_str},
6804         { 0x05, 4, 0, 32, dtlb4M_str},
6805         { 0x04, 4, 0, 8, dtlb4M_str},
6806         { 0x03, 4, 0, 64, dtlb4k_str},
6807         { 0x02, 4, 0, 2, itlb4M_str},
6808         { 0x01, 4, 0, 32, itlb4k_str},
6809         { 0 }
6810 };
6811 
6812 static const struct cachetab cyrix_ctab[] = {
6813         { 0x70, 4, 0, 32, "tlb-4K" },
6814         { 0x80, 4, 16, 16*1024, "l1-cache" },
6815         { 0 }
6816 };
6817 
6818 /*
6819  * Search a cache table for a matching entry
6820  */
6821 static const struct cachetab *
6822 find_cacheent(const struct cachetab *ct, uint_t code)
6823 {
6824         if (code != 0) {
6825                 for (; ct->ct_code != 0; ct++)
6826                         if (ct->ct_code <= code)
6827                                 break;
6828                 if (ct->ct_code == code)
6829                         return (ct);
6830         }
6831         return (NULL);
6832 }
6833 
6834 /*
6835  * Populate cachetab entry with L2 or L3 cache-information using
6836  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6837  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6838  * information is found.
6839  */
6840 static int
6841 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6842 {
6843         uint32_t level, i;
6844         int ret = 0;
6845 
6846         for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6847                 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6848 
6849                 if (level == 2 || level == 3) {
6850                         ct->ct_assoc =
6851                             CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6852                         ct->ct_line_size =
6853                             CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6854                         ct->ct_size = ct->ct_assoc *
6855                             (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6856                             ct->ct_line_size *
6857                             (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6858 
6859                         if (level == 2) {
6860                                 ct->ct_label = l2_cache_str;
6861                         } else if (level == 3) {
6862                                 ct->ct_label = l3_cache_str;
6863                         }
6864                         ret = 1;
6865                 }
6866         }
6867 
6868         return (ret);
6869 }
6870 
6871 /*
6872  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6873  * The walk is terminated if the walker returns non-zero.
6874  */
6875 static void
6876 intel_walk_cacheinfo(struct cpuid_info *cpi,
6877     void *arg, int (*func)(void *, const struct cachetab *))
6878 {
6879         const struct cachetab *ct;
6880         struct cachetab des_49_ct, des_b1_ct;
6881         uint8_t *dp;
6882         int i;
6883 
6884         if ((dp = cpi->cpi_cacheinfo) == NULL)
6885                 return;
6886         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6887                 /*
6888                  * For overloaded descriptor 0x49 we use cpuid function 4
6889                  * if supported by the current processor, to create
6890                  * cache information.
6891                  * For overloaded descriptor 0xb1 we use X86_PAE flag
6892                  * to disambiguate the cache information.
6893                  */
6894                 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6895                     intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6896                                 ct = &des_49_ct;
6897                 } else if (*dp == 0xb1) {
6898                         des_b1_ct.ct_code = 0xb1;
6899                         des_b1_ct.ct_assoc = 4;
6900                         des_b1_ct.ct_line_size = 0;
6901                         if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6902                                 des_b1_ct.ct_size = 8;
6903                                 des_b1_ct.ct_label = itlb2M_str;
6904                         } else {
6905                                 des_b1_ct.ct_size = 4;
6906                                 des_b1_ct.ct_label = itlb4M_str;
6907                         }
6908                         ct = &des_b1_ct;
6909                 } else {
6910                         if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6911                                 continue;
6912                         }
6913                 }
6914 
6915                 if (func(arg, ct) != 0) {
6916                         break;
6917                 }
6918         }
6919 }
6920 
6921 /*
6922  * (Like the Intel one, except for Cyrix CPUs)
6923  */
6924 static void
6925 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6926     void *arg, int (*func)(void *, const struct cachetab *))
6927 {
6928         const struct cachetab *ct;
6929         uint8_t *dp;
6930         int i;
6931 
6932         if ((dp = cpi->cpi_cacheinfo) == NULL)
6933                 return;
6934         for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6935                 /*
6936                  * Search Cyrix-specific descriptor table first ..
6937                  */
6938                 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6939                         if (func(arg, ct) != 0)
6940                                 break;
6941                         continue;
6942                 }
6943                 /*
6944                  * .. else fall back to the Intel one
6945                  */
6946                 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6947                         if (func(arg, ct) != 0)
6948                                 break;
6949                         continue;
6950                 }
6951         }
6952 }
6953 
6954 /*
6955  * A cacheinfo walker that adds associativity, line-size, and size properties
6956  * to the devinfo node it is passed as an argument.
6957  */
6958 static int
6959 add_cacheent_props(void *arg, const struct cachetab *ct)
6960 {
6961         dev_info_t *devi = arg;
6962 
6963         add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6964         if (ct->ct_line_size != 0)
6965                 add_cache_prop(devi, ct->ct_label, line_str,
6966                     ct->ct_line_size);
6967         add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6968         return (0);
6969 }
6970 
6971 
6972 static const char fully_assoc[] = "fully-associative?";
6973 
6974 /*
6975  * AMD style cache/tlb description
6976  *
6977  * Extended functions 5 and 6 directly describe properties of
6978  * tlbs and various cache levels.
6979  */
6980 static void
6981 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6982 {
6983         switch (assoc) {
6984         case 0: /* reserved; ignore */
6985                 break;
6986         default:
6987                 add_cache_prop(devi, label, assoc_str, assoc);
6988                 break;
6989         case 0xff:
6990                 add_cache_prop(devi, label, fully_assoc, 1);
6991                 break;
6992         }
6993 }
6994 
6995 static void
6996 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6997 {
6998         if (size == 0)
6999                 return;
7000         add_cache_prop(devi, label, size_str, size);
7001         add_amd_assoc(devi, label, assoc);
7002 }
7003 
7004 static void
7005 add_amd_cache(dev_info_t *devi, const char *label,
7006     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7007 {
7008         if (size == 0 || line_size == 0)
7009                 return;
7010         add_amd_assoc(devi, label, assoc);
7011         /*
7012          * Most AMD parts have a sectored cache. Multiple cache lines are
7013          * associated with each tag. A sector consists of all cache lines
7014          * associated with a tag. For example, the AMD K6-III has a sector
7015          * size of 2 cache lines per tag.
7016          */
7017         if (lines_per_tag != 0)
7018                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7019         add_cache_prop(devi, label, line_str, line_size);
7020         add_cache_prop(devi, label, size_str, size * 1024);
7021 }
7022 
7023 static void
7024 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7025 {
7026         switch (assoc) {
7027         case 0: /* off */
7028                 break;
7029         case 1:
7030         case 2:
7031         case 4:
7032                 add_cache_prop(devi, label, assoc_str, assoc);
7033                 break;
7034         case 6:
7035                 add_cache_prop(devi, label, assoc_str, 8);
7036                 break;
7037         case 8:
7038                 add_cache_prop(devi, label, assoc_str, 16);
7039                 break;
7040         case 0xf:
7041                 add_cache_prop(devi, label, fully_assoc, 1);
7042                 break;
7043         default: /* reserved; ignore */
7044                 break;
7045         }
7046 }
7047 
7048 static void
7049 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7050 {
7051         if (size == 0 || assoc == 0)
7052                 return;
7053         add_amd_l2_assoc(devi, label, assoc);
7054         add_cache_prop(devi, label, size_str, size);
7055 }
7056 
7057 static void
7058 add_amd_l2_cache(dev_info_t *devi, const char *label,
7059     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7060 {
7061         if (size == 0 || assoc == 0 || line_size == 0)
7062                 return;
7063         add_amd_l2_assoc(devi, label, assoc);
7064         if (lines_per_tag != 0)
7065                 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7066         add_cache_prop(devi, label, line_str, line_size);
7067         add_cache_prop(devi, label, size_str, size * 1024);
7068 }
7069 
7070 static void
7071 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
7072 {
7073         struct cpuid_regs *cp;
7074 
7075         if (cpi->cpi_xmaxeax < 0x80000005)
7076                 return;
7077         cp = &cpi->cpi_extd[5];
7078 
7079         /*
7080          * 4M/2M L1 TLB configuration
7081          *
7082          * We report the size for 2M pages because AMD uses two
7083          * TLB entries for one 4M page.
7084          */
7085         add_amd_tlb(devi, "dtlb-2M",
7086             BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
7087         add_amd_tlb(devi, "itlb-2M",
7088             BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
7089 
7090         /*
7091          * 4K L1 TLB configuration
7092          */
7093 
7094         switch (cpi->cpi_vendor) {
7095                 uint_t nentries;
7096         case X86_VENDOR_TM:
7097                 if (cpi->cpi_family >= 5) {
7098                         /*
7099                          * Crusoe processors have 256 TLB entries, but
7100                          * cpuid data format constrains them to only
7101                          * reporting 255 of them.
7102                          */
7103                         if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
7104                                 nentries = 256;
7105                         /*
7106                          * Crusoe processors also have a unified TLB
7107                          */
7108                         add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
7109                             nentries);
7110                         break;
7111                 }
7112                 /*FALLTHROUGH*/
7113         default:
7114                 add_amd_tlb(devi, itlb4k_str,
7115                     BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
7116                 add_amd_tlb(devi, dtlb4k_str,
7117                     BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
7118                 break;
7119         }
7120 
7121         /*
7122          * data L1 cache configuration
7123          */
7124 
7125         add_amd_cache(devi, l1_dcache_str,
7126             BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
7127             BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
7128 
7129         /*
7130          * code L1 cache configuration
7131          */
7132 
7133         add_amd_cache(devi, l1_icache_str,
7134             BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
7135             BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
7136 
7137         if (cpi->cpi_xmaxeax < 0x80000006)
7138                 return;
7139         cp = &cpi->cpi_extd[6];
7140 
7141         /* Check for a unified L2 TLB for large pages */
7142 
7143         if (BITX(cp->cp_eax, 31, 16) == 0)
7144                 add_amd_l2_tlb(devi, "l2-tlb-2M",
7145                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7146         else {
7147                 add_amd_l2_tlb(devi, "l2-dtlb-2M",
7148                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7149                 add_amd_l2_tlb(devi, "l2-itlb-2M",
7150                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7151         }
7152 
7153         /* Check for a unified L2 TLB for 4K pages */
7154 
7155         if (BITX(cp->cp_ebx, 31, 16) == 0) {
7156                 add_amd_l2_tlb(devi, "l2-tlb-4K",
7157                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7158         } else {
7159                 add_amd_l2_tlb(devi, "l2-dtlb-4K",
7160                     BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7161                 add_amd_l2_tlb(devi, "l2-itlb-4K",
7162                     BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7163         }
7164 
7165         add_amd_l2_cache(devi, l2_cache_str,
7166             BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7167             BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7168 }
7169 
7170 /*
7171  * There are two basic ways that the x86 world describes it cache
7172  * and tlb architecture - Intel's way and AMD's way.
7173  *
7174  * Return which flavor of cache architecture we should use
7175  */
7176 static int
7177 x86_which_cacheinfo(struct cpuid_info *cpi)
7178 {
7179         switch (cpi->cpi_vendor) {
7180         case X86_VENDOR_Intel:
7181                 if (cpi->cpi_maxeax >= 2)
7182                         return (X86_VENDOR_Intel);
7183                 break;
7184         case X86_VENDOR_AMD:
7185                 /*
7186                  * The K5 model 1 was the first part from AMD that reported
7187                  * cache sizes via extended cpuid functions.
7188                  */
7189                 if (cpi->cpi_family > 5 ||
7190                     (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7191                         return (X86_VENDOR_AMD);
7192                 break;
7193         case X86_VENDOR_HYGON:
7194                 return (X86_VENDOR_AMD);
7195         case X86_VENDOR_TM:
7196                 if (cpi->cpi_family >= 5)
7197                         return (X86_VENDOR_AMD);
7198                 /*FALLTHROUGH*/
7199         default:
7200                 /*
7201                  * If they have extended CPU data for 0x80000005
7202                  * then we assume they have AMD-format cache
7203                  * information.
7204                  *
7205                  * If not, and the vendor happens to be Cyrix,
7206                  * then try our-Cyrix specific handler.
7207                  *
7208                  * If we're not Cyrix, then assume we're using Intel's
7209                  * table-driven format instead.
7210                  */
7211                 if (cpi->cpi_xmaxeax >= 0x80000005)
7212                         return (X86_VENDOR_AMD);
7213                 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7214                         return (X86_VENDOR_Cyrix);
7215                 else if (cpi->cpi_maxeax >= 2)
7216                         return (X86_VENDOR_Intel);
7217                 break;
7218         }
7219         return (-1);
7220 }
7221 
7222 void
7223 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7224     struct cpuid_info *cpi)
7225 {
7226         dev_info_t *cpu_devi;
7227         int create;
7228 
7229         cpu_devi = (dev_info_t *)dip;
7230 
7231         /* device_type */
7232         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7233             "device_type", "cpu");
7234 
7235         /* reg */
7236         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7237             "reg", cpu_id);
7238 
7239         /* cpu-mhz, and clock-frequency */
7240         if (cpu_freq > 0) {
7241                 long long mul;
7242 
7243                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7244                     "cpu-mhz", cpu_freq);
7245                 if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7246                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7247                             "clock-frequency", (int)mul);
7248         }
7249 
7250         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7251 
7252         /* vendor-id */
7253         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7254             "vendor-id", cpi->cpi_vendorstr);
7255 
7256         if (cpi->cpi_maxeax == 0) {
7257                 return;
7258         }
7259 
7260         /*
7261          * family, model, and step
7262          */
7263         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7264             "family", CPI_FAMILY(cpi));
7265         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7266             "cpu-model", CPI_MODEL(cpi));
7267         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7268             "stepping-id", CPI_STEP(cpi));
7269 
7270         /* type */
7271         switch (cpi->cpi_vendor) {
7272         case X86_VENDOR_Intel:
7273                 create = 1;
7274                 break;
7275         default:
7276                 create = 0;
7277                 break;
7278         }
7279         if (create)
7280                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7281                     "type", CPI_TYPE(cpi));
7282 
7283         /* ext-family */
7284         switch (cpi->cpi_vendor) {
7285         case X86_VENDOR_Intel:
7286         case X86_VENDOR_AMD:
7287                 create = cpi->cpi_family >= 0xf;
7288                 break;
7289         case X86_VENDOR_HYGON:
7290                 create = 1;
7291                 break;
7292         default:
7293                 create = 0;
7294                 break;
7295         }
7296         if (create)
7297                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7298                     "ext-family", CPI_FAMILY_XTD(cpi));
7299 
7300         /* ext-model */
7301         switch (cpi->cpi_vendor) {
7302         case X86_VENDOR_Intel:
7303                 create = IS_EXTENDED_MODEL_INTEL(cpi);
7304                 break;
7305         case X86_VENDOR_AMD:
7306                 create = CPI_FAMILY(cpi) == 0xf;
7307                 break;
7308         case X86_VENDOR_HYGON:
7309                 create = 1;
7310                 break;
7311         default:
7312                 create = 0;
7313                 break;
7314         }
7315         if (create)
7316                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7317                     "ext-model", CPI_MODEL_XTD(cpi));
7318 
7319         /* generation */
7320         switch (cpi->cpi_vendor) {
7321         case X86_VENDOR_AMD:
7322         case X86_VENDOR_HYGON:
7323                 /*
7324                  * AMD K5 model 1 was the first part to support this
7325                  */
7326                 create = cpi->cpi_xmaxeax >= 0x80000001;
7327                 break;
7328         default:
7329                 create = 0;
7330                 break;
7331         }
7332         if (create)
7333                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7334                     "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7335 
7336         /* brand-id */
7337         switch (cpi->cpi_vendor) {
7338         case X86_VENDOR_Intel:
7339                 /*
7340                  * brand id first appeared on Pentium III Xeon model 8,
7341                  * and Celeron model 8 processors and Opteron
7342                  */
7343                 create = cpi->cpi_family > 6 ||
7344                     (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7345                 break;
7346         case X86_VENDOR_AMD:
7347                 create = cpi->cpi_family >= 0xf;
7348                 break;
7349         case X86_VENDOR_HYGON:
7350                 create = 1;
7351                 break;
7352         default:
7353                 create = 0;
7354                 break;
7355         }
7356         if (create && cpi->cpi_brandid != 0) {
7357                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7358                     "brand-id", cpi->cpi_brandid);
7359         }
7360 
7361         /* chunks, and apic-id */
7362         switch (cpi->cpi_vendor) {
7363                 /*
7364                  * first available on Pentium IV and Opteron (K8)
7365                  */
7366         case X86_VENDOR_Intel:
7367                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7368                 break;
7369         case X86_VENDOR_AMD:
7370                 create = cpi->cpi_family >= 0xf;
7371                 break;
7372         case X86_VENDOR_HYGON:
7373                 create = 1;
7374                 break;
7375         default:
7376                 create = 0;
7377                 break;
7378         }
7379         if (create) {
7380                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7381                     "chunks", CPI_CHUNKS(cpi));
7382                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7383                     "apic-id", cpi->cpi_apicid);
7384                 if (cpi->cpi_chipid >= 0) {
7385                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7386                             "chip#", cpi->cpi_chipid);
7387                         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7388                             "clog#", cpi->cpi_clogid);
7389                 }
7390         }
7391 
7392         /* cpuid-features */
7393         (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7394             "cpuid-features", CPI_FEATURES_EDX(cpi));
7395 
7396 
7397         /* cpuid-features-ecx */
7398         switch (cpi->cpi_vendor) {
7399         case X86_VENDOR_Intel:
7400                 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7401                 break;
7402         case X86_VENDOR_AMD:
7403                 create = cpi->cpi_family >= 0xf;
7404                 break;
7405         case X86_VENDOR_HYGON:
7406                 create = 1;
7407                 break;
7408         default:
7409                 create = 0;
7410                 break;
7411         }
7412         if (create)
7413                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7414                     "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7415 
7416         /* ext-cpuid-features */
7417         switch (cpi->cpi_vendor) {
7418         case X86_VENDOR_Intel:
7419         case X86_VENDOR_AMD:
7420         case X86_VENDOR_HYGON:
7421         case X86_VENDOR_Cyrix:
7422         case X86_VENDOR_TM:
7423         case X86_VENDOR_Centaur:
7424                 create = cpi->cpi_xmaxeax >= 0x80000001;
7425                 break;
7426         default:
7427                 create = 0;
7428                 break;
7429         }
7430         if (create) {
7431                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7432                     "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7433                 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7434                     "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7435         }
7436 
7437         /*
7438          * Brand String first appeared in Intel Pentium IV, AMD K5
7439          * model 1, and Cyrix GXm.  On earlier models we try and
7440          * simulate something similar .. so this string should always
7441          * same -something- about the processor, however lame.
7442          */
7443         (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7444             "brand-string", cpi->cpi_brandstr);
7445 
7446         /*
7447          * Finally, cache and tlb information
7448          */
7449         switch (x86_which_cacheinfo(cpi)) {
7450         case X86_VENDOR_Intel:
7451                 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7452                 break;
7453         case X86_VENDOR_Cyrix:
7454                 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7455                 break;
7456         case X86_VENDOR_AMD:
7457                 amd_cache_info(cpi, cpu_devi);
7458                 break;
7459         default:
7460                 break;
7461         }
7462 }
7463 
7464 struct l2info {
7465         int *l2i_csz;
7466         int *l2i_lsz;
7467         int *l2i_assoc;
7468         int l2i_ret;
7469 };
7470 
7471 /*
7472  * A cacheinfo walker that fetches the size, line-size and associativity
7473  * of the L2 cache
7474  */
7475 static int
7476 intel_l2cinfo(void *arg, const struct cachetab *ct)
7477 {
7478         struct l2info *l2i = arg;
7479         int *ip;
7480 
7481         if (ct->ct_label != l2_cache_str &&
7482             ct->ct_label != sl2_cache_str)
7483                 return (0);     /* not an L2 -- keep walking */
7484 
7485         if ((ip = l2i->l2i_csz) != NULL)
7486                 *ip = ct->ct_size;
7487         if ((ip = l2i->l2i_lsz) != NULL)
7488                 *ip = ct->ct_line_size;
7489         if ((ip = l2i->l2i_assoc) != NULL)
7490                 *ip = ct->ct_assoc;
7491         l2i->l2i_ret = ct->ct_size;
7492         return (1);             /* was an L2 -- terminate walk */
7493 }
7494 
7495 /*
7496  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7497  *
7498  *      Unlike the associativity for the L1 cache and tlb where the 8 bit
7499  *      value is the associativity, the associativity for the L2 cache and
7500  *      tlb is encoded in the following table. The 4 bit L2 value serves as
7501  *      an index into the amd_afd[] array to determine the associativity.
7502  *      -1 is undefined. 0 is fully associative.
7503  */
7504 
7505 static int amd_afd[] =
7506         {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7507 
7508 static void
7509 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7510 {
7511         struct cpuid_regs *cp;
7512         uint_t size, assoc;
7513         int i;
7514         int *ip;
7515 
7516         if (cpi->cpi_xmaxeax < 0x80000006)
7517                 return;
7518         cp = &cpi->cpi_extd[6];
7519 
7520         if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7521             (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7522                 uint_t cachesz = size * 1024;
7523                 assoc = amd_afd[i];
7524 
7525                 ASSERT(assoc != -1);
7526 
7527                 if ((ip = l2i->l2i_csz) != NULL)
7528                         *ip = cachesz;
7529                 if ((ip = l2i->l2i_lsz) != NULL)
7530                         *ip = BITX(cp->cp_ecx, 7, 0);
7531                 if ((ip = l2i->l2i_assoc) != NULL)
7532                         *ip = assoc;
7533                 l2i->l2i_ret = cachesz;
7534         }
7535 }
7536 
7537 int
7538 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7539 {
7540         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7541         struct l2info __l2info, *l2i = &__l2info;
7542 
7543         l2i->l2i_csz = csz;
7544         l2i->l2i_lsz = lsz;
7545         l2i->l2i_assoc = assoc;
7546         l2i->l2i_ret = -1;
7547 
7548         switch (x86_which_cacheinfo(cpi)) {
7549         case X86_VENDOR_Intel:
7550                 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7551                 break;
7552         case X86_VENDOR_Cyrix:
7553                 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7554                 break;
7555         case X86_VENDOR_AMD:
7556                 amd_l2cacheinfo(cpi, l2i);
7557                 break;
7558         default:
7559                 break;
7560         }
7561         return (l2i->l2i_ret);
7562 }
7563 
7564 #if !defined(__xpv)
7565 
7566 uint32_t *
7567 cpuid_mwait_alloc(cpu_t *cpu)
7568 {
7569         uint32_t        *ret;
7570         size_t          mwait_size;
7571 
7572         ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7573 
7574         mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7575         if (mwait_size == 0)
7576                 return (NULL);
7577 
7578         /*
7579          * kmem_alloc() returns cache line size aligned data for mwait_size
7580          * allocations.  mwait_size is currently cache line sized.  Neither
7581          * of these implementation details are guarantied to be true in the
7582          * future.
7583          *
7584          * First try allocating mwait_size as kmem_alloc() currently returns
7585          * correctly aligned memory.  If kmem_alloc() does not return
7586          * mwait_size aligned memory, then use mwait_size ROUNDUP.
7587          *
7588          * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7589          * decide to free this memory.
7590          */
7591         ret = kmem_zalloc(mwait_size, KM_SLEEP);
7592         if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7593                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7594                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7595                 *ret = MWAIT_RUNNING;
7596                 return (ret);
7597         } else {
7598                 kmem_free(ret, mwait_size);
7599                 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7600                 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7601                 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7602                 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7603                 *ret = MWAIT_RUNNING;
7604                 return (ret);
7605         }
7606 }
7607 
7608 void
7609 cpuid_mwait_free(cpu_t *cpu)
7610 {
7611         if (cpu->cpu_m.mcpu_cpi == NULL) {
7612                 return;
7613         }
7614 
7615         if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7616             cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7617                 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7618                     cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7619         }
7620 
7621         cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7622         cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7623 }
7624 
7625 void
7626 patch_tsc_read(int flag)
7627 {
7628         size_t cnt;
7629 
7630         switch (flag) {
7631         case TSC_NONE:
7632                 cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7633                 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7634                 break;
7635         case TSC_RDTSC_LFENCE:
7636                 cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7637                 (void) memcpy((void *)tsc_read,
7638                     (void *)&_tsc_lfence_start, cnt);
7639                 break;
7640         case TSC_TSCP:
7641                 cnt = &_tscp_end - &_tscp_start;
7642                 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7643                 break;
7644         default:
7645                 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7646                 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7647                 break;
7648         }
7649         tsc_type = flag;
7650 }
7651 
7652 int
7653 cpuid_deep_cstates_supported(void)
7654 {
7655         struct cpuid_info *cpi;
7656         struct cpuid_regs regs;
7657 
7658         ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7659         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7660 
7661         cpi = CPU->cpu_m.mcpu_cpi;
7662 
7663         switch (cpi->cpi_vendor) {
7664         case X86_VENDOR_Intel:
7665                 if (cpi->cpi_xmaxeax < 0x80000007)
7666                         return (0);
7667 
7668                 /*
7669                  * Does TSC run at a constant rate in all C-states?
7670                  */
7671                 regs.cp_eax = 0x80000007;
7672                 (void) __cpuid_insn(&regs);
7673                 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7674 
7675         default:
7676                 return (0);
7677         }
7678 }
7679 
7680 #endif  /* !__xpv */
7681 
7682 void
7683 post_startup_cpu_fixups(void)
7684 {
7685 #ifndef __xpv
7686         /*
7687          * Some AMD processors support C1E state. Entering this state will
7688          * cause the local APIC timer to stop, which we can't deal with at
7689          * this time.
7690          */
7691         if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7692                 on_trap_data_t otd;
7693                 uint64_t reg;
7694 
7695                 if (!on_trap(&otd, OT_DATA_ACCESS)) {
7696                         reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7697                         /* Disable C1E state if it is enabled by BIOS */
7698                         if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7699                             AMD_ACTONCMPHALT_MASK) {
7700                                 reg &= ~(AMD_ACTONCMPHALT_MASK <<
7701                                     AMD_ACTONCMPHALT_SHIFT);
7702                                 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7703                         }
7704                 }
7705                 no_trap();
7706         }
7707 #endif  /* !__xpv */
7708 }
7709 
7710 void
7711 enable_pcid(void)
7712 {
7713         if (x86_use_pcid == -1)
7714                 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7715 
7716         if (x86_use_invpcid == -1) {
7717                 x86_use_invpcid = is_x86_feature(x86_featureset,
7718                     X86FSET_INVPCID);
7719         }
7720 
7721         if (!x86_use_pcid)
7722                 return;
7723 
7724         /*
7725          * Intel say that on setting PCIDE, it immediately starts using the PCID
7726          * bits; better make sure there's nothing there.
7727          */
7728         ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7729 
7730         setcr4(getcr4() | CR4_PCIDE);
7731 }
7732 
7733 /*
7734  * Setup necessary registers to enable XSAVE feature on this processor.
7735  * This function needs to be called early enough, so that no xsave/xrstor
7736  * ops will execute on the processor before the MSRs are properly set up.
7737  *
7738  * Current implementation has the following assumption:
7739  * - cpuid_pass_basic() is done, so that X86 features are known.
7740  * - fpu_probe() is done, so that fp_save_mech is chosen.
7741  */
7742 void
7743 xsave_setup_msr(cpu_t *cpu)
7744 {
7745         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7746         ASSERT(fp_save_mech == FP_XSAVE);
7747         ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7748 
7749         /* Enable OSXSAVE in CR4. */
7750         setcr4(getcr4() | CR4_OSXSAVE);
7751         /*
7752          * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7753          * correct value.
7754          */
7755         cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7756         setup_xfem();
7757 }
7758 
7759 /*
7760  * Starting with the Westmere processor the local
7761  * APIC timer will continue running in all C-states,
7762  * including the deepest C-states.
7763  */
7764 int
7765 cpuid_arat_supported(void)
7766 {
7767         struct cpuid_info *cpi;
7768         struct cpuid_regs regs;
7769 
7770         ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7771         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7772 
7773         cpi = CPU->cpu_m.mcpu_cpi;
7774 
7775         switch (cpi->cpi_vendor) {
7776         case X86_VENDOR_Intel:
7777                 /*
7778                  * Always-running Local APIC Timer is
7779                  * indicated by CPUID.6.EAX[2].
7780                  */
7781                 if (cpi->cpi_maxeax >= 6) {
7782                         regs.cp_eax = 6;
7783                         (void) cpuid_insn(NULL, &regs);
7784                         return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7785                 } else {
7786                         return (0);
7787                 }
7788         default:
7789                 return (0);
7790         }
7791 }
7792 
7793 /*
7794  * Check support for Intel ENERGY_PERF_BIAS feature
7795  */
7796 int
7797 cpuid_iepb_supported(struct cpu *cp)
7798 {
7799         struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7800         struct cpuid_regs regs;
7801 
7802         ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7803         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7804 
7805         if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7806                 return (0);
7807         }
7808 
7809         /*
7810          * Intel ENERGY_PERF_BIAS MSR is indicated by
7811          * capability bit CPUID.6.ECX.3
7812          */
7813         if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7814                 return (0);
7815 
7816         regs.cp_eax = 0x6;
7817         (void) cpuid_insn(NULL, &regs);
7818         return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7819 }
7820 
7821 /*
7822  * Check support for TSC deadline timer
7823  *
7824  * TSC deadline timer provides a superior software programming
7825  * model over local APIC timer that eliminates "time drifts".
7826  * Instead of specifying a relative time, software specifies an
7827  * absolute time as the target at which the processor should
7828  * generate a timer event.
7829  */
7830 int
7831 cpuid_deadline_tsc_supported(void)
7832 {
7833         struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7834         struct cpuid_regs regs;
7835 
7836         ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7837         ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7838 
7839         switch (cpi->cpi_vendor) {
7840         case X86_VENDOR_Intel:
7841                 if (cpi->cpi_maxeax >= 1) {
7842                         regs.cp_eax = 1;
7843                         (void) cpuid_insn(NULL, &regs);
7844                         return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7845                 } else {
7846                         return (0);
7847                 }
7848         default:
7849                 return (0);
7850         }
7851 }
7852 
7853 #if !defined(__xpv)
7854 /*
7855  * Patch in versions of bcopy for high performance Intel Nhm processors
7856  * and later...
7857  */
7858 void
7859 patch_memops(uint_t vendor)
7860 {
7861         size_t cnt, i;
7862         caddr_t to, from;
7863 
7864         if ((vendor == X86_VENDOR_Intel) &&
7865             is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7866                 cnt = &bcopy_patch_end - &bcopy_patch_start;
7867                 to = &bcopy_ck_size;
7868                 from = &bcopy_patch_start;
7869                 for (i = 0; i < cnt; i++) {
7870                         *to++ = *from++;
7871                 }
7872         }
7873 }
7874 #endif  /*  !__xpv */
7875 
7876 /*
7877  * We're being asked to tell the system how many bits are required to represent
7878  * the various thread and strand IDs. While it's tempting to derive this based
7879  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7880  * correct. Instead, this needs to be based on the number of bits that the APIC
7881  * allows for these different configurations. We only update these to a larger
7882  * value if we find one.
7883  */
7884 void
7885 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7886 {
7887         struct cpuid_info *cpi;
7888 
7889         VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7890         cpi = cpu->cpu_m.mcpu_cpi;
7891 
7892         if (cpi->cpi_ncore_bits > *core_nbits) {
7893                 *core_nbits = cpi->cpi_ncore_bits;
7894         }
7895 
7896         if (cpi->cpi_nthread_bits > *strand_nbits) {
7897                 *strand_nbits = cpi->cpi_nthread_bits;
7898         }
7899 }
7900 
7901 void
7902 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7903 {
7904         struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7905         struct cpuid_regs cp;
7906 
7907         /*
7908          * Reread the CPUID portions that we need for various security
7909          * information.
7910          */
7911         if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7912                 /*
7913                  * Check if we now have leaf 7 available to us.
7914                  */
7915                 if (cpi->cpi_maxeax < 7) {
7916                         bzero(&cp, sizeof (cp));
7917                         cp.cp_eax = 0;
7918                         cpi->cpi_maxeax = __cpuid_insn(&cp);
7919                         if (cpi->cpi_maxeax < 7)
7920                                 return;
7921                 }
7922 
7923                 bzero(&cp, sizeof (cp));
7924                 cp.cp_eax = 7;
7925                 cp.cp_ecx = 0;
7926                 (void) __cpuid_insn(&cp);
7927                 cpi->cpi_std[7] = cp;
7928         } else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
7929             cpi->cpi_vendor == X86_VENDOR_HYGON) {
7930                 /* No xcpuid support */
7931                 if (cpi->cpi_family < 5 ||
7932                     (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7933                         return;
7934 
7935                 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7936                         bzero(&cp, sizeof (cp));
7937                         cp.cp_eax = CPUID_LEAF_EXT_0;
7938                         cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7939                         if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7940                                 return;
7941                         }
7942                 }
7943 
7944                 /*
7945                  * Most AMD features are in leaf 8. Automatic IBRS was added in
7946                  * leaf 0x21. So we also check that.
7947                  */
7948                 bzero(&cp, sizeof (cp));
7949                 cp.cp_eax = CPUID_LEAF_EXT_8;
7950                 (void) __cpuid_insn(&cp);
7951                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7952                 cpi->cpi_extd[8] = cp;
7953 
7954                 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21) {
7955                         return;
7956                 }
7957 
7958                 bzero(&cp, sizeof (cp));
7959                 cp.cp_eax = CPUID_LEAF_EXT_21;
7960                 (void) __cpuid_insn(&cp);
7961                 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
7962                 cpi->cpi_extd[0x21] = cp;
7963         } else {
7964                 /*
7965                  * Nothing to do here. Return an empty set which has already
7966                  * been zeroed for us.
7967                  */
7968                 return;
7969         }
7970         cpuid_scan_security(cpu, fset);
7971 }
7972 
7973 /* ARGSUSED */
7974 static int
7975 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7976 {
7977         uchar_t *fset;
7978         boolean_t first_pass = (boolean_t)arg1;
7979 
7980         fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7981         if (first_pass && CPU->cpu_id != 0)
7982                 return (0);
7983         if (!first_pass && CPU->cpu_id == 0)
7984                 return (0);
7985         cpuid_pass_ucode(CPU, fset);
7986 
7987         return (0);
7988 }
7989 
7990 /*
7991  * After a microcode update where the version has changed, then we need to
7992  * rescan CPUID. To do this we check every CPU to make sure that they have the
7993  * same microcode. Then we perform a cross call to all such CPUs. It's the
7994  * caller's job to make sure that no one else can end up doing an update while
7995  * this is going on.
7996  *
7997  * We assume that the system is microcode capable if we're called.
7998  */
7999 void
8000 cpuid_post_ucodeadm(void)
8001 {
8002         uint32_t rev;
8003         int i;
8004         struct cpu *cpu;
8005         cpuset_t cpuset;
8006         void *argdata;
8007         uchar_t *f0;
8008 
8009         argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
8010 
8011         mutex_enter(&cpu_lock);
8012         cpu = cpu_get(0);
8013         rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
8014         CPUSET_ONLY(cpuset, 0);
8015         for (i = 1; i < max_ncpus; i++) {
8016                 if ((cpu = cpu_get(i)) == NULL)
8017                         continue;
8018 
8019                 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
8020                         panic("post microcode update CPU %d has differing "
8021                             "microcode revision (%u) from CPU 0 (%u)",
8022                             i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
8023                 }
8024                 CPUSET_ADD(cpuset, i);
8025         }
8026 
8027         /*
8028          * We do the cross calls in two passes. The first pass is only for the
8029          * boot CPU. The second pass is for all of the other CPUs. This allows
8030          * the boot CPU to go through and change behavior related to patching or
8031          * whether or not Enhanced IBRS needs to be enabled and then allow all
8032          * other CPUs to follow suit.
8033          */
8034         kpreempt_disable();
8035         xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
8036             cpuid_post_ucodeadm_xc);
8037         xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
8038             cpuid_post_ucodeadm_xc);
8039         kpreempt_enable();
8040 
8041         /*
8042          * OK, now look at each CPU and see if their feature sets are equal.
8043          */
8044         f0 = argdata;
8045         for (i = 1; i < max_ncpus; i++) {
8046                 uchar_t *fset;
8047                 if (!CPU_IN_SET(cpuset, i))
8048                         continue;
8049 
8050                 fset = (uchar_t *)((uintptr_t)argdata +
8051                     sizeof (x86_featureset) * i);
8052 
8053                 if (!compare_x86_featureset(f0, fset)) {
8054                         panic("Post microcode update CPU %d has "
8055                             "differing security feature (%p) set from CPU 0 "
8056                             "(%p), not appending to feature set", i,
8057                             (void *)fset, (void *)f0);
8058                 }
8059         }
8060 
8061         mutex_exit(&cpu_lock);
8062 
8063         for (i = 0; i < NUM_X86_FEATURES; i++) {
8064                 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
8065                     x86_feature_names[i]);
8066                 if (is_x86_feature(f0, i)) {
8067                         add_x86_feature(x86_featureset, i);
8068                 }
8069         }
8070         kmem_free(argdata, sizeof (x86_featureset) * NCPU);
8071 }
8072 
8073 typedef void (*cpuid_pass_f)(cpu_t *, void *);
8074 
8075 typedef struct cpuid_pass_def {
8076         cpuid_pass_t cpd_pass;
8077         cpuid_pass_f cpd_func;
8078 } cpuid_pass_def_t;
8079 
8080 /*
8081  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
8082  * normal sense and should not appear here.
8083  */
8084 static const cpuid_pass_def_t cpuid_pass_defs[] = {
8085         { CPUID_PASS_PRELUDE, cpuid_pass_prelude },
8086         { CPUID_PASS_IDENT, cpuid_pass_ident },
8087         { CPUID_PASS_BASIC, cpuid_pass_basic },
8088         { CPUID_PASS_EXTENDED, cpuid_pass_extended },
8089         { CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
8090         { CPUID_PASS_RESOLVE, cpuid_pass_resolve },
8091 };
8092 
8093 void
8094 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
8095 {
8096         VERIFY3S(pass, !=, CPUID_PASS_NONE);
8097 
8098         if (cp == NULL)
8099                 cp = CPU;
8100 
8101         /*
8102          * Space statically allocated for BSP, ensure pointer is set
8103          */
8104         if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
8105                 cp->cpu_m.mcpu_cpi = &cpuid_info0;
8106 
8107         ASSERT(cpuid_checkpass(cp, pass - 1));
8108 
8109         for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
8110                 if (cpuid_pass_defs[i].cpd_pass == pass) {
8111                         cpuid_pass_defs[i].cpd_func(cp, arg);
8112                         cp->cpu_m.mcpu_cpi->cpi_pass = pass;
8113                         return;
8114                 }
8115         }
8116 
8117         panic("unable to execute invalid cpuid pass %d on cpu%d\n",
8118             pass, cp->cpu_id);
8119 }
8120 
8121 /*
8122  * Extract the processor family from a chiprev.  Processor families are not the
8123  * same as cpuid families; see comments above and in x86_archext.h.
8124  */
8125 x86_processor_family_t
8126 chiprev_family(const x86_chiprev_t cr)
8127 {
8128         return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
8129 }
8130 
8131 /*
8132  * A chiprev matches its template if the vendor and family are identical and the
8133  * revision of the chiprev matches one of the bits set in the template.  Callers
8134  * may bitwise-OR together chiprevs of the same vendor and family to form the
8135  * template, or use the _ANY variant.  It is not possible to match chiprevs of
8136  * multiple vendors or processor families with a single call.  Note that this
8137  * function operates on processor families, not cpuid families.
8138  */
8139 boolean_t
8140 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8141 {
8142         return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8143             _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8144             (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8145 }
8146 
8147 /*
8148  * A chiprev is at least min if the vendor and family are identical and the
8149  * revision of the chiprev is at least as recent as that of min.  Processor
8150  * families are considered unordered and cannot be compared using this function.
8151  * Note that this function operates on processor families, not cpuid families.
8152  * Use of the _ANY chiprev variant with this function is not useful; it will
8153  * always return B_FALSE if the _ANY variant is supplied as the minimum
8154  * revision.  To determine only whether a chiprev is of a given processor
8155  * family, test the return value of chiprev_family() instead.
8156  */
8157 boolean_t
8158 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8159 {
8160         return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8161             _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8162             _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8163 }
8164 
8165 /*
8166  * The uarch functions operate in a manner similar to the chiprev functions
8167  * above.  While it is tempting to allow these to operate on microarchitectures
8168  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8169  * than ZEN2), we elect not to do so because a manufacturer may supply
8170  * processors of multiple different microarchitecture families each of which may
8171  * be internally ordered but unordered with respect to those of other families.
8172  */
8173 x86_uarch_t
8174 uarchrev_uarch(const x86_uarchrev_t ur)
8175 {
8176         return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8177 }
8178 
8179 boolean_t
8180 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8181 {
8182         return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8183             _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8184             (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8185 }
8186 
8187 boolean_t
8188 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8189 {
8190         return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8191             _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8192             _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8193 }
8194 
8195 /*
8196  * Topology cache related information. This is yet another cache interface that
8197  * we're exposing out intended to be used when we have either Intel Leaf 4 or
8198  * AMD Leaf 8x1D (introduced with Zen 1).
8199  */
8200 static boolean_t
8201 cpuid_cache_topo_sup(const struct cpuid_info *cpi)
8202 {
8203         switch (cpi->cpi_vendor) {
8204         case X86_VENDOR_Intel:
8205                 if (cpi->cpi_maxeax >= 4) {
8206                         return (B_TRUE);
8207                 }
8208                 break;
8209         case X86_VENDOR_AMD:
8210         case X86_VENDOR_HYGON:
8211                 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
8212                     is_x86_feature(x86_featureset, X86FSET_TOPOEXT)) {
8213                         return (B_TRUE);
8214                 }
8215                 break;
8216         default:
8217                 break;
8218         }
8219 
8220         return (B_FALSE);
8221 }
8222 
8223 int
8224 cpuid_getncaches(struct cpu *cpu, uint32_t *ncache)
8225 {
8226         const struct cpuid_info *cpi;
8227 
8228         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8229         cpi = cpu->cpu_m.mcpu_cpi;
8230 
8231         if (!cpuid_cache_topo_sup(cpi)) {
8232                 return (ENOTSUP);
8233         }
8234 
8235         *ncache = cpi->cpi_cache_leaf_size;
8236         return (0);
8237 }
8238 
8239 int
8240 cpuid_getcache(struct cpu *cpu, uint32_t cno, x86_cache_t *cache)
8241 {
8242         const struct cpuid_info *cpi;
8243         const struct cpuid_regs *cp;
8244 
8245         ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8246         cpi = cpu->cpu_m.mcpu_cpi;
8247 
8248         if (!cpuid_cache_topo_sup(cpi)) {
8249                 return (ENOTSUP);
8250         }
8251 
8252         if (cno >= cpi->cpi_cache_leaf_size) {
8253                 return (EINVAL);
8254         }
8255 
8256         bzero(cache, sizeof (cache));
8257         cp = cpi->cpi_cache_leaves[cno];
8258         switch (CPI_CACHE_TYPE(cp)) {
8259         case CPI_CACHE_TYPE_DATA:
8260                 cache->xc_type = X86_CACHE_TYPE_DATA;
8261                 break;
8262         case CPI_CACHE_TYPE_INSTR:
8263                 cache->xc_type = X86_CACHE_TYPE_INST;
8264                 break;
8265         case CPI_CACHE_TYPE_UNIFIED:
8266                 cache->xc_type = X86_CACHE_TYPE_UNIFIED;
8267                 break;
8268         case CPI_CACHE_TYPE_DONE:
8269         default:
8270                 return (EINVAL);
8271         }
8272         cache->xc_level = CPI_CACHE_LVL(cp);
8273         if (CPI_FULL_ASSOC_CACHE(cp) != 0) {
8274                 cache->xc_flags |= X86_CACHE_F_FULL_ASSOC;
8275         }
8276         cache->xc_nparts = CPI_CACHE_PARTS(cp) + 1;
8277         /*
8278          * The number of sets is reserved on AMD if the CPU is tagged as fully
8279          * associative, where as it is considered valid on Intel.
8280          */
8281         if (cpi->cpi_vendor == X86_VENDOR_AMD &&
8282             CPI_FULL_ASSOC_CACHE(cp) != 0) {
8283                 cache->xc_nsets = 1;
8284         } else {
8285                 cache->xc_nsets = CPI_CACHE_SETS(cp) + 1;
8286         }
8287         cache->xc_nways = CPI_CACHE_WAYS(cp) + 1;
8288         cache->xc_line_size = CPI_CACHE_COH_LN_SZ(cp) + 1;
8289         cache->xc_size = cache->xc_nparts * cache->xc_nsets * cache->xc_nways *
8290             cache->xc_line_size;
8291         /*
8292          * We're looking for the number of bits to cover the number of CPUs that
8293          * are being shared. Normally this would be the value - 1, but the CPUID
8294          * value is encoded as the actual value minus one, so we don't modify
8295          * this at all.
8296          */
8297         cache->xc_apic_shift = highbit(CPI_NTHR_SHR_CACHE(cp));
8298 
8299         /*
8300          * To construct a unique ID we construct a uint64_t that looks as
8301          * follows:
8302          *
8303          * [47:40] cache level
8304          * [39:32] CPUID cache type
8305          * [31:00] shifted APIC ID
8306          *
8307          * The shifted APIC ID gives us a guarantee that a given cache entry is
8308          * unique within its peers. The other two numbers give us something that
8309          * ensures that something is unique within the CPU. If we just had the
8310          * APIC ID shifted over by the indicated number of bits we'd end up with
8311          * an ID of zero for the L1I, L1D, L2, and L3.
8312          *
8313          * The format of this ID is private to the system and can change across
8314          * a reboot for the time being.
8315          */
8316         cache->xc_id = (uint64_t)cache->xc_level << 40;
8317         cache->xc_id |= (uint64_t)cache->xc_type << 32;
8318         cache->xc_id |= (uint64_t)cpi->cpi_apicid >> cache->xc_apic_shift;
8319 
8320         return (0);
8321 }