Print this page
    
5513 KM_NORMALPRI should be documented in kmem_alloc(9f) and kmem_cache_create(9f) man pages
14465 Present KM_NOSLEEP_LAZY as documented interface
Change-Id: I002ec28ddf390650f1fcba1ca94f6abfdb241439
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/intel/io/imc/imc.c
          +++ new/usr/src/uts/intel/io/imc/imc.c
   1    1  /*
   2    2   * This file and its contents are supplied under the terms of the
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  
  12   12  /*
  13   13   * Copyright 2019 Joyent, Inc.
  14   14   */
  15   15  
  16   16  /*
  17   17   * Generic Intel Integrated Memory Controller (IMC) Driver
  18   18   *
  19   19   * This driver talks to the CPU's IMC to understand the detailed topology of the
  20   20   * processor and to determine how to map between physical addresses to the
  21   21   * corresponding DIMM. This driver supports the following generations of Intel
  22   22   * chips:
  23   23   *
  24   24   *  - Sandy Bridge
  25   25   *  - Ivy Bridge
  26   26   *  - Haswell
  27   27   *  - Broadwell
  28   28   *  - Skylake / Cascade Lake
  29   29   *
  30   30   * Memory Decoding
  31   31   * ---------------
  32   32   *
  33   33   * For more detailed summaries of the memory decoding process, please refer to
  34   34   * the Intel External Design Specifications for the corresponding processor.
  35   35   * What follows is a rough overview of how the memory decoding system works.
  36   36   *
  37   37   * First, we'd like to define the following concepts:
  38   38   *
  39   39   * SYSTEM ADDRESS
  40   40   *
  41   41   *      This is a physical address that the operating system normally uses. This
  42   42   *      address may refer to DRAM, it may refer to memory mapped PCI
  43   43   *      configuration space or device registers, or it may refer to other parts
  44   44   *      of the system's memory map, such as the extended advanced programmable
  45   45   *      interrupt controller (xAPIC), etc.
  46   46   *
  47   47   * DIMM
  48   48   *
  49   49   *      Dual-inline memory module. This refers to a physical stick of volatile
  50   50   *      memory that is inserted into a slot on the motherboard.
  51   51   *
  52   52   * RANK
  53   53   *
  54   54   *      A potential sub-division of a DIMM. A DIMM's memory capacity is divided
  55   55   *      into a number of equal sized ranks. For example, an 8 GiB DIMM, may have
  56   56   *      1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks.
  57   57   *
  58   58   * RANK ADDRESS
  59   59   *
  60   60   *      An address that exists in the context of a given rank on a DIMM. All
  61   61   *      ranks have overlapping addresses, so the address 0x400 exists on all
  62   62   *      ranks on a given DIMM.
  63   63   *
  64   64   * CHANNEL
  65   65   *
  66   66   *      Multiple DIMMs may be combined into a single channel. The channel
  67   67   *      represents the combined memory of all the DIMMs. A given channel only
  68   68   *      ever exists on a socket and is bound to a single memory controller.
  69   69   *
  70   70   * CHANNEL ADDRESS
  71   71   *
  72   72   *      This is an address that exists logically on a channel. Each address on a
  73   73   *      channel maps to a corresponding DIMM that exists on that channel. The
  74   74   *      address space on one channel is independent from that on another. This
  75   75   *      means that address 0x1000 can exist on each memory channel in the
  76   76   *      system.
  77   77   *
  78   78   * INTERLEAVE
  79   79   *
  80   80   *      There are several different cases where interleaving occurs on the
  81   81   *      system. For example, addresses may be interleaved across sockets,
  82   82   *      memory channels, or DIMM ranks. When addresses are interleaved, then
  83   83   *      some number of bits in an address are used to select which target to go
  84   84   *      to (usually through a look up table). The effect of interleaving is that
  85   85   *      addresses that are next to one another may not all go to the same
  86   86   *      device. The following image shows a non-interleaving case.
  87   87   *
  88   88   *      0x0fff +-----+             +-----+ 0x7ff
  89   89   *             |     |\___________/|     |
  90   90   *             |     |  __________ | (b) |
  91   91   *             |     | /          \|     |
  92   92   *      0x0800 |=====|=            +-----+ 0x000       +-----+ 0x7ff
  93   93   *             |     | \______________________________/|     |
  94   94   *             |     | _______________________________ | (a) |
  95   95   *             |     |/                               \|     |
  96   96   *      0x0000 +-----+                                 +-----+ 0x000
  97   97   *
  98   98   *      In this example of non-interleaving, addresses 0x0000 to 0x07ff go to
  99   99   *      device (a). While, addresses 0x08000 to 0xfff, go to device (b).
 100  100   *      However, each range is divided into the same number of components.
 101  101   *
 102  102   *      If instead, we were to look at that with interleaving, what we might say
 103  103   *      is that rather than splitting the range in half, we might say that if
 104  104   *      the address has bit 8 set (0x100), then it goes to (b), otherwise it
 105  105   *      goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a).
 106  106   *      0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a)
 107  107   *      again, and then 0x300 to 0x2ff would go back to (b). This would continue
 108  108   *      for a while. This would instead look something more like:
 109  109   *
 110  110   *
 111  111   *      0x0fff +-----+       A: 0x7ff +---------+   B: 0x7ff +---------+
 112  112   *             | (b) |                | e00-eff |            | f00-fff |
 113  113   *      0x0f00 |-----|          0x700 +---------+      0x700 +---------+
 114  114   *             | (a) |                | c00-cff |            | d00-dff |
 115  115   *      0x0e00 ~~~~~~~          0x600 +---------+      0x600 +---------+
 116  116   *               ***                  | a00-aff |            | b00-bff |
 117  117   *      0x0400 ~~~~~~~          0x500 +---------+      0x500 +---------+
 118  118   *             | (b) |                | 800-8ff |            | 900-9ff |
 119  119   *      0x0300 |-----|          0x400 +---------+      0x400 +---------+
 120  120   *             | (a) |                | 600-6ff |            | 700-7ff |
 121  121   *      0x0200 |-----|          0x300 +---------+      0x300 +---------+
 122  122   *             | (b) |                | 400-4ff |            | 500-5ff |
 123  123   *      0x0100 |-----|          0x200 +---------+      0x200 +---------+
 124  124   *             | (a) |                | 200-2ff |            | 300-3ff |
 125  125   *      0x0000 +-----+          0x100 +---------+      0x100 +---------+
 126  126   *                                    | 000-0ff |            | 100-1ff |
 127  127   *                              0x000 +---------+      0x000 +---------+
 128  128   *
 129  129   *      In this example we've performed two-way interleaving. The number of ways
 130  130   *      that something can interleave varies based on what we're interleaving
 131  131   *      between.
 132  132   *
 133  133   * MEMORY CONTROLLER
 134  134   *
 135  135   *      A given processor die (see uts/i86pc/os/cpuid.c) contains a number of
 136  136   *      memory controllers. Usually 1 or two. Each memory controller supports a
 137  137   *      given number of DIMMs, which are divided across multiple channels.
 138  138   *
 139  139   * TARGET ADDRESS DECODER
 140  140   *
 141  141   *      The target address decoder (TAD) is responsible for taking a system
 142  142   *      address and transforming it into a channel address based on the rules
 143  143   *      that are present. Each memory controller has a corresponding TAD. The
 144  144   *      TAD is often contained in a device called a 'Home Agent'.
 145  145   *
 146  146   * SYSTEM ADDRESS DECODER
 147  147   *
 148  148   *      The system address decoder (SAD) is responsible for taking a system
 149  149   *      address and directing it to the right place, whether this be memory or
 150  150   *      otherwise. There is a single memory controller per socket (see
 151  151   *      uts/i86pc/os/cpuid.c) that is shared between all the cores currently.
 152  152   *
 153  153   * NODE IDENTIFIER
 154  154   *
 155  155   *      The node identifier is used to uniquely identify an element in the
 156  156   *      various routing topologies on the die (see uts/i86pc/os/cpuid.c for the
 157  157   *      definition of 'die'). One can roughly think about this as a unique
 158  158   *      identifier for the socket itself. In general, the primary node ID for a
 159  159   *      socket should map to the socket APIC ID.
 160  160   *
 161  161   * Finding Devices
 162  162   * ---------------
 163  163   *
 164  164   * There is a bit of a chicken and egg problem on Intel systems and in the
 165  165   * device driver interface. The information that we need in the system is spread
 166  166   * out amongst a large number of different PCI devices that the processor
 167  167   * exposes. The number of such devices can vary based on the processor
 168  168   * generation and the specific SKU in the processor. To deal with this, we break
 169  169   * the driver into two different components: a stub driver and the full driver.
 170  170   *
 171  171   * The stub driver has aliases for all known PCI devices that we might attach to
 172  172   * in a given generation on the system. This driver is called 'imcstub'. When a
 173  173   * stub attaches, it just registers itself with the main driver, upon which it
 174  174   * has a module dependency.
 175  175   *
 176  176   * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it
 177  177   * kicks off a scan of the device tree which takes place in a task queue. Once
 178  178   * there, it determines the number of devices that it expects to exist by
 179  179   * walking the tree and comparing it against the generation-specific table.
 180  180   *
 181  181   * If all devices are found, we'll go ahead and read through all the devices and
 182  182   * build a map of all the information we need to understand the topology of the
 183  183   * system and to be able to decode addresses. We do this here, because we can be
 184  184   * asked to perform decoding in dangerous contexts (after taking an MCE, panic,
 185  185   * etc) where we don't want to have to rely on the broader kernel functioning at
 186  186   * this point in time.
 187  187   *
 188  188   * Once our topology is built, we'll create minor nodes which are used by the
 189  189   * fault management architecture to query for information and register our
 190  190   * decoding functionality with the kernel.
 191  191   *
 192  192   * PCI Numbering
 193  193   * -------------
 194  194   *
 195  195   * For each device that we care about, Intel defines the device and function
 196  196   * that we can expect to find the information and PCI configuration space
 197  197   * registers that we care about at. However, the PCI bus is not well defined.
 198  198   * Devices that are on the same socket use the same set of bus numbers; however,
 199  199   * some sockets have multiple device numbers that they'll use to represent
 200  200   * different classes. These bus numbers are programmed by systems firmware as
 201  201   * part of powering on the system. This means, that we need the ability to
 202  202   * map together these disparate ranges ourselves.
 203  203   *
 204  204   * There is a device called a utility box (UBOX), which exists per-socket and
 205  205   * maps the different sockets together. We use this to determine which devices
 206  206   * correspond to which sockets.
 207  207   *
 208  208   * Mapping Sockets
 209  209   * ---------------
 210  210   *
 211  211   * Another wrinkle is that the way that the OS sees the numbering of the CPUs is
 212  212   * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more
 213  213   * information). However, to map to the corresponding socket, we need to look at
 214  214   * the socket's node ID. The order of PCI buses in the system is not required to
 215  215   * have any relation to the socket ID. Therefore, we have to have yet another
 216  216   * indirection table in the imc_t.
 217  217   *
 218  218   * Exposing Data
 219  219   * -------------
 220  220   *
 221  221   * We expose topology data to FMA using the OS-private memory controller
 222  222   * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a
 223  223   * number of specific interfaces that we can then implement. The ioctl API asks
 224  224   * us for a snapshot of data, which basically has us go through and send an
 225  225   * nvlist_t to userland. This nvlist_t is constructed as part of the scan
 226  226   * process. This nvlist uses the version 1 format, which more explicitly encodes
 227  227   * the topology in a series of nested nvlists.
 228  228   *
 229  229   * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the
 230  230   * decoder and ask it to perform decoding.
 231  231   *
 232  232   * Decoding Addresses
 233  233   * ------------------
 234  234   *
 235  235   * The decoding logic can be found in common/imc/imc_decode.c. This file is
 236  236   * shared between the kernel and userland to allow for easier testing and
 237  237   * additional flexibility in operation. The decoding process happens in a few
 238  238   * different phases.
 239  239   *
 240  240   * The first phase, is to determine which memory controller on which socket is
 241  241   * responsible for this data. To determine this, we use the system address
 242  242   * decoder and walk the rules, looking for the correct target. There are various
 243  243   * manipulations to the address that exist which are used to determine which
 244  244   * index we use. The way that we interpret the output of the rule varies
 245  245   * somewhat based on the generation. Sandy Bridge just has a node ID which
 246  246   * points us to the socket with its single IMC. On Ivy Bridge through Broadwell,
 247  247   * the memory controller to use is also encoded in part of the node ID. Finally,
 248  248   * on Skylake, the SAD tells us which socket to look at. The socket in question
 249  249   * then has a routing table which tells us which channel on which memory
 250  250   * controller that is local to that socket.
 251  251   *
 252  252   * Once we have the target memory controller, we walk the list of target address
 253  253   * decoder rules. These rules can help tell us which channel we care about
 254  254   * (which is required on Sandy Bridge through Broadwell) and then describe some
 255  255   * amount of the interleaving rules which are used to turn the system address
 256  256   * into a channel address.
 257  257   *
 258  258   * Once we know the channel and the channel address, we walk the rank interleave
 259  259   * rules which help us determine which DIMM and the corresponding rank on it
 260  260   * that the corresponding channel address is on. It also has logic that we need
 261  261   * to use to determine how to transform a channel address into an address on
 262  262   * that specific rank. Once we have that, then the initial decoding is done.
 263  263   *
 264  264   * The logic in imc_decode.c is abstracted away from the broader kernel CMI
 265  265   * logic.  This is on purpose and allows us not only an easier time unit testing
 266  266   * the logic, but also allows us to express more high fidelity errors that are
 267  267   * translated into a much smaller subset. This logic is exercised in the
 268  268   * 'imc_test' program which is built in 'test/os-tests/tests/imc'.
 269  269   *
 270  270   * Limitations
 271  271   * -----------
 272  272   *
 273  273   * Currently, this driver has the following limitations:
 274  274   *
 275  275   *  o It doesn't decode the row and column addresses.
 276  276   *  o It doesn't encode from a DIMM address to a system address.
 277  277   *  o It doesn't properly support lockstep and mirroring modes on Sandy Bridge -
 278  278   *    Broadwell platforms.
 279  279   *  o It doesn't support virtual lockstep and adaptive mirroring on Purley
 280  280   *    platforms.
 281  281   *  o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs.
 282  282   *  o It doesn't know how to decode three way channel interleaving.
 283  283   *
 284  284   * None of these are intrinsic problems to the driver, it's mostly a matter of
 285  285   * having proper documentation and testing.
 286  286   */
 287  287  
 288  288  #include <sys/modctl.h>
 289  289  #include <sys/conf.h>
 290  290  #include <sys/devops.h>
 291  291  #include <sys/ddi.h>
 292  292  #include <sys/sunddi.h>
 293  293  #include <sys/types.h>
 294  294  #include <sys/file.h>
 295  295  #include <sys/errno.h>
 296  296  #include <sys/open.h>
 297  297  #include <sys/cred.h>
 298  298  #include <sys/pci.h>
 299  299  #include <sys/sysmacros.h>
 300  300  #include <sys/avl.h>
 301  301  #include <sys/stat.h>
 302  302  #include <sys/policy.h>
 303  303  
 304  304  #include <sys/cpu_module.h>
 305  305  #include <sys/mc.h>
 306  306  #include <sys/mc_intel.h>
 307  307  
 308  308  #include "imc.h"
 309  309  
 310  310  /*
 311  311   * These tables contain generational data that varies between processor
 312  312   * generation such as the maximum number of sockets, memory controllers, and the
 313  313   * offsets of the various registers.
 314  314   */
 315  315  
 316  316  static const imc_gen_data_t imc_gen_data_snb = {
 317  317          .igd_max_sockets = 4,
 318  318          .igd_max_imcs = 2,
 319  319          .igd_max_channels = 4,
 320  320          .igd_max_dimms = 3,
 321  321          .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
 322  322          .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
 323  323              IMC_REG_MC_MTR2 },
 324  324          .igd_mcmtr_offset = 0x7c,
 325  325          .igd_tolm_offset = 0x80,
 326  326          .igd_tohm_low_offset = 0x84,
 327  327          .igd_sad_dram_offset = 0x80,
 328  328          .igd_sad_ndram_rules = 10,
 329  329          .igd_sad_nodeid_offset = 0x40,
 330  330          .igd_tad_nrules = 12,
 331  331          .igd_tad_rule_offset = 0x40,
 332  332          .igd_tad_chan_offset = 0x90,
 333  333          .igd_tad_sysdef = 0x80,
 334  334          .igd_tad_sysdef2 = 0x84,
 335  335          .igd_mc_mirror = 0xac,
 336  336          .igd_rir_nways = 5,
 337  337          .igd_rir_way_offset = 0x108,
 338  338          .igd_rir_nileaves = 8,
 339  339          .igd_rir_ileave_offset = 0x120,
 340  340          .igd_ubox_cpubusno_offset = 0xd0,
 341  341  };
 342  342  
 343  343  static const imc_gen_data_t imc_gen_data_ivb = {
 344  344          .igd_max_sockets = 4,
 345  345          .igd_max_imcs = 2,
 346  346          .igd_max_channels = 4,
 347  347          .igd_max_dimms = 3,
 348  348          .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
 349  349          .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
 350  350              IMC_REG_MC_MTR2 },
 351  351          .igd_mcmtr_offset = 0x7c,
 352  352          .igd_tolm_offset = 0x80,
 353  353          .igd_tohm_low_offset = 0x84,
 354  354          .igd_sad_dram_offset = 0x60,
 355  355          .igd_sad_ndram_rules = 20,
 356  356          .igd_sad_nodeid_offset = 0x40,
 357  357          .igd_tad_nrules = 12,
 358  358          .igd_tad_rule_offset = 0x40,
 359  359          .igd_tad_chan_offset = 0x90,
 360  360          .igd_tad_sysdef = 0x80,
 361  361          .igd_tad_sysdef2 = 0x84,
 362  362          .igd_mc_mirror = 0xac,
 363  363          .igd_rir_nways = 5,
 364  364          .igd_rir_way_offset = 0x108,
 365  365          .igd_rir_nileaves = 8,
 366  366          .igd_rir_ileave_offset = 0x120,
 367  367          .igd_ubox_cpubusno_offset = 0xd0,
 368  368  };
 369  369  
 370  370  static const imc_gen_data_t imc_gen_data_has_brd = {
 371  371          .igd_max_sockets = 4,
 372  372          .igd_max_imcs = 2,
 373  373          .igd_max_channels = 4,
 374  374          .igd_max_dimms = 3,
 375  375          .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX,
 376  376          .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
 377  377              IMC_REG_MC_MTR2 },
 378  378          .igd_mcmtr_offset = 0x7c,
 379  379          .igd_tolm_offset = 0xd0,
 380  380          .igd_tohm_low_offset = 0xd4,
 381  381          .igd_tohm_hi_offset = 0xd8,
 382  382          .igd_sad_dram_offset = 0x60,
 383  383          .igd_sad_ndram_rules = 20,
 384  384          .igd_sad_nodeid_offset = 0x40,
 385  385          .igd_tad_nrules = 12,
 386  386          .igd_tad_rule_offset = 0x40,
 387  387          .igd_tad_chan_offset = 0x90,
 388  388          .igd_tad_sysdef = 0x80,
 389  389          .igd_tad_sysdef2 = 0x84,
 390  390          .igd_mc_mirror = 0xac,
 391  391          .igd_rir_nways = 5,
 392  392          .igd_rir_way_offset = 0x108,
 393  393          .igd_rir_nileaves = 8,
 394  394          .igd_rir_ileave_offset = 0x120,
 395  395          .igd_ubox_cpubusno_offset = 0xd0,
 396  396  };
 397  397  
 398  398  static const imc_gen_data_t imc_gen_data_skx = {
 399  399          .igd_max_sockets = 8,
 400  400          .igd_max_imcs = 2,
 401  401          .igd_max_channels = 3,
 402  402          .igd_max_dimms = 2,
 403  403          .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
 404  404          .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 },
 405  405          .igd_mcmtr_offset = 0x87c,
 406  406          .igd_topo_offset = 0x88,
 407  407          .igd_tolm_offset = 0xd0,
 408  408          .igd_tohm_low_offset = 0xd4,
 409  409          .igd_tohm_hi_offset = 0xd8,
 410  410          .igd_sad_dram_offset = 0x60,
 411  411          .igd_sad_ndram_rules = 24,
 412  412          .igd_sad_nodeid_offset = 0xc0,
 413  413          .igd_tad_nrules = 8,
 414  414          .igd_tad_rule_offset = 0x850,
 415  415          .igd_tad_chan_offset = 0x90,
 416  416          .igd_rir_nways = 4,
 417  417          .igd_rir_way_offset = 0x108,
 418  418          .igd_rir_nileaves = 4,
 419  419          .igd_rir_ileave_offset = 0x120,
 420  420          .igd_ubox_cpubusno_offset = 0xcc,
 421  421  };
 422  422  
 423  423  /*
 424  424   * This table contains all of the devices that we're looking for from a stub
 425  425   * perspective. These are organized by generation. Different generations behave
 426  426   * in slightly different ways. For example, Sandy Bridge through Broadwell use
 427  427   * unique PCI IDs for each PCI device/function combination that appears. Whereas
 428  428   * Skylake based systems use the same PCI ID; however, different device/function
 429  429   * values indicate that the IDs are used for different purposes.
 430  430   */
 431  431  /* BEGIN CSTYLED */
 432  432  static const imc_stub_table_t imc_stub_table[] = {
 433  433          /* Sandy Bridge */
 434  434          { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" },
 435  435          { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" },
 436  436          { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" },
 437  437          { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" },
 438  438          { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" },
 439  439          { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" },
 440  440          { IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" },
 441  441          { IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" },
 442  442          { IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" },
 443  443          { IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" },
 444  444          { IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" },
 445  445          { IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" },
 446  446          /* Ivy Bridge */
 447  447          { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" },
 448  448          { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" },
 449  449          { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" },
 450  450          { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" },
 451  451          { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" },
 452  452          { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" },
 453  453          { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" },
 454  454          { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" },
 455  455          { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" },
 456  456          { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" },
 457  457          { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" },
 458  458          { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" },
 459  459          { IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" },
 460  460          { IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" },
 461  461          { IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" },
 462  462          { IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" },
 463  463          { IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" },
 464  464          { IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" },
 465  465          { IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" },
 466  466          /* Haswell */
 467  467          { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" },
 468  468          { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" },
 469  469          { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" },
 470  470          { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" },
 471  471          { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" },
 472  472          { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" },
 473  473          { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" },
 474  474          { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" },
 475  475          { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" },
 476  476          { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" },
 477  477          { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" },
 478  478          { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" },
 479  479          { IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" },
 480  480          { IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" },
 481  481          { IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" },
 482  482          { IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" },
 483  483          { IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" },
 484  484          { IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" },
 485  485          { IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" },
 486  486          /* Broadwell Devices */
 487  487          { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" },
 488  488          { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" },
 489  489          { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" },
 490  490          { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" },
 491  491          { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" },
 492  492          { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" },
 493  493          { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" },
 494  494          { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" },
 495  495          { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" },
 496  496          { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" },
 497  497          { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" },
 498  498          { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" },
 499  499          { IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" },
 500  500          { IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" },
 501  501          { IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" },
 502  502          { IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" },
 503  503          { IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" },
 504  504          { IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" },
 505  505          { IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" },
 506  506          /* Skylake and Cascade Lake Devices */
 507  507          { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" },
 508  508          { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" },
 509  509          { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" },
 510  510          { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" },
 511  511          { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" },
 512  512          { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" },
 513  513          { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" },
 514  514          { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" },
 515  515          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" },
 516  516          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" },
 517  517          { IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" },
 518  518  
 519  519          /*
 520  520           * There is one SAD MC Route type device per core! Because of this a
 521  521           * wide array of device and functions are allocated. For now, we list
 522  522           * all 28 of them out.
 523  523           */
 524  524          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" },
 525  525          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" },
 526  526          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" },
 527  527          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" },
 528  528          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" },
 529  529          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" },
 530  530          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" },
 531  531          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" },
 532  532          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" },
 533  533          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" },
 534  534          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" },
 535  535          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" },
 536  536          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" },
 537  537          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" },
 538  538          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" },
 539  539          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" },
 540  540          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" },
 541  541          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" },
 542  542          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" },
 543  543          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" },
 544  544          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" },
 545  545          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" },
 546  546          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" },
 547  547          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" },
 548  548          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" },
 549  549          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" },
 550  550          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" },
 551  551          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" },
 552  552          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" },
 553  553          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" },
 554  554          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" },
 555  555          { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" },
 556  556  
 557  557          { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" },
 558  558          { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" },
 559  559  };
 560  560  /* END CSTYLED */
 561  561  
 562  562  #define IMC_PCI_VENDOR_INTC     0x8086
 563  563  
 564  564  /*
 565  565   * Our IMC data is global and statically set up during a combination of
 566  566   * _init(9E) and attach(9E). While we have a module dependency between the PCI
 567  567   * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't
 568  568   * guarantee that the imc driver has finished attaching. As such we make sure
 569  569   * that it can operate without it being attached in any way.
 570  570   */
 571  571  static imc_t *imc_data = NULL;
 572  572  
 573  573  /*
 574  574   * By default we should not allow the stubs to detach as we don't have a good
 575  575   * way of forcing them to attach again. This is provided in case someone does
 576  576   * want to allow the driver to unload.
 577  577   */
 578  578  int imc_allow_detach = 0;
 579  579  
 580  580  static void
 581  581  imc_set_gen_data(imc_t *imc)
 582  582  {
 583  583          switch (imc->imc_gen) {
 584  584          case IMC_GEN_SANDY:
 585  585                  imc->imc_gen_data = &imc_gen_data_snb;
 586  586                  break;
 587  587          case IMC_GEN_IVY:
 588  588                  imc->imc_gen_data = &imc_gen_data_ivb;
 589  589                  break;
 590  590          case IMC_GEN_HASWELL:
 591  591          case IMC_GEN_BROADWELL:
 592  592                  imc->imc_gen_data = &imc_gen_data_has_brd;
 593  593                  break;
 594  594          case IMC_GEN_SKYLAKE:
 595  595                  imc->imc_gen_data = &imc_gen_data_skx;
 596  596                  break;
 597  597          default:
 598  598                  dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
 599  599                      "set to unknown generation: %u", imc->imc_gen);
 600  600          }
 601  601  }
 602  602  
 603  603  /*
 604  604   * If our device (dev_info_t) does not have a non-zero unit address, then
 605  605   * devfsadmd will not pay attention to us at all. Therefore we need to set the
 606  606   * unit address below, before we create minor nodes.
 607  607   *
 608  608   * The rest of the system expects us to have one minor node per socket. The
 609  609   * minor node ID should be the ID of the socket.
 610  610   */
 611  611  static boolean_t
 612  612  imc_create_minors(imc_t *imc)
 613  613  {
 614  614          uint_t i;
 615  615  
 616  616          ddi_set_name_addr(imc->imc_dip, "1");
 617  617          for (i = 0; i < imc->imc_nsockets; i++) {
 618  618                  char buf[MAXNAMELEN];
 619  619  
 620  620                  if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >=
 621  621                      sizeof (buf)) {
 622  622                          goto fail;
 623  623                  }
 624  624  
 625  625                  if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i,
 626  626                      "ddi_mem_ctrl", 0) != DDI_SUCCESS) {
 627  627                          dev_err(imc->imc_dip, CE_WARN, "failed to create "
 628  628                              "minor node %u: %s", i, buf);
 629  629                          goto fail;
 630  630                  }
 631  631          }
 632  632          return (B_TRUE);
 633  633  
 634  634  fail:
 635  635          ddi_remove_minor_node(imc->imc_dip, NULL);
 636  636          return (B_FALSE);
 637  637  }
 638  638  
 639  639  /*
 640  640   * Check the current MC route value for this SAD. On Skylake systems there is
 641  641   * one per core. Every core should agree. If not, we will not trust the SAD
 642  642   * MCROUTE values and this will cause system address decoding to fail on
 643  643   * skylake.
 644  644   */
 645  645  static void
 646  646  imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub)
 647  647  {
 648  648          uint32_t val;
 649  649  
 650  650          val = pci_config_get32(stub->istub_cfgspace,
 651  651              IMC_REG_SKX_SAD_MC_ROUTE_TABLE);
 652  652          if (val == PCI_EINVAL32) {
 653  653                  sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
 654  654                  return;
 655  655          }
 656  656  
 657  657          if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) {
 658  658                  sad->isad_flags |= IMC_SAD_MCROUTE_VALID;
 659  659                  sad->isad_mcroute.ismc_raw_mcroute = val;
 660  660                  return;
 661  661          }
 662  662  
 663  663          /*
 664  664           * Occasionally we see MC ROUTE table entries with a value of zero.
 665  665           * We should ignore those for now.
 666  666           */
 667  667          if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) {
 668  668                  dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch "
 669  669                      "with socket. SAD has val 0x%x, system has %x\n",
 670  670                      val, sad->isad_mcroute.ismc_raw_mcroute);
 671  671                  sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE;
 672  672          }
 673  673  }
 674  674  
 675  675  /*
 676  676   * On Skylake, many of the devices that we care about are on separate PCI Buses.
 677  677   * These can be mapped together by the DECS register. However, we need to know
 678  678   * how to map different buses together so that we can more usefully associate
 679  679   * information. The set of buses is all present in the DECS register. We'll
 680  680   * effectively assign sockets to buses. This is also still something that comes
 681  681   * up on pre-Skylake systems as well.
 682  682   */
 683  683  static boolean_t
 684  684  imc_map_buses(imc_t *imc)
 685  685  {
 686  686          imc_stub_t *stub;
 687  687          uint_t nsock;
 688  688  
 689  689          /*
 690  690           * Find the UBOX_DECS registers so we can establish socket mappings. On
 691  691           * Skylake, there are three different sets of buses that we need to
 692  692           * cover all of our devices, while there are only two before that.
 693  693           */
 694  694          for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL;
 695  695              stub = AVL_NEXT(&imc->imc_stubs, stub)) {
 696  696                  uint32_t busno;
 697  697  
 698  698                  if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) {
 699  699                          continue;
 700  700                  }
 701  701  
 702  702                  busno = pci_config_get32(stub->istub_cfgspace,
 703  703                      imc->imc_gen_data->igd_ubox_cpubusno_offset);
 704  704                  if (busno == PCI_EINVAL32) {
 705  705                          dev_err(imc->imc_dip, CE_WARN, "failed to read "
 706  706                              "UBOX_DECS CPUBUSNO0: invalid PCI read");
 707  707                          return (B_FALSE);
 708  708                  }
 709  709  
 710  710                  if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
 711  711                          imc->imc_sockets[nsock].isock_nbus = 3;
 712  712                          imc->imc_sockets[nsock].isock_bus[0] =
 713  713                              IMC_UBOX_CPUBUSNO_0(busno);
 714  714                          imc->imc_sockets[nsock].isock_bus[1] =
 715  715                              IMC_UBOX_CPUBUSNO_1(busno);
 716  716                          imc->imc_sockets[nsock].isock_bus[2] =
 717  717                              IMC_UBOX_CPUBUSNO_2(busno);
 718  718                  } else {
 719  719                          imc->imc_sockets[nsock].isock_bus[0] =
 720  720                              IMC_UBOX_CPUBUSNO_0(busno);
 721  721                          imc->imc_sockets[nsock].isock_bus[1] =
 722  722                              IMC_UBOX_CPUBUSNO_1(busno);
 723  723                          imc->imc_sockets[nsock].isock_nbus = 2;
 724  724                  }
 725  725                  nsock++;
 726  726          }
 727  727          imc->imc_nsockets = nsock;
 728  728  
 729  729          return (B_TRUE);
 730  730  }
 731  731  
 732  732  /*
 733  733   * For a given stub that we've found, map it to its corresponding socket based
 734  734   * on the PCI bus that it has.
 735  735   */
 736  736  static imc_socket_t *
 737  737  imc_map_find_socket(imc_t *imc, imc_stub_t *stub)
 738  738  {
 739  739          uint_t i;
 740  740  
 741  741          for (i = 0; i < imc->imc_nsockets; i++) {
 742  742                  uint_t bus;
 743  743  
 744  744                  for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) {
 745  745                          if (imc->imc_sockets[i].isock_bus[bus] ==
 746  746                              stub->istub_bus) {
 747  747                                  return (&imc->imc_sockets[i]);
 748  748                          }
 749  749                  }
 750  750          }
 751  751  
 752  752          return (NULL);
 753  753  }
 754  754  
 755  755  static boolean_t
 756  756  imc_map_stubs(imc_t *imc)
 757  757  {
 758  758          imc_stub_t *stub;
 759  759  
 760  760          if (!imc_map_buses(imc)) {
 761  761                  return (B_FALSE);
 762  762          }
 763  763  
 764  764          stub = avl_first(&imc->imc_stubs);
 765  765          for (stub = avl_first(&imc->imc_stubs); stub != NULL;
 766  766              stub = AVL_NEXT(&imc->imc_stubs, stub)) {
 767  767                  imc_socket_t *sock = imc_map_find_socket(imc, stub);
 768  768  
 769  769                  if (sock == NULL) {
 770  770                          dev_err(imc->imc_dip, CE_WARN, "found stub type %u "
 771  771                              "PCI%x,%x with bdf %u/%u/%u that does not match a "
 772  772                              "known PCI bus for any of %u sockets",
 773  773                              stub->istub_table->imcs_type, stub->istub_vid,
 774  774                              stub->istub_did, stub->istub_bus, stub->istub_dev,
 775  775                              stub->istub_func, imc->imc_nsockets);
 776  776                          continue;
 777  777                  }
 778  778  
 779  779                  /*
 780  780                   * We don't have to worry about duplicates here. We check to
 781  781                   * make sure that we have unique bdfs here.
 782  782                   */
 783  783                  switch (stub->istub_table->imcs_type) {
 784  784                  case IMC_TYPE_MC0_M2M:
 785  785                          sock->isock_imcs[0].icn_m2m = stub;
 786  786                          break;
 787  787                  case IMC_TYPE_MC1_M2M:
 788  788                          sock->isock_imcs[1].icn_m2m = stub;
 789  789                          break;
 790  790                  case IMC_TYPE_MC0_MAIN0:
 791  791                          sock->isock_nimc++;
 792  792                          sock->isock_imcs[0].icn_main0 = stub;
 793  793  
 794  794                          /*
 795  795                           * On Skylake, the MAIN0 does double duty as channel
 796  796                           * zero and as the TAD.
 797  797                           */
 798  798                          if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
 799  799                                  sock->isock_imcs[0].icn_nchannels++;
 800  800                                  sock->isock_imcs[0].icn_channels[0].ich_desc =
 801  801                                      stub;
 802  802                                  sock->isock_tad[0].itad_stub = stub;
 803  803                                  sock->isock_ntad++;
 804  804                          }
 805  805                          break;
 806  806                  case IMC_TYPE_MC0_MAIN1:
 807  807                          sock->isock_imcs[0].icn_main1 = stub;
 808  808                          break;
 809  809                  case IMC_TYPE_MC1_MAIN0:
 810  810                          sock->isock_nimc++;
 811  811                          sock->isock_imcs[1].icn_main0 = stub;
 812  812  
 813  813                          /*
 814  814                           * On Skylake, the MAIN0 does double duty as channel
 815  815                           * zero and as the TAD.
 816  816                           */
 817  817                          if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
 818  818                                  sock->isock_imcs[1].icn_nchannels++;
 819  819                                  sock->isock_imcs[1].icn_channels[0].ich_desc =
 820  820                                      stub;
 821  821                                  sock->isock_tad[1].itad_stub = stub;
 822  822                                  sock->isock_ntad++;
 823  823                          }
 824  824                          break;
 825  825                  case IMC_TYPE_MC1_MAIN1:
 826  826                          sock->isock_imcs[1].icn_main1 = stub;
 827  827                          break;
 828  828                  case IMC_TYPE_MC0_CHANNEL0:
 829  829                          sock->isock_imcs[0].icn_nchannels++;
 830  830                          sock->isock_imcs[0].icn_channels[0].ich_desc = stub;
 831  831                          break;
 832  832                  case IMC_TYPE_MC0_CHANNEL1:
 833  833                          sock->isock_imcs[0].icn_nchannels++;
 834  834                          sock->isock_imcs[0].icn_channels[1].ich_desc = stub;
 835  835                          break;
 836  836                  case IMC_TYPE_MC0_CHANNEL2:
 837  837                          sock->isock_imcs[0].icn_nchannels++;
 838  838                          sock->isock_imcs[0].icn_channels[2].ich_desc = stub;
 839  839                          break;
 840  840                  case IMC_TYPE_MC0_CHANNEL3:
 841  841                          sock->isock_imcs[0].icn_nchannels++;
 842  842                          sock->isock_imcs[0].icn_channels[3].ich_desc = stub;
 843  843                          break;
 844  844                  case IMC_TYPE_MC1_CHANNEL0:
 845  845                          sock->isock_imcs[1].icn_nchannels++;
 846  846                          sock->isock_imcs[1].icn_channels[0].ich_desc = stub;
 847  847                          break;
 848  848                  case IMC_TYPE_MC1_CHANNEL1:
 849  849                          sock->isock_imcs[1].icn_nchannels++;
 850  850                          sock->isock_imcs[1].icn_channels[1].ich_desc = stub;
 851  851                          break;
 852  852                  case IMC_TYPE_MC1_CHANNEL2:
 853  853                          sock->isock_imcs[1].icn_nchannels++;
 854  854                          sock->isock_imcs[1].icn_channels[2].ich_desc = stub;
 855  855                          break;
 856  856                  case IMC_TYPE_MC1_CHANNEL3:
 857  857                          sock->isock_imcs[1].icn_nchannels++;
 858  858                          sock->isock_imcs[1].icn_channels[3].ich_desc = stub;
 859  859                          break;
 860  860                  case IMC_TYPE_SAD_DRAM:
 861  861                          sock->isock_sad.isad_dram = stub;
 862  862                          break;
 863  863                  case IMC_TYPE_SAD_MMIO:
 864  864                          sock->isock_sad.isad_mmio = stub;
 865  865                          break;
 866  866                  case IMC_TYPE_SAD_MISC:
 867  867                          sock->isock_sad.isad_tolh = stub;
 868  868                          break;
 869  869                  case IMC_TYPE_VTD_MISC:
 870  870                          /*
 871  871                           * Some systems have multiple VT-D Misc. entry points
 872  872                           * in the system. In this case, only use the first one
 873  873                           * we find.
 874  874                           */
 875  875                          if (imc->imc_gvtd_misc == NULL) {
 876  876                                  imc->imc_gvtd_misc = stub;
 877  877                          }
 878  878                          break;
 879  879                  case IMC_TYPE_SAD_MCROUTE:
 880  880                          ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE);
 881  881                          imc_mcroute_check(imc, &sock->isock_sad, stub);
 882  882                          break;
 883  883                  case IMC_TYPE_UBOX:
 884  884                          sock->isock_ubox = stub;
 885  885                          break;
 886  886                  case IMC_TYPE_HA0:
 887  887                          sock->isock_ntad++;
 888  888                          sock->isock_tad[0].itad_stub = stub;
 889  889                          break;
 890  890                  case IMC_TYPE_HA1:
 891  891                          sock->isock_ntad++;
 892  892                          sock->isock_tad[1].itad_stub = stub;
 893  893                          break;
 894  894                  case IMC_TYPE_UBOX_CPUBUSNO:
 895  895                          sock->isock_cpubusno = stub;
 896  896                          break;
 897  897                  default:
 898  898                          /*
 899  899                           * Attempt to still attach if we can.
 900  900                           */
 901  901                          dev_err(imc->imc_dip, CE_WARN, "Encountered unknown "
 902  902                              "IMC type (%u) on PCI %x,%x",
 903  903                              stub->istub_table->imcs_type,
 904  904                              stub->istub_vid, stub->istub_did);
 905  905                          break;
 906  906                  }
 907  907          }
 908  908  
 909  909          return (B_TRUE);
 910  910  }
 911  911  
 912  912  /*
 913  913   * Go through and fix up various aspects of the stubs mappings on systems. The
 914  914   * following are a list of what we need to fix up:
 915  915   *
 916  916   *  1. On Haswell and newer systems, there is only one global VT-d device. We
 917  917   *     need to go back and map that to all of the per-socket imc_sad_t entries.
 918  918   */
 919  919  static void
 920  920  imc_fixup_stubs(imc_t *imc)
 921  921  {
 922  922          if (imc->imc_gen >= IMC_GEN_HASWELL) {
 923  923                  uint_t i;
 924  924  
 925  925                  for (i = 0; i < imc->imc_nsockets; i++) {
 926  926                          ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh,
 927  927                              ==, NULL);
 928  928                          imc->imc_sockets[i].isock_sad.isad_tolh =
 929  929                              imc->imc_gvtd_misc;
 930  930                  }
 931  931          }
 932  932  }
 933  933  
 934  934  /*
 935  935   * In the wild we've hit a few odd cases where not all devices are exposed that
 936  936   * we might expect by firmware. In particular we've seen and validate the
 937  937   * following cases:
 938  938   *
 939  939   *  o We don't find all of the channel devices that we expect, e.g. we have the
 940  940   *    stubs for channels 1-3, but not 0. That has been seen on an Intel S2600CW
 941  941   *    with an E5-2630v3.
 942  942   */
 943  943  static boolean_t
 944  944  imc_validate_stubs(imc_t *imc)
 945  945  {
 946  946          for (uint_t sock = 0; sock < imc->imc_nsockets; sock++) {
 947  947                  imc_socket_t *socket = &imc->imc_sockets[sock];
 948  948  
 949  949                  for (uint_t mc = 0; mc < socket->isock_nimc; mc++) {
 950  950                          imc_mc_t *mcp = &socket->isock_imcs[mc];
 951  951  
 952  952                          for (uint_t chan = 0; chan < mcp->icn_nchannels;
 953  953                              chan++) {
 954  954                                  if (mcp->icn_channels[chan].ich_desc == NULL) {
 955  955                                          dev_err(imc->imc_dip, CE_WARN,
 956  956                                              "!missing device for socket %u/"
 957  957                                              "imc %u/channel %u", sock, mc,
 958  958                                              chan);
 959  959                                          return (B_FALSE);
 960  960                                  }
 961  961                          }
 962  962                  }
 963  963          }
 964  964  
 965  965          return (B_TRUE);
 966  966  }
 967  967  
 968  968  /*
 969  969   * Attempt to map all of the discovered sockets to the corresponding APIC based
 970  970   * socket. We do these mappings by getting the node id of the socket and
 971  971   * adjusting it to make sure that no home agent is present in it. We use the
 972  972   * UBOX to avoid any home agent related bits that are present in other
 973  973   * registers.
 974  974   */
 975  975  static void
 976  976  imc_map_sockets(imc_t *imc)
 977  977  {
 978  978          uint_t i;
 979  979  
 980  980          for (i = 0; i < imc->imc_nsockets; i++) {
 981  981                  uint32_t nodeid;
 982  982                  ddi_acc_handle_t h;
 983  983  
 984  984                  h = imc->imc_sockets[i].isock_ubox->istub_cfgspace;
 985  985                  nodeid = pci_config_get32(h,
 986  986                      imc->imc_gen_data->igd_sad_nodeid_offset);
 987  987                  if (nodeid == PCI_EINVAL32) {
 988  988                          imc->imc_sockets[i].isock_valid |=
 989  989                              IMC_SOCKET_V_BAD_NODEID;
 990  990                          continue;
 991  991                  }
 992  992  
 993  993                  imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid);
 994  994                  imc->imc_spointers[nodeid] = &imc->imc_sockets[i];
 995  995          }
 996  996  }
 997  997  
 998  998  /*
 999  999   * Decode the MTR, accounting for variances between processor generations.
1000 1000   */
1001 1001  static void
1002 1002  imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr)
1003 1003  {
1004 1004          uint8_t disable;
1005 1005  
1006 1006          /*
1007 1007           * Check present first, before worrying about anything else.
1008 1008           */
1009 1009          if (imc->imc_gen < IMC_GEN_SKYLAKE &&
1010 1010              IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) {
1011 1011                  dimm->idimm_present = B_FALSE;
1012 1012                  return;
1013 1013          } else if (imc->imc_gen >= IMC_GEN_SKYLAKE &&
1014 1014              IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) {
1015 1015                  dimm->idimm_present = B_FALSE;
1016 1016                  return;
1017 1017          }
1018 1018  
1019 1019          dimm->idimm_present = B_TRUE;
1020 1020          dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE;
1021 1021          if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN ||
1022 1022              dimm->idimm_ncolumns > IMC_MTR_CA_MAX) {
1023 1023                  dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS;
1024 1024          }
1025 1025  
1026 1026          dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE;
1027 1027          if (dimm->idimm_nrows < IMC_MTR_RA_MIN ||
1028 1028              dimm->idimm_nrows > IMC_MTR_RA_MAX) {
1029 1029                  dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS;
1030 1030          }
1031 1031  
1032 1032          /*
1033 1033           * Determine Density, this information is not present on Sandy Bridge.
1034 1034           */
1035 1035          switch (imc->imc_gen) {
1036 1036          case IMC_GEN_IVY:
1037 1037                  dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr);
1038 1038                  break;
1039 1039          case IMC_GEN_HASWELL:
1040 1040          case IMC_GEN_BROADWELL:
1041 1041                  switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) {
1042 1042                  case 0:
1043 1043                  default:
1044 1044                          dimm->idimm_density = 0;
1045 1045                          dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1046 1046                          break;
1047 1047                  case 1:
1048 1048                          dimm->idimm_density = 2;
1049 1049                          break;
1050 1050                  case 2:
1051 1051                          dimm->idimm_density = 4;
1052 1052                          break;
1053 1053                  case 3:
1054 1054                          dimm->idimm_density = 8;
1055 1055                          break;
1056 1056                  }
1057 1057                  break;
1058 1058          case IMC_GEN_SKYLAKE:
1059 1059                  switch (IMC_MTR_DENSITY_SKX(mtr)) {
1060 1060                  case 0:
1061 1061                  default:
1062 1062                          dimm->idimm_density = 0;
1063 1063                          dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1064 1064                          break;
1065 1065                  case 1:
1066 1066                          dimm->idimm_density = 2;
1067 1067                          break;
1068 1068                  case 2:
1069 1069                          dimm->idimm_density = 4;
1070 1070                          break;
1071 1071                  case 3:
1072 1072                          dimm->idimm_density = 8;
1073 1073                          break;
1074 1074                  case 4:
1075 1075                          dimm->idimm_density = 16;
1076 1076                          break;
1077 1077                  case 5:
1078 1078                          dimm->idimm_density = 12;
1079 1079                          break;
1080 1080                  }
1081 1081                  break;
1082 1082          case IMC_GEN_UNKNOWN:
1083 1083          case IMC_GEN_SANDY:
1084 1084                  dimm->idimm_density = 0;
1085 1085                  break;
1086 1086          }
1087 1087  
1088 1088          /*
1089 1089           * The values of width are the same on IVY->SKX, but the bits are
1090 1090           * different. This doesn't exist on SNB.
1091 1091           */
1092 1092          if (imc->imc_gen > IMC_GEN_SANDY) {
1093 1093                  uint8_t width;
1094 1094  
1095 1095                  if (imc->imc_gen >= IMC_GEN_BROADWELL) {
1096 1096                          width = IMC_MTR_WIDTH_BRD_SKX(mtr);
1097 1097                  } else {
1098 1098                          width = IMC_MTR_WIDTH_IVB_HAS(mtr);
1099 1099                  }
1100 1100                  switch (width) {
1101 1101                  case 0:
1102 1102                          dimm->idimm_width = 4;
1103 1103                          break;
1104 1104                  case 1:
1105 1105                          dimm->idimm_width = 8;
1106 1106                          break;
1107 1107                  case 2:
1108 1108                          dimm->idimm_width = 16;
1109 1109                          break;
1110 1110                  default:
1111 1111                          dimm->idimm_width = 0;
1112 1112                          dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH;
1113 1113                          break;
1114 1114                  }
1115 1115          } else {
1116 1116                  dimm->idimm_width = 0;
1117 1117          }
1118 1118  
1119 1119          dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr);
1120 1120          switch (imc->imc_gen) {
1121 1121          case IMC_GEN_HASWELL:
1122 1122          case IMC_GEN_BROADWELL:
1123 1123          case IMC_GEN_SKYLAKE:
1124 1124                  if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) {
1125 1125                          dimm->idimm_nranks = 0;
1126 1126                          dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1127 1127                  }
1128 1128                  break;
1129 1129          default:
1130 1130                  if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) {
1131 1131                          dimm->idimm_nranks = 0;
1132 1132                          dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1133 1133                  }
1134 1134          }
1135 1135  
1136 1136          disable = IMC_MTR_RANK_DISABLE(mtr);
1137 1137          dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0;
1138 1138          dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0;
1139 1139          dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0;
1140 1140          dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0;
1141 1141  
1142 1142          /*
1143 1143           * Only Haswell and later have this information.
1144 1144           */
1145 1145          if (imc->imc_gen >= IMC_GEN_HASWELL) {
1146 1146                  dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0;
1147 1147                  dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0;
1148 1148                  dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr);
1149 1149                  if (dimm->idimm_3dsranks != 0) {
1150 1150                          dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks;
1151 1151                  }
1152 1152          }
1153 1153  
1154 1154  
1155 1155          if (icn->icn_dimm_type == IMC_DIMM_DDR4) {
1156 1156                  dimm->idimm_nbanks = 16;
1157 1157          } else {
1158 1158                  dimm->idimm_nbanks = 8;
1159 1159          }
1160 1160  
1161 1161          /*
1162 1162           * To calculate the DIMM size we need first take the number of rows and
1163 1163           * columns. This gives us the number of slots per chip. In a given rank
1164 1164           * there are nbanks of these. There are nrank entries of those. Each of
1165 1165           * these slots can fit a byte.
1166 1166           */
1167 1167          dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 *
1168 1168              (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows));
1169 1169  }
1170 1170  
1171 1171  static void
1172 1172  imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan)
1173 1173  {
1174 1174          uint_t i;
1175 1175  
1176 1176          /*
1177 1177           * There's one register for each DIMM that might be present, we always
1178 1178           * read that information to determine information about the DIMMs.
1179 1179           */
1180 1180          chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms;
1181 1181          for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1182 1182                  uint32_t mtr;
1183 1183                  imc_dimm_t *dimm = &chan->ich_dimms[i];
1184 1184  
1185 1185                  bzero(dimm, sizeof (imc_dimm_t));
1186 1186                  mtr = pci_config_get32(chan->ich_desc->istub_cfgspace,
1187 1187                      imc->imc_gen_data->igd_mtr_offsets[i]);
1188 1188                  dimm->idimm_mtr = mtr;
1189 1189                  /*
1190 1190                   * We don't really expect to get a bad PCIe read. However, if we
1191 1191                   * do, treat that for the moment as though the DIMM is bad.
1192 1192                   */
1193 1193                  if (mtr == PCI_EINVAL32) {
1194 1194                          dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ;
1195 1195                          continue;
1196 1196                  }
1197 1197  
1198 1198                  imc_decode_mtr(imc, icn, dimm, mtr);
1199 1199          }
1200 1200  }
1201 1201  
1202 1202  static boolean_t
1203 1203  imc_fill_controller(imc_t *imc, imc_mc_t *icn)
1204 1204  {
1205 1205          uint32_t mcmtr;
1206 1206  
1207 1207          mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace,
1208 1208              imc->imc_gen_data->igd_mcmtr_offset);
1209 1209          if (mcmtr == PCI_EINVAL32) {
1210 1210                  icn->icn_invalid = B_TRUE;
1211 1211                  return (B_FALSE);
1212 1212          }
1213 1213  
1214 1214          icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0;
1215 1215          if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1216 1216                  icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0;
1217 1217          } else {
1218 1218                  icn->icn_lockstep = B_FALSE;
1219 1219          }
1220 1220  
1221 1221          icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0;
1222 1222  
1223 1223          /*
1224 1224           * SNB and IVB only support DDR3. Haswell and Broadwell may support
1225 1225           * DDR4, depends on the SKU. Skylake only supports DDR4.
1226 1226           */
1227 1227          switch (imc->imc_gen) {
1228 1228          case IMC_GEN_SANDY:
1229 1229          case IMC_GEN_IVY:
1230 1230                  icn->icn_dimm_type = IMC_DIMM_DDR3;
1231 1231                  break;
1232 1232          case IMC_GEN_HASWELL:
1233 1233          case IMC_GEN_BROADWELL:
1234 1234                  if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) {
1235 1235                          icn->icn_dimm_type = IMC_DIMM_DDR4;
1236 1236                  } else {
1237 1237                          icn->icn_dimm_type = IMC_DIMM_DDR3;
1238 1238                  }
1239 1239                  break;
1240 1240          default:
1241 1241                  /*
1242 1242                   * Skylake and on are all DDR4.
1243 1243                   */
1244 1244                  icn->icn_dimm_type = IMC_DIMM_DDR4;
1245 1245                  break;
1246 1246          }
1247 1247  
1248 1248          if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) {
1249 1249                  icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace,
1250 1250                      imc->imc_gen_data->igd_topo_offset);
1251 1251          }
1252 1252  
1253 1253          return (B_TRUE);
1254 1254  }
1255 1255  
1256 1256  /*
1257 1257   * Walk the IMC data and fill in the information on DIMMs and the memory
1258 1258   * controller configurations.
1259 1259   */
1260 1260  static void
1261 1261  imc_fill_data(imc_t *imc)
1262 1262  {
1263 1263          uint_t csock, cmc, cchan;
1264 1264  
1265 1265          for (csock = 0; csock < imc->imc_nsockets; csock++) {
1266 1266                  imc_socket_t *sock = &imc->imc_sockets[csock];
1267 1267  
1268 1268                  for (cmc = 0; cmc < sock->isock_nimc; cmc++) {
1269 1269                          imc_mc_t *icn = &sock->isock_imcs[cmc];
1270 1270  
1271 1271                          if (!imc_fill_controller(imc, icn))
1272 1272                                  continue;
1273 1273  
1274 1274                          for (cchan = 0; cchan < icn->icn_nchannels; cchan++) {
1275 1275                                  imc_fill_dimms(imc, icn,
1276 1276                                      &icn->icn_channels[cchan]);
1277 1277                          }
1278 1278                  }
1279 1279          }
1280 1280  }
1281 1281  
1282 1282  static nvlist_t *
1283 1283  imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm)
1284 1284  {
1285 1285          nvlist_t *nvl;
1286 1286  
1287 1287          nvl = fnvlist_alloc();
1288 1288          fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT,
1289 1289              dimm->idimm_present);
1290 1290          if (!dimm->idimm_present) {
1291 1291                  return (nvl);
1292 1292          }
1293 1293  
1294 1294          fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size);
1295 1295          fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS,
1296 1296              dimm->idimm_ncolumns);
1297 1297          fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS,
1298 1298              dimm->idimm_nrows);
1299 1299  
1300 1300          if (imc->imc_gen > IMC_GEN_SANDY) {
1301 1301                  fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY,
1302 1302                      dimm->idimm_density * (1ULL << 30));
1303 1303                  fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH,
1304 1304                      dimm->idimm_width);
1305 1305          }
1306 1306          fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS,
1307 1307              dimm->idimm_nranks);
1308 1308          fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS,
1309 1309              dimm->idimm_nbanks);
1310 1310          fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS,
1311 1311              dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE);
1312 1312  
1313 1313          if (imc->imc_gen >= IMC_GEN_HASWELL) {
1314 1314                  fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL,
1315 1315                      dimm->idimm_hdrl);
1316 1316                  fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP,
1317 1317                      dimm->idimm_hdrl_parity);
1318 1318                  if (dimm->idimm_3dsranks > 0) {
1319 1319                          fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK,
1320 1320                              dimm->idimm_3dsranks);
1321 1321                  }
1322 1322          }
1323 1323  
1324 1324          return (nvl);
1325 1325  }
1326 1326  
1327 1327  static nvlist_t *
1328 1328  imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan)
1329 1329  {
1330 1330          nvlist_t *nvl;
1331 1331          nvlist_t *dimms[IMC_MAX_DIMMPERCHAN];
1332 1332          uint_t i;
1333 1333  
1334 1334          nvl = fnvlist_alloc();
1335 1335          fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC,
1336 1336              imc->imc_gen_data->igd_max_dimms);
1337 1337          for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1338 1338                  dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]);
1339 1339          }
1340 1340  
1341 1341          fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS,
1342 1342              dimms, i);
1343 1343  
1344 1344          for (; i > 0; i--) {
1345 1345                  nvlist_free(dimms[i-1]);
1346 1346          }
1347 1347  
1348 1348          return (nvl);
1349 1349  }
1350 1350  
1351 1351  static nvlist_t *
1352 1352  imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn)
1353 1353  {
1354 1354          nvlist_t *nvl;
1355 1355          nvlist_t *channels[IMC_MAX_CHANPERMC];
1356 1356          uint_t i;
1357 1357  
1358 1358          nvl = fnvlist_alloc();
1359 1359          fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels);
1360 1360          fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC,
1361 1361              icn->icn_ecc);
1362 1362          if (icn->icn_lockstep) {
1363 1363                  fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1364 1364                      MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK);
1365 1365          } else {
1366 1366                  fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1367 1367                      MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP);
1368 1368  
1369 1369          }
1370 1370  
1371 1371          if (icn->icn_closed) {
1372 1372                  fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1373 1373                      MCINTEL_NVLIST_V1_MC_POLICY_CLOSED);
1374 1374          } else {
1375 1375                  fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1376 1376                      MCINTEL_NVLIST_V1_MC_POLICY_OPEN);
1377 1377          }
1378 1378  
1379 1379          for (i = 0; i < icn->icn_nchannels; i++) {
1380 1380                  channels[i] = imc_nvl_create_channel(imc,
1381 1381                      &icn->icn_channels[i]);
1382 1382          }
1383 1383          fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS,
1384 1384              channels, icn->icn_nchannels);
1385 1385          for (i = 0; i < icn->icn_nchannels; i++) {
1386 1386                  nvlist_free(channels[i]);
1387 1387          }
1388 1388  
1389 1389          return (nvl);
1390 1390  }
1391 1391  
1392 1392  static void
1393 1393  imc_nvl_pack(imc_socket_t *sock, boolean_t sleep)
1394 1394  {
1395 1395          char *buf = NULL;
1396 1396          size_t len = 0;
1397 1397          int kmflag;
  
    | 
      ↓ open down ↓ | 
    1397 lines elided | 
    
      ↑ open up ↑ | 
  
1398 1398  
1399 1399          if (sock->isock_nvl == NULL)
1400 1400                  return;
1401 1401  
1402 1402          if (sock->isock_buf != NULL)
1403 1403                  return;
1404 1404  
1405 1405          if (sleep) {
1406 1406                  kmflag = KM_SLEEP;
1407 1407          } else {
1408      -                kmflag = KM_NOSLEEP | KM_NORMALPRI;
     1408 +                kmflag = KM_NOSLEEP_LAZY;
1409 1409          }
1410 1410  
1411 1411          if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR,
1412 1412              kmflag) != 0) {
1413 1413                  return;
1414 1414          }
1415 1415  
1416 1416          sock->isock_buf = buf;
1417 1417          sock->isock_buflen = len;
1418 1418          sock->isock_gen++;
1419 1419  }
1420 1420  
1421 1421  static void
1422 1422  imc_decoder_pack(imc_t *imc)
1423 1423  {
1424 1424          char *buf = NULL;
  
    | 
      ↓ open down ↓ | 
    6 lines elided | 
    
      ↑ open up ↑ | 
  
1425 1425          size_t len = 0;
1426 1426  
1427 1427          if (imc->imc_decoder_buf != NULL)
1428 1428                  return;
1429 1429  
1430 1430          if (imc->imc_decoder_dump == NULL) {
1431 1431                  imc->imc_decoder_dump = imc_dump_decoder(imc);
1432 1432          }
1433 1433  
1434 1434          if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR,
1435      -            KM_NOSLEEP | KM_NORMALPRI) != 0) {
     1435 +            KM_NOSLEEP_LAZY) != 0) {
1436 1436                  return;
1437 1437          }
1438 1438  
1439 1439          imc->imc_decoder_buf = buf;
1440 1440          imc->imc_decoder_len = len;
1441 1441  }
1442 1442  
1443 1443  static void
1444 1444  imc_nvl_create(imc_t *imc)
1445 1445  {
1446 1446          uint_t csock;
1447 1447          for (csock = 0; csock < imc->imc_nsockets; csock++) {
1448 1448                  uint_t i;
1449 1449                  nvlist_t *nvl;
1450 1450                  nvlist_t *mcs[IMC_MAX_IMCPERSOCK];
1451 1451                  imc_socket_t *sock = &imc->imc_sockets[csock];
1452 1452  
1453 1453                  nvl = fnvlist_alloc();
1454 1454                  fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR,
1455 1455                      MCINTEL_NVLIST_VERS1);
1456 1456                  fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC,
1457 1457                      sock->isock_nimc);
1458 1458  
1459 1459                  for (i = 0; i < sock->isock_nimc; i++) {
1460 1460                          mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]);
1461 1461                  }
1462 1462  
1463 1463                  fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS,
1464 1464                      mcs, sock->isock_nimc);
1465 1465  
1466 1466                  for (i = 0; i < sock->isock_nimc; i++) {
1467 1467                          nvlist_free(mcs[i]);
1468 1468                  }
1469 1469  
1470 1470                  sock->isock_nvl = nvl;
1471 1471                  imc_nvl_pack(sock, B_TRUE);
1472 1472          }
1473 1473  }
1474 1474  
1475 1475  /*
1476 1476   * Determine the top of low and high memory. These determine whether transaction
1477 1477   * addresses target main memory or not. Unfortunately, the way that these are
1478 1478   * stored and fetched changes with different generations.
1479 1479   */
1480 1480  static void
1481 1481  imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad)
1482 1482  {
1483 1483          uint32_t tolm, tohm_low, tohm_hi;
1484 1484  
1485 1485          tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1486 1486              imc->imc_gen_data->igd_tolm_offset);
1487 1487          tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1488 1488              imc->imc_gen_data->igd_tohm_low_offset);
1489 1489          if (imc->imc_gen_data->igd_tohm_hi_offset != 0) {
1490 1490                  tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1491 1491                      imc->imc_gen_data->igd_tohm_hi_offset);
1492 1492          } else {
1493 1493                  tohm_hi = 0;
1494 1494          }
1495 1495  
1496 1496          if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 ||
1497 1497              tohm_hi == PCI_EINVAL32) {
1498 1498                  sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1499 1499                  return;
1500 1500          }
1501 1501  
1502 1502          switch (imc->imc_gen) {
1503 1503          case IMC_GEN_SANDY:
1504 1504          case IMC_GEN_IVY:
1505 1505                  sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) <<
1506 1506                      IMC_TOLM_SNB_IVY_SHIFT;
1507 1507                  sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) <<
1508 1508                      IMC_TOLM_SNB_IVY_SHIFT;
1509 1509                  break;
1510 1510          case IMC_GEN_HASWELL:
1511 1511          case IMC_GEN_BROADWELL:
1512 1512          case IMC_GEN_SKYLAKE:
1513 1513                  sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK;
1514 1514                  sad->isad_tohm = ((uint64_t)tohm_low &
1515 1515                      IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32);
1516 1516  
1517 1517                  /*
1518 1518                   * Adjust the values to turn them into an exclusive range.
1519 1519                   */
1520 1520                  sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL;
1521 1521                  sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL;
1522 1522                  break;
1523 1523          default:
1524 1524                  dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
1525 1525                      "set to unknown generation: %u", imc->imc_gen);
1526 1526                  return;
1527 1527          }
1528 1528  }
1529 1529  
1530 1530  static void
1531 1531  imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule,
1532 1532      uint32_t raw)
1533 1533  {
1534 1534          uint_t attr;
1535 1535          uint64_t limit;
1536 1536          bzero(rule, sizeof (imc_sad_rule_t));
1537 1537  
1538 1538          rule->isr_raw_dram = raw;
1539 1539          rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0;
1540 1540          if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1541 1541                  switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) {
1542 1542                  case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6:
1543 1543                          rule->isr_imode = IMC_SAD_IMODE_8t6;
1544 1544                          break;
1545 1545                  case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR:
1546 1546                          rule->isr_imode = IMC_SAD_IMODE_8t6XOR;
1547 1547                          break;
1548 1548                  }
1549 1549          } else {
1550 1550                  switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) {
1551 1551                  case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6:
1552 1552                          rule->isr_imode = IMC_SAD_IMODE_8t6;
1553 1553                          break;
1554 1554                  case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8:
1555 1555                          rule->isr_imode = IMC_SAD_IMODE_10t8;
1556 1556                          break;
1557 1557                  case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12:
1558 1558                          rule->isr_imode = IMC_SAD_IMODE_14t12;
1559 1559                          break;
1560 1560                  case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30:
1561 1561                          rule->isr_imode = IMC_SAD_IMODE_32t30;
1562 1562                          break;
1563 1563                  }
1564 1564          }
1565 1565  
1566 1566          if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1567 1567                  attr = IMC_SAD_DRAM_ATTR_SKX(raw);
1568 1568          } else {
1569 1569                  attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw);
1570 1570          }
1571 1571  
1572 1572          switch (attr) {
1573 1573          case IMC_SAD_DRAM_ATTR_DRAM:
1574 1574                  rule->isr_type = IMC_SAD_TYPE_DRAM;
1575 1575                  break;
1576 1576          case IMC_SAD_DRAM_ATTR_MMCFG:
1577 1577                  rule->isr_type = IMC_SAD_TYPE_MMCFG;
1578 1578                  break;
1579 1579          case IMC_SAD_DRAM_ATTR_NXM:
1580 1580                  if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1581 1581                          sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1582 1582                  }
1583 1583                  rule->isr_type = IMC_SAD_TYPE_NXM;
1584 1584                  break;
1585 1585          default:
1586 1586                  sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1587 1587                  break;
1588 1588          }
1589 1589  
1590 1590          /*
1591 1591           * Fetch the limit which represents bits 45:26 and then adjust this so
1592 1592           * that it is exclusive.
1593 1593           */
1594 1594          if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1595 1595                  limit = IMC_SAD_DRAM_LIMIT_SKX(raw);
1596 1596          } else {
1597 1597                  limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw);
1598 1598          }
1599 1599          rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) +
1600 1600              IMC_SAD_DRAM_LIMIT_EXCLUSIVE;
1601 1601  
1602 1602          /*
1603 1603           * The rest of this does not apply to Sandy Bridge.
1604 1604           */
1605 1605          if (imc->imc_gen == IMC_GEN_SANDY)
1606 1606                  return;
1607 1607  
1608 1608          if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) {
1609 1609                  rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0;
1610 1610                  return;
1611 1611          }
1612 1612  
1613 1613          switch (IMC_SAD_DRAM_MOD23_SKX(raw)) {
1614 1614          case IMC_SAD_DRAM_MOD23_MOD3:
1615 1615                  rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3;
1616 1616                  break;
1617 1617          case IMC_SAD_DRAM_MOD23_MOD2_C01:
1618 1618                  rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01;
1619 1619                  break;
1620 1620          case IMC_SAD_DRAM_MOD23_MOD2_C12:
1621 1621                  rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12;
1622 1622                  break;
1623 1623          case IMC_SAD_DRAM_MOD23_MOD2_C02:
1624 1624                  rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02;
1625 1625                  break;
1626 1626          }
1627 1627  
1628 1628          rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0;
1629 1629          switch (IMC_SAD_DRAM_MOD3_SKX(raw)) {
1630 1630          case IMC_SAD_DRAM_MOD3_MODE_45t6:
1631 1631                  rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6;
1632 1632                  break;
1633 1633          case IMC_SAD_DRAM_MOD3_MODE_45t8:
1634 1634                  rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8;
1635 1635                  break;
1636 1636          case IMC_SAD_DRAM_MOD3_MODE_45t12:
1637 1637                  rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12;
1638 1638                  break;
1639 1639          default:
1640 1640                  sad->isad_valid |= IMC_SAD_V_BAD_MOD3;
1641 1641                  break;
1642 1642          }
1643 1643  }
1644 1644  
1645 1645  static void
1646 1646  imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw)
1647 1647  {
1648 1648          uint_t i;
1649 1649          uint32_t mlen, mbase, skipbits, skipafter;
1650 1650  
1651 1651          rule->isr_raw_interleave = raw;
1652 1652  
1653 1653          /*
1654 1654           * Right now all architectures always have the maximum number of SAD
1655 1655           * interleave targets.
1656 1656           */
1657 1657          rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE;
1658 1658  
1659 1659          /*
1660 1660           * Sandy Bridge has a gap in the interleave list due to the fact that it
1661 1661           * uses a smaller length.
1662 1662           */
1663 1663          if (imc->imc_gen > IMC_GEN_SANDY) {
1664 1664                  mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN;
1665 1665                  mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK;
1666 1666                  skipbits = skipafter = 0;
1667 1667          } else {
1668 1668                  mlen = IMC_SAD_ILEAVE_SNB_LEN;
1669 1669                  mbase = IMC_SAD_ILEAVE_SNB_MASK;
1670 1670                  skipbits = 2;
1671 1671                  skipafter = 4;
1672 1672          }
1673 1673  
1674 1674          for (i = 0; i < rule->isr_ntargets; i++) {
1675 1675                  uint32_t mask, shift;
1676 1676  
1677 1677                  shift = i * mlen;
1678 1678                  if (i >= skipafter)
1679 1679                          shift += skipbits;
1680 1680                  mask = mbase << shift;
1681 1681                  rule->isr_targets[i] = (raw & mask) >> shift;
1682 1682          }
1683 1683  }
1684 1684  
1685 1685  static void
1686 1686  imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad)
1687 1687  {
1688 1688          uint_t i;
1689 1689          off_t off;
1690 1690  
1691 1691          sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules;
1692 1692          for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset;
1693 1693              i < sad->isad_nrules; i++, off += sizeof (uint64_t)) {
1694 1694                  uint32_t dram, interleave;
1695 1695                  imc_sad_rule_t *rule = &sad->isad_rules[i];
1696 1696  
1697 1697                  dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off);
1698 1698                  interleave = pci_config_get32(sad->isad_dram->istub_cfgspace,
1699 1699                      off + 4);
1700 1700  
1701 1701                  if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) {
1702 1702                          sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1703 1703                          return;
1704 1704                  }
1705 1705  
1706 1706                  imc_sad_fill_rule(imc, sad, rule, dram);
1707 1707                  imc_sad_fill_rule_interleave(imc, rule, interleave);
1708 1708          }
1709 1709  }
1710 1710  
1711 1711  static void
1712 1712  imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad)
1713 1713  {
1714 1714          uint_t i;
1715 1715          imc_sad_mcroute_table_t *mc = &sad->isad_mcroute;
1716 1716  
1717 1717          if (imc->imc_gen < IMC_GEN_SKYLAKE)
1718 1718                  return;
1719 1719          if (sad->isad_valid != 0)
1720 1720                  return;
1721 1721  
1722 1722          mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES;
1723 1723          for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) {
1724 1724                  uint_t chanoff, ringoff;
1725 1725  
1726 1726                  ringoff = i * IMC_MC_ROUTE_RING_BITS;
1727 1727                  chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET;
1728 1728  
1729 1729                  mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >>
1730 1730                      ringoff) & IMC_MC_ROUTE_RING_MASK;
1731 1731                  mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >>
1732 1732                      chanoff) & IMC_MC_ROUTE_CHAN_MASK;
1733 1733          }
1734 1734  }
1735 1735  
1736 1736  /*
1737 1737   * Initialize the SAD. To do this we have to do a few different things:
1738 1738   *
1739 1739   * 1. Determine where the top of low and high memory is.
1740 1740   * 2. Read and decode all of the rules for the SAD
1741 1741   * 3. On systems with a route table, decode the raw routes
1742 1742   *
1743 1743   * At this point in time, we treat TOLM and TOHM as a per-socket construct, even
1744 1744   * though it really should be global, this just makes life a bit simpler.
1745 1745   */
1746 1746  static void
1747 1747  imc_decoder_init_sad(imc_t *imc)
1748 1748  {
1749 1749          uint_t i;
1750 1750  
1751 1751          for (i = 0; i < imc->imc_nsockets; i++) {
1752 1752                  imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad);
1753 1753                  imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad);
1754 1754                  imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad);
1755 1755          }
1756 1756  }
1757 1757  
1758 1758  static void
1759 1759  imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev,
1760 1760      imc_tad_rule_t *rule, uint32_t val)
1761 1761  {
1762 1762          uint64_t limit;
1763 1763  
1764 1764          limit = IMC_TAD_LIMIT(val);
1765 1765          rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) +
1766 1766              IMC_TAD_LIMIT_EXCLUSIVE;
1767 1767          rule->itr_raw = val;
1768 1768  
1769 1769          switch (IMC_TAD_SOCK_WAY(val)) {
1770 1770          case IMC_TAD_SOCK_WAY_1:
1771 1771                  rule->itr_sock_way = 1;
1772 1772                  break;
1773 1773          case IMC_TAD_SOCK_WAY_2:
1774 1774                  rule->itr_sock_way = 2;
1775 1775                  break;
1776 1776          case IMC_TAD_SOCK_WAY_4:
1777 1777                  rule->itr_sock_way = 4;
1778 1778                  break;
1779 1779          case IMC_TAD_SOCK_WAY_8:
1780 1780                  rule->itr_sock_way = 8;
1781 1781                  break;
1782 1782          }
1783 1783  
1784 1784          rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1;
1785 1785          rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1786 1786          rule->itr_chan_gran = IMC_TAD_GRAN_64B;
1787 1787  
1788 1788          /*
1789 1789           * Starting with Skylake the targets that are used are no longer part of
1790 1790           * the TAD. Those come from the IMC route table.
1791 1791           */
1792 1792          if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1793 1793                  rule->itr_ntargets = 0;
1794 1794                  return;
1795 1795          }
1796 1796  
1797 1797          rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS;
1798 1798          rule->itr_targets[0] = IMC_TAD_TARG0(val);
1799 1799          rule->itr_targets[1] = IMC_TAD_TARG1(val);
1800 1800          rule->itr_targets[2] = IMC_TAD_TARG2(val);
1801 1801          rule->itr_targets[3] = IMC_TAD_TARG3(val);
1802 1802  
1803 1803          if (prev == NULL) {
1804 1804                  rule->itr_base = 0;
1805 1805          } else {
1806 1806                  rule->itr_base = prev->itr_limit + 1;
1807 1807          }
1808 1808  }
1809 1809  
1810 1810  static void
1811 1811  imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule,
1812 1812      uint32_t val)
1813 1813  {
1814 1814          uint64_t base;
1815 1815  
1816 1816          rule->itr_raw_gran = val;
1817 1817          base = IMC_TAD_BASE_BASE(val);
1818 1818          rule->itr_base = base << IMC_TAD_BASE_SHIFT;
1819 1819  
1820 1820          switch (IMC_TAD_BASE_CHAN_GRAN(val)) {
1821 1821          case IMC_TAD_BASE_CHAN_GRAN_64B:
1822 1822                  rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1823 1823                  break;
1824 1824          case IMC_TAD_BASE_CHAN_GRAN_256B:
1825 1825                  rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1826 1826                  break;
1827 1827          case IMC_TAD_BASE_CHAN_GRAN_4KB:
1828 1828                  rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1829 1829                  break;
1830 1830          default:
1831 1831                  tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN;
1832 1832                  return;
1833 1833          }
1834 1834  
1835 1835          switch (IMC_TAD_BASE_SOCK_GRAN(val)) {
1836 1836          case IMC_TAD_BASE_SOCK_GRAN_64B:
1837 1837                  rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1838 1838                  break;
1839 1839          case IMC_TAD_BASE_SOCK_GRAN_256B:
1840 1840                  rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1841 1841                  break;
1842 1842          case IMC_TAD_BASE_SOCK_GRAN_4KB:
1843 1843                  rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1844 1844                  break;
1845 1845          case IMC_TAD_BASE_SOCK_GRAN_1GB:
1846 1846                  rule->itr_sock_gran = IMC_TAD_GRAN_1GB;
1847 1847                  break;
1848 1848          }
1849 1849  }
1850 1850  
1851 1851  /*
1852 1852   * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's
1853 1853   * suggested that the channel wayness will take this into account and therefore
1854 1854   * should be accurately reflected.
1855 1855   */
1856 1856  static void
1857 1857  imc_tad_read_rules(imc_t *imc, imc_tad_t *tad)
1858 1858  {
1859 1859          uint_t i;
1860 1860          off_t baseoff;
1861 1861          imc_tad_rule_t *prev;
1862 1862  
1863 1863          tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules;
1864 1864          for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset,
1865 1865              prev = NULL; i < tad->itad_nrules;
1866 1866              i++, baseoff += sizeof (uint32_t)) {
1867 1867                  uint32_t val;
1868 1868                  off_t off;
1869 1869                  imc_tad_rule_t *rule = &tad->itad_rules[i];
1870 1870  
1871 1871                  /*
1872 1872                   * On Skylake, the TAD rules are split among two registers. The
1873 1873                   * latter set mimics what exists on pre-Skylake.
1874 1874                   */
1875 1875                  if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1876 1876                          off = baseoff + IMC_SKX_WAYNESS_OFFSET;
1877 1877                  } else {
1878 1878                          off = baseoff;
1879 1879                  }
1880 1880  
1881 1881                  val = pci_config_get32(tad->itad_stub->istub_cfgspace, off);
1882 1882                  if (val == PCI_EINVAL32) {
1883 1883                          tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1884 1884                          return;
1885 1885                  }
1886 1886  
1887 1887                  imc_tad_fill_rule(imc, tad, prev, rule, val);
1888 1888                  prev = rule;
1889 1889                  if (imc->imc_gen < IMC_GEN_SKYLAKE)
1890 1890                          continue;
1891 1891  
1892 1892                  val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff);
1893 1893                  if (val == PCI_EINVAL32) {
1894 1894                          tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1895 1895                          return;
1896 1896                  }
1897 1897  
1898 1898                  imc_tad_fill_skx(imc, tad, rule, val);
1899 1899          }
1900 1900  }
1901 1901  
1902 1902  /*
1903 1903   * Check for features which change how decoding works.
1904 1904   */
1905 1905  static void
1906 1906  imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc)
1907 1907  {
1908 1908          uint32_t val;
1909 1909  
1910 1910          /*
1911 1911           * Determine whether or not lockstep mode or mirroring are enabled.
1912 1912           * These change the behavior of how we're supposed to interpret channel
1913 1913           * wayness. Lockstep is available in the TAD's features. Mirroring is
1914 1914           * available on the IMC's features. This isn't present in Skylake+. On
1915 1915           * Skylake Mirorring is a property of the SAD rule and there is no
1916 1916           * lockstep.
1917 1917           */
1918 1918          switch (imc->imc_gen) {
1919 1919          case IMC_GEN_SANDY:
1920 1920          case IMC_GEN_IVY:
1921 1921          case IMC_GEN_HASWELL:
1922 1922          case IMC_GEN_BROADWELL:
1923 1923                  val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1924 1924                      imc->imc_gen_data->igd_tad_sysdef);
1925 1925                  if (val == PCI_EINVAL32) {
1926 1926                          tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1927 1927                          return;
1928 1928                  }
1929 1929                  if (IMC_TAD_SYSDEF_LOCKSTEP(val)) {
1930 1930                          tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP;
1931 1931                  }
1932 1932  
1933 1933                  val = pci_config_get32(mc->icn_main1->istub_cfgspace,
1934 1934                      imc->imc_gen_data->igd_mc_mirror);
1935 1935                  if (val == PCI_EINVAL32) {
1936 1936                          tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1937 1937                          return;
1938 1938                  }
1939 1939                  if (IMC_MC_MIRROR_SNB_BRD(val)) {
1940 1940                          tad->itad_flags |= IMC_TAD_FLAG_MIRROR;
1941 1941                  }
1942 1942                  break;
1943 1943          default:
1944 1944                  break;
1945 1945          }
1946 1946  
1947 1947          /*
1948 1948           * Now, go through and look at values that'll change how we do the
1949 1949           * channel index and adddress calculation. These are only present
1950 1950           * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge
1951 1951           * and they don't exist on Skylake+.
1952 1952           */
1953 1953          switch (imc->imc_gen) {
1954 1954          case IMC_GEN_IVY:
1955 1955          case IMC_GEN_HASWELL:
1956 1956          case IMC_GEN_BROADWELL:
1957 1957                  val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1958 1958                      imc->imc_gen_data->igd_tad_sysdef2);
1959 1959                  if (val == PCI_EINVAL32) {
1960 1960                          tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1961 1961                          return;
1962 1962                  }
1963 1963                  if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1964 1964                          tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT;
1965 1965                  }
1966 1966                  if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1967 1967                          tad->itad_flags |= IMC_TAD_FLAG_CHANHASH;
1968 1968                  }
1969 1969                  break;
1970 1970          default:
1971 1971                  break;
1972 1972          }
1973 1973  }
1974 1974  
1975 1975  /*
1976 1976   * Read the IMC channel interleave records
1977 1977   */
1978 1978  static void
1979 1979  imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan)
1980 1980  {
1981 1981          uint_t i;
1982 1982          off_t off;
1983 1983  
1984 1984          chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules;
1985 1985          for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset;
1986 1986              i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) {
1987 1987                  uint32_t val;
1988 1988                  uint64_t offset;
1989 1989  
1990 1990                  val = pci_config_get32(chan->ich_desc->istub_cfgspace,
1991 1991                      off);
1992 1992                  if (val == PCI_EINVAL32) {
1993 1993                          chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
1994 1994                          return;
1995 1995                  }
1996 1996  
1997 1997                  if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1998 1998                          offset = IMC_TADCHAN_OFFSET_SKX(val);
1999 1999                  } else {
2000 2000                          offset = IMC_TADCHAN_OFFSET_SNB_BRD(val);
2001 2001                  }
2002 2002  
2003 2003                  chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT;
2004 2004                  chan->ich_tad_offsets_raw[i] = val;
2005 2005          }
2006 2006  }
2007 2007  
2008 2008  static void
2009 2009  imc_decoder_init_tad(imc_t *imc)
2010 2010  {
2011 2011          uint_t i;
2012 2012  
2013 2013          for (i = 0; i < imc->imc_nsockets; i++) {
2014 2014                  uint_t j;
2015 2015  
2016 2016                  for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) {
2017 2017                          imc_tad_read_features(imc,
2018 2018                              &imc->imc_sockets[i].isock_tad[j],
2019 2019                              &imc->imc_sockets[i].isock_imcs[j]);
2020 2020                          imc_tad_read_rules(imc,
2021 2021                              &imc->imc_sockets[i].isock_tad[j]);
2022 2022                  }
2023 2023          }
2024 2024  
2025 2025          for (i = 0; i < imc->imc_nsockets; i++) {
2026 2026                  uint_t j;
2027 2027                  imc_socket_t *sock = &imc->imc_sockets[i];
2028 2028  
2029 2029                  for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2030 2030                          uint_t k;
2031 2031                          imc_mc_t *mc = &sock->isock_imcs[j];
2032 2032  
2033 2033                          for (k = 0; k < mc->icn_nchannels; k++) {
2034 2034                                  imc_channel_t *chan = &mc->icn_channels[k];
2035 2035                                  imc_tad_read_interleave(imc, chan);
2036 2036                          }
2037 2037                  }
2038 2038          }
2039 2039  }
2040 2040  
2041 2041  static void
2042 2042  imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan,
2043 2043      imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig)
2044 2044  {
2045 2045          uint_t i;
2046 2046          off_t off, incr;
2047 2047  
2048 2048          /*
2049 2049           * Rank interleave offset registers come in two forms. Either they are
2050 2050           * contiguous for a given wayness, meaning that all of the entries for
2051 2051           * wayness zero are contiguous, or they are sparse, meaning that there
2052 2052           * is a bank for entry zero for all wayness, then entry one for all
2053 2053           * wayness, etc.
2054 2054           */
2055 2055          if (contig) {
2056 2056                  off = imc->imc_gen_data->igd_rir_ileave_offset +
2057 2057                      (rirno * imc->imc_gen_data->igd_rir_nileaves *
2058 2058                      sizeof (uint32_t));
2059 2059                  incr = sizeof (uint32_t);
2060 2060          } else {
2061 2061                  off = imc->imc_gen_data->igd_rir_ileave_offset +
2062 2062                      (rirno * sizeof (uint32_t));
2063 2063                  incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t);
2064 2064          }
2065 2065          for (i = 0; i < rank->irle_nentries; i++, off += incr) {
2066 2066                  uint32_t val;
2067 2067                  uint64_t offset;
2068 2068                  imc_rank_ileave_entry_t *ent = &rank->irle_entries[i];
2069 2069  
2070 2070                  val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2071 2071                  if (val == PCI_EINVAL32) {
2072 2072                          chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2073 2073                          return;
2074 2074                  }
2075 2075  
2076 2076                  switch (imc->imc_gen) {
2077 2077                  case IMC_GEN_BROADWELL:
2078 2078                          ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val);
2079 2079                          break;
2080 2080                  default:
2081 2081                          ent->irle_target = IMC_RIR_OFFSET_TARGET(val);
2082 2082                          break;
2083 2083                  }
2084 2084                  if (imc->imc_gen >= IMC_GEN_HASWELL) {
2085 2085                          offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val);
2086 2086                  } else {
2087 2087                          offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val);
2088 2088                  }
2089 2089                  ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT;
2090 2090          }
2091 2091  }
2092 2092  
2093 2093  static void
2094 2094  imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan)
2095 2095  {
2096 2096          uint_t i;
2097 2097          off_t off;
2098 2098  
2099 2099          chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways;
2100 2100          for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset;
2101 2101              i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) {
2102 2102                  uint32_t val;
2103 2103                  uint64_t lim;
2104 2104                  imc_rank_ileave_t *ent = &chan->ich_rankileaves[i];
2105 2105  
2106 2106                  val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2107 2107                  if (val == PCI_EINVAL32) {
2108 2108                          chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2109 2109                          return;
2110 2110                  }
2111 2111  
2112 2112                  ent->irle_raw = val;
2113 2113                  ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0;
2114 2114                  ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val);
2115 2115                  ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val);
2116 2116                  if (imc->imc_gen >= IMC_GEN_HASWELL) {
2117 2117                          lim = IMC_RIR_LIMIT_HAS_SKX(val);
2118 2118                  } else {
2119 2119                          lim = IMC_RIR_LIMIT_SNB_IVB(val);
2120 2120                  }
2121 2121  
2122 2122                  ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) +
2123 2123                      IMC_RIR_LIMIT_EXCLUSIVE;
2124 2124  
2125 2125                  ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves;
2126 2126                  if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
2127 2127                          imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE);
2128 2128                  } else {
2129 2129                          imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE);
2130 2130                  }
2131 2131          }
2132 2132  }
2133 2133  
2134 2134  static void
2135 2135  imc_decoder_init_rir(imc_t *imc)
2136 2136  {
2137 2137          uint_t i;
2138 2138  
2139 2139          for (i = 0; i < imc->imc_nsockets; i++) {
2140 2140                  uint_t j;
2141 2141                  imc_socket_t *sock = &imc->imc_sockets[i];
2142 2142  
2143 2143                  for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2144 2144                          uint_t k;
2145 2145                          imc_mc_t *mc = &sock->isock_imcs[j];
2146 2146  
2147 2147                          for (k = 0; k < mc->icn_nchannels; k++) {
2148 2148                                  imc_channel_t *chan = &mc->icn_channels[k];
2149 2149                                  imc_rir_read_wayness(imc, chan);
2150 2150                          }
2151 2151                  }
2152 2152          }
2153 2153  }
2154 2154  
2155 2155  static cmi_errno_t
2156 2156  imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo,
2157 2157      uint32_t synd, int syndtype, mc_unum_t *unump)
2158 2158  {
2159 2159          imc_t *imc = arg;
2160 2160          uint_t i;
2161 2161          imc_decode_state_t dec;
2162 2162  
2163 2163          bzero(&dec, sizeof (dec));
2164 2164          if (!imc_decode_pa(imc, pa, &dec)) {
2165 2165                  switch (dec.ids_fail) {
2166 2166                  case IMC_DECODE_F_LEGACY_RANGE:
2167 2167                  case IMC_DECODE_F_OUTSIDE_DRAM:
2168 2168                          return (CMIERR_MC_NOTDIMMADDR);
2169 2169                  default:
2170 2170                          return (CMIERR_MC_BADSTATE);
2171 2171                  }
2172 2172          }
2173 2173  
2174 2174          unump->unum_board = 0;
2175 2175          /*
2176 2176           * The chip id needs to be in the order that the OS expects it, which
2177 2177           * may not be our order.
2178 2178           */
2179 2179          for (i = 0; i < imc->imc_nsockets; i++) {
2180 2180                  if (imc->imc_spointers[i] == dec.ids_socket)
2181 2181                          break;
2182 2182          }
2183 2183          if (i == imc->imc_nsockets) {
2184 2184                  return (CMIERR_MC_BADSTATE);
2185 2185          }
2186 2186          unump->unum_chip = i;
2187 2187          unump->unum_mc = dec.ids_tadid;
2188 2188          unump->unum_chan = dec.ids_channelid;
2189 2189          unump->unum_cs = dec.ids_dimmid;
2190 2190          unump->unum_rank = dec.ids_rankid;
2191 2191          unump->unum_offset = dec.ids_rankaddr;
2192 2192          for (i = 0; i < MC_UNUM_NDIMM; i++) {
2193 2193                  unump->unum_dimms[i] = MC_INVALNUM;
2194 2194          }
2195 2195  
2196 2196          return (CMI_SUCCESS);
2197 2197  }
2198 2198  
2199 2199  static cmi_errno_t
2200 2200  imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa)
2201 2201  {
2202 2202          return (CMIERR_UNKNOWN);
2203 2203  }
2204 2204  
2205 2205  static const cmi_mc_ops_t imc_mc_ops = {
2206 2206          .cmi_mc_patounum = imc_mc_patounum,
2207 2207          .cmi_mc_unumtopa = imc_mc_unumtopa
2208 2208  };
2209 2209  
2210 2210  /*
2211 2211   * This is where we really finish attaching and become open for business. This
2212 2212   * occurs once we have all of the expected stubs attached. Here's where all of
2213 2213   * the real fun begins.
2214 2214   */
2215 2215  static void
2216 2216  imc_attach_complete(void *arg)
2217 2217  {
2218 2218          imc_t *imc = arg;
2219 2219          cmi_errno_t err;
2220 2220  
2221 2221          imc_set_gen_data(imc);
2222 2222  
2223 2223          /*
2224 2224           * On SKX and newer, we can fail to map PCI buses at this point due to
2225 2225           * bad PCIe reads.
2226 2226           */
2227 2227          if (!imc_map_stubs(imc)) {
2228 2228                  goto done;
2229 2229          }
2230 2230  
2231 2231          if (!imc_validate_stubs(imc)) {
2232 2232                  imc->imc_flags |= IMC_F_VALIDATE_FAILED;
2233 2233                  goto done;
2234 2234          }
2235 2235  
2236 2236          imc_fixup_stubs(imc);
2237 2237          imc_map_sockets(imc);
2238 2238  
2239 2239          if (!imc_create_minors(imc)) {
2240 2240                  goto done;
2241 2241          }
2242 2242  
2243 2243          imc_fill_data(imc);
2244 2244          imc_nvl_create(imc);
2245 2245  
2246 2246          /*
2247 2247           * Gather additional information that we need so that we can properly
2248 2248           * initialize the memory decoder and encoder.
2249 2249           */
2250 2250          imc_decoder_init_sad(imc);
2251 2251          imc_decoder_init_tad(imc);
2252 2252          imc_decoder_init_rir(imc);
2253 2253  
2254 2254          /*
2255 2255           * Register decoder functions. This may fail. If so, try and complain
2256 2256           * loudly, but stay active to allow other data to be useful. Register a
2257 2257           * global handle.
2258 2258           */
2259 2259          if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) {
2260 2260                  imc->imc_flags |= IMC_F_MCREG_FAILED;
2261 2261                  dev_err(imc->imc_dip, CE_WARN, "failed to register memory "
2262 2262                      "decoding operations: 0x%x", err);
2263 2263          }
2264 2264  
2265 2265  done:
2266 2266          mutex_enter(&imc->imc_lock);
2267 2267          imc->imc_flags &= IMC_F_ATTACH_DISPATCHED;
2268 2268          imc->imc_flags |= IMC_F_ATTACH_COMPLETE;
2269 2269          mutex_exit(&imc->imc_lock);
2270 2270  }
2271 2271  
2272 2272  static int
2273 2273  imc_stub_comparator(const void *l, const void *r)
2274 2274  {
2275 2275          const imc_stub_t *sl = l, *sr = r;
2276 2276          if (sl->istub_bus > sr->istub_bus)
2277 2277                  return (1);
2278 2278          if (sl->istub_bus < sr->istub_bus)
2279 2279                  return (-1);
2280 2280          if (sl->istub_dev > sr->istub_dev)
2281 2281                  return (1);
2282 2282          if (sl->istub_dev < sr->istub_dev)
2283 2283                  return (-1);
2284 2284          if (sl->istub_func > sr->istub_func)
2285 2285                  return (1);
2286 2286          if (sl->istub_func < sr->istub_func)
2287 2287                  return (-1);
2288 2288          return (0);
2289 2289  }
2290 2290  
2291 2291  static int
2292 2292  imc_stub_scan_cb(dev_info_t *dip, void *arg)
2293 2293  {
2294 2294          int vid, did;
2295 2295          const imc_stub_table_t *table;
2296 2296          imc_t *imc = arg;
2297 2297          int *regs;
2298 2298          uint_t i, nregs;
2299 2299  
2300 2300          if (dip == ddi_root_node()) {
2301 2301                  return (DDI_WALK_CONTINUE);
2302 2302          }
2303 2303  
2304 2304          /*
2305 2305           * Get the dev info name. PCI devices will always be children of PCI
2306 2306           * devices today on x86. If we reach something that has a device name
2307 2307           * that's not PCI, then we can prune it's children.
2308 2308           */
2309 2309          if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2310 2310                  return (DDI_WALK_PRUNECHILD);
2311 2311          }
2312 2312  
2313 2313          /*
2314 2314           * Get the device and vendor ID and see if this is something the imc
2315 2315           * knows about or cares about.
2316 2316           */
2317 2317          vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2318 2318              "vendor-id", PCI_EINVAL16);
2319 2319          did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2320 2320              "device-id", PCI_EINVAL16);
2321 2321          if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2322 2322                  return (DDI_WALK_CONTINUE);
2323 2323          }
2324 2324  
2325 2325          if (vid != IMC_PCI_VENDOR_INTC) {
2326 2326                  return (DDI_WALK_PRUNECHILD);
2327 2327          }
2328 2328  
2329 2329          if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2330 2330              "reg", ®s, &nregs) != DDI_PROP_SUCCESS) {
2331 2331                  return (DDI_WALK_CONTINUE);
2332 2332          }
2333 2333  
2334 2334          if (nregs == 0) {
2335 2335                  ddi_prop_free(regs);
2336 2336                  return (DDI_WALK_CONTINUE);
2337 2337          }
2338 2338  
2339 2339  
2340 2340          table = NULL;
2341 2341          for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2342 2342                  if (imc_stub_table[i].imcs_devid == did &&
2343 2343                      imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2344 2344                      imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2345 2345                          table = &imc_stub_table[i];
2346 2346                          break;
2347 2347                  }
2348 2348          }
2349 2349          ddi_prop_free(regs);
2350 2350  
2351 2351          /*
2352 2352           * Not a match, not interesting.
2353 2353           */
2354 2354          if (table == NULL) {
2355 2355                  return (DDI_WALK_CONTINUE);
2356 2356          }
2357 2357  
2358 2358          mutex_enter(&imc->imc_lock);
2359 2359          imc->imc_nscanned++;
2360 2360          mutex_exit(&imc->imc_lock);
2361 2361  
2362 2362          return (DDI_WALK_CONTINUE);
2363 2363  }
2364 2364  
2365 2365  /*
2366 2366   * From here, go through and see how many of the devices that we know about.
2367 2367   */
2368 2368  static void
2369 2369  imc_stub_scan(void *arg)
2370 2370  {
2371 2371          imc_t *imc = arg;
2372 2372          boolean_t dispatch = B_FALSE;
2373 2373  
2374 2374          /*
2375 2375           * Zero out the scan results in case we've been detached and reattached.
2376 2376           */
2377 2377          mutex_enter(&imc->imc_lock);
2378 2378          imc->imc_nscanned = 0;
2379 2379          mutex_exit(&imc->imc_lock);
2380 2380  
2381 2381          ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc);
2382 2382  
2383 2383          mutex_enter(&imc->imc_lock);
2384 2384          imc->imc_flags |= IMC_F_SCAN_COMPLETE;
2385 2385          imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED;
2386 2386  
2387 2387          /*
2388 2388           * If the scan found no nodes, then that means that we're on a hardware
2389 2389           * platform that we don't support. Therefore, there's no reason to do
2390 2390           * anything here.
2391 2391           */
2392 2392          if (imc->imc_nscanned == 0) {
2393 2393                  imc->imc_flags |= IMC_F_UNSUP_PLATFORM;
2394 2394                  mutex_exit(&imc->imc_lock);
2395 2395                  return;
2396 2396          }
2397 2397  
2398 2398          if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2399 2399                  imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2400 2400                  dispatch = B_TRUE;
2401 2401          }
2402 2402  
2403 2403          mutex_exit(&imc->imc_lock);
2404 2404  
2405 2405          if (dispatch) {
2406 2406                  (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2407 2407                      imc, DDI_SLEEP);
2408 2408          }
2409 2409  }
2410 2410  
2411 2411  /*
2412 2412   * By default, refuse to allow stubs to detach.
2413 2413   */
2414 2414  int
2415 2415  imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd)
2416 2416  {
2417 2417          imc_stub_t *stub;
2418 2418          imc_t *imc = imc_data;
2419 2419  
2420 2420          mutex_enter(&imc->imc_lock);
2421 2421  
2422 2422          /*
2423 2423           * By default, we do not allow stubs to detach. However, if the driver
2424 2424           * has attached to devices on a platform it doesn't recognize or
2425 2425           * support or if the override flag has been set, then allow detach to
2426 2426           * proceed.
2427 2427           */
2428 2428          if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 &&
2429 2429              imc_allow_detach == 0) {
2430 2430                  mutex_exit(&imc->imc_lock);
2431 2431                  return (DDI_FAILURE);
2432 2432          }
2433 2433  
2434 2434          for (stub = avl_first(&imc->imc_stubs); stub != NULL;
2435 2435              stub = AVL_NEXT(&imc->imc_stubs, stub)) {
2436 2436                  if (stub->istub_dip == dip) {
2437 2437                          break;
2438 2438                  }
2439 2439          }
2440 2440  
2441 2441          /*
2442 2442           * A device was attached to us that we somehow don't know about. Allow
2443 2443           * this to proceed.
2444 2444           */
2445 2445          if (stub == NULL) {
2446 2446                  mutex_exit(&imc->imc_lock);
2447 2447                  return (DDI_SUCCESS);
2448 2448          }
2449 2449  
2450 2450          pci_config_teardown(&stub->istub_cfgspace);
2451 2451          avl_remove(&imc->imc_stubs, stub);
2452 2452          kmem_free(stub, sizeof (imc_stub_t));
2453 2453          mutex_exit(&imc->imc_lock);
2454 2454  
2455 2455          return (DDI_SUCCESS);
2456 2456  }
2457 2457  
2458 2458  int
2459 2459  imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd)
2460 2460  {
2461 2461          imc_stub_t *stub, *lookup;
2462 2462          int did, vid, *regs;
2463 2463          uint_t i, nregs;
2464 2464          const imc_stub_table_t *table;
2465 2465          avl_index_t idx;
2466 2466          boolean_t dispatch = B_FALSE;
2467 2467          imc_t *imc = imc_data;
2468 2468  
2469 2469          if (cmd != DDI_ATTACH) {
2470 2470                  return (DDI_FAILURE);
2471 2471          }
2472 2472  
2473 2473          /*
2474 2474           * We've been asked to attach a stub. First, determine if this is even a
2475 2475           * PCI device that we should care about. Then, append it to our global
2476 2476           * list and kick off the configuration task. Note that we do this
2477 2477           * configuration task in a taskq so that we don't interfere with the
2478 2478           * normal attach / detach path processing.
2479 2479           */
2480 2480          if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2481 2481                  return (DDI_FAILURE);
2482 2482          }
2483 2483  
2484 2484          /*
2485 2485           * Get the device and vendor ID and see if this is something the imc
2486 2486           * knows about or cares about.
2487 2487           */
2488 2488          vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2489 2489              "vendor-id", PCI_EINVAL16);
2490 2490          did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2491 2491              "device-id", PCI_EINVAL16);
2492 2492          if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2493 2493                  return (DDI_FAILURE);
2494 2494          }
2495 2495  
2496 2496          /*
2497 2497           * Only accept INTC parts on the imc driver.
2498 2498           */
2499 2499          if (vid != IMC_PCI_VENDOR_INTC) {
2500 2500                  return (DDI_FAILURE);
2501 2501          }
2502 2502  
2503 2503          if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2504 2504              "reg", ®s, &nregs) != DDI_PROP_SUCCESS) {
2505 2505                  return (DDI_FAILURE);
2506 2506          }
2507 2507  
2508 2508          if (nregs == 0) {
2509 2509                  ddi_prop_free(regs);
2510 2510                  return (DDI_FAILURE);
2511 2511          }
2512 2512  
2513 2513          /*
2514 2514           * Determine if this matches a known device.
2515 2515           */
2516 2516          table = NULL;
2517 2517          for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2518 2518                  if (imc_stub_table[i].imcs_devid == did &&
2519 2519                      imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2520 2520                      imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2521 2521                          table = &imc_stub_table[i];
2522 2522                          break;
2523 2523                  }
2524 2524          }
2525 2525  
2526 2526          if (i == ARRAY_SIZE(imc_stub_table)) {
2527 2527                  ddi_prop_free(regs);
2528 2528                  return (DDI_FAILURE);
2529 2529          }
2530 2530  
2531 2531          /*
2532 2532           * We've found something. Make sure the generation matches our current
2533 2533           * one. If it does, construct the entry and append it to the list.
2534 2534           */
2535 2535          mutex_enter(&imc->imc_lock);
2536 2536          if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen !=
2537 2537              table->imcs_gen) {
2538 2538                  mutex_exit(&imc->imc_lock);
2539 2539                  ddi_prop_free(regs);
2540 2540                  dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) "
2541 2541                      "that has different hardware generation (%u) from current "
2542 2542                      "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen);
2543 2543                  return (DDI_FAILURE);
2544 2544          } else {
2545 2545                  imc->imc_gen = table->imcs_gen;
2546 2546          }
2547 2547          mutex_exit(&imc->imc_lock);
2548 2548  
2549 2549          stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP);
2550 2550          stub->istub_dip = dip;
2551 2551          stub->istub_vid = vid;
2552 2552          stub->istub_did = did;
2553 2553          stub->istub_bus = PCI_REG_BUS_G(regs[0]);
2554 2554          stub->istub_dev = PCI_REG_DEV_G(regs[0]);
2555 2555          stub->istub_func = PCI_REG_FUNC_G(regs[0]);
2556 2556          ddi_prop_free(regs);
2557 2557          stub->istub_table = table;
2558 2558  
2559 2559          if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) {
2560 2560                  kmem_free(stub, sizeof (stub));
2561 2561                  dev_err(dip, CE_WARN, "Failed to set up PCI config space "
2562 2562                      "for IMC stub device %s (%u/%u)", ddi_node_name(dip),
2563 2563                      vid, did);
2564 2564                  return (DDI_FAILURE);
2565 2565          }
2566 2566  
2567 2567          mutex_enter(&imc->imc_lock);
2568 2568          if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) {
2569 2569                  dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate "
2570 2570                      "bdf %u/%u/%u with %s (%u/%u), not attaching",
2571 2571                      ddi_node_name(imc->imc_dip), vid, did,
2572 2572                      stub->istub_bus, stub->istub_dev, stub->istub_func,
2573 2573                      ddi_node_name(lookup->istub_dip), lookup->istub_vid,
2574 2574                      lookup->istub_did);
2575 2575                  mutex_exit(&imc->imc_lock);
2576 2576                  pci_config_teardown(&stub->istub_cfgspace);
2577 2577                  kmem_free(stub, sizeof (stub));
2578 2578  
2579 2579                  return (DDI_FAILURE);
2580 2580          }
2581 2581          avl_insert(&imc->imc_stubs, stub, idx);
2582 2582  
2583 2583          if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE &&
2584 2584              avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2585 2585                  imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2586 2586                  dispatch = B_TRUE;
2587 2587          }
2588 2588          mutex_exit(&imc->imc_lock);
2589 2589  
2590 2590          if (dispatch) {
2591 2591                  (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2592 2592                      imc, DDI_SLEEP);
2593 2593          }
2594 2594  
2595 2595          return (DDI_SUCCESS);
2596 2596  }
2597 2597  
2598 2598  static int
2599 2599  imc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2600 2600  {
2601 2601          imc_t *imc = imc_data;
2602 2602  
2603 2603          if ((flag & (FEXCL | FNDELAY)) != 0)
2604 2604                  return (EINVAL);
2605 2605  
2606 2606          if (otyp != OTYP_CHR)
2607 2607                  return (EINVAL);
2608 2608  
2609 2609          mutex_enter(&imc->imc_lock);
2610 2610  
2611 2611          if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) {
2612 2612                  mutex_exit(&imc->imc_lock);
2613 2613                  return (ENOTSUP);
2614 2614          }
2615 2615  
2616 2616          /*
2617 2617           * It's possible that someone has come in during the window between when
2618 2618           * we've created the minor node and when we've finished doing work.
2619 2619           */
2620 2620          if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) {
2621 2621                  mutex_exit(&imc->imc_lock);
2622 2622                  return (EAGAIN);
2623 2623          }
2624 2624  
2625 2625          /*
2626 2626           * It's not clear how someone would get a minor that we didn't create.
2627 2627           * But be paranoid and make sure.
2628 2628           */
2629 2629          if (getminor(*devp) >= imc->imc_nsockets) {
2630 2630                  mutex_exit(&imc->imc_lock);
2631 2631                  return (EINVAL);
2632 2632          }
2633 2633  
2634 2634          /*
2635 2635           * Make sure this socket entry has been filled in.
2636 2636           */
2637 2637          if (imc->imc_spointers[getminor(*devp)] == NULL) {
2638 2638                  mutex_exit(&imc->imc_lock);
2639 2639                  return (EINVAL);
2640 2640          }
2641 2641  
2642 2642          mutex_exit(&imc->imc_lock);
2643 2643  
2644 2644          return (0);
2645 2645  }
2646 2646  
2647 2647  static void
2648 2648  imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode)
2649 2649  {
2650 2650          imc_decode_state_t dec;
2651 2651          uint_t i;
2652 2652  
2653 2653          bzero(&dec, sizeof (dec));
2654 2654          if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) {
2655 2655                  encode->mcei_err = (uint32_t)dec.ids_fail;
2656 2656                  encode->mcei_errdata = dec.ids_fail_data;
2657 2657                  return;
2658 2658          }
2659 2659  
2660 2660          encode->mcei_errdata = 0;
2661 2661          encode->mcei_err = 0;
2662 2662          encode->mcei_board = 0;
2663 2663          for (i = 0; i < imc->imc_nsockets; i++) {
2664 2664                  if (imc->imc_spointers[i] == dec.ids_socket)
2665 2665                          break;
2666 2666          }
2667 2667          encode->mcei_chip = i;
2668 2668          encode->mcei_mc = dec.ids_tadid;
2669 2669          encode->mcei_chan = dec.ids_channelid;
2670 2670          encode->mcei_dimm = dec.ids_dimmid;
2671 2671          encode->mcei_rank_addr = dec.ids_rankaddr;
2672 2672          encode->mcei_rank = dec.ids_rankid;
2673 2673          encode->mcei_row = UINT32_MAX;
2674 2674          encode->mcei_column = UINT32_MAX;
2675 2675          encode->mcei_pad = 0;
2676 2676  }
2677 2677  
2678 2678  static int
2679 2679  imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2680 2680      int *rvalp)
2681 2681  {
2682 2682          int ret;
2683 2683          minor_t m;
2684 2684          mc_snapshot_info_t info;
2685 2685          mc_encode_ioc_t encode;
2686 2686          imc_t *imc = imc_data;
2687 2687          imc_socket_t *sock;
2688 2688  
2689 2689          mutex_enter(&imc->imc_lock);
2690 2690          m = getminor(dev);
2691 2691          if (m >= imc->imc_nsockets) {
2692 2692                  ret = EINVAL;
2693 2693                  goto done;
2694 2694          }
2695 2695          sock = imc->imc_spointers[m];
2696 2696          if (sock == NULL) {
2697 2697                  ret = EINVAL;
2698 2698                  goto done;
2699 2699          }
2700 2700  
2701 2701          /*
2702 2702           * Note, other memory controller drivers don't check mode for reading
2703 2703           * data nor do they care who can read it from a credential perspective.
2704 2704           * As such we don't either at this time.
2705 2705           */
2706 2706          switch (cmd) {
2707 2707          case MC_IOC_SNAPSHOT_INFO:
2708 2708                  imc_nvl_pack(sock, B_FALSE);
2709 2709                  if (sock->isock_buf == NULL) {
2710 2710                          ret = EIO;
2711 2711                          break;
2712 2712                  }
2713 2713  
2714 2714                  info.mcs_size = sock->isock_buflen;
2715 2715                  info.mcs_gen = sock->isock_gen;
2716 2716  
2717 2717                  if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2718 2718                          ret = EFAULT;
2719 2719                          break;
2720 2720                  }
2721 2721  
2722 2722                  ret = 0;
2723 2723                  break;
2724 2724          case MC_IOC_SNAPSHOT:
2725 2725                  imc_nvl_pack(sock, B_FALSE);
2726 2726                  if (sock->isock_buf == NULL) {
2727 2727                          ret = EIO;
2728 2728                          break;
2729 2729                  }
2730 2730  
2731 2731                  if (ddi_copyout(sock->isock_buf, (void *)arg,
2732 2732                      sock->isock_buflen, mode) != 0) {
2733 2733                          ret = EFAULT;
2734 2734                          break;
2735 2735                  }
2736 2736  
2737 2737                  ret = 0;
2738 2738                  break;
2739 2739          case MC_IOC_DECODE_SNAPSHOT_INFO:
2740 2740                  imc_decoder_pack(imc);
2741 2741                  if (imc->imc_decoder_buf == NULL) {
2742 2742                          ret = EIO;
2743 2743                          break;
2744 2744                  }
2745 2745  
2746 2746                  info.mcs_size = imc->imc_decoder_len;
2747 2747                  info.mcs_gen = imc->imc_spointers[0]->isock_gen;
2748 2748  
2749 2749                  if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2750 2750                          ret = EFAULT;
2751 2751                          break;
2752 2752                  }
2753 2753  
2754 2754                  ret = 0;
2755 2755                  break;
2756 2756          case MC_IOC_DECODE_SNAPSHOT:
2757 2757                  imc_decoder_pack(imc);
2758 2758                  if (imc->imc_decoder_buf == NULL) {
2759 2759                          ret = EIO;
2760 2760                          break;
2761 2761                  }
2762 2762  
2763 2763                  if (ddi_copyout(imc->imc_decoder_buf, (void *)arg,
2764 2764                      imc->imc_decoder_len, mode) != 0) {
2765 2765                          ret = EFAULT;
2766 2766                          break;
2767 2767                  }
2768 2768  
2769 2769                  ret = 0;
2770 2770                  break;
2771 2771          case MC_IOC_DECODE_PA:
2772 2772                  if (crgetzoneid(credp) != GLOBAL_ZONEID ||
2773 2773                      drv_priv(credp) != 0) {
2774 2774                          ret = EPERM;
2775 2775                          break;
2776 2776                  }
2777 2777  
2778 2778                  if (ddi_copyin((void *)arg, &encode, sizeof (encode),
2779 2779                      mode & FKIOCTL) != 0) {
2780 2780                          ret = EPERM;
2781 2781                          break;
2782 2782                  }
2783 2783  
2784 2784                  imc_ioctl_decode(imc, &encode);
2785 2785                  ret = 0;
2786 2786  
2787 2787                  if (ddi_copyout(&encode, (void *)arg, sizeof (encode),
2788 2788                      mode & FKIOCTL) != 0) {
2789 2789                          ret = EPERM;
2790 2790                          break;
2791 2791                  }
2792 2792                  break;
2793 2793          default:
2794 2794                  ret = EINVAL;
2795 2795                  goto done;
2796 2796          }
2797 2797  
2798 2798  done:
2799 2799          mutex_exit(&imc->imc_lock);
2800 2800          return (ret);
2801 2801  }
2802 2802  
2803 2803  static int
2804 2804  imc_close(dev_t dev, int flag, int otyp, cred_t *credp)
2805 2805  {
2806 2806          return (0);
2807 2807  }
2808 2808  
2809 2809  static int
2810 2810  imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2811 2811  {
2812 2812          if (cmd != DDI_ATTACH) {
2813 2813                  return (DDI_FAILURE);
2814 2814          }
2815 2815  
2816 2816          if (imc_data == NULL || imc_data->imc_dip != NULL) {
2817 2817                  return (DDI_FAILURE);
2818 2818          }
2819 2819  
2820 2820          mutex_enter(&imc_data->imc_lock);
2821 2821          if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1,
2822 2822              TASKQ_DEFAULTPRI, 0)) == NULL) {
2823 2823                  mutex_exit(&imc_data->imc_lock);
2824 2824                  return (DDI_FAILURE);
2825 2825          }
2826 2826  
2827 2827          imc_data->imc_dip = dip;
2828 2828          imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED;
2829 2829          mutex_exit(&imc_data->imc_lock);
2830 2830  
2831 2831          (void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data,
2832 2832              DDI_SLEEP);
2833 2833  
2834 2834          return (DDI_SUCCESS);
2835 2835  }
2836 2836  
2837 2837  /*
2838 2838   * We only export a single instance.
2839 2839   */
2840 2840  static int
2841 2841  imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
2842 2842  {
2843 2843          /*
2844 2844           * getinfo(9E) shouldn't be called if we're not attached. But be
2845 2845           * paranoid.
2846 2846           */
2847 2847          if (imc_data == NULL || imc_data->imc_dip == NULL) {
2848 2848                  return (DDI_FAILURE);
2849 2849          }
2850 2850  
2851 2851          switch (infocmd) {
2852 2852          case DDI_INFO_DEVT2DEVINFO:
2853 2853                  *resultp = imc_data->imc_dip;
2854 2854                  break;
2855 2855          case DDI_INFO_DEVT2INSTANCE:
2856 2856                  *resultp = (void *)0;
2857 2857                  break;
2858 2858          default:
2859 2859                  return (DDI_FAILURE);
2860 2860          }
2861 2861  
2862 2862          return (DDI_SUCCESS);
2863 2863  }
2864 2864  
2865 2865  static int
2866 2866  imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2867 2867  {
2868 2868          if (cmd != DDI_DETACH) {
2869 2869                  return (DDI_FAILURE);
2870 2870          }
2871 2871  
2872 2872          if (imc_data == NULL || imc_data->imc_dip) {
2873 2873                  return (DDI_FAILURE);
2874 2874          }
2875 2875  
2876 2876          mutex_enter(&imc_data->imc_lock);
2877 2877  
2878 2878          /*
2879 2879           * While a scan or attach is outstanding, don't allow us to detach.
2880 2880           */
2881 2881          if ((imc_data->imc_flags &
2882 2882              (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) {
2883 2883                  mutex_exit(&imc_data->imc_lock);
2884 2884                  return (DDI_FAILURE);
2885 2885          }
2886 2886  
2887 2887          /*
2888 2888           * Because the stub driver depends on the imc driver, we shouldn't be
2889 2889           * able to have any entries in this list when we detach. However, we
2890 2890           * check just to make sure.
2891 2891           */
2892 2892          if (!avl_is_empty(&imc_data->imc_stubs)) {
2893 2893                  mutex_exit(&imc_data->imc_lock);
2894 2894                  return (DDI_FAILURE);
2895 2895          }
2896 2896  
2897 2897          nvlist_free(imc_data->imc_decoder_dump);
2898 2898          imc_data->imc_decoder_dump = NULL;
2899 2899          if (imc_data->imc_decoder_buf != NULL) {
2900 2900                  kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len);
2901 2901                  imc_data->imc_decoder_buf = NULL;
2902 2902                  imc_data->imc_decoder_len = 0;
2903 2903          }
2904 2904  
2905 2905          ddi_remove_minor_node(imc_data->imc_dip, NULL);
2906 2906          imc_data->imc_dip = NULL;
2907 2907          mutex_exit(&imc_data->imc_lock);
2908 2908  
2909 2909          ddi_taskq_wait(imc_data->imc_taskq);
2910 2910          ddi_taskq_destroy(imc_data->imc_taskq);
2911 2911          imc_data->imc_taskq = NULL;
2912 2912  
2913 2913          return (DDI_SUCCESS);
2914 2914  }
2915 2915  
2916 2916  static void
2917 2917  imc_free(void)
2918 2918  {
2919 2919          if (imc_data == NULL) {
2920 2920                  return;
2921 2921          }
2922 2922  
2923 2923          VERIFY(avl_is_empty(&imc_data->imc_stubs));
2924 2924          avl_destroy(&imc_data->imc_stubs);
2925 2925          mutex_destroy(&imc_data->imc_lock);
2926 2926          kmem_free(imc_data, sizeof (imc_t));
2927 2927          imc_data = NULL;
2928 2928  }
2929 2929  
2930 2930  static void
2931 2931  imc_alloc(void)
2932 2932  {
2933 2933          imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP);
2934 2934  
2935 2935          mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL);
2936 2936          avl_create(&imc_data->imc_stubs, imc_stub_comparator,
2937 2937              sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link));
2938 2938  }
2939 2939  
2940 2940  static struct cb_ops imc_cb_ops = {
2941 2941          .cb_open = imc_open,
2942 2942          .cb_close = imc_close,
2943 2943          .cb_strategy = nodev,
2944 2944          .cb_print = nodev,
2945 2945          .cb_dump = nodev,
2946 2946          .cb_read = nodev,
2947 2947          .cb_write = nodev,
2948 2948          .cb_ioctl = imc_ioctl,
2949 2949          .cb_devmap = nodev,
2950 2950          .cb_mmap = nodev,
2951 2951          .cb_segmap = nodev,
2952 2952          .cb_chpoll = nochpoll,
2953 2953          .cb_prop_op = ddi_prop_op,
2954 2954          .cb_flag = D_MP,
2955 2955          .cb_rev = CB_REV,
2956 2956          .cb_aread = nodev,
2957 2957          .cb_awrite = nodev
2958 2958  };
2959 2959  
2960 2960  static struct dev_ops imc_dev_ops = {
2961 2961          .devo_rev = DEVO_REV,
2962 2962          .devo_refcnt = 0,
2963 2963          .devo_getinfo = imc_getinfo,
2964 2964          .devo_identify = nulldev,
2965 2965          .devo_probe = nulldev,
2966 2966          .devo_attach = imc_attach,
2967 2967          .devo_detach = imc_detach,
2968 2968          .devo_reset = nodev,
2969 2969          .devo_cb_ops = &imc_cb_ops,
2970 2970          .devo_quiesce = ddi_quiesce_not_needed
2971 2971  };
2972 2972  
2973 2973  static struct modldrv imc_modldrv = {
2974 2974          .drv_modops = &mod_driverops,
2975 2975          .drv_linkinfo = "Intel Integrated Memory Controller Driver",
2976 2976          .drv_dev_ops = &imc_dev_ops
2977 2977  };
2978 2978  
2979 2979  static struct modlinkage imc_modlinkage = {
2980 2980          .ml_rev = MODREV_1,
2981 2981          .ml_linkage = { &imc_modldrv, NULL }
2982 2982  };
2983 2983  
2984 2984  int
2985 2985  _init(void)
2986 2986  {
2987 2987          int ret;
2988 2988  
2989 2989          if ((ret = mod_install(&imc_modlinkage)) == 0) {
2990 2990                  imc_alloc();
2991 2991          }
2992 2992  
2993 2993          return (ret);
2994 2994  }
2995 2995  
2996 2996  int
2997 2997  _info(struct modinfo *modinfop)
2998 2998  {
2999 2999          return (mod_info(&imc_modlinkage, modinfop));
3000 3000  }
3001 3001  
3002 3002  int
3003 3003  _fini(void)
3004 3004  {
3005 3005          int ret;
3006 3006  
3007 3007          if ((ret = mod_remove(&imc_modlinkage)) == 0) {
3008 3008                  imc_free();
3009 3009          }
3010 3010          return (ret);
3011 3011  }
  
    | 
      ↓ open down ↓ | 
    1566 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX