1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2019 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Generic Intel Integrated Memory Controller (IMC) Driver
  18  *
  19  * This driver talks to the CPU's IMC to understand the detailed topology of the
  20  * processor and to determine how to map between physical addresses to the
  21  * corresponding DIMM. This driver supports the following generations of Intel
  22  * chips:
  23  *
  24  *  - Sandy Bridge
  25  *  - Ivy Bridge
  26  *  - Haswell
  27  *  - Broadwell
  28  *  - Skylake / Cascade Lake
  29  *
  30  * Memory Decoding
  31  * ---------------
  32  *
  33  * For more detailed summaries of the memory decoding process, please refer to
  34  * the Intel External Design Specifications for the corresponding processor.
  35  * What follows is a rough overview of how the memory decoding system works.
  36  *
  37  * First, we'd like to define the following concepts:
  38  *
  39  * SYSTEM ADDRESS
  40  *
  41  *      This is a physical address that the operating system normally uses. This
  42  *      address may refer to DRAM, it may refer to memory mapped PCI
  43  *      configuration space or device registers, or it may refer to other parts
  44  *      of the system's memory map, such as the extended advanced programmable
  45  *      interrupt controller (xAPIC), etc.
  46  *
  47  * DIMM
  48  *
  49  *      Dual-inline memory module. This refers to a physical stick of volatile
  50  *      memory that is inserted into a slot on the motherboard.
  51  *
  52  * RANK
  53  *
  54  *      A potential sub-division of a DIMM. A DIMM's memory capacity is divided
  55  *      into a number of equal sized ranks. For example, an 8 GiB DIMM, may have
  56  *      1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks.
  57  *
  58  * RANK ADDRESS
  59  *
  60  *      An address that exists in the context of a given rank on a DIMM. All
  61  *      ranks have overlapping addresses, so the address 0x400 exists on all
  62  *      ranks on a given DIMM.
  63  *
  64  * CHANNEL
  65  *
  66  *      Multiple DIMMs may be combined into a single channel. The channel
  67  *      represents the combined memory of all the DIMMs. A given channel only
  68  *      ever exists on a socket and is bound to a single memory controller.
  69  *
  70  * CHANNEL ADDRESS
  71  *
  72  *      This is an address that exists logically on a channel. Each address on a
  73  *      channel maps to a corresponding DIMM that exists on that channel. The
  74  *      address space on one channel is independent from that on another. This
  75  *      means that address 0x1000 can exist on each memory channel in the
  76  *      system.
  77  *
  78  * INTERLEAVE
  79  *
  80  *      There are several different cases where interleaving occurs on the
  81  *      system. For example, addresses may be interleaved across sockets,
  82  *      memory channels, or DIMM ranks. When addresses are interleaved, then
  83  *      some number of bits in an address are used to select which target to go
  84  *      to (usually through a look up table). The effect of interleaving is that
  85  *      addresses that are next to one another may not all go to the same
  86  *      device. The following image shows a non-interleaving case.
  87  *
  88  *      0x0fff +-----+             +-----+ 0x7ff
  89  *             |     |\___________/|     |
  90  *             |     |  __________ | (b) |
  91  *             |     | /          \|     |
  92  *      0x0800 |=====|=            +-----+ 0x000       +-----+ 0x7ff
  93  *             |     | \______________________________/|     |
  94  *             |     | _______________________________ | (a) |
  95  *             |     |/                               \|     |
  96  *      0x0000 +-----+                                 +-----+ 0x000
  97  *
  98  *      In this example of non-interleaving, addresses 0x0000 to 0x07ff go to
  99  *      device (a). While, addresses 0x08000 to 0xfff, go to device (b).
 100  *      However, each range is divided into the same number of components.
 101  *
 102  *      If instead, we were to look at that with interleaving, what we might say
 103  *      is that rather than splitting the range in half, we might say that if
 104  *      the address has bit 8 set (0x100), then it goes to (b), otherwise it
 105  *      goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a).
 106  *      0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a)
 107  *      again, and then 0x300 to 0x2ff would go back to (b). This would continue
 108  *      for a while. This would instead look something more like:
 109  *
 110  *
 111  *      0x0fff +-----+       A: 0x7ff +---------+   B: 0x7ff +---------+
 112  *             | (b) |                | e00-eff |            | f00-fff |
 113  *      0x0f00 |-----|          0x700 +---------+      0x700 +---------+
 114  *             | (a) |                | c00-cff |            | d00-dff |
 115  *      0x0e00 ~~~~~~~          0x600 +---------+      0x600 +---------+
 116  *               ***                  | a00-aff |            | b00-bff |
 117  *      0x0400 ~~~~~~~          0x500 +---------+      0x500 +---------+
 118  *             | (b) |                | 800-8ff |            | 900-9ff |
 119  *      0x0300 |-----|          0x400 +---------+      0x400 +---------+
 120  *             | (a) |                | 600-6ff |            | 700-7ff |
 121  *      0x0200 |-----|          0x300 +---------+      0x300 +---------+
 122  *             | (b) |                | 400-4ff |            | 500-5ff |
 123  *      0x0100 |-----|          0x200 +---------+      0x200 +---------+
 124  *             | (a) |                | 200-2ff |            | 300-3ff |
 125  *      0x0000 +-----+          0x100 +---------+      0x100 +---------+
 126  *                                    | 000-0ff |            | 100-1ff |
 127  *                              0x000 +---------+      0x000 +---------+
 128  *
 129  *      In this example we've performed two-way interleaving. The number of ways
 130  *      that something can interleave varies based on what we're interleaving
 131  *      between.
 132  *
 133  * MEMORY CONTROLLER
 134  *
 135  *      A given processor die (see uts/i86pc/os/cpuid.c) contains a number of
 136  *      memory controllers. Usually 1 or two. Each memory controller supports a
 137  *      given number of DIMMs, which are divided across multiple channels.
 138  *
 139  * TARGET ADDRESS DECODER
 140  *
 141  *      The target address decoder (TAD) is responsible for taking a system
 142  *      address and transforming it into a channel address based on the rules
 143  *      that are present. Each memory controller has a corresponding TAD. The
 144  *      TAD is often contained in a device called a 'Home Agent'.
 145  *
 146  * SYSTEM ADDRESS DECODER
 147  *
 148  *      The system address decoder (SAD) is responsible for taking a system
 149  *      address and directing it to the right place, whether this be memory or
 150  *      otherwise. There is a single memory controller per socket (see
 151  *      uts/i86pc/os/cpuid.c) that is shared between all the cores currently.
 152  *
 153  * NODE IDENTIFIER
 154  *
 155  *      The node identifier is used to uniquely identify an element in the
 156  *      various routing topologies on the die (see uts/i86pc/os/cpuid.c for the
 157  *      definition of 'die'). One can roughly think about this as a unique
 158  *      identifier for the socket itself. In general, the primary node ID for a
 159  *      socket should map to the socket APIC ID.
 160  *
 161  * Finding Devices
 162  * ---------------
 163  *
 164  * There is a bit of a chicken and egg problem on Intel systems and in the
 165  * device driver interface. The information that we need in the system is spread
 166  * out amongst a large number of different PCI devices that the processor
 167  * exposes. The number of such devices can vary based on the processor
 168  * generation and the specific SKU in the processor. To deal with this, we break
 169  * the driver into two different components: a stub driver and the full driver.
 170  *
 171  * The stub driver has aliases for all known PCI devices that we might attach to
 172  * in a given generation on the system. This driver is called 'imcstub'. When a
 173  * stub attaches, it just registers itself with the main driver, upon which it
 174  * has a module dependency.
 175  *
 176  * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it
 177  * kicks off a scan of the device tree which takes place in a task queue. Once
 178  * there, it determines the number of devices that it expects to exist by
 179  * walking the tree and comparing it against the generation-specific table.
 180  *
 181  * If all devices are found, we'll go ahead and read through all the devices and
 182  * build a map of all the information we need to understand the topology of the
 183  * system and to be able to decode addresses. We do this here, because we can be
 184  * asked to perform decoding in dangerous contexts (after taking an MCE, panic,
 185  * etc) where we don't want to have to rely on the broader kernel functioning at
 186  * this point in time.
 187  *
 188  * Once our topology is built, we'll create minor nodes which are used by the
 189  * fault management architecture to query for information and register our
 190  * decoding functionality with the kernel.
 191  *
 192  * PCI Numbering
 193  * -------------
 194  *
 195  * For each device that we care about, Intel defines the device and function
 196  * that we can expect to find the information and PCI configuration space
 197  * registers that we care about at. However, the PCI bus is not well defined.
 198  * Devices that are on the same socket use the same set of bus numbers; however,
 199  * some sockets have multiple device numbers that they'll use to represent
 200  * different classes. These bus numbers are programmed by systems firmware as
 201  * part of powering on the system. This means, that we need the ability to
 202  * map together these disparate ranges ourselves.
 203  *
 204  * There is a device called a utility box (UBOX), which exists per-socket and
 205  * maps the different sockets together. We use this to determine which devices
 206  * correspond to which sockets.
 207  *
 208  * Mapping Sockets
 209  * ---------------
 210  *
 211  * Another wrinkle is that the way that the OS sees the numbering of the CPUs is
 212  * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more
 213  * information). However, to map to the corresponding socket, we need to look at
 214  * the socket's node ID. The order of PCI buses in the system is not required to
 215  * have any relation to the socket ID. Therefore, we have to have yet another
 216  * indirection table in the imc_t.
 217  *
 218  * Exposing Data
 219  * -------------
 220  *
 221  * We expose topology data to FMA using the OS-private memory controller
 222  * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a
 223  * number of specific interfaces that we can then implement. The ioctl API asks
 224  * us for a snapshot of data, which basically has us go through and send an
 225  * nvlist_t to userland. This nvlist_t is constructed as part of the scan
 226  * process. This nvlist uses the version 1 format, which more explicitly encodes
 227  * the topology in a series of nested nvlists.
 228  *
 229  * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the
 230  * decoder and ask it to perform decoding.
 231  *
 232  * Decoding Addresses
 233  * ------------------
 234  *
 235  * The decoding logic can be found in common/imc/imc_decode.c. This file is
 236  * shared between the kernel and userland to allow for easier testing and
 237  * additional flexibility in operation. The decoding process happens in a few
 238  * different phases.
 239  *
 240  * The first phase, is to determine which memory controller on which socket is
 241  * responsible for this data. To determine this, we use the system address
 242  * decoder and walk the rules, looking for the correct target. There are various
 243  * manipulations to the address that exist which are used to determine which
 244  * index we use. The way that we interpret the output of the rule varies
 245  * somewhat based on the generation. Sandy Bridge just has a node ID which
 246  * points us to the socket with its single IMC. On Ivy Bridge through Broadwell,
 247  * the memory controller to use is also encoded in part of the node ID. Finally,
 248  * on Skylake, the SAD tells us which socket to look at. The socket in question
 249  * then has a routing table which tells us which channel on which memory
 250  * controller that is local to that socket.
 251  *
 252  * Once we have the target memory controller, we walk the list of target address
 253  * decoder rules. These rules can help tell us which channel we care about
 254  * (which is required on Sandy Bridge through Broadwell) and then describe some
 255  * amount of the interleaving rules which are used to turn the system address
 256  * into a channel address.
 257  *
 258  * Once we know the channel and the channel address, we walk the rank interleave
 259  * rules which help us determine which DIMM and the corresponding rank on it
 260  * that the corresponding channel address is on. It also has logic that we need
 261  * to use to determine how to transform a channel address into an address on
 262  * that specific rank. Once we have that, then the initial decoding is done.
 263  *
 264  * The logic in imc_decode.c is abstracted away from the broader kernel CMI
 265  * logic.  This is on purpose and allows us not only an easier time unit testing
 266  * the logic, but also allows us to express more high fidelity errors that are
 267  * translated into a much smaller subset. This logic is exercised in the
 268  * 'imc_test' program which is built in 'test/os-tests/tests/imc'.
 269  *
 270  * Limitations
 271  * -----------
 272  *
 273  * Currently, this driver has the following limitations:
 274  *
 275  *  o It doesn't decode the row and column addresses.
 276  *  o It doesn't encode from a DIMM address to a system address.
 277  *  o It doesn't properly support lockstep and mirroring modes on Sandy Bridge -
 278  *    Broadwell platforms.
 279  *  o It doesn't support virtual lockstep and adaptive mirroring on Purley
 280  *    platforms.
 281  *  o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs.
 282  *  o It doesn't know how to decode three way channel interleaving.
 283  *
 284  * None of these are intrinsic problems to the driver, it's mostly a matter of
 285  * having proper documentation and testing.
 286  */
 287 
 288 #include <sys/modctl.h>
 289 #include <sys/conf.h>
 290 #include <sys/devops.h>
 291 #include <sys/ddi.h>
 292 #include <sys/sunddi.h>
 293 #include <sys/types.h>
 294 #include <sys/file.h>
 295 #include <sys/errno.h>
 296 #include <sys/open.h>
 297 #include <sys/cred.h>
 298 #include <sys/pci.h>
 299 #include <sys/sysmacros.h>
 300 #include <sys/avl.h>
 301 #include <sys/stat.h>
 302 #include <sys/policy.h>
 303 
 304 #include <sys/cpu_module.h>
 305 #include <sys/mc.h>
 306 #include <sys/mc_intel.h>
 307 
 308 #include "imc.h"
 309 
 310 /*
 311  * These tables contain generational data that varies between processor
 312  * generation such as the maximum number of sockets, memory controllers, and the
 313  * offsets of the various registers.
 314  */
 315 
 316 static const imc_gen_data_t imc_gen_data_snb = {
 317         .igd_max_sockets = 4,
 318         .igd_max_imcs = 2,
 319         .igd_max_channels = 4,
 320         .igd_max_dimms = 3,
 321         .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
 322         .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
 323             IMC_REG_MC_MTR2 },
 324         .igd_mcmtr_offset = 0x7c,
 325         .igd_tolm_offset = 0x80,
 326         .igd_tohm_low_offset = 0x84,
 327         .igd_sad_dram_offset = 0x80,
 328         .igd_sad_ndram_rules = 10,
 329         .igd_sad_nodeid_offset = 0x40,
 330         .igd_tad_nrules = 12,
 331         .igd_tad_rule_offset = 0x40,
 332         .igd_tad_chan_offset = 0x90,
 333         .igd_tad_sysdef = 0x80,
 334         .igd_tad_sysdef2 = 0x84,
 335         .igd_mc_mirror = 0xac,
 336         .igd_rir_nways = 5,
 337         .igd_rir_way_offset = 0x108,
 338         .igd_rir_nileaves = 8,
 339         .igd_rir_ileave_offset = 0x120,
 340         .igd_ubox_cpubusno_offset = 0xd0,
 341 };
 342 
 343 static const imc_gen_data_t imc_gen_data_ivb = {
 344         .igd_max_sockets = 4,
 345         .igd_max_imcs = 2,
 346         .igd_max_channels = 4,
 347         .igd_max_dimms = 3,
 348         .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
 349         .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
 350             IMC_REG_MC_MTR2 },
 351         .igd_mcmtr_offset = 0x7c,
 352         .igd_tolm_offset = 0x80,
 353         .igd_tohm_low_offset = 0x84,
 354         .igd_sad_dram_offset = 0x60,
 355         .igd_sad_ndram_rules = 20,
 356         .igd_sad_nodeid_offset = 0x40,
 357         .igd_tad_nrules = 12,
 358         .igd_tad_rule_offset = 0x40,
 359         .igd_tad_chan_offset = 0x90,
 360         .igd_tad_sysdef = 0x80,
 361         .igd_tad_sysdef2 = 0x84,
 362         .igd_mc_mirror = 0xac,
 363         .igd_rir_nways = 5,
 364         .igd_rir_way_offset = 0x108,
 365         .igd_rir_nileaves = 8,
 366         .igd_rir_ileave_offset = 0x120,
 367         .igd_ubox_cpubusno_offset = 0xd0,
 368 };
 369 
 370 static const imc_gen_data_t imc_gen_data_has_brd = {
 371         .igd_max_sockets = 4,
 372         .igd_max_imcs = 2,
 373         .igd_max_channels = 4,
 374         .igd_max_dimms = 3,
 375         .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX,
 376         .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
 377             IMC_REG_MC_MTR2 },
 378         .igd_mcmtr_offset = 0x7c,
 379         .igd_tolm_offset = 0xd0,
 380         .igd_tohm_low_offset = 0xd4,
 381         .igd_tohm_hi_offset = 0xd8,
 382         .igd_sad_dram_offset = 0x60,
 383         .igd_sad_ndram_rules = 20,
 384         .igd_sad_nodeid_offset = 0x40,
 385         .igd_tad_nrules = 12,
 386         .igd_tad_rule_offset = 0x40,
 387         .igd_tad_chan_offset = 0x90,
 388         .igd_tad_sysdef = 0x80,
 389         .igd_tad_sysdef2 = 0x84,
 390         .igd_mc_mirror = 0xac,
 391         .igd_rir_nways = 5,
 392         .igd_rir_way_offset = 0x108,
 393         .igd_rir_nileaves = 8,
 394         .igd_rir_ileave_offset = 0x120,
 395         .igd_ubox_cpubusno_offset = 0xd0,
 396 };
 397 
 398 static const imc_gen_data_t imc_gen_data_skx = {
 399         .igd_max_sockets = 8,
 400         .igd_max_imcs = 2,
 401         .igd_max_channels = 3,
 402         .igd_max_dimms = 2,
 403         .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
 404         .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 },
 405         .igd_mcmtr_offset = 0x87c,
 406         .igd_topo_offset = 0x88,
 407         .igd_tolm_offset = 0xd0,
 408         .igd_tohm_low_offset = 0xd4,
 409         .igd_tohm_hi_offset = 0xd8,
 410         .igd_sad_dram_offset = 0x60,
 411         .igd_sad_ndram_rules = 24,
 412         .igd_sad_nodeid_offset = 0xc0,
 413         .igd_tad_nrules = 8,
 414         .igd_tad_rule_offset = 0x850,
 415         .igd_tad_chan_offset = 0x90,
 416         .igd_rir_nways = 4,
 417         .igd_rir_way_offset = 0x108,
 418         .igd_rir_nileaves = 4,
 419         .igd_rir_ileave_offset = 0x120,
 420         .igd_ubox_cpubusno_offset = 0xcc,
 421 };
 422 
 423 /*
 424  * This table contains all of the devices that we're looking for from a stub
 425  * perspective. These are organized by generation. Different generations behave
 426  * in slightly different ways. For example, Sandy Bridge through Broadwell use
 427  * unique PCI IDs for each PCI device/function combination that appears. Whereas
 428  * Skylake based systems use the same PCI ID; however, different device/function
 429  * values indicate that the IDs are used for different purposes.
 430  */
 431 /* BEGIN CSTYLED */
 432 static const imc_stub_table_t imc_stub_table[] = {
 433         /* Sandy Bridge */
 434         { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" },
 435         { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" },
 436         { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" },
 437         { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" },
 438         { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" },
 439         { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" },
 440         { IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" },
 441         { IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" },
 442         { IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" },
 443         { IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" },
 444         { IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" },
 445         { IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" },
 446         /* Ivy Bridge */
 447         { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" },
 448         { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" },
 449         { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" },
 450         { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" },
 451         { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" },
 452         { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" },
 453         { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" },
 454         { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" },
 455         { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" },
 456         { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" },
 457         { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" },
 458         { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" },
 459         { IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" },
 460         { IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" },
 461         { IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" },
 462         { IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" },
 463         { IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" },
 464         { IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" },
 465         { IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" },
 466         /* Haswell */
 467         { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" },
 468         { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" },
 469         { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" },
 470         { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" },
 471         { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" },
 472         { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" },
 473         { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" },
 474         { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" },
 475         { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" },
 476         { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" },
 477         { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" },
 478         { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" },
 479         { IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" },
 480         { IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" },
 481         { IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" },
 482         { IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" },
 483         { IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" },
 484         { IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" },
 485         { IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" },
 486         /* Broadwell Devices */
 487         { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" },
 488         { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" },
 489         { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" },
 490         { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" },
 491         { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" },
 492         { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" },
 493         { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" },
 494         { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" },
 495         { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" },
 496         { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" },
 497         { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" },
 498         { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" },
 499         { IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" },
 500         { IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" },
 501         { IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" },
 502         { IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" },
 503         { IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" },
 504         { IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" },
 505         { IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" },
 506         /* Skylake and Cascade Lake Devices */
 507         { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" },
 508         { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" },
 509         { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" },
 510         { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" },
 511         { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" },
 512         { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" },
 513         { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" },
 514         { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" },
 515         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" },
 516         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" },
 517         { IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" },
 518 
 519         /*
 520          * There is one SAD MC Route type device per core! Because of this a
 521          * wide array of device and functions are allocated. For now, we list
 522          * all 28 of them out.
 523          */
 524         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" },
 525         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" },
 526         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" },
 527         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" },
 528         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" },
 529         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" },
 530         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" },
 531         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" },
 532         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" },
 533         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" },
 534         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" },
 535         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" },
 536         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" },
 537         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" },
 538         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" },
 539         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" },
 540         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" },
 541         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" },
 542         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" },
 543         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" },
 544         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" },
 545         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" },
 546         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" },
 547         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" },
 548         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" },
 549         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" },
 550         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" },
 551         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" },
 552         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" },
 553         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" },
 554         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" },
 555         { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" },
 556 
 557         { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" },
 558         { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" },
 559 };
 560 /* END CSTYLED */
 561 
 562 #define IMC_PCI_VENDOR_INTC     0x8086
 563 
 564 /*
 565  * Our IMC data is global and statically set up during a combination of
 566  * _init(9E) and attach(9E). While we have a module dependency between the PCI
 567  * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't
 568  * guarantee that the imc driver has finished attaching. As such we make sure
 569  * that it can operate without it being attached in any way.
 570  */
 571 static imc_t *imc_data = NULL;
 572 
 573 /*
 574  * By default we should not allow the stubs to detach as we don't have a good
 575  * way of forcing them to attach again. This is provided in case someone does
 576  * want to allow the driver to unload.
 577  */
 578 int imc_allow_detach = 0;
 579 
 580 static void
 581 imc_set_gen_data(imc_t *imc)
 582 {
 583         switch (imc->imc_gen) {
 584         case IMC_GEN_SANDY:
 585                 imc->imc_gen_data = &imc_gen_data_snb;
 586                 break;
 587         case IMC_GEN_IVY:
 588                 imc->imc_gen_data = &imc_gen_data_ivb;
 589                 break;
 590         case IMC_GEN_HASWELL:
 591         case IMC_GEN_BROADWELL:
 592                 imc->imc_gen_data = &imc_gen_data_has_brd;
 593                 break;
 594         case IMC_GEN_SKYLAKE:
 595                 imc->imc_gen_data = &imc_gen_data_skx;
 596                 break;
 597         default:
 598                 dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
 599                     "set to unknown generation: %u", imc->imc_gen);
 600         }
 601 }
 602 
 603 /*
 604  * If our device (dev_info_t) does not have a non-zero unit address, then
 605  * devfsadmd will not pay attention to us at all. Therefore we need to set the
 606  * unit address below, before we create minor nodes.
 607  *
 608  * The rest of the system expects us to have one minor node per socket. The
 609  * minor node ID should be the ID of the socket.
 610  */
 611 static boolean_t
 612 imc_create_minors(imc_t *imc)
 613 {
 614         uint_t i;
 615 
 616         ddi_set_name_addr(imc->imc_dip, "1");
 617         for (i = 0; i < imc->imc_nsockets; i++) {
 618                 char buf[MAXNAMELEN];
 619 
 620                 if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >=
 621                     sizeof (buf)) {
 622                         goto fail;
 623                 }
 624 
 625                 if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i,
 626                     "ddi_mem_ctrl", 0) != DDI_SUCCESS) {
 627                         dev_err(imc->imc_dip, CE_WARN, "failed to create "
 628                             "minor node %u: %s", i, buf);
 629                         goto fail;
 630                 }
 631         }
 632         return (B_TRUE);
 633 
 634 fail:
 635         ddi_remove_minor_node(imc->imc_dip, NULL);
 636         return (B_FALSE);
 637 }
 638 
 639 /*
 640  * Check the current MC route value for this SAD. On Skylake systems there is
 641  * one per core. Every core should agree. If not, we will not trust the SAD
 642  * MCROUTE values and this will cause system address decoding to fail on
 643  * skylake.
 644  */
 645 static void
 646 imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub)
 647 {
 648         uint32_t val;
 649 
 650         val = pci_config_get32(stub->istub_cfgspace,
 651             IMC_REG_SKX_SAD_MC_ROUTE_TABLE);
 652         if (val == PCI_EINVAL32) {
 653                 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
 654                 return;
 655         }
 656 
 657         if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) {
 658                 sad->isad_flags |= IMC_SAD_MCROUTE_VALID;
 659                 sad->isad_mcroute.ismc_raw_mcroute = val;
 660                 return;
 661         }
 662 
 663         /*
 664          * Occasionally we see MC ROUTE table entries with a value of zero.
 665          * We should ignore those for now.
 666          */
 667         if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) {
 668                 dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch "
 669                     "with socket. SAD has val 0x%x, system has %x\n",
 670                     val, sad->isad_mcroute.ismc_raw_mcroute);
 671                 sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE;
 672         }
 673 }
 674 
 675 /*
 676  * On Skylake, many of the devices that we care about are on separate PCI Buses.
 677  * These can be mapped together by the DECS register. However, we need to know
 678  * how to map different buses together so that we can more usefully associate
 679  * information. The set of buses is all present in the DECS register. We'll
 680  * effectively assign sockets to buses. This is also still something that comes
 681  * up on pre-Skylake systems as well.
 682  */
 683 static boolean_t
 684 imc_map_buses(imc_t *imc)
 685 {
 686         imc_stub_t *stub;
 687         uint_t nsock;
 688 
 689         /*
 690          * Find the UBOX_DECS registers so we can establish socket mappings. On
 691          * Skylake, there are three different sets of buses that we need to
 692          * cover all of our devices, while there are only two before that.
 693          */
 694         for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL;
 695             stub = AVL_NEXT(&imc->imc_stubs, stub)) {
 696                 uint32_t busno;
 697 
 698                 if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) {
 699                         continue;
 700                 }
 701 
 702                 busno = pci_config_get32(stub->istub_cfgspace,
 703                     imc->imc_gen_data->igd_ubox_cpubusno_offset);
 704                 if (busno == PCI_EINVAL32) {
 705                         dev_err(imc->imc_dip, CE_WARN, "failed to read "
 706                             "UBOX_DECS CPUBUSNO0: invalid PCI read");
 707                         return (B_FALSE);
 708                 }
 709 
 710                 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
 711                         imc->imc_sockets[nsock].isock_nbus = 3;
 712                         imc->imc_sockets[nsock].isock_bus[0] =
 713                             IMC_UBOX_CPUBUSNO_0(busno);
 714                         imc->imc_sockets[nsock].isock_bus[1] =
 715                             IMC_UBOX_CPUBUSNO_1(busno);
 716                         imc->imc_sockets[nsock].isock_bus[2] =
 717                             IMC_UBOX_CPUBUSNO_2(busno);
 718                 } else {
 719                         imc->imc_sockets[nsock].isock_bus[0] =
 720                             IMC_UBOX_CPUBUSNO_0(busno);
 721                         imc->imc_sockets[nsock].isock_bus[1] =
 722                             IMC_UBOX_CPUBUSNO_1(busno);
 723                         imc->imc_sockets[nsock].isock_nbus = 2;
 724                 }
 725                 nsock++;
 726         }
 727         imc->imc_nsockets = nsock;
 728 
 729         return (B_TRUE);
 730 }
 731 
 732 /*
 733  * For a given stub that we've found, map it to its corresponding socket based
 734  * on the PCI bus that it has.
 735  */
 736 static imc_socket_t *
 737 imc_map_find_socket(imc_t *imc, imc_stub_t *stub)
 738 {
 739         uint_t i;
 740 
 741         for (i = 0; i < imc->imc_nsockets; i++) {
 742                 uint_t bus;
 743 
 744                 for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) {
 745                         if (imc->imc_sockets[i].isock_bus[bus] ==
 746                             stub->istub_bus) {
 747                                 return (&imc->imc_sockets[i]);
 748                         }
 749                 }
 750         }
 751 
 752         return (NULL);
 753 }
 754 
 755 static boolean_t
 756 imc_map_stubs(imc_t *imc)
 757 {
 758         imc_stub_t *stub;
 759 
 760         if (!imc_map_buses(imc)) {
 761                 return (B_FALSE);
 762         }
 763 
 764         stub = avl_first(&imc->imc_stubs);
 765         for (stub = avl_first(&imc->imc_stubs); stub != NULL;
 766             stub = AVL_NEXT(&imc->imc_stubs, stub)) {
 767                 imc_socket_t *sock = imc_map_find_socket(imc, stub);
 768 
 769                 if (sock == NULL) {
 770                         dev_err(imc->imc_dip, CE_WARN, "found stub type %u "
 771                             "PCI%x,%x with bdf %u/%u/%u that does not match a "
 772                             "known PCI bus for any of %u sockets",
 773                             stub->istub_table->imcs_type, stub->istub_vid,
 774                             stub->istub_did, stub->istub_bus, stub->istub_dev,
 775                             stub->istub_func, imc->imc_nsockets);
 776                         continue;
 777                 }
 778 
 779                 /*
 780                  * We don't have to worry about duplicates here. We check to
 781                  * make sure that we have unique bdfs here.
 782                  */
 783                 switch (stub->istub_table->imcs_type) {
 784                 case IMC_TYPE_MC0_M2M:
 785                         sock->isock_imcs[0].icn_m2m = stub;
 786                         break;
 787                 case IMC_TYPE_MC1_M2M:
 788                         sock->isock_imcs[1].icn_m2m = stub;
 789                         break;
 790                 case IMC_TYPE_MC0_MAIN0:
 791                         sock->isock_nimc++;
 792                         sock->isock_imcs[0].icn_main0 = stub;
 793 
 794                         /*
 795                          * On Skylake, the MAIN0 does double duty as channel
 796                          * zero and as the TAD.
 797                          */
 798                         if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
 799                                 sock->isock_imcs[0].icn_nchannels++;
 800                                 sock->isock_imcs[0].icn_channels[0].ich_desc =
 801                                     stub;
 802                                 sock->isock_tad[0].itad_stub = stub;
 803                                 sock->isock_ntad++;
 804                         }
 805                         break;
 806                 case IMC_TYPE_MC0_MAIN1:
 807                         sock->isock_imcs[0].icn_main1 = stub;
 808                         break;
 809                 case IMC_TYPE_MC1_MAIN0:
 810                         sock->isock_nimc++;
 811                         sock->isock_imcs[1].icn_main0 = stub;
 812 
 813                         /*
 814                          * On Skylake, the MAIN0 does double duty as channel
 815                          * zero and as the TAD.
 816                          */
 817                         if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
 818                                 sock->isock_imcs[1].icn_nchannels++;
 819                                 sock->isock_imcs[1].icn_channels[0].ich_desc =
 820                                     stub;
 821                                 sock->isock_tad[1].itad_stub = stub;
 822                                 sock->isock_ntad++;
 823                         }
 824                         break;
 825                 case IMC_TYPE_MC1_MAIN1:
 826                         sock->isock_imcs[1].icn_main1 = stub;
 827                         break;
 828                 case IMC_TYPE_MC0_CHANNEL0:
 829                         sock->isock_imcs[0].icn_nchannels++;
 830                         sock->isock_imcs[0].icn_channels[0].ich_desc = stub;
 831                         break;
 832                 case IMC_TYPE_MC0_CHANNEL1:
 833                         sock->isock_imcs[0].icn_nchannels++;
 834                         sock->isock_imcs[0].icn_channels[1].ich_desc = stub;
 835                         break;
 836                 case IMC_TYPE_MC0_CHANNEL2:
 837                         sock->isock_imcs[0].icn_nchannels++;
 838                         sock->isock_imcs[0].icn_channels[2].ich_desc = stub;
 839                         break;
 840                 case IMC_TYPE_MC0_CHANNEL3:
 841                         sock->isock_imcs[0].icn_nchannels++;
 842                         sock->isock_imcs[0].icn_channels[3].ich_desc = stub;
 843                         break;
 844                 case IMC_TYPE_MC1_CHANNEL0:
 845                         sock->isock_imcs[1].icn_nchannels++;
 846                         sock->isock_imcs[1].icn_channels[0].ich_desc = stub;
 847                         break;
 848                 case IMC_TYPE_MC1_CHANNEL1:
 849                         sock->isock_imcs[1].icn_nchannels++;
 850                         sock->isock_imcs[1].icn_channels[1].ich_desc = stub;
 851                         break;
 852                 case IMC_TYPE_MC1_CHANNEL2:
 853                         sock->isock_imcs[1].icn_nchannels++;
 854                         sock->isock_imcs[1].icn_channels[2].ich_desc = stub;
 855                         break;
 856                 case IMC_TYPE_MC1_CHANNEL3:
 857                         sock->isock_imcs[1].icn_nchannels++;
 858                         sock->isock_imcs[1].icn_channels[3].ich_desc = stub;
 859                         break;
 860                 case IMC_TYPE_SAD_DRAM:
 861                         sock->isock_sad.isad_dram = stub;
 862                         break;
 863                 case IMC_TYPE_SAD_MMIO:
 864                         sock->isock_sad.isad_mmio = stub;
 865                         break;
 866                 case IMC_TYPE_SAD_MISC:
 867                         sock->isock_sad.isad_tolh = stub;
 868                         break;
 869                 case IMC_TYPE_VTD_MISC:
 870                         /*
 871                          * Some systems have multiple VT-D Misc. entry points
 872                          * in the system. In this case, only use the first one
 873                          * we find.
 874                          */
 875                         if (imc->imc_gvtd_misc == NULL) {
 876                                 imc->imc_gvtd_misc = stub;
 877                         }
 878                         break;
 879                 case IMC_TYPE_SAD_MCROUTE:
 880                         ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE);
 881                         imc_mcroute_check(imc, &sock->isock_sad, stub);
 882                         break;
 883                 case IMC_TYPE_UBOX:
 884                         sock->isock_ubox = stub;
 885                         break;
 886                 case IMC_TYPE_HA0:
 887                         sock->isock_ntad++;
 888                         sock->isock_tad[0].itad_stub = stub;
 889                         break;
 890                 case IMC_TYPE_HA1:
 891                         sock->isock_ntad++;
 892                         sock->isock_tad[1].itad_stub = stub;
 893                         break;
 894                 case IMC_TYPE_UBOX_CPUBUSNO:
 895                         sock->isock_cpubusno = stub;
 896                         break;
 897                 default:
 898                         /*
 899                          * Attempt to still attach if we can.
 900                          */
 901                         dev_err(imc->imc_dip, CE_WARN, "Encountered unknown "
 902                             "IMC type (%u) on PCI %x,%x",
 903                             stub->istub_table->imcs_type,
 904                             stub->istub_vid, stub->istub_did);
 905                         break;
 906                 }
 907         }
 908 
 909         return (B_TRUE);
 910 }
 911 
 912 /*
 913  * Go through and fix up various aspects of the stubs mappings on systems. The
 914  * following are a list of what we need to fix up:
 915  *
 916  *  1. On Haswell and newer systems, there is only one global VT-d device. We
 917  *     need to go back and map that to all of the per-socket imc_sad_t entries.
 918  */
 919 static void
 920 imc_fixup_stubs(imc_t *imc)
 921 {
 922         if (imc->imc_gen >= IMC_GEN_HASWELL) {
 923                 uint_t i;
 924 
 925                 for (i = 0; i < imc->imc_nsockets; i++) {
 926                         ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh,
 927                             ==, NULL);
 928                         imc->imc_sockets[i].isock_sad.isad_tolh =
 929                             imc->imc_gvtd_misc;
 930                 }
 931         }
 932 }
 933 
 934 /*
 935  * In the wild we've hit a few odd cases where not all devices are exposed that
 936  * we might expect by firmware. In particular we've seen and validate the
 937  * following cases:
 938  *
 939  *  o We don't find all of the channel devices that we expect, e.g. we have the
 940  *    stubs for channels 1-3, but not 0. That has been seen on an Intel S2600CW
 941  *    with an E5-2630v3.
 942  */
 943 static boolean_t
 944 imc_validate_stubs(imc_t *imc)
 945 {
 946         for (uint_t sock = 0; sock < imc->imc_nsockets; sock++) {
 947                 imc_socket_t *socket = &imc->imc_sockets[sock];
 948 
 949                 for (uint_t mc = 0; mc < socket->isock_nimc; mc++) {
 950                         imc_mc_t *mcp = &socket->isock_imcs[mc];
 951 
 952                         for (uint_t chan = 0; chan < mcp->icn_nchannels;
 953                             chan++) {
 954                                 if (mcp->icn_channels[chan].ich_desc == NULL) {
 955                                         dev_err(imc->imc_dip, CE_WARN,
 956                                             "!missing device for socket %u/"
 957                                             "imc %u/channel %u", sock, mc,
 958                                             chan);
 959                                         return (B_FALSE);
 960                                 }
 961                         }
 962                 }
 963         }
 964 
 965         return (B_TRUE);
 966 }
 967 
 968 /*
 969  * Attempt to map all of the discovered sockets to the corresponding APIC based
 970  * socket. We do these mappings by getting the node id of the socket and
 971  * adjusting it to make sure that no home agent is present in it. We use the
 972  * UBOX to avoid any home agent related bits that are present in other
 973  * registers.
 974  */
 975 static void
 976 imc_map_sockets(imc_t *imc)
 977 {
 978         uint_t i;
 979 
 980         for (i = 0; i < imc->imc_nsockets; i++) {
 981                 uint32_t nodeid;
 982                 ddi_acc_handle_t h;
 983 
 984                 h = imc->imc_sockets[i].isock_ubox->istub_cfgspace;
 985                 nodeid = pci_config_get32(h,
 986                     imc->imc_gen_data->igd_sad_nodeid_offset);
 987                 if (nodeid == PCI_EINVAL32) {
 988                         imc->imc_sockets[i].isock_valid |=
 989                             IMC_SOCKET_V_BAD_NODEID;
 990                         continue;
 991                 }
 992 
 993                 imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid);
 994                 imc->imc_spointers[nodeid] = &imc->imc_sockets[i];
 995         }
 996 }
 997 
 998 /*
 999  * Decode the MTR, accounting for variances between processor generations.
1000  */
1001 static void
1002 imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr)
1003 {
1004         uint8_t disable;
1005 
1006         /*
1007          * Check present first, before worrying about anything else.
1008          */
1009         if (imc->imc_gen < IMC_GEN_SKYLAKE &&
1010             IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) {
1011                 dimm->idimm_present = B_FALSE;
1012                 return;
1013         } else if (imc->imc_gen >= IMC_GEN_SKYLAKE &&
1014             IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) {
1015                 dimm->idimm_present = B_FALSE;
1016                 return;
1017         }
1018 
1019         dimm->idimm_present = B_TRUE;
1020         dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE;
1021         if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN ||
1022             dimm->idimm_ncolumns > IMC_MTR_CA_MAX) {
1023                 dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS;
1024         }
1025 
1026         dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE;
1027         if (dimm->idimm_nrows < IMC_MTR_RA_MIN ||
1028             dimm->idimm_nrows > IMC_MTR_RA_MAX) {
1029                 dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS;
1030         }
1031 
1032         /*
1033          * Determine Density, this information is not present on Sandy Bridge.
1034          */
1035         switch (imc->imc_gen) {
1036         case IMC_GEN_IVY:
1037                 dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr);
1038                 break;
1039         case IMC_GEN_HASWELL:
1040         case IMC_GEN_BROADWELL:
1041                 switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) {
1042                 case 0:
1043                 default:
1044                         dimm->idimm_density = 0;
1045                         dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1046                         break;
1047                 case 1:
1048                         dimm->idimm_density = 2;
1049                         break;
1050                 case 2:
1051                         dimm->idimm_density = 4;
1052                         break;
1053                 case 3:
1054                         dimm->idimm_density = 8;
1055                         break;
1056                 }
1057                 break;
1058         case IMC_GEN_SKYLAKE:
1059                 switch (IMC_MTR_DENSITY_SKX(mtr)) {
1060                 case 0:
1061                 default:
1062                         dimm->idimm_density = 0;
1063                         dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1064                         break;
1065                 case 1:
1066                         dimm->idimm_density = 2;
1067                         break;
1068                 case 2:
1069                         dimm->idimm_density = 4;
1070                         break;
1071                 case 3:
1072                         dimm->idimm_density = 8;
1073                         break;
1074                 case 4:
1075                         dimm->idimm_density = 16;
1076                         break;
1077                 case 5:
1078                         dimm->idimm_density = 12;
1079                         break;
1080                 }
1081                 break;
1082         case IMC_GEN_UNKNOWN:
1083         case IMC_GEN_SANDY:
1084                 dimm->idimm_density = 0;
1085                 break;
1086         }
1087 
1088         /*
1089          * The values of width are the same on IVY->SKX, but the bits are
1090          * different. This doesn't exist on SNB.
1091          */
1092         if (imc->imc_gen > IMC_GEN_SANDY) {
1093                 uint8_t width;
1094 
1095                 if (imc->imc_gen >= IMC_GEN_BROADWELL) {
1096                         width = IMC_MTR_WIDTH_BRD_SKX(mtr);
1097                 } else {
1098                         width = IMC_MTR_WIDTH_IVB_HAS(mtr);
1099                 }
1100                 switch (width) {
1101                 case 0:
1102                         dimm->idimm_width = 4;
1103                         break;
1104                 case 1:
1105                         dimm->idimm_width = 8;
1106                         break;
1107                 case 2:
1108                         dimm->idimm_width = 16;
1109                         break;
1110                 default:
1111                         dimm->idimm_width = 0;
1112                         dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH;
1113                         break;
1114                 }
1115         } else {
1116                 dimm->idimm_width = 0;
1117         }
1118 
1119         dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr);
1120         switch (imc->imc_gen) {
1121         case IMC_GEN_HASWELL:
1122         case IMC_GEN_BROADWELL:
1123         case IMC_GEN_SKYLAKE:
1124                 if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) {
1125                         dimm->idimm_nranks = 0;
1126                         dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1127                 }
1128                 break;
1129         default:
1130                 if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) {
1131                         dimm->idimm_nranks = 0;
1132                         dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1133                 }
1134         }
1135 
1136         disable = IMC_MTR_RANK_DISABLE(mtr);
1137         dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0;
1138         dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0;
1139         dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0;
1140         dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0;
1141 
1142         /*
1143          * Only Haswell and later have this information.
1144          */
1145         if (imc->imc_gen >= IMC_GEN_HASWELL) {
1146                 dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0;
1147                 dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0;
1148                 dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr);
1149                 if (dimm->idimm_3dsranks != 0) {
1150                         dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks;
1151                 }
1152         }
1153 
1154 
1155         if (icn->icn_dimm_type == IMC_DIMM_DDR4) {
1156                 dimm->idimm_nbanks = 16;
1157         } else {
1158                 dimm->idimm_nbanks = 8;
1159         }
1160 
1161         /*
1162          * To calculate the DIMM size we need first take the number of rows and
1163          * columns. This gives us the number of slots per chip. In a given rank
1164          * there are nbanks of these. There are nrank entries of those. Each of
1165          * these slots can fit a byte.
1166          */
1167         dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 *
1168             (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows));
1169 }
1170 
1171 static void
1172 imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan)
1173 {
1174         uint_t i;
1175 
1176         /*
1177          * There's one register for each DIMM that might be present, we always
1178          * read that information to determine information about the DIMMs.
1179          */
1180         chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms;
1181         for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1182                 uint32_t mtr;
1183                 imc_dimm_t *dimm = &chan->ich_dimms[i];
1184 
1185                 bzero(dimm, sizeof (imc_dimm_t));
1186                 mtr = pci_config_get32(chan->ich_desc->istub_cfgspace,
1187                     imc->imc_gen_data->igd_mtr_offsets[i]);
1188                 dimm->idimm_mtr = mtr;
1189                 /*
1190                  * We don't really expect to get a bad PCIe read. However, if we
1191                  * do, treat that for the moment as though the DIMM is bad.
1192                  */
1193                 if (mtr == PCI_EINVAL32) {
1194                         dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ;
1195                         continue;
1196                 }
1197 
1198                 imc_decode_mtr(imc, icn, dimm, mtr);
1199         }
1200 }
1201 
1202 static boolean_t
1203 imc_fill_controller(imc_t *imc, imc_mc_t *icn)
1204 {
1205         uint32_t mcmtr;
1206 
1207         mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace,
1208             imc->imc_gen_data->igd_mcmtr_offset);
1209         if (mcmtr == PCI_EINVAL32) {
1210                 icn->icn_invalid = B_TRUE;
1211                 return (B_FALSE);
1212         }
1213 
1214         icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0;
1215         if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1216                 icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0;
1217         } else {
1218                 icn->icn_lockstep = B_FALSE;
1219         }
1220 
1221         icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0;
1222 
1223         /*
1224          * SNB and IVB only support DDR3. Haswell and Broadwell may support
1225          * DDR4, depends on the SKU. Skylake only supports DDR4.
1226          */
1227         switch (imc->imc_gen) {
1228         case IMC_GEN_SANDY:
1229         case IMC_GEN_IVY:
1230                 icn->icn_dimm_type = IMC_DIMM_DDR3;
1231                 break;
1232         case IMC_GEN_HASWELL:
1233         case IMC_GEN_BROADWELL:
1234                 if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) {
1235                         icn->icn_dimm_type = IMC_DIMM_DDR4;
1236                 } else {
1237                         icn->icn_dimm_type = IMC_DIMM_DDR3;
1238                 }
1239                 break;
1240         default:
1241                 /*
1242                  * Skylake and on are all DDR4.
1243                  */
1244                 icn->icn_dimm_type = IMC_DIMM_DDR4;
1245                 break;
1246         }
1247 
1248         if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) {
1249                 icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace,
1250                     imc->imc_gen_data->igd_topo_offset);
1251         }
1252 
1253         return (B_TRUE);
1254 }
1255 
1256 /*
1257  * Walk the IMC data and fill in the information on DIMMs and the memory
1258  * controller configurations.
1259  */
1260 static void
1261 imc_fill_data(imc_t *imc)
1262 {
1263         uint_t csock, cmc, cchan;
1264 
1265         for (csock = 0; csock < imc->imc_nsockets; csock++) {
1266                 imc_socket_t *sock = &imc->imc_sockets[csock];
1267 
1268                 for (cmc = 0; cmc < sock->isock_nimc; cmc++) {
1269                         imc_mc_t *icn = &sock->isock_imcs[cmc];
1270 
1271                         if (!imc_fill_controller(imc, icn))
1272                                 continue;
1273 
1274                         for (cchan = 0; cchan < icn->icn_nchannels; cchan++) {
1275                                 imc_fill_dimms(imc, icn,
1276                                     &icn->icn_channels[cchan]);
1277                         }
1278                 }
1279         }
1280 }
1281 
1282 static nvlist_t *
1283 imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm)
1284 {
1285         nvlist_t *nvl;
1286 
1287         nvl = fnvlist_alloc();
1288         fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT,
1289             dimm->idimm_present);
1290         if (!dimm->idimm_present) {
1291                 return (nvl);
1292         }
1293 
1294         fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size);
1295         fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS,
1296             dimm->idimm_ncolumns);
1297         fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS,
1298             dimm->idimm_nrows);
1299 
1300         if (imc->imc_gen > IMC_GEN_SANDY) {
1301                 fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY,
1302                     dimm->idimm_density * (1ULL << 30));
1303                 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH,
1304                     dimm->idimm_width);
1305         }
1306         fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS,
1307             dimm->idimm_nranks);
1308         fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS,
1309             dimm->idimm_nbanks);
1310         fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS,
1311             dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE);
1312 
1313         if (imc->imc_gen >= IMC_GEN_HASWELL) {
1314                 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL,
1315                     dimm->idimm_hdrl);
1316                 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP,
1317                     dimm->idimm_hdrl_parity);
1318                 if (dimm->idimm_3dsranks > 0) {
1319                         fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK,
1320                             dimm->idimm_3dsranks);
1321                 }
1322         }
1323 
1324         return (nvl);
1325 }
1326 
1327 static nvlist_t *
1328 imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan)
1329 {
1330         nvlist_t *nvl;
1331         nvlist_t *dimms[IMC_MAX_DIMMPERCHAN];
1332         uint_t i;
1333 
1334         nvl = fnvlist_alloc();
1335         fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC,
1336             imc->imc_gen_data->igd_max_dimms);
1337         for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1338                 dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]);
1339         }
1340 
1341         fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS,
1342             dimms, i);
1343 
1344         for (; i > 0; i--) {
1345                 nvlist_free(dimms[i-1]);
1346         }
1347 
1348         return (nvl);
1349 }
1350 
1351 static nvlist_t *
1352 imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn)
1353 {
1354         nvlist_t *nvl;
1355         nvlist_t *channels[IMC_MAX_CHANPERMC];
1356         uint_t i;
1357 
1358         nvl = fnvlist_alloc();
1359         fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels);
1360         fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC,
1361             icn->icn_ecc);
1362         if (icn->icn_lockstep) {
1363                 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1364                     MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK);
1365         } else {
1366                 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1367                     MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP);
1368 
1369         }
1370 
1371         if (icn->icn_closed) {
1372                 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1373                     MCINTEL_NVLIST_V1_MC_POLICY_CLOSED);
1374         } else {
1375                 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1376                     MCINTEL_NVLIST_V1_MC_POLICY_OPEN);
1377         }
1378 
1379         for (i = 0; i < icn->icn_nchannels; i++) {
1380                 channels[i] = imc_nvl_create_channel(imc,
1381                     &icn->icn_channels[i]);
1382         }
1383         fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS,
1384             channels, icn->icn_nchannels);
1385         for (i = 0; i < icn->icn_nchannels; i++) {
1386                 nvlist_free(channels[i]);
1387         }
1388 
1389         return (nvl);
1390 }
1391 
1392 static void
1393 imc_nvl_pack(imc_socket_t *sock, boolean_t sleep)
1394 {
1395         char *buf = NULL;
1396         size_t len = 0;
1397         int kmflag;
1398 
1399         if (sock->isock_nvl == NULL)
1400                 return;
1401 
1402         if (sock->isock_buf != NULL)
1403                 return;
1404 
1405         if (sleep) {
1406                 kmflag = KM_SLEEP;
1407         } else {
1408                 kmflag = KM_NOSLEEP | KM_NORMALPRI;
1409         }
1410 
1411         if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR,
1412             kmflag) != 0) {
1413                 return;
1414         }
1415 
1416         sock->isock_buf = buf;
1417         sock->isock_buflen = len;
1418         sock->isock_gen++;
1419 }
1420 
1421 static void
1422 imc_decoder_pack(imc_t *imc)
1423 {
1424         char *buf = NULL;
1425         size_t len = 0;
1426 
1427         if (imc->imc_decoder_buf != NULL)
1428                 return;
1429 
1430         if (imc->imc_decoder_dump == NULL) {
1431                 imc->imc_decoder_dump = imc_dump_decoder(imc);
1432         }
1433 
1434         if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR,
1435             KM_NOSLEEP | KM_NORMALPRI) != 0) {
1436                 return;
1437         }
1438 
1439         imc->imc_decoder_buf = buf;
1440         imc->imc_decoder_len = len;
1441 }
1442 
1443 static void
1444 imc_nvl_create(imc_t *imc)
1445 {
1446         uint_t csock;
1447         for (csock = 0; csock < imc->imc_nsockets; csock++) {
1448                 uint_t i;
1449                 nvlist_t *nvl;
1450                 nvlist_t *mcs[IMC_MAX_IMCPERSOCK];
1451                 imc_socket_t *sock = &imc->imc_sockets[csock];
1452 
1453                 nvl = fnvlist_alloc();
1454                 fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR,
1455                     MCINTEL_NVLIST_VERS1);
1456                 fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC,
1457                     sock->isock_nimc);
1458 
1459                 for (i = 0; i < sock->isock_nimc; i++) {
1460                         mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]);
1461                 }
1462 
1463                 fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS,
1464                     mcs, sock->isock_nimc);
1465 
1466                 for (i = 0; i < sock->isock_nimc; i++) {
1467                         nvlist_free(mcs[i]);
1468                 }
1469 
1470                 sock->isock_nvl = nvl;
1471                 imc_nvl_pack(sock, B_TRUE);
1472         }
1473 }
1474 
1475 /*
1476  * Determine the top of low and high memory. These determine whether transaction
1477  * addresses target main memory or not. Unfortunately, the way that these are
1478  * stored and fetched changes with different generations.
1479  */
1480 static void
1481 imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad)
1482 {
1483         uint32_t tolm, tohm_low, tohm_hi;
1484 
1485         tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1486             imc->imc_gen_data->igd_tolm_offset);
1487         tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1488             imc->imc_gen_data->igd_tohm_low_offset);
1489         if (imc->imc_gen_data->igd_tohm_hi_offset != 0) {
1490                 tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1491                     imc->imc_gen_data->igd_tohm_hi_offset);
1492         } else {
1493                 tohm_hi = 0;
1494         }
1495 
1496         if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 ||
1497             tohm_hi == PCI_EINVAL32) {
1498                 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1499                 return;
1500         }
1501 
1502         switch (imc->imc_gen) {
1503         case IMC_GEN_SANDY:
1504         case IMC_GEN_IVY:
1505                 sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) <<
1506                     IMC_TOLM_SNB_IVY_SHIFT;
1507                 sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) <<
1508                     IMC_TOLM_SNB_IVY_SHIFT;
1509                 break;
1510         case IMC_GEN_HASWELL:
1511         case IMC_GEN_BROADWELL:
1512         case IMC_GEN_SKYLAKE:
1513                 sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK;
1514                 sad->isad_tohm = ((uint64_t)tohm_low &
1515                     IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32);
1516 
1517                 /*
1518                  * Adjust the values to turn them into an exclusive range.
1519                  */
1520                 sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL;
1521                 sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL;
1522                 break;
1523         default:
1524                 dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
1525                     "set to unknown generation: %u", imc->imc_gen);
1526                 return;
1527         }
1528 }
1529 
1530 static void
1531 imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule,
1532     uint32_t raw)
1533 {
1534         uint_t attr;
1535         uint64_t limit;
1536         bzero(rule, sizeof (imc_sad_rule_t));
1537 
1538         rule->isr_raw_dram = raw;
1539         rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0;
1540         if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1541                 switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) {
1542                 case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6:
1543                         rule->isr_imode = IMC_SAD_IMODE_8t6;
1544                         break;
1545                 case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR:
1546                         rule->isr_imode = IMC_SAD_IMODE_8t6XOR;
1547                         break;
1548                 }
1549         } else {
1550                 switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) {
1551                 case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6:
1552                         rule->isr_imode = IMC_SAD_IMODE_8t6;
1553                         break;
1554                 case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8:
1555                         rule->isr_imode = IMC_SAD_IMODE_10t8;
1556                         break;
1557                 case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12:
1558                         rule->isr_imode = IMC_SAD_IMODE_14t12;
1559                         break;
1560                 case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30:
1561                         rule->isr_imode = IMC_SAD_IMODE_32t30;
1562                         break;
1563                 }
1564         }
1565 
1566         if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1567                 attr = IMC_SAD_DRAM_ATTR_SKX(raw);
1568         } else {
1569                 attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw);
1570         }
1571 
1572         switch (attr) {
1573         case IMC_SAD_DRAM_ATTR_DRAM:
1574                 rule->isr_type = IMC_SAD_TYPE_DRAM;
1575                 break;
1576         case IMC_SAD_DRAM_ATTR_MMCFG:
1577                 rule->isr_type = IMC_SAD_TYPE_MMCFG;
1578                 break;
1579         case IMC_SAD_DRAM_ATTR_NXM:
1580                 if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1581                         sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1582                 }
1583                 rule->isr_type = IMC_SAD_TYPE_NXM;
1584                 break;
1585         default:
1586                 sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1587                 break;
1588         }
1589 
1590         /*
1591          * Fetch the limit which represents bits 45:26 and then adjust this so
1592          * that it is exclusive.
1593          */
1594         if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1595                 limit = IMC_SAD_DRAM_LIMIT_SKX(raw);
1596         } else {
1597                 limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw);
1598         }
1599         rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) +
1600             IMC_SAD_DRAM_LIMIT_EXCLUSIVE;
1601 
1602         /*
1603          * The rest of this does not apply to Sandy Bridge.
1604          */
1605         if (imc->imc_gen == IMC_GEN_SANDY)
1606                 return;
1607 
1608         if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) {
1609                 rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0;
1610                 return;
1611         }
1612 
1613         switch (IMC_SAD_DRAM_MOD23_SKX(raw)) {
1614         case IMC_SAD_DRAM_MOD23_MOD3:
1615                 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3;
1616                 break;
1617         case IMC_SAD_DRAM_MOD23_MOD2_C01:
1618                 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01;
1619                 break;
1620         case IMC_SAD_DRAM_MOD23_MOD2_C12:
1621                 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12;
1622                 break;
1623         case IMC_SAD_DRAM_MOD23_MOD2_C02:
1624                 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02;
1625                 break;
1626         }
1627 
1628         rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0;
1629         switch (IMC_SAD_DRAM_MOD3_SKX(raw)) {
1630         case IMC_SAD_DRAM_MOD3_MODE_45t6:
1631                 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6;
1632                 break;
1633         case IMC_SAD_DRAM_MOD3_MODE_45t8:
1634                 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8;
1635                 break;
1636         case IMC_SAD_DRAM_MOD3_MODE_45t12:
1637                 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12;
1638                 break;
1639         default:
1640                 sad->isad_valid |= IMC_SAD_V_BAD_MOD3;
1641                 break;
1642         }
1643 }
1644 
1645 static void
1646 imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw)
1647 {
1648         uint_t i;
1649         uint32_t mlen, mbase, skipbits, skipafter;
1650 
1651         rule->isr_raw_interleave = raw;
1652 
1653         /*
1654          * Right now all architectures always have the maximum number of SAD
1655          * interleave targets.
1656          */
1657         rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE;
1658 
1659         /*
1660          * Sandy Bridge has a gap in the interleave list due to the fact that it
1661          * uses a smaller length.
1662          */
1663         if (imc->imc_gen > IMC_GEN_SANDY) {
1664                 mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN;
1665                 mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK;
1666                 skipbits = skipafter = 0;
1667         } else {
1668                 mlen = IMC_SAD_ILEAVE_SNB_LEN;
1669                 mbase = IMC_SAD_ILEAVE_SNB_MASK;
1670                 skipbits = 2;
1671                 skipafter = 4;
1672         }
1673 
1674         for (i = 0; i < rule->isr_ntargets; i++) {
1675                 uint32_t mask, shift;
1676 
1677                 shift = i * mlen;
1678                 if (i >= skipafter)
1679                         shift += skipbits;
1680                 mask = mbase << shift;
1681                 rule->isr_targets[i] = (raw & mask) >> shift;
1682         }
1683 }
1684 
1685 static void
1686 imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad)
1687 {
1688         uint_t i;
1689         off_t off;
1690 
1691         sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules;
1692         for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset;
1693             i < sad->isad_nrules; i++, off += sizeof (uint64_t)) {
1694                 uint32_t dram, interleave;
1695                 imc_sad_rule_t *rule = &sad->isad_rules[i];
1696 
1697                 dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off);
1698                 interleave = pci_config_get32(sad->isad_dram->istub_cfgspace,
1699                     off + 4);
1700 
1701                 if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) {
1702                         sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1703                         return;
1704                 }
1705 
1706                 imc_sad_fill_rule(imc, sad, rule, dram);
1707                 imc_sad_fill_rule_interleave(imc, rule, interleave);
1708         }
1709 }
1710 
1711 static void
1712 imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad)
1713 {
1714         uint_t i;
1715         imc_sad_mcroute_table_t *mc = &sad->isad_mcroute;
1716 
1717         if (imc->imc_gen < IMC_GEN_SKYLAKE)
1718                 return;
1719         if (sad->isad_valid != 0)
1720                 return;
1721 
1722         mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES;
1723         for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) {
1724                 uint_t chanoff, ringoff;
1725 
1726                 ringoff = i * IMC_MC_ROUTE_RING_BITS;
1727                 chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET;
1728 
1729                 mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >>
1730                     ringoff) & IMC_MC_ROUTE_RING_MASK;
1731                 mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >>
1732                     chanoff) & IMC_MC_ROUTE_CHAN_MASK;
1733         }
1734 }
1735 
1736 /*
1737  * Initialize the SAD. To do this we have to do a few different things:
1738  *
1739  * 1. Determine where the top of low and high memory is.
1740  * 2. Read and decode all of the rules for the SAD
1741  * 3. On systems with a route table, decode the raw routes
1742  *
1743  * At this point in time, we treat TOLM and TOHM as a per-socket construct, even
1744  * though it really should be global, this just makes life a bit simpler.
1745  */
1746 static void
1747 imc_decoder_init_sad(imc_t *imc)
1748 {
1749         uint_t i;
1750 
1751         for (i = 0; i < imc->imc_nsockets; i++) {
1752                 imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad);
1753                 imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad);
1754                 imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad);
1755         }
1756 }
1757 
1758 static void
1759 imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev,
1760     imc_tad_rule_t *rule, uint32_t val)
1761 {
1762         uint64_t limit;
1763 
1764         limit = IMC_TAD_LIMIT(val);
1765         rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) +
1766             IMC_TAD_LIMIT_EXCLUSIVE;
1767         rule->itr_raw = val;
1768 
1769         switch (IMC_TAD_SOCK_WAY(val)) {
1770         case IMC_TAD_SOCK_WAY_1:
1771                 rule->itr_sock_way = 1;
1772                 break;
1773         case IMC_TAD_SOCK_WAY_2:
1774                 rule->itr_sock_way = 2;
1775                 break;
1776         case IMC_TAD_SOCK_WAY_4:
1777                 rule->itr_sock_way = 4;
1778                 break;
1779         case IMC_TAD_SOCK_WAY_8:
1780                 rule->itr_sock_way = 8;
1781                 break;
1782         }
1783 
1784         rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1;
1785         rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1786         rule->itr_chan_gran = IMC_TAD_GRAN_64B;
1787 
1788         /*
1789          * Starting with Skylake the targets that are used are no longer part of
1790          * the TAD. Those come from the IMC route table.
1791          */
1792         if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1793                 rule->itr_ntargets = 0;
1794                 return;
1795         }
1796 
1797         rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS;
1798         rule->itr_targets[0] = IMC_TAD_TARG0(val);
1799         rule->itr_targets[1] = IMC_TAD_TARG1(val);
1800         rule->itr_targets[2] = IMC_TAD_TARG2(val);
1801         rule->itr_targets[3] = IMC_TAD_TARG3(val);
1802 
1803         if (prev == NULL) {
1804                 rule->itr_base = 0;
1805         } else {
1806                 rule->itr_base = prev->itr_limit + 1;
1807         }
1808 }
1809 
1810 static void
1811 imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule,
1812     uint32_t val)
1813 {
1814         uint64_t base;
1815 
1816         rule->itr_raw_gran = val;
1817         base = IMC_TAD_BASE_BASE(val);
1818         rule->itr_base = base << IMC_TAD_BASE_SHIFT;
1819 
1820         switch (IMC_TAD_BASE_CHAN_GRAN(val)) {
1821         case IMC_TAD_BASE_CHAN_GRAN_64B:
1822                 rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1823                 break;
1824         case IMC_TAD_BASE_CHAN_GRAN_256B:
1825                 rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1826                 break;
1827         case IMC_TAD_BASE_CHAN_GRAN_4KB:
1828                 rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1829                 break;
1830         default:
1831                 tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN;
1832                 return;
1833         }
1834 
1835         switch (IMC_TAD_BASE_SOCK_GRAN(val)) {
1836         case IMC_TAD_BASE_SOCK_GRAN_64B:
1837                 rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1838                 break;
1839         case IMC_TAD_BASE_SOCK_GRAN_256B:
1840                 rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1841                 break;
1842         case IMC_TAD_BASE_SOCK_GRAN_4KB:
1843                 rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1844                 break;
1845         case IMC_TAD_BASE_SOCK_GRAN_1GB:
1846                 rule->itr_sock_gran = IMC_TAD_GRAN_1GB;
1847                 break;
1848         }
1849 }
1850 
1851 /*
1852  * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's
1853  * suggested that the channel wayness will take this into account and therefore
1854  * should be accurately reflected.
1855  */
1856 static void
1857 imc_tad_read_rules(imc_t *imc, imc_tad_t *tad)
1858 {
1859         uint_t i;
1860         off_t baseoff;
1861         imc_tad_rule_t *prev;
1862 
1863         tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules;
1864         for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset,
1865             prev = NULL; i < tad->itad_nrules;
1866             i++, baseoff += sizeof (uint32_t)) {
1867                 uint32_t val;
1868                 off_t off;
1869                 imc_tad_rule_t *rule = &tad->itad_rules[i];
1870 
1871                 /*
1872                  * On Skylake, the TAD rules are split among two registers. The
1873                  * latter set mimics what exists on pre-Skylake.
1874                  */
1875                 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1876                         off = baseoff + IMC_SKX_WAYNESS_OFFSET;
1877                 } else {
1878                         off = baseoff;
1879                 }
1880 
1881                 val = pci_config_get32(tad->itad_stub->istub_cfgspace, off);
1882                 if (val == PCI_EINVAL32) {
1883                         tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1884                         return;
1885                 }
1886 
1887                 imc_tad_fill_rule(imc, tad, prev, rule, val);
1888                 prev = rule;
1889                 if (imc->imc_gen < IMC_GEN_SKYLAKE)
1890                         continue;
1891 
1892                 val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff);
1893                 if (val == PCI_EINVAL32) {
1894                         tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1895                         return;
1896                 }
1897 
1898                 imc_tad_fill_skx(imc, tad, rule, val);
1899         }
1900 }
1901 
1902 /*
1903  * Check for features which change how decoding works.
1904  */
1905 static void
1906 imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc)
1907 {
1908         uint32_t val;
1909 
1910         /*
1911          * Determine whether or not lockstep mode or mirroring are enabled.
1912          * These change the behavior of how we're supposed to interpret channel
1913          * wayness. Lockstep is available in the TAD's features. Mirroring is
1914          * available on the IMC's features. This isn't present in Skylake+. On
1915          * Skylake Mirorring is a property of the SAD rule and there is no
1916          * lockstep.
1917          */
1918         switch (imc->imc_gen) {
1919         case IMC_GEN_SANDY:
1920         case IMC_GEN_IVY:
1921         case IMC_GEN_HASWELL:
1922         case IMC_GEN_BROADWELL:
1923                 val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1924                     imc->imc_gen_data->igd_tad_sysdef);
1925                 if (val == PCI_EINVAL32) {
1926                         tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1927                         return;
1928                 }
1929                 if (IMC_TAD_SYSDEF_LOCKSTEP(val)) {
1930                         tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP;
1931                 }
1932 
1933                 val = pci_config_get32(mc->icn_main1->istub_cfgspace,
1934                     imc->imc_gen_data->igd_mc_mirror);
1935                 if (val == PCI_EINVAL32) {
1936                         tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1937                         return;
1938                 }
1939                 if (IMC_MC_MIRROR_SNB_BRD(val)) {
1940                         tad->itad_flags |= IMC_TAD_FLAG_MIRROR;
1941                 }
1942                 break;
1943         default:
1944                 break;
1945         }
1946 
1947         /*
1948          * Now, go through and look at values that'll change how we do the
1949          * channel index and adddress calculation. These are only present
1950          * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge
1951          * and they don't exist on Skylake+.
1952          */
1953         switch (imc->imc_gen) {
1954         case IMC_GEN_IVY:
1955         case IMC_GEN_HASWELL:
1956         case IMC_GEN_BROADWELL:
1957                 val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1958                     imc->imc_gen_data->igd_tad_sysdef2);
1959                 if (val == PCI_EINVAL32) {
1960                         tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1961                         return;
1962                 }
1963                 if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1964                         tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT;
1965                 }
1966                 if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1967                         tad->itad_flags |= IMC_TAD_FLAG_CHANHASH;
1968                 }
1969                 break;
1970         default:
1971                 break;
1972         }
1973 }
1974 
1975 /*
1976  * Read the IMC channel interleave records
1977  */
1978 static void
1979 imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan)
1980 {
1981         uint_t i;
1982         off_t off;
1983 
1984         chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules;
1985         for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset;
1986             i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) {
1987                 uint32_t val;
1988                 uint64_t offset;
1989 
1990                 val = pci_config_get32(chan->ich_desc->istub_cfgspace,
1991                     off);
1992                 if (val == PCI_EINVAL32) {
1993                         chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
1994                         return;
1995                 }
1996 
1997                 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1998                         offset = IMC_TADCHAN_OFFSET_SKX(val);
1999                 } else {
2000                         offset = IMC_TADCHAN_OFFSET_SNB_BRD(val);
2001                 }
2002 
2003                 chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT;
2004                 chan->ich_tad_offsets_raw[i] = val;
2005         }
2006 }
2007 
2008 static void
2009 imc_decoder_init_tad(imc_t *imc)
2010 {
2011         uint_t i;
2012 
2013         for (i = 0; i < imc->imc_nsockets; i++) {
2014                 uint_t j;
2015 
2016                 for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) {
2017                         imc_tad_read_features(imc,
2018                             &imc->imc_sockets[i].isock_tad[j],
2019                             &imc->imc_sockets[i].isock_imcs[j]);
2020                         imc_tad_read_rules(imc,
2021                             &imc->imc_sockets[i].isock_tad[j]);
2022                 }
2023         }
2024 
2025         for (i = 0; i < imc->imc_nsockets; i++) {
2026                 uint_t j;
2027                 imc_socket_t *sock = &imc->imc_sockets[i];
2028 
2029                 for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2030                         uint_t k;
2031                         imc_mc_t *mc = &sock->isock_imcs[j];
2032 
2033                         for (k = 0; k < mc->icn_nchannels; k++) {
2034                                 imc_channel_t *chan = &mc->icn_channels[k];
2035                                 imc_tad_read_interleave(imc, chan);
2036                         }
2037                 }
2038         }
2039 }
2040 
2041 static void
2042 imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan,
2043     imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig)
2044 {
2045         uint_t i;
2046         off_t off, incr;
2047 
2048         /*
2049          * Rank interleave offset registers come in two forms. Either they are
2050          * contiguous for a given wayness, meaning that all of the entries for
2051          * wayness zero are contiguous, or they are sparse, meaning that there
2052          * is a bank for entry zero for all wayness, then entry one for all
2053          * wayness, etc.
2054          */
2055         if (contig) {
2056                 off = imc->imc_gen_data->igd_rir_ileave_offset +
2057                     (rirno * imc->imc_gen_data->igd_rir_nileaves *
2058                     sizeof (uint32_t));
2059                 incr = sizeof (uint32_t);
2060         } else {
2061                 off = imc->imc_gen_data->igd_rir_ileave_offset +
2062                     (rirno * sizeof (uint32_t));
2063                 incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t);
2064         }
2065         for (i = 0; i < rank->irle_nentries; i++, off += incr) {
2066                 uint32_t val;
2067                 uint64_t offset;
2068                 imc_rank_ileave_entry_t *ent = &rank->irle_entries[i];
2069 
2070                 val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2071                 if (val == PCI_EINVAL32) {
2072                         chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2073                         return;
2074                 }
2075 
2076                 switch (imc->imc_gen) {
2077                 case IMC_GEN_BROADWELL:
2078                         ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val);
2079                         break;
2080                 default:
2081                         ent->irle_target = IMC_RIR_OFFSET_TARGET(val);
2082                         break;
2083                 }
2084                 if (imc->imc_gen >= IMC_GEN_HASWELL) {
2085                         offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val);
2086                 } else {
2087                         offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val);
2088                 }
2089                 ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT;
2090         }
2091 }
2092 
2093 static void
2094 imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan)
2095 {
2096         uint_t i;
2097         off_t off;
2098 
2099         chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways;
2100         for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset;
2101             i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) {
2102                 uint32_t val;
2103                 uint64_t lim;
2104                 imc_rank_ileave_t *ent = &chan->ich_rankileaves[i];
2105 
2106                 val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2107                 if (val == PCI_EINVAL32) {
2108                         chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2109                         return;
2110                 }
2111 
2112                 ent->irle_raw = val;
2113                 ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0;
2114                 ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val);
2115                 ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val);
2116                 if (imc->imc_gen >= IMC_GEN_HASWELL) {
2117                         lim = IMC_RIR_LIMIT_HAS_SKX(val);
2118                 } else {
2119                         lim = IMC_RIR_LIMIT_SNB_IVB(val);
2120                 }
2121 
2122                 ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) +
2123                     IMC_RIR_LIMIT_EXCLUSIVE;
2124 
2125                 ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves;
2126                 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
2127                         imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE);
2128                 } else {
2129                         imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE);
2130                 }
2131         }
2132 }
2133 
2134 static void
2135 imc_decoder_init_rir(imc_t *imc)
2136 {
2137         uint_t i;
2138 
2139         for (i = 0; i < imc->imc_nsockets; i++) {
2140                 uint_t j;
2141                 imc_socket_t *sock = &imc->imc_sockets[i];
2142 
2143                 for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2144                         uint_t k;
2145                         imc_mc_t *mc = &sock->isock_imcs[j];
2146 
2147                         for (k = 0; k < mc->icn_nchannels; k++) {
2148                                 imc_channel_t *chan = &mc->icn_channels[k];
2149                                 imc_rir_read_wayness(imc, chan);
2150                         }
2151                 }
2152         }
2153 }
2154 
2155 static cmi_errno_t
2156 imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo,
2157     uint32_t synd, int syndtype, mc_unum_t *unump)
2158 {
2159         imc_t *imc = arg;
2160         uint_t i;
2161         imc_decode_state_t dec;
2162 
2163         bzero(&dec, sizeof (dec));
2164         if (!imc_decode_pa(imc, pa, &dec)) {
2165                 switch (dec.ids_fail) {
2166                 case IMC_DECODE_F_LEGACY_RANGE:
2167                 case IMC_DECODE_F_OUTSIDE_DRAM:
2168                         return (CMIERR_MC_NOTDIMMADDR);
2169                 default:
2170                         return (CMIERR_MC_BADSTATE);
2171                 }
2172         }
2173 
2174         unump->unum_board = 0;
2175         /*
2176          * The chip id needs to be in the order that the OS expects it, which
2177          * may not be our order.
2178          */
2179         for (i = 0; i < imc->imc_nsockets; i++) {
2180                 if (imc->imc_spointers[i] == dec.ids_socket)
2181                         break;
2182         }
2183         if (i == imc->imc_nsockets) {
2184                 return (CMIERR_MC_BADSTATE);
2185         }
2186         unump->unum_chip = i;
2187         unump->unum_mc = dec.ids_tadid;
2188         unump->unum_chan = dec.ids_channelid;
2189         unump->unum_cs = dec.ids_dimmid;
2190         unump->unum_rank = dec.ids_rankid;
2191         unump->unum_offset = dec.ids_rankaddr;
2192         for (i = 0; i < MC_UNUM_NDIMM; i++) {
2193                 unump->unum_dimms[i] = MC_INVALNUM;
2194         }
2195 
2196         return (CMI_SUCCESS);
2197 }
2198 
2199 static cmi_errno_t
2200 imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa)
2201 {
2202         return (CMIERR_UNKNOWN);
2203 }
2204 
2205 static const cmi_mc_ops_t imc_mc_ops = {
2206         .cmi_mc_patounum = imc_mc_patounum,
2207         .cmi_mc_unumtopa = imc_mc_unumtopa
2208 };
2209 
2210 /*
2211  * This is where we really finish attaching and become open for business. This
2212  * occurs once we have all of the expected stubs attached. Here's where all of
2213  * the real fun begins.
2214  */
2215 static void
2216 imc_attach_complete(void *arg)
2217 {
2218         imc_t *imc = arg;
2219         cmi_errno_t err;
2220 
2221         imc_set_gen_data(imc);
2222 
2223         /*
2224          * On SKX and newer, we can fail to map PCI buses at this point due to
2225          * bad PCIe reads.
2226          */
2227         if (!imc_map_stubs(imc)) {
2228                 goto done;
2229         }
2230 
2231         if (!imc_validate_stubs(imc)) {
2232                 imc->imc_flags |= IMC_F_VALIDATE_FAILED;
2233                 goto done;
2234         }
2235 
2236         imc_fixup_stubs(imc);
2237         imc_map_sockets(imc);
2238 
2239         if (!imc_create_minors(imc)) {
2240                 goto done;
2241         }
2242 
2243         imc_fill_data(imc);
2244         imc_nvl_create(imc);
2245 
2246         /*
2247          * Gather additional information that we need so that we can properly
2248          * initialize the memory decoder and encoder.
2249          */
2250         imc_decoder_init_sad(imc);
2251         imc_decoder_init_tad(imc);
2252         imc_decoder_init_rir(imc);
2253 
2254         /*
2255          * Register decoder functions. This may fail. If so, try and complain
2256          * loudly, but stay active to allow other data to be useful. Register a
2257          * global handle.
2258          */
2259         if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) {
2260                 imc->imc_flags |= IMC_F_MCREG_FAILED;
2261                 dev_err(imc->imc_dip, CE_WARN, "failed to register memory "
2262                     "decoding operations: 0x%x", err);
2263         }
2264 
2265 done:
2266         mutex_enter(&imc->imc_lock);
2267         imc->imc_flags &= IMC_F_ATTACH_DISPATCHED;
2268         imc->imc_flags |= IMC_F_ATTACH_COMPLETE;
2269         mutex_exit(&imc->imc_lock);
2270 }
2271 
2272 static int
2273 imc_stub_comparator(const void *l, const void *r)
2274 {
2275         const imc_stub_t *sl = l, *sr = r;
2276         if (sl->istub_bus > sr->istub_bus)
2277                 return (1);
2278         if (sl->istub_bus < sr->istub_bus)
2279                 return (-1);
2280         if (sl->istub_dev > sr->istub_dev)
2281                 return (1);
2282         if (sl->istub_dev < sr->istub_dev)
2283                 return (-1);
2284         if (sl->istub_func > sr->istub_func)
2285                 return (1);
2286         if (sl->istub_func < sr->istub_func)
2287                 return (-1);
2288         return (0);
2289 }
2290 
2291 static int
2292 imc_stub_scan_cb(dev_info_t *dip, void *arg)
2293 {
2294         int vid, did;
2295         const imc_stub_table_t *table;
2296         imc_t *imc = arg;
2297         int *regs;
2298         uint_t i, nregs;
2299 
2300         if (dip == ddi_root_node()) {
2301                 return (DDI_WALK_CONTINUE);
2302         }
2303 
2304         /*
2305          * Get the dev info name. PCI devices will always be children of PCI
2306          * devices today on x86. If we reach something that has a device name
2307          * that's not PCI, then we can prune it's children.
2308          */
2309         if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2310                 return (DDI_WALK_PRUNECHILD);
2311         }
2312 
2313         /*
2314          * Get the device and vendor ID and see if this is something the imc
2315          * knows about or cares about.
2316          */
2317         vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2318             "vendor-id", PCI_EINVAL16);
2319         did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2320             "device-id", PCI_EINVAL16);
2321         if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2322                 return (DDI_WALK_CONTINUE);
2323         }
2324 
2325         if (vid != IMC_PCI_VENDOR_INTC) {
2326                 return (DDI_WALK_PRUNECHILD);
2327         }
2328 
2329         if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2330             "reg", &regs, &nregs) != DDI_PROP_SUCCESS) {
2331                 return (DDI_WALK_CONTINUE);
2332         }
2333 
2334         if (nregs == 0) {
2335                 ddi_prop_free(regs);
2336                 return (DDI_WALK_CONTINUE);
2337         }
2338 
2339 
2340         table = NULL;
2341         for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2342                 if (imc_stub_table[i].imcs_devid == did &&
2343                     imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2344                     imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2345                         table = &imc_stub_table[i];
2346                         break;
2347                 }
2348         }
2349         ddi_prop_free(regs);
2350 
2351         /*
2352          * Not a match, not interesting.
2353          */
2354         if (table == NULL) {
2355                 return (DDI_WALK_CONTINUE);
2356         }
2357 
2358         mutex_enter(&imc->imc_lock);
2359         imc->imc_nscanned++;
2360         mutex_exit(&imc->imc_lock);
2361 
2362         return (DDI_WALK_CONTINUE);
2363 }
2364 
2365 /*
2366  * From here, go through and see how many of the devices that we know about.
2367  */
2368 static void
2369 imc_stub_scan(void *arg)
2370 {
2371         imc_t *imc = arg;
2372         boolean_t dispatch = B_FALSE;
2373 
2374         /*
2375          * Zero out the scan results in case we've been detached and reattached.
2376          */
2377         mutex_enter(&imc->imc_lock);
2378         imc->imc_nscanned = 0;
2379         mutex_exit(&imc->imc_lock);
2380 
2381         ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc);
2382 
2383         mutex_enter(&imc->imc_lock);
2384         imc->imc_flags |= IMC_F_SCAN_COMPLETE;
2385         imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED;
2386 
2387         /*
2388          * If the scan found no nodes, then that means that we're on a hardware
2389          * platform that we don't support. Therefore, there's no reason to do
2390          * anything here.
2391          */
2392         if (imc->imc_nscanned == 0) {
2393                 imc->imc_flags |= IMC_F_UNSUP_PLATFORM;
2394                 mutex_exit(&imc->imc_lock);
2395                 return;
2396         }
2397 
2398         if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2399                 imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2400                 dispatch = B_TRUE;
2401         }
2402 
2403         mutex_exit(&imc->imc_lock);
2404 
2405         if (dispatch) {
2406                 (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2407                     imc, DDI_SLEEP);
2408         }
2409 }
2410 
2411 /*
2412  * By default, refuse to allow stubs to detach.
2413  */
2414 int
2415 imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd)
2416 {
2417         imc_stub_t *stub;
2418         imc_t *imc = imc_data;
2419 
2420         mutex_enter(&imc->imc_lock);
2421 
2422         /*
2423          * By default, we do not allow stubs to detach. However, if the driver
2424          * has attached to devices on a platform it doesn't recognize or
2425          * support or if the override flag has been set, then allow detach to
2426          * proceed.
2427          */
2428         if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 &&
2429             imc_allow_detach == 0) {
2430                 mutex_exit(&imc->imc_lock);
2431                 return (DDI_FAILURE);
2432         }
2433 
2434         for (stub = avl_first(&imc->imc_stubs); stub != NULL;
2435             stub = AVL_NEXT(&imc->imc_stubs, stub)) {
2436                 if (stub->istub_dip == dip) {
2437                         break;
2438                 }
2439         }
2440 
2441         /*
2442          * A device was attached to us that we somehow don't know about. Allow
2443          * this to proceed.
2444          */
2445         if (stub == NULL) {
2446                 mutex_exit(&imc->imc_lock);
2447                 return (DDI_SUCCESS);
2448         }
2449 
2450         pci_config_teardown(&stub->istub_cfgspace);
2451         avl_remove(&imc->imc_stubs, stub);
2452         kmem_free(stub, sizeof (imc_stub_t));
2453         mutex_exit(&imc->imc_lock);
2454 
2455         return (DDI_SUCCESS);
2456 }
2457 
2458 int
2459 imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd)
2460 {
2461         imc_stub_t *stub, *lookup;
2462         int did, vid, *regs;
2463         uint_t i, nregs;
2464         const imc_stub_table_t *table;
2465         avl_index_t idx;
2466         boolean_t dispatch = B_FALSE;
2467         imc_t *imc = imc_data;
2468 
2469         if (cmd != DDI_ATTACH) {
2470                 return (DDI_FAILURE);
2471         }
2472 
2473         /*
2474          * We've been asked to attach a stub. First, determine if this is even a
2475          * PCI device that we should care about. Then, append it to our global
2476          * list and kick off the configuration task. Note that we do this
2477          * configuration task in a taskq so that we don't interfere with the
2478          * normal attach / detach path processing.
2479          */
2480         if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2481                 return (DDI_FAILURE);
2482         }
2483 
2484         /*
2485          * Get the device and vendor ID and see if this is something the imc
2486          * knows about or cares about.
2487          */
2488         vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2489             "vendor-id", PCI_EINVAL16);
2490         did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2491             "device-id", PCI_EINVAL16);
2492         if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2493                 return (DDI_FAILURE);
2494         }
2495 
2496         /*
2497          * Only accept INTC parts on the imc driver.
2498          */
2499         if (vid != IMC_PCI_VENDOR_INTC) {
2500                 return (DDI_FAILURE);
2501         }
2502 
2503         if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2504             "reg", &regs, &nregs) != DDI_PROP_SUCCESS) {
2505                 return (DDI_FAILURE);
2506         }
2507 
2508         if (nregs == 0) {
2509                 ddi_prop_free(regs);
2510                 return (DDI_FAILURE);
2511         }
2512 
2513         /*
2514          * Determine if this matches a known device.
2515          */
2516         table = NULL;
2517         for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2518                 if (imc_stub_table[i].imcs_devid == did &&
2519                     imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2520                     imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2521                         table = &imc_stub_table[i];
2522                         break;
2523                 }
2524         }
2525 
2526         if (i == ARRAY_SIZE(imc_stub_table)) {
2527                 ddi_prop_free(regs);
2528                 return (DDI_FAILURE);
2529         }
2530 
2531         /*
2532          * We've found something. Make sure the generation matches our current
2533          * one. If it does, construct the entry and append it to the list.
2534          */
2535         mutex_enter(&imc->imc_lock);
2536         if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen !=
2537             table->imcs_gen) {
2538                 mutex_exit(&imc->imc_lock);
2539                 ddi_prop_free(regs);
2540                 dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) "
2541                     "that has different hardware generation (%u) from current "
2542                     "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen);
2543                 return (DDI_FAILURE);
2544         } else {
2545                 imc->imc_gen = table->imcs_gen;
2546         }
2547         mutex_exit(&imc->imc_lock);
2548 
2549         stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP);
2550         stub->istub_dip = dip;
2551         stub->istub_vid = vid;
2552         stub->istub_did = did;
2553         stub->istub_bus = PCI_REG_BUS_G(regs[0]);
2554         stub->istub_dev = PCI_REG_DEV_G(regs[0]);
2555         stub->istub_func = PCI_REG_FUNC_G(regs[0]);
2556         ddi_prop_free(regs);
2557         stub->istub_table = table;
2558 
2559         if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) {
2560                 kmem_free(stub, sizeof (stub));
2561                 dev_err(dip, CE_WARN, "Failed to set up PCI config space "
2562                     "for IMC stub device %s (%u/%u)", ddi_node_name(dip),
2563                     vid, did);
2564                 return (DDI_FAILURE);
2565         }
2566 
2567         mutex_enter(&imc->imc_lock);
2568         if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) {
2569                 dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate "
2570                     "bdf %u/%u/%u with %s (%u/%u), not attaching",
2571                     ddi_node_name(imc->imc_dip), vid, did,
2572                     stub->istub_bus, stub->istub_dev, stub->istub_func,
2573                     ddi_node_name(lookup->istub_dip), lookup->istub_vid,
2574                     lookup->istub_did);
2575                 mutex_exit(&imc->imc_lock);
2576                 pci_config_teardown(&stub->istub_cfgspace);
2577                 kmem_free(stub, sizeof (stub));
2578 
2579                 return (DDI_FAILURE);
2580         }
2581         avl_insert(&imc->imc_stubs, stub, idx);
2582 
2583         if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE &&
2584             avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2585                 imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2586                 dispatch = B_TRUE;
2587         }
2588         mutex_exit(&imc->imc_lock);
2589 
2590         if (dispatch) {
2591                 (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2592                     imc, DDI_SLEEP);
2593         }
2594 
2595         return (DDI_SUCCESS);
2596 }
2597 
2598 static int
2599 imc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2600 {
2601         imc_t *imc = imc_data;
2602 
2603         if ((flag & (FEXCL | FNDELAY)) != 0)
2604                 return (EINVAL);
2605 
2606         if (otyp != OTYP_CHR)
2607                 return (EINVAL);
2608 
2609         mutex_enter(&imc->imc_lock);
2610 
2611         if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) {
2612                 mutex_exit(&imc->imc_lock);
2613                 return (ENOTSUP);
2614         }
2615 
2616         /*
2617          * It's possible that someone has come in during the window between when
2618          * we've created the minor node and when we've finished doing work.
2619          */
2620         if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) {
2621                 mutex_exit(&imc->imc_lock);
2622                 return (EAGAIN);
2623         }
2624 
2625         /*
2626          * It's not clear how someone would get a minor that we didn't create.
2627          * But be paranoid and make sure.
2628          */
2629         if (getminor(*devp) >= imc->imc_nsockets) {
2630                 mutex_exit(&imc->imc_lock);
2631                 return (EINVAL);
2632         }
2633 
2634         /*
2635          * Make sure this socket entry has been filled in.
2636          */
2637         if (imc->imc_spointers[getminor(*devp)] == NULL) {
2638                 mutex_exit(&imc->imc_lock);
2639                 return (EINVAL);
2640         }
2641 
2642         mutex_exit(&imc->imc_lock);
2643 
2644         return (0);
2645 }
2646 
2647 static void
2648 imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode)
2649 {
2650         imc_decode_state_t dec;
2651         uint_t i;
2652 
2653         bzero(&dec, sizeof (dec));
2654         if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) {
2655                 encode->mcei_err = (uint32_t)dec.ids_fail;
2656                 encode->mcei_errdata = dec.ids_fail_data;
2657                 return;
2658         }
2659 
2660         encode->mcei_errdata = 0;
2661         encode->mcei_err = 0;
2662         encode->mcei_board = 0;
2663         for (i = 0; i < imc->imc_nsockets; i++) {
2664                 if (imc->imc_spointers[i] == dec.ids_socket)
2665                         break;
2666         }
2667         encode->mcei_chip = i;
2668         encode->mcei_mc = dec.ids_tadid;
2669         encode->mcei_chan = dec.ids_channelid;
2670         encode->mcei_dimm = dec.ids_dimmid;
2671         encode->mcei_rank_addr = dec.ids_rankaddr;
2672         encode->mcei_rank = dec.ids_rankid;
2673         encode->mcei_row = UINT32_MAX;
2674         encode->mcei_column = UINT32_MAX;
2675         encode->mcei_pad = 0;
2676 }
2677 
2678 static int
2679 imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2680     int *rvalp)
2681 {
2682         int ret;
2683         minor_t m;
2684         mc_snapshot_info_t info;
2685         mc_encode_ioc_t encode;
2686         imc_t *imc = imc_data;
2687         imc_socket_t *sock;
2688 
2689         mutex_enter(&imc->imc_lock);
2690         m = getminor(dev);
2691         if (m >= imc->imc_nsockets) {
2692                 ret = EINVAL;
2693                 goto done;
2694         }
2695         sock = imc->imc_spointers[m];
2696         if (sock == NULL) {
2697                 ret = EINVAL;
2698                 goto done;
2699         }
2700 
2701         /*
2702          * Note, other memory controller drivers don't check mode for reading
2703          * data nor do they care who can read it from a credential perspective.
2704          * As such we don't either at this time.
2705          */
2706         switch (cmd) {
2707         case MC_IOC_SNAPSHOT_INFO:
2708                 imc_nvl_pack(sock, B_FALSE);
2709                 if (sock->isock_buf == NULL) {
2710                         ret = EIO;
2711                         break;
2712                 }
2713 
2714                 info.mcs_size = sock->isock_buflen;
2715                 info.mcs_gen = sock->isock_gen;
2716 
2717                 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2718                         ret = EFAULT;
2719                         break;
2720                 }
2721 
2722                 ret = 0;
2723                 break;
2724         case MC_IOC_SNAPSHOT:
2725                 imc_nvl_pack(sock, B_FALSE);
2726                 if (sock->isock_buf == NULL) {
2727                         ret = EIO;
2728                         break;
2729                 }
2730 
2731                 if (ddi_copyout(sock->isock_buf, (void *)arg,
2732                     sock->isock_buflen, mode) != 0) {
2733                         ret = EFAULT;
2734                         break;
2735                 }
2736 
2737                 ret = 0;
2738                 break;
2739         case MC_IOC_DECODE_SNAPSHOT_INFO:
2740                 imc_decoder_pack(imc);
2741                 if (imc->imc_decoder_buf == NULL) {
2742                         ret = EIO;
2743                         break;
2744                 }
2745 
2746                 info.mcs_size = imc->imc_decoder_len;
2747                 info.mcs_gen = imc->imc_spointers[0]->isock_gen;
2748 
2749                 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2750                         ret = EFAULT;
2751                         break;
2752                 }
2753 
2754                 ret = 0;
2755                 break;
2756         case MC_IOC_DECODE_SNAPSHOT:
2757                 imc_decoder_pack(imc);
2758                 if (imc->imc_decoder_buf == NULL) {
2759                         ret = EIO;
2760                         break;
2761                 }
2762 
2763                 if (ddi_copyout(imc->imc_decoder_buf, (void *)arg,
2764                     imc->imc_decoder_len, mode) != 0) {
2765                         ret = EFAULT;
2766                         break;
2767                 }
2768 
2769                 ret = 0;
2770                 break;
2771         case MC_IOC_DECODE_PA:
2772                 if (crgetzoneid(credp) != GLOBAL_ZONEID ||
2773                     drv_priv(credp) != 0) {
2774                         ret = EPERM;
2775                         break;
2776                 }
2777 
2778                 if (ddi_copyin((void *)arg, &encode, sizeof (encode),
2779                     mode & FKIOCTL) != 0) {
2780                         ret = EPERM;
2781                         break;
2782                 }
2783 
2784                 imc_ioctl_decode(imc, &encode);
2785                 ret = 0;
2786 
2787                 if (ddi_copyout(&encode, (void *)arg, sizeof (encode),
2788                     mode & FKIOCTL) != 0) {
2789                         ret = EPERM;
2790                         break;
2791                 }
2792                 break;
2793         default:
2794                 ret = EINVAL;
2795                 goto done;
2796         }
2797 
2798 done:
2799         mutex_exit(&imc->imc_lock);
2800         return (ret);
2801 }
2802 
2803 static int
2804 imc_close(dev_t dev, int flag, int otyp, cred_t *credp)
2805 {
2806         return (0);
2807 }
2808 
2809 static int
2810 imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2811 {
2812         if (cmd != DDI_ATTACH) {
2813                 return (DDI_FAILURE);
2814         }
2815 
2816         if (imc_data == NULL || imc_data->imc_dip != NULL) {
2817                 return (DDI_FAILURE);
2818         }
2819 
2820         mutex_enter(&imc_data->imc_lock);
2821         if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1,
2822             TASKQ_DEFAULTPRI, 0)) == NULL) {
2823                 mutex_exit(&imc_data->imc_lock);
2824                 return (DDI_FAILURE);
2825         }
2826 
2827         imc_data->imc_dip = dip;
2828         imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED;
2829         mutex_exit(&imc_data->imc_lock);
2830 
2831         (void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data,
2832             DDI_SLEEP);
2833 
2834         return (DDI_SUCCESS);
2835 }
2836 
2837 /*
2838  * We only export a single instance.
2839  */
2840 static int
2841 imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
2842 {
2843         /*
2844          * getinfo(9E) shouldn't be called if we're not attached. But be
2845          * paranoid.
2846          */
2847         if (imc_data == NULL || imc_data->imc_dip == NULL) {
2848                 return (DDI_FAILURE);
2849         }
2850 
2851         switch (infocmd) {
2852         case DDI_INFO_DEVT2DEVINFO:
2853                 *resultp = imc_data->imc_dip;
2854                 break;
2855         case DDI_INFO_DEVT2INSTANCE:
2856                 *resultp = (void *)0;
2857                 break;
2858         default:
2859                 return (DDI_FAILURE);
2860         }
2861 
2862         return (DDI_SUCCESS);
2863 }
2864 
2865 static int
2866 imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2867 {
2868         if (cmd != DDI_DETACH) {
2869                 return (DDI_FAILURE);
2870         }
2871 
2872         if (imc_data == NULL || imc_data->imc_dip) {
2873                 return (DDI_FAILURE);
2874         }
2875 
2876         mutex_enter(&imc_data->imc_lock);
2877 
2878         /*
2879          * While a scan or attach is outstanding, don't allow us to detach.
2880          */
2881         if ((imc_data->imc_flags &
2882             (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) {
2883                 mutex_exit(&imc_data->imc_lock);
2884                 return (DDI_FAILURE);
2885         }
2886 
2887         /*
2888          * Because the stub driver depends on the imc driver, we shouldn't be
2889          * able to have any entries in this list when we detach. However, we
2890          * check just to make sure.
2891          */
2892         if (!avl_is_empty(&imc_data->imc_stubs)) {
2893                 mutex_exit(&imc_data->imc_lock);
2894                 return (DDI_FAILURE);
2895         }
2896 
2897         nvlist_free(imc_data->imc_decoder_dump);
2898         imc_data->imc_decoder_dump = NULL;
2899         if (imc_data->imc_decoder_buf != NULL) {
2900                 kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len);
2901                 imc_data->imc_decoder_buf = NULL;
2902                 imc_data->imc_decoder_len = 0;
2903         }
2904 
2905         ddi_remove_minor_node(imc_data->imc_dip, NULL);
2906         imc_data->imc_dip = NULL;
2907         mutex_exit(&imc_data->imc_lock);
2908 
2909         ddi_taskq_wait(imc_data->imc_taskq);
2910         ddi_taskq_destroy(imc_data->imc_taskq);
2911         imc_data->imc_taskq = NULL;
2912 
2913         return (DDI_SUCCESS);
2914 }
2915 
2916 static void
2917 imc_free(void)
2918 {
2919         if (imc_data == NULL) {
2920                 return;
2921         }
2922 
2923         VERIFY(avl_is_empty(&imc_data->imc_stubs));
2924         avl_destroy(&imc_data->imc_stubs);
2925         mutex_destroy(&imc_data->imc_lock);
2926         kmem_free(imc_data, sizeof (imc_t));
2927         imc_data = NULL;
2928 }
2929 
2930 static void
2931 imc_alloc(void)
2932 {
2933         imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP);
2934 
2935         mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL);
2936         avl_create(&imc_data->imc_stubs, imc_stub_comparator,
2937             sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link));
2938 }
2939 
2940 static struct cb_ops imc_cb_ops = {
2941         .cb_open = imc_open,
2942         .cb_close = imc_close,
2943         .cb_strategy = nodev,
2944         .cb_print = nodev,
2945         .cb_dump = nodev,
2946         .cb_read = nodev,
2947         .cb_write = nodev,
2948         .cb_ioctl = imc_ioctl,
2949         .cb_devmap = nodev,
2950         .cb_mmap = nodev,
2951         .cb_segmap = nodev,
2952         .cb_chpoll = nochpoll,
2953         .cb_prop_op = ddi_prop_op,
2954         .cb_flag = D_MP,
2955         .cb_rev = CB_REV,
2956         .cb_aread = nodev,
2957         .cb_awrite = nodev
2958 };
2959 
2960 static struct dev_ops imc_dev_ops = {
2961         .devo_rev = DEVO_REV,
2962         .devo_refcnt = 0,
2963         .devo_getinfo = imc_getinfo,
2964         .devo_identify = nulldev,
2965         .devo_probe = nulldev,
2966         .devo_attach = imc_attach,
2967         .devo_detach = imc_detach,
2968         .devo_reset = nodev,
2969         .devo_cb_ops = &imc_cb_ops,
2970         .devo_quiesce = ddi_quiesce_not_needed
2971 };
2972 
2973 static struct modldrv imc_modldrv = {
2974         .drv_modops = &mod_driverops,
2975         .drv_linkinfo = "Intel Integrated Memory Controller Driver",
2976         .drv_dev_ops = &imc_dev_ops
2977 };
2978 
2979 static struct modlinkage imc_modlinkage = {
2980         .ml_rev = MODREV_1,
2981         .ml_linkage = { &imc_modldrv, NULL }
2982 };
2983 
2984 int
2985 _init(void)
2986 {
2987         int ret;
2988 
2989         if ((ret = mod_install(&imc_modlinkage)) == 0) {
2990                 imc_alloc();
2991         }
2992 
2993         return (ret);
2994 }
2995 
2996 int
2997 _info(struct modinfo *modinfop)
2998 {
2999         return (mod_info(&imc_modlinkage, modinfop));
3000 }
3001 
3002 int
3003 _fini(void)
3004 {
3005         int ret;
3006 
3007         if ((ret = mod_remove(&imc_modlinkage)) == 0) {
3008                 imc_free();
3009         }
3010         return (ret);
3011 }