Print this page
5513 KM_NORMALPRI should be documented in kmem_alloc(9f) and kmem_cache_create(9f) man pages
14465 Present KM_NOSLEEP_LAZY as documented interface
Change-Id: I002ec28ddf390650f1fcba1ca94f6abfdb241439
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/intel/io/imc/imc.c
+++ new/usr/src/uts/intel/io/imc/imc.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 13 * Copyright 2019 Joyent, Inc.
14 14 */
15 15
16 16 /*
17 17 * Generic Intel Integrated Memory Controller (IMC) Driver
18 18 *
19 19 * This driver talks to the CPU's IMC to understand the detailed topology of the
20 20 * processor and to determine how to map between physical addresses to the
21 21 * corresponding DIMM. This driver supports the following generations of Intel
22 22 * chips:
23 23 *
24 24 * - Sandy Bridge
25 25 * - Ivy Bridge
26 26 * - Haswell
27 27 * - Broadwell
28 28 * - Skylake / Cascade Lake
29 29 *
30 30 * Memory Decoding
31 31 * ---------------
32 32 *
33 33 * For more detailed summaries of the memory decoding process, please refer to
34 34 * the Intel External Design Specifications for the corresponding processor.
35 35 * What follows is a rough overview of how the memory decoding system works.
36 36 *
37 37 * First, we'd like to define the following concepts:
38 38 *
39 39 * SYSTEM ADDRESS
40 40 *
41 41 * This is a physical address that the operating system normally uses. This
42 42 * address may refer to DRAM, it may refer to memory mapped PCI
43 43 * configuration space or device registers, or it may refer to other parts
44 44 * of the system's memory map, such as the extended advanced programmable
45 45 * interrupt controller (xAPIC), etc.
46 46 *
47 47 * DIMM
48 48 *
49 49 * Dual-inline memory module. This refers to a physical stick of volatile
50 50 * memory that is inserted into a slot on the motherboard.
51 51 *
52 52 * RANK
53 53 *
54 54 * A potential sub-division of a DIMM. A DIMM's memory capacity is divided
55 55 * into a number of equal sized ranks. For example, an 8 GiB DIMM, may have
56 56 * 1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks.
57 57 *
58 58 * RANK ADDRESS
59 59 *
60 60 * An address that exists in the context of a given rank on a DIMM. All
61 61 * ranks have overlapping addresses, so the address 0x400 exists on all
62 62 * ranks on a given DIMM.
63 63 *
64 64 * CHANNEL
65 65 *
66 66 * Multiple DIMMs may be combined into a single channel. The channel
67 67 * represents the combined memory of all the DIMMs. A given channel only
68 68 * ever exists on a socket and is bound to a single memory controller.
69 69 *
70 70 * CHANNEL ADDRESS
71 71 *
72 72 * This is an address that exists logically on a channel. Each address on a
73 73 * channel maps to a corresponding DIMM that exists on that channel. The
74 74 * address space on one channel is independent from that on another. This
75 75 * means that address 0x1000 can exist on each memory channel in the
76 76 * system.
77 77 *
78 78 * INTERLEAVE
79 79 *
80 80 * There are several different cases where interleaving occurs on the
81 81 * system. For example, addresses may be interleaved across sockets,
82 82 * memory channels, or DIMM ranks. When addresses are interleaved, then
83 83 * some number of bits in an address are used to select which target to go
84 84 * to (usually through a look up table). The effect of interleaving is that
85 85 * addresses that are next to one another may not all go to the same
86 86 * device. The following image shows a non-interleaving case.
87 87 *
88 88 * 0x0fff +-----+ +-----+ 0x7ff
89 89 * | |\___________/| |
90 90 * | | __________ | (b) |
91 91 * | | / \| |
92 92 * 0x0800 |=====|= +-----+ 0x000 +-----+ 0x7ff
93 93 * | | \______________________________/| |
94 94 * | | _______________________________ | (a) |
95 95 * | |/ \| |
96 96 * 0x0000 +-----+ +-----+ 0x000
97 97 *
98 98 * In this example of non-interleaving, addresses 0x0000 to 0x07ff go to
99 99 * device (a). While, addresses 0x08000 to 0xfff, go to device (b).
100 100 * However, each range is divided into the same number of components.
101 101 *
102 102 * If instead, we were to look at that with interleaving, what we might say
103 103 * is that rather than splitting the range in half, we might say that if
104 104 * the address has bit 8 set (0x100), then it goes to (b), otherwise it
105 105 * goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a).
106 106 * 0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a)
107 107 * again, and then 0x300 to 0x2ff would go back to (b). This would continue
108 108 * for a while. This would instead look something more like:
109 109 *
110 110 *
111 111 * 0x0fff +-----+ A: 0x7ff +---------+ B: 0x7ff +---------+
112 112 * | (b) | | e00-eff | | f00-fff |
113 113 * 0x0f00 |-----| 0x700 +---------+ 0x700 +---------+
114 114 * | (a) | | c00-cff | | d00-dff |
115 115 * 0x0e00 ~~~~~~~ 0x600 +---------+ 0x600 +---------+
116 116 * *** | a00-aff | | b00-bff |
117 117 * 0x0400 ~~~~~~~ 0x500 +---------+ 0x500 +---------+
118 118 * | (b) | | 800-8ff | | 900-9ff |
119 119 * 0x0300 |-----| 0x400 +---------+ 0x400 +---------+
120 120 * | (a) | | 600-6ff | | 700-7ff |
121 121 * 0x0200 |-----| 0x300 +---------+ 0x300 +---------+
122 122 * | (b) | | 400-4ff | | 500-5ff |
123 123 * 0x0100 |-----| 0x200 +---------+ 0x200 +---------+
124 124 * | (a) | | 200-2ff | | 300-3ff |
125 125 * 0x0000 +-----+ 0x100 +---------+ 0x100 +---------+
126 126 * | 000-0ff | | 100-1ff |
127 127 * 0x000 +---------+ 0x000 +---------+
128 128 *
129 129 * In this example we've performed two-way interleaving. The number of ways
130 130 * that something can interleave varies based on what we're interleaving
131 131 * between.
132 132 *
133 133 * MEMORY CONTROLLER
134 134 *
135 135 * A given processor die (see uts/i86pc/os/cpuid.c) contains a number of
136 136 * memory controllers. Usually 1 or two. Each memory controller supports a
137 137 * given number of DIMMs, which are divided across multiple channels.
138 138 *
139 139 * TARGET ADDRESS DECODER
140 140 *
141 141 * The target address decoder (TAD) is responsible for taking a system
142 142 * address and transforming it into a channel address based on the rules
143 143 * that are present. Each memory controller has a corresponding TAD. The
144 144 * TAD is often contained in a device called a 'Home Agent'.
145 145 *
146 146 * SYSTEM ADDRESS DECODER
147 147 *
148 148 * The system address decoder (SAD) is responsible for taking a system
149 149 * address and directing it to the right place, whether this be memory or
150 150 * otherwise. There is a single memory controller per socket (see
151 151 * uts/i86pc/os/cpuid.c) that is shared between all the cores currently.
152 152 *
153 153 * NODE IDENTIFIER
154 154 *
155 155 * The node identifier is used to uniquely identify an element in the
156 156 * various routing topologies on the die (see uts/i86pc/os/cpuid.c for the
157 157 * definition of 'die'). One can roughly think about this as a unique
158 158 * identifier for the socket itself. In general, the primary node ID for a
159 159 * socket should map to the socket APIC ID.
160 160 *
161 161 * Finding Devices
162 162 * ---------------
163 163 *
164 164 * There is a bit of a chicken and egg problem on Intel systems and in the
165 165 * device driver interface. The information that we need in the system is spread
166 166 * out amongst a large number of different PCI devices that the processor
167 167 * exposes. The number of such devices can vary based on the processor
168 168 * generation and the specific SKU in the processor. To deal with this, we break
169 169 * the driver into two different components: a stub driver and the full driver.
170 170 *
171 171 * The stub driver has aliases for all known PCI devices that we might attach to
172 172 * in a given generation on the system. This driver is called 'imcstub'. When a
173 173 * stub attaches, it just registers itself with the main driver, upon which it
174 174 * has a module dependency.
175 175 *
176 176 * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it
177 177 * kicks off a scan of the device tree which takes place in a task queue. Once
178 178 * there, it determines the number of devices that it expects to exist by
179 179 * walking the tree and comparing it against the generation-specific table.
180 180 *
181 181 * If all devices are found, we'll go ahead and read through all the devices and
182 182 * build a map of all the information we need to understand the topology of the
183 183 * system and to be able to decode addresses. We do this here, because we can be
184 184 * asked to perform decoding in dangerous contexts (after taking an MCE, panic,
185 185 * etc) where we don't want to have to rely on the broader kernel functioning at
186 186 * this point in time.
187 187 *
188 188 * Once our topology is built, we'll create minor nodes which are used by the
189 189 * fault management architecture to query for information and register our
190 190 * decoding functionality with the kernel.
191 191 *
192 192 * PCI Numbering
193 193 * -------------
194 194 *
195 195 * For each device that we care about, Intel defines the device and function
196 196 * that we can expect to find the information and PCI configuration space
197 197 * registers that we care about at. However, the PCI bus is not well defined.
198 198 * Devices that are on the same socket use the same set of bus numbers; however,
199 199 * some sockets have multiple device numbers that they'll use to represent
200 200 * different classes. These bus numbers are programmed by systems firmware as
201 201 * part of powering on the system. This means, that we need the ability to
202 202 * map together these disparate ranges ourselves.
203 203 *
204 204 * There is a device called a utility box (UBOX), which exists per-socket and
205 205 * maps the different sockets together. We use this to determine which devices
206 206 * correspond to which sockets.
207 207 *
208 208 * Mapping Sockets
209 209 * ---------------
210 210 *
211 211 * Another wrinkle is that the way that the OS sees the numbering of the CPUs is
212 212 * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more
213 213 * information). However, to map to the corresponding socket, we need to look at
214 214 * the socket's node ID. The order of PCI buses in the system is not required to
215 215 * have any relation to the socket ID. Therefore, we have to have yet another
216 216 * indirection table in the imc_t.
217 217 *
218 218 * Exposing Data
219 219 * -------------
220 220 *
221 221 * We expose topology data to FMA using the OS-private memory controller
222 222 * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a
223 223 * number of specific interfaces that we can then implement. The ioctl API asks
224 224 * us for a snapshot of data, which basically has us go through and send an
225 225 * nvlist_t to userland. This nvlist_t is constructed as part of the scan
226 226 * process. This nvlist uses the version 1 format, which more explicitly encodes
227 227 * the topology in a series of nested nvlists.
228 228 *
229 229 * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the
230 230 * decoder and ask it to perform decoding.
231 231 *
232 232 * Decoding Addresses
233 233 * ------------------
234 234 *
235 235 * The decoding logic can be found in common/imc/imc_decode.c. This file is
236 236 * shared between the kernel and userland to allow for easier testing and
237 237 * additional flexibility in operation. The decoding process happens in a few
238 238 * different phases.
239 239 *
240 240 * The first phase, is to determine which memory controller on which socket is
241 241 * responsible for this data. To determine this, we use the system address
242 242 * decoder and walk the rules, looking for the correct target. There are various
243 243 * manipulations to the address that exist which are used to determine which
244 244 * index we use. The way that we interpret the output of the rule varies
245 245 * somewhat based on the generation. Sandy Bridge just has a node ID which
246 246 * points us to the socket with its single IMC. On Ivy Bridge through Broadwell,
247 247 * the memory controller to use is also encoded in part of the node ID. Finally,
248 248 * on Skylake, the SAD tells us which socket to look at. The socket in question
249 249 * then has a routing table which tells us which channel on which memory
250 250 * controller that is local to that socket.
251 251 *
252 252 * Once we have the target memory controller, we walk the list of target address
253 253 * decoder rules. These rules can help tell us which channel we care about
254 254 * (which is required on Sandy Bridge through Broadwell) and then describe some
255 255 * amount of the interleaving rules which are used to turn the system address
256 256 * into a channel address.
257 257 *
258 258 * Once we know the channel and the channel address, we walk the rank interleave
259 259 * rules which help us determine which DIMM and the corresponding rank on it
260 260 * that the corresponding channel address is on. It also has logic that we need
261 261 * to use to determine how to transform a channel address into an address on
262 262 * that specific rank. Once we have that, then the initial decoding is done.
263 263 *
264 264 * The logic in imc_decode.c is abstracted away from the broader kernel CMI
265 265 * logic. This is on purpose and allows us not only an easier time unit testing
266 266 * the logic, but also allows us to express more high fidelity errors that are
267 267 * translated into a much smaller subset. This logic is exercised in the
268 268 * 'imc_test' program which is built in 'test/os-tests/tests/imc'.
269 269 *
270 270 * Limitations
271 271 * -----------
272 272 *
273 273 * Currently, this driver has the following limitations:
274 274 *
275 275 * o It doesn't decode the row and column addresses.
276 276 * o It doesn't encode from a DIMM address to a system address.
277 277 * o It doesn't properly support lockstep and mirroring modes on Sandy Bridge -
278 278 * Broadwell platforms.
279 279 * o It doesn't support virtual lockstep and adaptive mirroring on Purley
280 280 * platforms.
281 281 * o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs.
282 282 * o It doesn't know how to decode three way channel interleaving.
283 283 *
284 284 * None of these are intrinsic problems to the driver, it's mostly a matter of
285 285 * having proper documentation and testing.
286 286 */
287 287
288 288 #include <sys/modctl.h>
289 289 #include <sys/conf.h>
290 290 #include <sys/devops.h>
291 291 #include <sys/ddi.h>
292 292 #include <sys/sunddi.h>
293 293 #include <sys/types.h>
294 294 #include <sys/file.h>
295 295 #include <sys/errno.h>
296 296 #include <sys/open.h>
297 297 #include <sys/cred.h>
298 298 #include <sys/pci.h>
299 299 #include <sys/sysmacros.h>
300 300 #include <sys/avl.h>
301 301 #include <sys/stat.h>
302 302 #include <sys/policy.h>
303 303
304 304 #include <sys/cpu_module.h>
305 305 #include <sys/mc.h>
306 306 #include <sys/mc_intel.h>
307 307
308 308 #include "imc.h"
309 309
310 310 /*
311 311 * These tables contain generational data that varies between processor
312 312 * generation such as the maximum number of sockets, memory controllers, and the
313 313 * offsets of the various registers.
314 314 */
315 315
316 316 static const imc_gen_data_t imc_gen_data_snb = {
317 317 .igd_max_sockets = 4,
318 318 .igd_max_imcs = 2,
319 319 .igd_max_channels = 4,
320 320 .igd_max_dimms = 3,
321 321 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
322 322 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
323 323 IMC_REG_MC_MTR2 },
324 324 .igd_mcmtr_offset = 0x7c,
325 325 .igd_tolm_offset = 0x80,
326 326 .igd_tohm_low_offset = 0x84,
327 327 .igd_sad_dram_offset = 0x80,
328 328 .igd_sad_ndram_rules = 10,
329 329 .igd_sad_nodeid_offset = 0x40,
330 330 .igd_tad_nrules = 12,
331 331 .igd_tad_rule_offset = 0x40,
332 332 .igd_tad_chan_offset = 0x90,
333 333 .igd_tad_sysdef = 0x80,
334 334 .igd_tad_sysdef2 = 0x84,
335 335 .igd_mc_mirror = 0xac,
336 336 .igd_rir_nways = 5,
337 337 .igd_rir_way_offset = 0x108,
338 338 .igd_rir_nileaves = 8,
339 339 .igd_rir_ileave_offset = 0x120,
340 340 .igd_ubox_cpubusno_offset = 0xd0,
341 341 };
342 342
343 343 static const imc_gen_data_t imc_gen_data_ivb = {
344 344 .igd_max_sockets = 4,
345 345 .igd_max_imcs = 2,
346 346 .igd_max_channels = 4,
347 347 .igd_max_dimms = 3,
348 348 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
349 349 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
350 350 IMC_REG_MC_MTR2 },
351 351 .igd_mcmtr_offset = 0x7c,
352 352 .igd_tolm_offset = 0x80,
353 353 .igd_tohm_low_offset = 0x84,
354 354 .igd_sad_dram_offset = 0x60,
355 355 .igd_sad_ndram_rules = 20,
356 356 .igd_sad_nodeid_offset = 0x40,
357 357 .igd_tad_nrules = 12,
358 358 .igd_tad_rule_offset = 0x40,
359 359 .igd_tad_chan_offset = 0x90,
360 360 .igd_tad_sysdef = 0x80,
361 361 .igd_tad_sysdef2 = 0x84,
362 362 .igd_mc_mirror = 0xac,
363 363 .igd_rir_nways = 5,
364 364 .igd_rir_way_offset = 0x108,
365 365 .igd_rir_nileaves = 8,
366 366 .igd_rir_ileave_offset = 0x120,
367 367 .igd_ubox_cpubusno_offset = 0xd0,
368 368 };
369 369
370 370 static const imc_gen_data_t imc_gen_data_has_brd = {
371 371 .igd_max_sockets = 4,
372 372 .igd_max_imcs = 2,
373 373 .igd_max_channels = 4,
374 374 .igd_max_dimms = 3,
375 375 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX,
376 376 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
377 377 IMC_REG_MC_MTR2 },
378 378 .igd_mcmtr_offset = 0x7c,
379 379 .igd_tolm_offset = 0xd0,
380 380 .igd_tohm_low_offset = 0xd4,
381 381 .igd_tohm_hi_offset = 0xd8,
382 382 .igd_sad_dram_offset = 0x60,
383 383 .igd_sad_ndram_rules = 20,
384 384 .igd_sad_nodeid_offset = 0x40,
385 385 .igd_tad_nrules = 12,
386 386 .igd_tad_rule_offset = 0x40,
387 387 .igd_tad_chan_offset = 0x90,
388 388 .igd_tad_sysdef = 0x80,
389 389 .igd_tad_sysdef2 = 0x84,
390 390 .igd_mc_mirror = 0xac,
391 391 .igd_rir_nways = 5,
392 392 .igd_rir_way_offset = 0x108,
393 393 .igd_rir_nileaves = 8,
394 394 .igd_rir_ileave_offset = 0x120,
395 395 .igd_ubox_cpubusno_offset = 0xd0,
396 396 };
397 397
398 398 static const imc_gen_data_t imc_gen_data_skx = {
399 399 .igd_max_sockets = 8,
400 400 .igd_max_imcs = 2,
401 401 .igd_max_channels = 3,
402 402 .igd_max_dimms = 2,
403 403 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
404 404 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 },
405 405 .igd_mcmtr_offset = 0x87c,
406 406 .igd_topo_offset = 0x88,
407 407 .igd_tolm_offset = 0xd0,
408 408 .igd_tohm_low_offset = 0xd4,
409 409 .igd_tohm_hi_offset = 0xd8,
410 410 .igd_sad_dram_offset = 0x60,
411 411 .igd_sad_ndram_rules = 24,
412 412 .igd_sad_nodeid_offset = 0xc0,
413 413 .igd_tad_nrules = 8,
414 414 .igd_tad_rule_offset = 0x850,
415 415 .igd_tad_chan_offset = 0x90,
416 416 .igd_rir_nways = 4,
417 417 .igd_rir_way_offset = 0x108,
418 418 .igd_rir_nileaves = 4,
419 419 .igd_rir_ileave_offset = 0x120,
420 420 .igd_ubox_cpubusno_offset = 0xcc,
421 421 };
422 422
423 423 /*
424 424 * This table contains all of the devices that we're looking for from a stub
425 425 * perspective. These are organized by generation. Different generations behave
426 426 * in slightly different ways. For example, Sandy Bridge through Broadwell use
427 427 * unique PCI IDs for each PCI device/function combination that appears. Whereas
428 428 * Skylake based systems use the same PCI ID; however, different device/function
429 429 * values indicate that the IDs are used for different purposes.
430 430 */
431 431 /* BEGIN CSTYLED */
432 432 static const imc_stub_table_t imc_stub_table[] = {
433 433 /* Sandy Bridge */
434 434 { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" },
435 435 { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" },
436 436 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" },
437 437 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" },
438 438 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" },
439 439 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" },
440 440 { IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" },
441 441 { IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" },
442 442 { IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" },
443 443 { IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" },
444 444 { IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" },
445 445 { IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" },
446 446 /* Ivy Bridge */
447 447 { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" },
448 448 { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" },
449 449 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" },
450 450 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" },
451 451 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" },
452 452 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" },
453 453 { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" },
454 454 { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" },
455 455 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" },
456 456 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" },
457 457 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" },
458 458 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" },
459 459 { IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" },
460 460 { IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" },
461 461 { IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" },
462 462 { IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" },
463 463 { IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" },
464 464 { IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" },
465 465 { IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" },
466 466 /* Haswell */
467 467 { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" },
468 468 { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" },
469 469 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" },
470 470 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" },
471 471 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" },
472 472 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" },
473 473 { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" },
474 474 { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" },
475 475 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" },
476 476 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" },
477 477 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" },
478 478 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" },
479 479 { IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" },
480 480 { IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" },
481 481 { IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" },
482 482 { IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" },
483 483 { IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" },
484 484 { IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" },
485 485 { IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" },
486 486 /* Broadwell Devices */
487 487 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" },
488 488 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" },
489 489 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" },
490 490 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" },
491 491 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" },
492 492 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" },
493 493 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" },
494 494 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" },
495 495 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" },
496 496 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" },
497 497 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" },
498 498 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" },
499 499 { IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" },
500 500 { IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" },
501 501 { IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" },
502 502 { IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" },
503 503 { IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" },
504 504 { IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" },
505 505 { IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" },
506 506 /* Skylake and Cascade Lake Devices */
507 507 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" },
508 508 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" },
509 509 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" },
510 510 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" },
511 511 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" },
512 512 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" },
513 513 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" },
514 514 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" },
515 515 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" },
516 516 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" },
517 517 { IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" },
518 518
519 519 /*
520 520 * There is one SAD MC Route type device per core! Because of this a
521 521 * wide array of device and functions are allocated. For now, we list
522 522 * all 28 of them out.
523 523 */
524 524 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" },
525 525 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" },
526 526 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" },
527 527 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" },
528 528 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" },
529 529 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" },
530 530 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" },
531 531 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" },
532 532 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" },
533 533 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" },
534 534 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" },
535 535 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" },
536 536 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" },
537 537 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" },
538 538 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" },
539 539 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" },
540 540 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" },
541 541 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" },
542 542 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" },
543 543 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" },
544 544 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" },
545 545 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" },
546 546 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" },
547 547 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" },
548 548 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" },
549 549 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" },
550 550 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" },
551 551 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" },
552 552 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" },
553 553 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" },
554 554 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" },
555 555 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" },
556 556
557 557 { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" },
558 558 { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" },
559 559 };
560 560 /* END CSTYLED */
561 561
562 562 #define IMC_PCI_VENDOR_INTC 0x8086
563 563
564 564 /*
565 565 * Our IMC data is global and statically set up during a combination of
566 566 * _init(9E) and attach(9E). While we have a module dependency between the PCI
567 567 * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't
568 568 * guarantee that the imc driver has finished attaching. As such we make sure
569 569 * that it can operate without it being attached in any way.
570 570 */
571 571 static imc_t *imc_data = NULL;
572 572
573 573 /*
574 574 * By default we should not allow the stubs to detach as we don't have a good
575 575 * way of forcing them to attach again. This is provided in case someone does
576 576 * want to allow the driver to unload.
577 577 */
578 578 int imc_allow_detach = 0;
579 579
580 580 static void
581 581 imc_set_gen_data(imc_t *imc)
582 582 {
583 583 switch (imc->imc_gen) {
584 584 case IMC_GEN_SANDY:
585 585 imc->imc_gen_data = &imc_gen_data_snb;
586 586 break;
587 587 case IMC_GEN_IVY:
588 588 imc->imc_gen_data = &imc_gen_data_ivb;
589 589 break;
590 590 case IMC_GEN_HASWELL:
591 591 case IMC_GEN_BROADWELL:
592 592 imc->imc_gen_data = &imc_gen_data_has_brd;
593 593 break;
594 594 case IMC_GEN_SKYLAKE:
595 595 imc->imc_gen_data = &imc_gen_data_skx;
596 596 break;
597 597 default:
598 598 dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
599 599 "set to unknown generation: %u", imc->imc_gen);
600 600 }
601 601 }
602 602
603 603 /*
604 604 * If our device (dev_info_t) does not have a non-zero unit address, then
605 605 * devfsadmd will not pay attention to us at all. Therefore we need to set the
606 606 * unit address below, before we create minor nodes.
607 607 *
608 608 * The rest of the system expects us to have one minor node per socket. The
609 609 * minor node ID should be the ID of the socket.
610 610 */
611 611 static boolean_t
612 612 imc_create_minors(imc_t *imc)
613 613 {
614 614 uint_t i;
615 615
616 616 ddi_set_name_addr(imc->imc_dip, "1");
617 617 for (i = 0; i < imc->imc_nsockets; i++) {
618 618 char buf[MAXNAMELEN];
619 619
620 620 if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >=
621 621 sizeof (buf)) {
622 622 goto fail;
623 623 }
624 624
625 625 if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i,
626 626 "ddi_mem_ctrl", 0) != DDI_SUCCESS) {
627 627 dev_err(imc->imc_dip, CE_WARN, "failed to create "
628 628 "minor node %u: %s", i, buf);
629 629 goto fail;
630 630 }
631 631 }
632 632 return (B_TRUE);
633 633
634 634 fail:
635 635 ddi_remove_minor_node(imc->imc_dip, NULL);
636 636 return (B_FALSE);
637 637 }
638 638
639 639 /*
640 640 * Check the current MC route value for this SAD. On Skylake systems there is
641 641 * one per core. Every core should agree. If not, we will not trust the SAD
642 642 * MCROUTE values and this will cause system address decoding to fail on
643 643 * skylake.
644 644 */
645 645 static void
646 646 imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub)
647 647 {
648 648 uint32_t val;
649 649
650 650 val = pci_config_get32(stub->istub_cfgspace,
651 651 IMC_REG_SKX_SAD_MC_ROUTE_TABLE);
652 652 if (val == PCI_EINVAL32) {
653 653 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
654 654 return;
655 655 }
656 656
657 657 if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) {
658 658 sad->isad_flags |= IMC_SAD_MCROUTE_VALID;
659 659 sad->isad_mcroute.ismc_raw_mcroute = val;
660 660 return;
661 661 }
662 662
663 663 /*
664 664 * Occasionally we see MC ROUTE table entries with a value of zero.
665 665 * We should ignore those for now.
666 666 */
667 667 if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) {
668 668 dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch "
669 669 "with socket. SAD has val 0x%x, system has %x\n",
670 670 val, sad->isad_mcroute.ismc_raw_mcroute);
671 671 sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE;
672 672 }
673 673 }
674 674
675 675 /*
676 676 * On Skylake, many of the devices that we care about are on separate PCI Buses.
677 677 * These can be mapped together by the DECS register. However, we need to know
678 678 * how to map different buses together so that we can more usefully associate
679 679 * information. The set of buses is all present in the DECS register. We'll
680 680 * effectively assign sockets to buses. This is also still something that comes
681 681 * up on pre-Skylake systems as well.
682 682 */
683 683 static boolean_t
684 684 imc_map_buses(imc_t *imc)
685 685 {
686 686 imc_stub_t *stub;
687 687 uint_t nsock;
688 688
689 689 /*
690 690 * Find the UBOX_DECS registers so we can establish socket mappings. On
691 691 * Skylake, there are three different sets of buses that we need to
692 692 * cover all of our devices, while there are only two before that.
693 693 */
694 694 for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL;
695 695 stub = AVL_NEXT(&imc->imc_stubs, stub)) {
696 696 uint32_t busno;
697 697
698 698 if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) {
699 699 continue;
700 700 }
701 701
702 702 busno = pci_config_get32(stub->istub_cfgspace,
703 703 imc->imc_gen_data->igd_ubox_cpubusno_offset);
704 704 if (busno == PCI_EINVAL32) {
705 705 dev_err(imc->imc_dip, CE_WARN, "failed to read "
706 706 "UBOX_DECS CPUBUSNO0: invalid PCI read");
707 707 return (B_FALSE);
708 708 }
709 709
710 710 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
711 711 imc->imc_sockets[nsock].isock_nbus = 3;
712 712 imc->imc_sockets[nsock].isock_bus[0] =
713 713 IMC_UBOX_CPUBUSNO_0(busno);
714 714 imc->imc_sockets[nsock].isock_bus[1] =
715 715 IMC_UBOX_CPUBUSNO_1(busno);
716 716 imc->imc_sockets[nsock].isock_bus[2] =
717 717 IMC_UBOX_CPUBUSNO_2(busno);
718 718 } else {
719 719 imc->imc_sockets[nsock].isock_bus[0] =
720 720 IMC_UBOX_CPUBUSNO_0(busno);
721 721 imc->imc_sockets[nsock].isock_bus[1] =
722 722 IMC_UBOX_CPUBUSNO_1(busno);
723 723 imc->imc_sockets[nsock].isock_nbus = 2;
724 724 }
725 725 nsock++;
726 726 }
727 727 imc->imc_nsockets = nsock;
728 728
729 729 return (B_TRUE);
730 730 }
731 731
732 732 /*
733 733 * For a given stub that we've found, map it to its corresponding socket based
734 734 * on the PCI bus that it has.
735 735 */
736 736 static imc_socket_t *
737 737 imc_map_find_socket(imc_t *imc, imc_stub_t *stub)
738 738 {
739 739 uint_t i;
740 740
741 741 for (i = 0; i < imc->imc_nsockets; i++) {
742 742 uint_t bus;
743 743
744 744 for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) {
745 745 if (imc->imc_sockets[i].isock_bus[bus] ==
746 746 stub->istub_bus) {
747 747 return (&imc->imc_sockets[i]);
748 748 }
749 749 }
750 750 }
751 751
752 752 return (NULL);
753 753 }
754 754
755 755 static boolean_t
756 756 imc_map_stubs(imc_t *imc)
757 757 {
758 758 imc_stub_t *stub;
759 759
760 760 if (!imc_map_buses(imc)) {
761 761 return (B_FALSE);
762 762 }
763 763
764 764 stub = avl_first(&imc->imc_stubs);
765 765 for (stub = avl_first(&imc->imc_stubs); stub != NULL;
766 766 stub = AVL_NEXT(&imc->imc_stubs, stub)) {
767 767 imc_socket_t *sock = imc_map_find_socket(imc, stub);
768 768
769 769 if (sock == NULL) {
770 770 dev_err(imc->imc_dip, CE_WARN, "found stub type %u "
771 771 "PCI%x,%x with bdf %u/%u/%u that does not match a "
772 772 "known PCI bus for any of %u sockets",
773 773 stub->istub_table->imcs_type, stub->istub_vid,
774 774 stub->istub_did, stub->istub_bus, stub->istub_dev,
775 775 stub->istub_func, imc->imc_nsockets);
776 776 continue;
777 777 }
778 778
779 779 /*
780 780 * We don't have to worry about duplicates here. We check to
781 781 * make sure that we have unique bdfs here.
782 782 */
783 783 switch (stub->istub_table->imcs_type) {
784 784 case IMC_TYPE_MC0_M2M:
785 785 sock->isock_imcs[0].icn_m2m = stub;
786 786 break;
787 787 case IMC_TYPE_MC1_M2M:
788 788 sock->isock_imcs[1].icn_m2m = stub;
789 789 break;
790 790 case IMC_TYPE_MC0_MAIN0:
791 791 sock->isock_nimc++;
792 792 sock->isock_imcs[0].icn_main0 = stub;
793 793
794 794 /*
795 795 * On Skylake, the MAIN0 does double duty as channel
796 796 * zero and as the TAD.
797 797 */
798 798 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
799 799 sock->isock_imcs[0].icn_nchannels++;
800 800 sock->isock_imcs[0].icn_channels[0].ich_desc =
801 801 stub;
802 802 sock->isock_tad[0].itad_stub = stub;
803 803 sock->isock_ntad++;
804 804 }
805 805 break;
806 806 case IMC_TYPE_MC0_MAIN1:
807 807 sock->isock_imcs[0].icn_main1 = stub;
808 808 break;
809 809 case IMC_TYPE_MC1_MAIN0:
810 810 sock->isock_nimc++;
811 811 sock->isock_imcs[1].icn_main0 = stub;
812 812
813 813 /*
814 814 * On Skylake, the MAIN0 does double duty as channel
815 815 * zero and as the TAD.
816 816 */
817 817 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
818 818 sock->isock_imcs[1].icn_nchannels++;
819 819 sock->isock_imcs[1].icn_channels[0].ich_desc =
820 820 stub;
821 821 sock->isock_tad[1].itad_stub = stub;
822 822 sock->isock_ntad++;
823 823 }
824 824 break;
825 825 case IMC_TYPE_MC1_MAIN1:
826 826 sock->isock_imcs[1].icn_main1 = stub;
827 827 break;
828 828 case IMC_TYPE_MC0_CHANNEL0:
829 829 sock->isock_imcs[0].icn_nchannels++;
830 830 sock->isock_imcs[0].icn_channels[0].ich_desc = stub;
831 831 break;
832 832 case IMC_TYPE_MC0_CHANNEL1:
833 833 sock->isock_imcs[0].icn_nchannels++;
834 834 sock->isock_imcs[0].icn_channels[1].ich_desc = stub;
835 835 break;
836 836 case IMC_TYPE_MC0_CHANNEL2:
837 837 sock->isock_imcs[0].icn_nchannels++;
838 838 sock->isock_imcs[0].icn_channels[2].ich_desc = stub;
839 839 break;
840 840 case IMC_TYPE_MC0_CHANNEL3:
841 841 sock->isock_imcs[0].icn_nchannels++;
842 842 sock->isock_imcs[0].icn_channels[3].ich_desc = stub;
843 843 break;
844 844 case IMC_TYPE_MC1_CHANNEL0:
845 845 sock->isock_imcs[1].icn_nchannels++;
846 846 sock->isock_imcs[1].icn_channels[0].ich_desc = stub;
847 847 break;
848 848 case IMC_TYPE_MC1_CHANNEL1:
849 849 sock->isock_imcs[1].icn_nchannels++;
850 850 sock->isock_imcs[1].icn_channels[1].ich_desc = stub;
851 851 break;
852 852 case IMC_TYPE_MC1_CHANNEL2:
853 853 sock->isock_imcs[1].icn_nchannels++;
854 854 sock->isock_imcs[1].icn_channels[2].ich_desc = stub;
855 855 break;
856 856 case IMC_TYPE_MC1_CHANNEL3:
857 857 sock->isock_imcs[1].icn_nchannels++;
858 858 sock->isock_imcs[1].icn_channels[3].ich_desc = stub;
859 859 break;
860 860 case IMC_TYPE_SAD_DRAM:
861 861 sock->isock_sad.isad_dram = stub;
862 862 break;
863 863 case IMC_TYPE_SAD_MMIO:
864 864 sock->isock_sad.isad_mmio = stub;
865 865 break;
866 866 case IMC_TYPE_SAD_MISC:
867 867 sock->isock_sad.isad_tolh = stub;
868 868 break;
869 869 case IMC_TYPE_VTD_MISC:
870 870 /*
871 871 * Some systems have multiple VT-D Misc. entry points
872 872 * in the system. In this case, only use the first one
873 873 * we find.
874 874 */
875 875 if (imc->imc_gvtd_misc == NULL) {
876 876 imc->imc_gvtd_misc = stub;
877 877 }
878 878 break;
879 879 case IMC_TYPE_SAD_MCROUTE:
880 880 ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE);
881 881 imc_mcroute_check(imc, &sock->isock_sad, stub);
882 882 break;
883 883 case IMC_TYPE_UBOX:
884 884 sock->isock_ubox = stub;
885 885 break;
886 886 case IMC_TYPE_HA0:
887 887 sock->isock_ntad++;
888 888 sock->isock_tad[0].itad_stub = stub;
889 889 break;
890 890 case IMC_TYPE_HA1:
891 891 sock->isock_ntad++;
892 892 sock->isock_tad[1].itad_stub = stub;
893 893 break;
894 894 case IMC_TYPE_UBOX_CPUBUSNO:
895 895 sock->isock_cpubusno = stub;
896 896 break;
897 897 default:
898 898 /*
899 899 * Attempt to still attach if we can.
900 900 */
901 901 dev_err(imc->imc_dip, CE_WARN, "Encountered unknown "
902 902 "IMC type (%u) on PCI %x,%x",
903 903 stub->istub_table->imcs_type,
904 904 stub->istub_vid, stub->istub_did);
905 905 break;
906 906 }
907 907 }
908 908
909 909 return (B_TRUE);
910 910 }
911 911
912 912 /*
913 913 * Go through and fix up various aspects of the stubs mappings on systems. The
914 914 * following are a list of what we need to fix up:
915 915 *
916 916 * 1. On Haswell and newer systems, there is only one global VT-d device. We
917 917 * need to go back and map that to all of the per-socket imc_sad_t entries.
918 918 */
919 919 static void
920 920 imc_fixup_stubs(imc_t *imc)
921 921 {
922 922 if (imc->imc_gen >= IMC_GEN_HASWELL) {
923 923 uint_t i;
924 924
925 925 for (i = 0; i < imc->imc_nsockets; i++) {
926 926 ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh,
927 927 ==, NULL);
928 928 imc->imc_sockets[i].isock_sad.isad_tolh =
929 929 imc->imc_gvtd_misc;
930 930 }
931 931 }
932 932 }
933 933
934 934 /*
935 935 * In the wild we've hit a few odd cases where not all devices are exposed that
936 936 * we might expect by firmware. In particular we've seen and validate the
937 937 * following cases:
938 938 *
939 939 * o We don't find all of the channel devices that we expect, e.g. we have the
940 940 * stubs for channels 1-3, but not 0. That has been seen on an Intel S2600CW
941 941 * with an E5-2630v3.
942 942 */
943 943 static boolean_t
944 944 imc_validate_stubs(imc_t *imc)
945 945 {
946 946 for (uint_t sock = 0; sock < imc->imc_nsockets; sock++) {
947 947 imc_socket_t *socket = &imc->imc_sockets[sock];
948 948
949 949 for (uint_t mc = 0; mc < socket->isock_nimc; mc++) {
950 950 imc_mc_t *mcp = &socket->isock_imcs[mc];
951 951
952 952 for (uint_t chan = 0; chan < mcp->icn_nchannels;
953 953 chan++) {
954 954 if (mcp->icn_channels[chan].ich_desc == NULL) {
955 955 dev_err(imc->imc_dip, CE_WARN,
956 956 "!missing device for socket %u/"
957 957 "imc %u/channel %u", sock, mc,
958 958 chan);
959 959 return (B_FALSE);
960 960 }
961 961 }
962 962 }
963 963 }
964 964
965 965 return (B_TRUE);
966 966 }
967 967
968 968 /*
969 969 * Attempt to map all of the discovered sockets to the corresponding APIC based
970 970 * socket. We do these mappings by getting the node id of the socket and
971 971 * adjusting it to make sure that no home agent is present in it. We use the
972 972 * UBOX to avoid any home agent related bits that are present in other
973 973 * registers.
974 974 */
975 975 static void
976 976 imc_map_sockets(imc_t *imc)
977 977 {
978 978 uint_t i;
979 979
980 980 for (i = 0; i < imc->imc_nsockets; i++) {
981 981 uint32_t nodeid;
982 982 ddi_acc_handle_t h;
983 983
984 984 h = imc->imc_sockets[i].isock_ubox->istub_cfgspace;
985 985 nodeid = pci_config_get32(h,
986 986 imc->imc_gen_data->igd_sad_nodeid_offset);
987 987 if (nodeid == PCI_EINVAL32) {
988 988 imc->imc_sockets[i].isock_valid |=
989 989 IMC_SOCKET_V_BAD_NODEID;
990 990 continue;
991 991 }
992 992
993 993 imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid);
994 994 imc->imc_spointers[nodeid] = &imc->imc_sockets[i];
995 995 }
996 996 }
997 997
998 998 /*
999 999 * Decode the MTR, accounting for variances between processor generations.
1000 1000 */
1001 1001 static void
1002 1002 imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr)
1003 1003 {
1004 1004 uint8_t disable;
1005 1005
1006 1006 /*
1007 1007 * Check present first, before worrying about anything else.
1008 1008 */
1009 1009 if (imc->imc_gen < IMC_GEN_SKYLAKE &&
1010 1010 IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) {
1011 1011 dimm->idimm_present = B_FALSE;
1012 1012 return;
1013 1013 } else if (imc->imc_gen >= IMC_GEN_SKYLAKE &&
1014 1014 IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) {
1015 1015 dimm->idimm_present = B_FALSE;
1016 1016 return;
1017 1017 }
1018 1018
1019 1019 dimm->idimm_present = B_TRUE;
1020 1020 dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE;
1021 1021 if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN ||
1022 1022 dimm->idimm_ncolumns > IMC_MTR_CA_MAX) {
1023 1023 dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS;
1024 1024 }
1025 1025
1026 1026 dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE;
1027 1027 if (dimm->idimm_nrows < IMC_MTR_RA_MIN ||
1028 1028 dimm->idimm_nrows > IMC_MTR_RA_MAX) {
1029 1029 dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS;
1030 1030 }
1031 1031
1032 1032 /*
1033 1033 * Determine Density, this information is not present on Sandy Bridge.
1034 1034 */
1035 1035 switch (imc->imc_gen) {
1036 1036 case IMC_GEN_IVY:
1037 1037 dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr);
1038 1038 break;
1039 1039 case IMC_GEN_HASWELL:
1040 1040 case IMC_GEN_BROADWELL:
1041 1041 switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) {
1042 1042 case 0:
1043 1043 default:
1044 1044 dimm->idimm_density = 0;
1045 1045 dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1046 1046 break;
1047 1047 case 1:
1048 1048 dimm->idimm_density = 2;
1049 1049 break;
1050 1050 case 2:
1051 1051 dimm->idimm_density = 4;
1052 1052 break;
1053 1053 case 3:
1054 1054 dimm->idimm_density = 8;
1055 1055 break;
1056 1056 }
1057 1057 break;
1058 1058 case IMC_GEN_SKYLAKE:
1059 1059 switch (IMC_MTR_DENSITY_SKX(mtr)) {
1060 1060 case 0:
1061 1061 default:
1062 1062 dimm->idimm_density = 0;
1063 1063 dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1064 1064 break;
1065 1065 case 1:
1066 1066 dimm->idimm_density = 2;
1067 1067 break;
1068 1068 case 2:
1069 1069 dimm->idimm_density = 4;
1070 1070 break;
1071 1071 case 3:
1072 1072 dimm->idimm_density = 8;
1073 1073 break;
1074 1074 case 4:
1075 1075 dimm->idimm_density = 16;
1076 1076 break;
1077 1077 case 5:
1078 1078 dimm->idimm_density = 12;
1079 1079 break;
1080 1080 }
1081 1081 break;
1082 1082 case IMC_GEN_UNKNOWN:
1083 1083 case IMC_GEN_SANDY:
1084 1084 dimm->idimm_density = 0;
1085 1085 break;
1086 1086 }
1087 1087
1088 1088 /*
1089 1089 * The values of width are the same on IVY->SKX, but the bits are
1090 1090 * different. This doesn't exist on SNB.
1091 1091 */
1092 1092 if (imc->imc_gen > IMC_GEN_SANDY) {
1093 1093 uint8_t width;
1094 1094
1095 1095 if (imc->imc_gen >= IMC_GEN_BROADWELL) {
1096 1096 width = IMC_MTR_WIDTH_BRD_SKX(mtr);
1097 1097 } else {
1098 1098 width = IMC_MTR_WIDTH_IVB_HAS(mtr);
1099 1099 }
1100 1100 switch (width) {
1101 1101 case 0:
1102 1102 dimm->idimm_width = 4;
1103 1103 break;
1104 1104 case 1:
1105 1105 dimm->idimm_width = 8;
1106 1106 break;
1107 1107 case 2:
1108 1108 dimm->idimm_width = 16;
1109 1109 break;
1110 1110 default:
1111 1111 dimm->idimm_width = 0;
1112 1112 dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH;
1113 1113 break;
1114 1114 }
1115 1115 } else {
1116 1116 dimm->idimm_width = 0;
1117 1117 }
1118 1118
1119 1119 dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr);
1120 1120 switch (imc->imc_gen) {
1121 1121 case IMC_GEN_HASWELL:
1122 1122 case IMC_GEN_BROADWELL:
1123 1123 case IMC_GEN_SKYLAKE:
1124 1124 if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) {
1125 1125 dimm->idimm_nranks = 0;
1126 1126 dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1127 1127 }
1128 1128 break;
1129 1129 default:
1130 1130 if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) {
1131 1131 dimm->idimm_nranks = 0;
1132 1132 dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1133 1133 }
1134 1134 }
1135 1135
1136 1136 disable = IMC_MTR_RANK_DISABLE(mtr);
1137 1137 dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0;
1138 1138 dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0;
1139 1139 dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0;
1140 1140 dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0;
1141 1141
1142 1142 /*
1143 1143 * Only Haswell and later have this information.
1144 1144 */
1145 1145 if (imc->imc_gen >= IMC_GEN_HASWELL) {
1146 1146 dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0;
1147 1147 dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0;
1148 1148 dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr);
1149 1149 if (dimm->idimm_3dsranks != 0) {
1150 1150 dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks;
1151 1151 }
1152 1152 }
1153 1153
1154 1154
1155 1155 if (icn->icn_dimm_type == IMC_DIMM_DDR4) {
1156 1156 dimm->idimm_nbanks = 16;
1157 1157 } else {
1158 1158 dimm->idimm_nbanks = 8;
1159 1159 }
1160 1160
1161 1161 /*
1162 1162 * To calculate the DIMM size we need first take the number of rows and
1163 1163 * columns. This gives us the number of slots per chip. In a given rank
1164 1164 * there are nbanks of these. There are nrank entries of those. Each of
1165 1165 * these slots can fit a byte.
1166 1166 */
1167 1167 dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 *
1168 1168 (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows));
1169 1169 }
1170 1170
1171 1171 static void
1172 1172 imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan)
1173 1173 {
1174 1174 uint_t i;
1175 1175
1176 1176 /*
1177 1177 * There's one register for each DIMM that might be present, we always
1178 1178 * read that information to determine information about the DIMMs.
1179 1179 */
1180 1180 chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms;
1181 1181 for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1182 1182 uint32_t mtr;
1183 1183 imc_dimm_t *dimm = &chan->ich_dimms[i];
1184 1184
1185 1185 bzero(dimm, sizeof (imc_dimm_t));
1186 1186 mtr = pci_config_get32(chan->ich_desc->istub_cfgspace,
1187 1187 imc->imc_gen_data->igd_mtr_offsets[i]);
1188 1188 dimm->idimm_mtr = mtr;
1189 1189 /*
1190 1190 * We don't really expect to get a bad PCIe read. However, if we
1191 1191 * do, treat that for the moment as though the DIMM is bad.
1192 1192 */
1193 1193 if (mtr == PCI_EINVAL32) {
1194 1194 dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ;
1195 1195 continue;
1196 1196 }
1197 1197
1198 1198 imc_decode_mtr(imc, icn, dimm, mtr);
1199 1199 }
1200 1200 }
1201 1201
1202 1202 static boolean_t
1203 1203 imc_fill_controller(imc_t *imc, imc_mc_t *icn)
1204 1204 {
1205 1205 uint32_t mcmtr;
1206 1206
1207 1207 mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace,
1208 1208 imc->imc_gen_data->igd_mcmtr_offset);
1209 1209 if (mcmtr == PCI_EINVAL32) {
1210 1210 icn->icn_invalid = B_TRUE;
1211 1211 return (B_FALSE);
1212 1212 }
1213 1213
1214 1214 icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0;
1215 1215 if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1216 1216 icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0;
1217 1217 } else {
1218 1218 icn->icn_lockstep = B_FALSE;
1219 1219 }
1220 1220
1221 1221 icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0;
1222 1222
1223 1223 /*
1224 1224 * SNB and IVB only support DDR3. Haswell and Broadwell may support
1225 1225 * DDR4, depends on the SKU. Skylake only supports DDR4.
1226 1226 */
1227 1227 switch (imc->imc_gen) {
1228 1228 case IMC_GEN_SANDY:
1229 1229 case IMC_GEN_IVY:
1230 1230 icn->icn_dimm_type = IMC_DIMM_DDR3;
1231 1231 break;
1232 1232 case IMC_GEN_HASWELL:
1233 1233 case IMC_GEN_BROADWELL:
1234 1234 if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) {
1235 1235 icn->icn_dimm_type = IMC_DIMM_DDR4;
1236 1236 } else {
1237 1237 icn->icn_dimm_type = IMC_DIMM_DDR3;
1238 1238 }
1239 1239 break;
1240 1240 default:
1241 1241 /*
1242 1242 * Skylake and on are all DDR4.
1243 1243 */
1244 1244 icn->icn_dimm_type = IMC_DIMM_DDR4;
1245 1245 break;
1246 1246 }
1247 1247
1248 1248 if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) {
1249 1249 icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace,
1250 1250 imc->imc_gen_data->igd_topo_offset);
1251 1251 }
1252 1252
1253 1253 return (B_TRUE);
1254 1254 }
1255 1255
1256 1256 /*
1257 1257 * Walk the IMC data and fill in the information on DIMMs and the memory
1258 1258 * controller configurations.
1259 1259 */
1260 1260 static void
1261 1261 imc_fill_data(imc_t *imc)
1262 1262 {
1263 1263 uint_t csock, cmc, cchan;
1264 1264
1265 1265 for (csock = 0; csock < imc->imc_nsockets; csock++) {
1266 1266 imc_socket_t *sock = &imc->imc_sockets[csock];
1267 1267
1268 1268 for (cmc = 0; cmc < sock->isock_nimc; cmc++) {
1269 1269 imc_mc_t *icn = &sock->isock_imcs[cmc];
1270 1270
1271 1271 if (!imc_fill_controller(imc, icn))
1272 1272 continue;
1273 1273
1274 1274 for (cchan = 0; cchan < icn->icn_nchannels; cchan++) {
1275 1275 imc_fill_dimms(imc, icn,
1276 1276 &icn->icn_channels[cchan]);
1277 1277 }
1278 1278 }
1279 1279 }
1280 1280 }
1281 1281
1282 1282 static nvlist_t *
1283 1283 imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm)
1284 1284 {
1285 1285 nvlist_t *nvl;
1286 1286
1287 1287 nvl = fnvlist_alloc();
1288 1288 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT,
1289 1289 dimm->idimm_present);
1290 1290 if (!dimm->idimm_present) {
1291 1291 return (nvl);
1292 1292 }
1293 1293
1294 1294 fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size);
1295 1295 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS,
1296 1296 dimm->idimm_ncolumns);
1297 1297 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS,
1298 1298 dimm->idimm_nrows);
1299 1299
1300 1300 if (imc->imc_gen > IMC_GEN_SANDY) {
1301 1301 fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY,
1302 1302 dimm->idimm_density * (1ULL << 30));
1303 1303 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH,
1304 1304 dimm->idimm_width);
1305 1305 }
1306 1306 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS,
1307 1307 dimm->idimm_nranks);
1308 1308 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS,
1309 1309 dimm->idimm_nbanks);
1310 1310 fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS,
1311 1311 dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE);
1312 1312
1313 1313 if (imc->imc_gen >= IMC_GEN_HASWELL) {
1314 1314 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL,
1315 1315 dimm->idimm_hdrl);
1316 1316 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP,
1317 1317 dimm->idimm_hdrl_parity);
1318 1318 if (dimm->idimm_3dsranks > 0) {
1319 1319 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK,
1320 1320 dimm->idimm_3dsranks);
1321 1321 }
1322 1322 }
1323 1323
1324 1324 return (nvl);
1325 1325 }
1326 1326
1327 1327 static nvlist_t *
1328 1328 imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan)
1329 1329 {
1330 1330 nvlist_t *nvl;
1331 1331 nvlist_t *dimms[IMC_MAX_DIMMPERCHAN];
1332 1332 uint_t i;
1333 1333
1334 1334 nvl = fnvlist_alloc();
1335 1335 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC,
1336 1336 imc->imc_gen_data->igd_max_dimms);
1337 1337 for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1338 1338 dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]);
1339 1339 }
1340 1340
1341 1341 fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS,
1342 1342 dimms, i);
1343 1343
1344 1344 for (; i > 0; i--) {
1345 1345 nvlist_free(dimms[i-1]);
1346 1346 }
1347 1347
1348 1348 return (nvl);
1349 1349 }
1350 1350
1351 1351 static nvlist_t *
1352 1352 imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn)
1353 1353 {
1354 1354 nvlist_t *nvl;
1355 1355 nvlist_t *channels[IMC_MAX_CHANPERMC];
1356 1356 uint_t i;
1357 1357
1358 1358 nvl = fnvlist_alloc();
1359 1359 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels);
1360 1360 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC,
1361 1361 icn->icn_ecc);
1362 1362 if (icn->icn_lockstep) {
1363 1363 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1364 1364 MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK);
1365 1365 } else {
1366 1366 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1367 1367 MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP);
1368 1368
1369 1369 }
1370 1370
1371 1371 if (icn->icn_closed) {
1372 1372 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1373 1373 MCINTEL_NVLIST_V1_MC_POLICY_CLOSED);
1374 1374 } else {
1375 1375 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1376 1376 MCINTEL_NVLIST_V1_MC_POLICY_OPEN);
1377 1377 }
1378 1378
1379 1379 for (i = 0; i < icn->icn_nchannels; i++) {
1380 1380 channels[i] = imc_nvl_create_channel(imc,
1381 1381 &icn->icn_channels[i]);
1382 1382 }
1383 1383 fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS,
1384 1384 channels, icn->icn_nchannels);
1385 1385 for (i = 0; i < icn->icn_nchannels; i++) {
1386 1386 nvlist_free(channels[i]);
1387 1387 }
1388 1388
1389 1389 return (nvl);
1390 1390 }
1391 1391
1392 1392 static void
1393 1393 imc_nvl_pack(imc_socket_t *sock, boolean_t sleep)
1394 1394 {
1395 1395 char *buf = NULL;
1396 1396 size_t len = 0;
1397 1397 int kmflag;
|
↓ open down ↓ |
1397 lines elided |
↑ open up ↑ |
1398 1398
1399 1399 if (sock->isock_nvl == NULL)
1400 1400 return;
1401 1401
1402 1402 if (sock->isock_buf != NULL)
1403 1403 return;
1404 1404
1405 1405 if (sleep) {
1406 1406 kmflag = KM_SLEEP;
1407 1407 } else {
1408 - kmflag = KM_NOSLEEP | KM_NORMALPRI;
1408 + kmflag = KM_NOSLEEP_LAZY;
1409 1409 }
1410 1410
1411 1411 if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR,
1412 1412 kmflag) != 0) {
1413 1413 return;
1414 1414 }
1415 1415
1416 1416 sock->isock_buf = buf;
1417 1417 sock->isock_buflen = len;
1418 1418 sock->isock_gen++;
1419 1419 }
1420 1420
1421 1421 static void
1422 1422 imc_decoder_pack(imc_t *imc)
1423 1423 {
1424 1424 char *buf = NULL;
|
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
1425 1425 size_t len = 0;
1426 1426
1427 1427 if (imc->imc_decoder_buf != NULL)
1428 1428 return;
1429 1429
1430 1430 if (imc->imc_decoder_dump == NULL) {
1431 1431 imc->imc_decoder_dump = imc_dump_decoder(imc);
1432 1432 }
1433 1433
1434 1434 if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR,
1435 - KM_NOSLEEP | KM_NORMALPRI) != 0) {
1435 + KM_NOSLEEP_LAZY) != 0) {
1436 1436 return;
1437 1437 }
1438 1438
1439 1439 imc->imc_decoder_buf = buf;
1440 1440 imc->imc_decoder_len = len;
1441 1441 }
1442 1442
1443 1443 static void
1444 1444 imc_nvl_create(imc_t *imc)
1445 1445 {
1446 1446 uint_t csock;
1447 1447 for (csock = 0; csock < imc->imc_nsockets; csock++) {
1448 1448 uint_t i;
1449 1449 nvlist_t *nvl;
1450 1450 nvlist_t *mcs[IMC_MAX_IMCPERSOCK];
1451 1451 imc_socket_t *sock = &imc->imc_sockets[csock];
1452 1452
1453 1453 nvl = fnvlist_alloc();
1454 1454 fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR,
1455 1455 MCINTEL_NVLIST_VERS1);
1456 1456 fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC,
1457 1457 sock->isock_nimc);
1458 1458
1459 1459 for (i = 0; i < sock->isock_nimc; i++) {
1460 1460 mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]);
1461 1461 }
1462 1462
1463 1463 fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS,
1464 1464 mcs, sock->isock_nimc);
1465 1465
1466 1466 for (i = 0; i < sock->isock_nimc; i++) {
1467 1467 nvlist_free(mcs[i]);
1468 1468 }
1469 1469
1470 1470 sock->isock_nvl = nvl;
1471 1471 imc_nvl_pack(sock, B_TRUE);
1472 1472 }
1473 1473 }
1474 1474
1475 1475 /*
1476 1476 * Determine the top of low and high memory. These determine whether transaction
1477 1477 * addresses target main memory or not. Unfortunately, the way that these are
1478 1478 * stored and fetched changes with different generations.
1479 1479 */
1480 1480 static void
1481 1481 imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad)
1482 1482 {
1483 1483 uint32_t tolm, tohm_low, tohm_hi;
1484 1484
1485 1485 tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1486 1486 imc->imc_gen_data->igd_tolm_offset);
1487 1487 tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1488 1488 imc->imc_gen_data->igd_tohm_low_offset);
1489 1489 if (imc->imc_gen_data->igd_tohm_hi_offset != 0) {
1490 1490 tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1491 1491 imc->imc_gen_data->igd_tohm_hi_offset);
1492 1492 } else {
1493 1493 tohm_hi = 0;
1494 1494 }
1495 1495
1496 1496 if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 ||
1497 1497 tohm_hi == PCI_EINVAL32) {
1498 1498 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1499 1499 return;
1500 1500 }
1501 1501
1502 1502 switch (imc->imc_gen) {
1503 1503 case IMC_GEN_SANDY:
1504 1504 case IMC_GEN_IVY:
1505 1505 sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) <<
1506 1506 IMC_TOLM_SNB_IVY_SHIFT;
1507 1507 sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) <<
1508 1508 IMC_TOLM_SNB_IVY_SHIFT;
1509 1509 break;
1510 1510 case IMC_GEN_HASWELL:
1511 1511 case IMC_GEN_BROADWELL:
1512 1512 case IMC_GEN_SKYLAKE:
1513 1513 sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK;
1514 1514 sad->isad_tohm = ((uint64_t)tohm_low &
1515 1515 IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32);
1516 1516
1517 1517 /*
1518 1518 * Adjust the values to turn them into an exclusive range.
1519 1519 */
1520 1520 sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL;
1521 1521 sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL;
1522 1522 break;
1523 1523 default:
1524 1524 dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
1525 1525 "set to unknown generation: %u", imc->imc_gen);
1526 1526 return;
1527 1527 }
1528 1528 }
1529 1529
1530 1530 static void
1531 1531 imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule,
1532 1532 uint32_t raw)
1533 1533 {
1534 1534 uint_t attr;
1535 1535 uint64_t limit;
1536 1536 bzero(rule, sizeof (imc_sad_rule_t));
1537 1537
1538 1538 rule->isr_raw_dram = raw;
1539 1539 rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0;
1540 1540 if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1541 1541 switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) {
1542 1542 case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6:
1543 1543 rule->isr_imode = IMC_SAD_IMODE_8t6;
1544 1544 break;
1545 1545 case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR:
1546 1546 rule->isr_imode = IMC_SAD_IMODE_8t6XOR;
1547 1547 break;
1548 1548 }
1549 1549 } else {
1550 1550 switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) {
1551 1551 case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6:
1552 1552 rule->isr_imode = IMC_SAD_IMODE_8t6;
1553 1553 break;
1554 1554 case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8:
1555 1555 rule->isr_imode = IMC_SAD_IMODE_10t8;
1556 1556 break;
1557 1557 case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12:
1558 1558 rule->isr_imode = IMC_SAD_IMODE_14t12;
1559 1559 break;
1560 1560 case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30:
1561 1561 rule->isr_imode = IMC_SAD_IMODE_32t30;
1562 1562 break;
1563 1563 }
1564 1564 }
1565 1565
1566 1566 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1567 1567 attr = IMC_SAD_DRAM_ATTR_SKX(raw);
1568 1568 } else {
1569 1569 attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw);
1570 1570 }
1571 1571
1572 1572 switch (attr) {
1573 1573 case IMC_SAD_DRAM_ATTR_DRAM:
1574 1574 rule->isr_type = IMC_SAD_TYPE_DRAM;
1575 1575 break;
1576 1576 case IMC_SAD_DRAM_ATTR_MMCFG:
1577 1577 rule->isr_type = IMC_SAD_TYPE_MMCFG;
1578 1578 break;
1579 1579 case IMC_SAD_DRAM_ATTR_NXM:
1580 1580 if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1581 1581 sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1582 1582 }
1583 1583 rule->isr_type = IMC_SAD_TYPE_NXM;
1584 1584 break;
1585 1585 default:
1586 1586 sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1587 1587 break;
1588 1588 }
1589 1589
1590 1590 /*
1591 1591 * Fetch the limit which represents bits 45:26 and then adjust this so
1592 1592 * that it is exclusive.
1593 1593 */
1594 1594 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1595 1595 limit = IMC_SAD_DRAM_LIMIT_SKX(raw);
1596 1596 } else {
1597 1597 limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw);
1598 1598 }
1599 1599 rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) +
1600 1600 IMC_SAD_DRAM_LIMIT_EXCLUSIVE;
1601 1601
1602 1602 /*
1603 1603 * The rest of this does not apply to Sandy Bridge.
1604 1604 */
1605 1605 if (imc->imc_gen == IMC_GEN_SANDY)
1606 1606 return;
1607 1607
1608 1608 if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) {
1609 1609 rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0;
1610 1610 return;
1611 1611 }
1612 1612
1613 1613 switch (IMC_SAD_DRAM_MOD23_SKX(raw)) {
1614 1614 case IMC_SAD_DRAM_MOD23_MOD3:
1615 1615 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3;
1616 1616 break;
1617 1617 case IMC_SAD_DRAM_MOD23_MOD2_C01:
1618 1618 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01;
1619 1619 break;
1620 1620 case IMC_SAD_DRAM_MOD23_MOD2_C12:
1621 1621 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12;
1622 1622 break;
1623 1623 case IMC_SAD_DRAM_MOD23_MOD2_C02:
1624 1624 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02;
1625 1625 break;
1626 1626 }
1627 1627
1628 1628 rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0;
1629 1629 switch (IMC_SAD_DRAM_MOD3_SKX(raw)) {
1630 1630 case IMC_SAD_DRAM_MOD3_MODE_45t6:
1631 1631 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6;
1632 1632 break;
1633 1633 case IMC_SAD_DRAM_MOD3_MODE_45t8:
1634 1634 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8;
1635 1635 break;
1636 1636 case IMC_SAD_DRAM_MOD3_MODE_45t12:
1637 1637 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12;
1638 1638 break;
1639 1639 default:
1640 1640 sad->isad_valid |= IMC_SAD_V_BAD_MOD3;
1641 1641 break;
1642 1642 }
1643 1643 }
1644 1644
1645 1645 static void
1646 1646 imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw)
1647 1647 {
1648 1648 uint_t i;
1649 1649 uint32_t mlen, mbase, skipbits, skipafter;
1650 1650
1651 1651 rule->isr_raw_interleave = raw;
1652 1652
1653 1653 /*
1654 1654 * Right now all architectures always have the maximum number of SAD
1655 1655 * interleave targets.
1656 1656 */
1657 1657 rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE;
1658 1658
1659 1659 /*
1660 1660 * Sandy Bridge has a gap in the interleave list due to the fact that it
1661 1661 * uses a smaller length.
1662 1662 */
1663 1663 if (imc->imc_gen > IMC_GEN_SANDY) {
1664 1664 mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN;
1665 1665 mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK;
1666 1666 skipbits = skipafter = 0;
1667 1667 } else {
1668 1668 mlen = IMC_SAD_ILEAVE_SNB_LEN;
1669 1669 mbase = IMC_SAD_ILEAVE_SNB_MASK;
1670 1670 skipbits = 2;
1671 1671 skipafter = 4;
1672 1672 }
1673 1673
1674 1674 for (i = 0; i < rule->isr_ntargets; i++) {
1675 1675 uint32_t mask, shift;
1676 1676
1677 1677 shift = i * mlen;
1678 1678 if (i >= skipafter)
1679 1679 shift += skipbits;
1680 1680 mask = mbase << shift;
1681 1681 rule->isr_targets[i] = (raw & mask) >> shift;
1682 1682 }
1683 1683 }
1684 1684
1685 1685 static void
1686 1686 imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad)
1687 1687 {
1688 1688 uint_t i;
1689 1689 off_t off;
1690 1690
1691 1691 sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules;
1692 1692 for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset;
1693 1693 i < sad->isad_nrules; i++, off += sizeof (uint64_t)) {
1694 1694 uint32_t dram, interleave;
1695 1695 imc_sad_rule_t *rule = &sad->isad_rules[i];
1696 1696
1697 1697 dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off);
1698 1698 interleave = pci_config_get32(sad->isad_dram->istub_cfgspace,
1699 1699 off + 4);
1700 1700
1701 1701 if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) {
1702 1702 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1703 1703 return;
1704 1704 }
1705 1705
1706 1706 imc_sad_fill_rule(imc, sad, rule, dram);
1707 1707 imc_sad_fill_rule_interleave(imc, rule, interleave);
1708 1708 }
1709 1709 }
1710 1710
1711 1711 static void
1712 1712 imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad)
1713 1713 {
1714 1714 uint_t i;
1715 1715 imc_sad_mcroute_table_t *mc = &sad->isad_mcroute;
1716 1716
1717 1717 if (imc->imc_gen < IMC_GEN_SKYLAKE)
1718 1718 return;
1719 1719 if (sad->isad_valid != 0)
1720 1720 return;
1721 1721
1722 1722 mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES;
1723 1723 for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) {
1724 1724 uint_t chanoff, ringoff;
1725 1725
1726 1726 ringoff = i * IMC_MC_ROUTE_RING_BITS;
1727 1727 chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET;
1728 1728
1729 1729 mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >>
1730 1730 ringoff) & IMC_MC_ROUTE_RING_MASK;
1731 1731 mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >>
1732 1732 chanoff) & IMC_MC_ROUTE_CHAN_MASK;
1733 1733 }
1734 1734 }
1735 1735
1736 1736 /*
1737 1737 * Initialize the SAD. To do this we have to do a few different things:
1738 1738 *
1739 1739 * 1. Determine where the top of low and high memory is.
1740 1740 * 2. Read and decode all of the rules for the SAD
1741 1741 * 3. On systems with a route table, decode the raw routes
1742 1742 *
1743 1743 * At this point in time, we treat TOLM and TOHM as a per-socket construct, even
1744 1744 * though it really should be global, this just makes life a bit simpler.
1745 1745 */
1746 1746 static void
1747 1747 imc_decoder_init_sad(imc_t *imc)
1748 1748 {
1749 1749 uint_t i;
1750 1750
1751 1751 for (i = 0; i < imc->imc_nsockets; i++) {
1752 1752 imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad);
1753 1753 imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad);
1754 1754 imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad);
1755 1755 }
1756 1756 }
1757 1757
1758 1758 static void
1759 1759 imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev,
1760 1760 imc_tad_rule_t *rule, uint32_t val)
1761 1761 {
1762 1762 uint64_t limit;
1763 1763
1764 1764 limit = IMC_TAD_LIMIT(val);
1765 1765 rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) +
1766 1766 IMC_TAD_LIMIT_EXCLUSIVE;
1767 1767 rule->itr_raw = val;
1768 1768
1769 1769 switch (IMC_TAD_SOCK_WAY(val)) {
1770 1770 case IMC_TAD_SOCK_WAY_1:
1771 1771 rule->itr_sock_way = 1;
1772 1772 break;
1773 1773 case IMC_TAD_SOCK_WAY_2:
1774 1774 rule->itr_sock_way = 2;
1775 1775 break;
1776 1776 case IMC_TAD_SOCK_WAY_4:
1777 1777 rule->itr_sock_way = 4;
1778 1778 break;
1779 1779 case IMC_TAD_SOCK_WAY_8:
1780 1780 rule->itr_sock_way = 8;
1781 1781 break;
1782 1782 }
1783 1783
1784 1784 rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1;
1785 1785 rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1786 1786 rule->itr_chan_gran = IMC_TAD_GRAN_64B;
1787 1787
1788 1788 /*
1789 1789 * Starting with Skylake the targets that are used are no longer part of
1790 1790 * the TAD. Those come from the IMC route table.
1791 1791 */
1792 1792 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1793 1793 rule->itr_ntargets = 0;
1794 1794 return;
1795 1795 }
1796 1796
1797 1797 rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS;
1798 1798 rule->itr_targets[0] = IMC_TAD_TARG0(val);
1799 1799 rule->itr_targets[1] = IMC_TAD_TARG1(val);
1800 1800 rule->itr_targets[2] = IMC_TAD_TARG2(val);
1801 1801 rule->itr_targets[3] = IMC_TAD_TARG3(val);
1802 1802
1803 1803 if (prev == NULL) {
1804 1804 rule->itr_base = 0;
1805 1805 } else {
1806 1806 rule->itr_base = prev->itr_limit + 1;
1807 1807 }
1808 1808 }
1809 1809
1810 1810 static void
1811 1811 imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule,
1812 1812 uint32_t val)
1813 1813 {
1814 1814 uint64_t base;
1815 1815
1816 1816 rule->itr_raw_gran = val;
1817 1817 base = IMC_TAD_BASE_BASE(val);
1818 1818 rule->itr_base = base << IMC_TAD_BASE_SHIFT;
1819 1819
1820 1820 switch (IMC_TAD_BASE_CHAN_GRAN(val)) {
1821 1821 case IMC_TAD_BASE_CHAN_GRAN_64B:
1822 1822 rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1823 1823 break;
1824 1824 case IMC_TAD_BASE_CHAN_GRAN_256B:
1825 1825 rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1826 1826 break;
1827 1827 case IMC_TAD_BASE_CHAN_GRAN_4KB:
1828 1828 rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1829 1829 break;
1830 1830 default:
1831 1831 tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN;
1832 1832 return;
1833 1833 }
1834 1834
1835 1835 switch (IMC_TAD_BASE_SOCK_GRAN(val)) {
1836 1836 case IMC_TAD_BASE_SOCK_GRAN_64B:
1837 1837 rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1838 1838 break;
1839 1839 case IMC_TAD_BASE_SOCK_GRAN_256B:
1840 1840 rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1841 1841 break;
1842 1842 case IMC_TAD_BASE_SOCK_GRAN_4KB:
1843 1843 rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1844 1844 break;
1845 1845 case IMC_TAD_BASE_SOCK_GRAN_1GB:
1846 1846 rule->itr_sock_gran = IMC_TAD_GRAN_1GB;
1847 1847 break;
1848 1848 }
1849 1849 }
1850 1850
1851 1851 /*
1852 1852 * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's
1853 1853 * suggested that the channel wayness will take this into account and therefore
1854 1854 * should be accurately reflected.
1855 1855 */
1856 1856 static void
1857 1857 imc_tad_read_rules(imc_t *imc, imc_tad_t *tad)
1858 1858 {
1859 1859 uint_t i;
1860 1860 off_t baseoff;
1861 1861 imc_tad_rule_t *prev;
1862 1862
1863 1863 tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules;
1864 1864 for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset,
1865 1865 prev = NULL; i < tad->itad_nrules;
1866 1866 i++, baseoff += sizeof (uint32_t)) {
1867 1867 uint32_t val;
1868 1868 off_t off;
1869 1869 imc_tad_rule_t *rule = &tad->itad_rules[i];
1870 1870
1871 1871 /*
1872 1872 * On Skylake, the TAD rules are split among two registers. The
1873 1873 * latter set mimics what exists on pre-Skylake.
1874 1874 */
1875 1875 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1876 1876 off = baseoff + IMC_SKX_WAYNESS_OFFSET;
1877 1877 } else {
1878 1878 off = baseoff;
1879 1879 }
1880 1880
1881 1881 val = pci_config_get32(tad->itad_stub->istub_cfgspace, off);
1882 1882 if (val == PCI_EINVAL32) {
1883 1883 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1884 1884 return;
1885 1885 }
1886 1886
1887 1887 imc_tad_fill_rule(imc, tad, prev, rule, val);
1888 1888 prev = rule;
1889 1889 if (imc->imc_gen < IMC_GEN_SKYLAKE)
1890 1890 continue;
1891 1891
1892 1892 val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff);
1893 1893 if (val == PCI_EINVAL32) {
1894 1894 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1895 1895 return;
1896 1896 }
1897 1897
1898 1898 imc_tad_fill_skx(imc, tad, rule, val);
1899 1899 }
1900 1900 }
1901 1901
1902 1902 /*
1903 1903 * Check for features which change how decoding works.
1904 1904 */
1905 1905 static void
1906 1906 imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc)
1907 1907 {
1908 1908 uint32_t val;
1909 1909
1910 1910 /*
1911 1911 * Determine whether or not lockstep mode or mirroring are enabled.
1912 1912 * These change the behavior of how we're supposed to interpret channel
1913 1913 * wayness. Lockstep is available in the TAD's features. Mirroring is
1914 1914 * available on the IMC's features. This isn't present in Skylake+. On
1915 1915 * Skylake Mirorring is a property of the SAD rule and there is no
1916 1916 * lockstep.
1917 1917 */
1918 1918 switch (imc->imc_gen) {
1919 1919 case IMC_GEN_SANDY:
1920 1920 case IMC_GEN_IVY:
1921 1921 case IMC_GEN_HASWELL:
1922 1922 case IMC_GEN_BROADWELL:
1923 1923 val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1924 1924 imc->imc_gen_data->igd_tad_sysdef);
1925 1925 if (val == PCI_EINVAL32) {
1926 1926 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1927 1927 return;
1928 1928 }
1929 1929 if (IMC_TAD_SYSDEF_LOCKSTEP(val)) {
1930 1930 tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP;
1931 1931 }
1932 1932
1933 1933 val = pci_config_get32(mc->icn_main1->istub_cfgspace,
1934 1934 imc->imc_gen_data->igd_mc_mirror);
1935 1935 if (val == PCI_EINVAL32) {
1936 1936 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1937 1937 return;
1938 1938 }
1939 1939 if (IMC_MC_MIRROR_SNB_BRD(val)) {
1940 1940 tad->itad_flags |= IMC_TAD_FLAG_MIRROR;
1941 1941 }
1942 1942 break;
1943 1943 default:
1944 1944 break;
1945 1945 }
1946 1946
1947 1947 /*
1948 1948 * Now, go through and look at values that'll change how we do the
1949 1949 * channel index and adddress calculation. These are only present
1950 1950 * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge
1951 1951 * and they don't exist on Skylake+.
1952 1952 */
1953 1953 switch (imc->imc_gen) {
1954 1954 case IMC_GEN_IVY:
1955 1955 case IMC_GEN_HASWELL:
1956 1956 case IMC_GEN_BROADWELL:
1957 1957 val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1958 1958 imc->imc_gen_data->igd_tad_sysdef2);
1959 1959 if (val == PCI_EINVAL32) {
1960 1960 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1961 1961 return;
1962 1962 }
1963 1963 if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1964 1964 tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT;
1965 1965 }
1966 1966 if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1967 1967 tad->itad_flags |= IMC_TAD_FLAG_CHANHASH;
1968 1968 }
1969 1969 break;
1970 1970 default:
1971 1971 break;
1972 1972 }
1973 1973 }
1974 1974
1975 1975 /*
1976 1976 * Read the IMC channel interleave records
1977 1977 */
1978 1978 static void
1979 1979 imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan)
1980 1980 {
1981 1981 uint_t i;
1982 1982 off_t off;
1983 1983
1984 1984 chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules;
1985 1985 for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset;
1986 1986 i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) {
1987 1987 uint32_t val;
1988 1988 uint64_t offset;
1989 1989
1990 1990 val = pci_config_get32(chan->ich_desc->istub_cfgspace,
1991 1991 off);
1992 1992 if (val == PCI_EINVAL32) {
1993 1993 chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
1994 1994 return;
1995 1995 }
1996 1996
1997 1997 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1998 1998 offset = IMC_TADCHAN_OFFSET_SKX(val);
1999 1999 } else {
2000 2000 offset = IMC_TADCHAN_OFFSET_SNB_BRD(val);
2001 2001 }
2002 2002
2003 2003 chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT;
2004 2004 chan->ich_tad_offsets_raw[i] = val;
2005 2005 }
2006 2006 }
2007 2007
2008 2008 static void
2009 2009 imc_decoder_init_tad(imc_t *imc)
2010 2010 {
2011 2011 uint_t i;
2012 2012
2013 2013 for (i = 0; i < imc->imc_nsockets; i++) {
2014 2014 uint_t j;
2015 2015
2016 2016 for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) {
2017 2017 imc_tad_read_features(imc,
2018 2018 &imc->imc_sockets[i].isock_tad[j],
2019 2019 &imc->imc_sockets[i].isock_imcs[j]);
2020 2020 imc_tad_read_rules(imc,
2021 2021 &imc->imc_sockets[i].isock_tad[j]);
2022 2022 }
2023 2023 }
2024 2024
2025 2025 for (i = 0; i < imc->imc_nsockets; i++) {
2026 2026 uint_t j;
2027 2027 imc_socket_t *sock = &imc->imc_sockets[i];
2028 2028
2029 2029 for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2030 2030 uint_t k;
2031 2031 imc_mc_t *mc = &sock->isock_imcs[j];
2032 2032
2033 2033 for (k = 0; k < mc->icn_nchannels; k++) {
2034 2034 imc_channel_t *chan = &mc->icn_channels[k];
2035 2035 imc_tad_read_interleave(imc, chan);
2036 2036 }
2037 2037 }
2038 2038 }
2039 2039 }
2040 2040
2041 2041 static void
2042 2042 imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan,
2043 2043 imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig)
2044 2044 {
2045 2045 uint_t i;
2046 2046 off_t off, incr;
2047 2047
2048 2048 /*
2049 2049 * Rank interleave offset registers come in two forms. Either they are
2050 2050 * contiguous for a given wayness, meaning that all of the entries for
2051 2051 * wayness zero are contiguous, or they are sparse, meaning that there
2052 2052 * is a bank for entry zero for all wayness, then entry one for all
2053 2053 * wayness, etc.
2054 2054 */
2055 2055 if (contig) {
2056 2056 off = imc->imc_gen_data->igd_rir_ileave_offset +
2057 2057 (rirno * imc->imc_gen_data->igd_rir_nileaves *
2058 2058 sizeof (uint32_t));
2059 2059 incr = sizeof (uint32_t);
2060 2060 } else {
2061 2061 off = imc->imc_gen_data->igd_rir_ileave_offset +
2062 2062 (rirno * sizeof (uint32_t));
2063 2063 incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t);
2064 2064 }
2065 2065 for (i = 0; i < rank->irle_nentries; i++, off += incr) {
2066 2066 uint32_t val;
2067 2067 uint64_t offset;
2068 2068 imc_rank_ileave_entry_t *ent = &rank->irle_entries[i];
2069 2069
2070 2070 val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2071 2071 if (val == PCI_EINVAL32) {
2072 2072 chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2073 2073 return;
2074 2074 }
2075 2075
2076 2076 switch (imc->imc_gen) {
2077 2077 case IMC_GEN_BROADWELL:
2078 2078 ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val);
2079 2079 break;
2080 2080 default:
2081 2081 ent->irle_target = IMC_RIR_OFFSET_TARGET(val);
2082 2082 break;
2083 2083 }
2084 2084 if (imc->imc_gen >= IMC_GEN_HASWELL) {
2085 2085 offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val);
2086 2086 } else {
2087 2087 offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val);
2088 2088 }
2089 2089 ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT;
2090 2090 }
2091 2091 }
2092 2092
2093 2093 static void
2094 2094 imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan)
2095 2095 {
2096 2096 uint_t i;
2097 2097 off_t off;
2098 2098
2099 2099 chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways;
2100 2100 for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset;
2101 2101 i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) {
2102 2102 uint32_t val;
2103 2103 uint64_t lim;
2104 2104 imc_rank_ileave_t *ent = &chan->ich_rankileaves[i];
2105 2105
2106 2106 val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2107 2107 if (val == PCI_EINVAL32) {
2108 2108 chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2109 2109 return;
2110 2110 }
2111 2111
2112 2112 ent->irle_raw = val;
2113 2113 ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0;
2114 2114 ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val);
2115 2115 ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val);
2116 2116 if (imc->imc_gen >= IMC_GEN_HASWELL) {
2117 2117 lim = IMC_RIR_LIMIT_HAS_SKX(val);
2118 2118 } else {
2119 2119 lim = IMC_RIR_LIMIT_SNB_IVB(val);
2120 2120 }
2121 2121
2122 2122 ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) +
2123 2123 IMC_RIR_LIMIT_EXCLUSIVE;
2124 2124
2125 2125 ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves;
2126 2126 if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
2127 2127 imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE);
2128 2128 } else {
2129 2129 imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE);
2130 2130 }
2131 2131 }
2132 2132 }
2133 2133
2134 2134 static void
2135 2135 imc_decoder_init_rir(imc_t *imc)
2136 2136 {
2137 2137 uint_t i;
2138 2138
2139 2139 for (i = 0; i < imc->imc_nsockets; i++) {
2140 2140 uint_t j;
2141 2141 imc_socket_t *sock = &imc->imc_sockets[i];
2142 2142
2143 2143 for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2144 2144 uint_t k;
2145 2145 imc_mc_t *mc = &sock->isock_imcs[j];
2146 2146
2147 2147 for (k = 0; k < mc->icn_nchannels; k++) {
2148 2148 imc_channel_t *chan = &mc->icn_channels[k];
2149 2149 imc_rir_read_wayness(imc, chan);
2150 2150 }
2151 2151 }
2152 2152 }
2153 2153 }
2154 2154
2155 2155 static cmi_errno_t
2156 2156 imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo,
2157 2157 uint32_t synd, int syndtype, mc_unum_t *unump)
2158 2158 {
2159 2159 imc_t *imc = arg;
2160 2160 uint_t i;
2161 2161 imc_decode_state_t dec;
2162 2162
2163 2163 bzero(&dec, sizeof (dec));
2164 2164 if (!imc_decode_pa(imc, pa, &dec)) {
2165 2165 switch (dec.ids_fail) {
2166 2166 case IMC_DECODE_F_LEGACY_RANGE:
2167 2167 case IMC_DECODE_F_OUTSIDE_DRAM:
2168 2168 return (CMIERR_MC_NOTDIMMADDR);
2169 2169 default:
2170 2170 return (CMIERR_MC_BADSTATE);
2171 2171 }
2172 2172 }
2173 2173
2174 2174 unump->unum_board = 0;
2175 2175 /*
2176 2176 * The chip id needs to be in the order that the OS expects it, which
2177 2177 * may not be our order.
2178 2178 */
2179 2179 for (i = 0; i < imc->imc_nsockets; i++) {
2180 2180 if (imc->imc_spointers[i] == dec.ids_socket)
2181 2181 break;
2182 2182 }
2183 2183 if (i == imc->imc_nsockets) {
2184 2184 return (CMIERR_MC_BADSTATE);
2185 2185 }
2186 2186 unump->unum_chip = i;
2187 2187 unump->unum_mc = dec.ids_tadid;
2188 2188 unump->unum_chan = dec.ids_channelid;
2189 2189 unump->unum_cs = dec.ids_dimmid;
2190 2190 unump->unum_rank = dec.ids_rankid;
2191 2191 unump->unum_offset = dec.ids_rankaddr;
2192 2192 for (i = 0; i < MC_UNUM_NDIMM; i++) {
2193 2193 unump->unum_dimms[i] = MC_INVALNUM;
2194 2194 }
2195 2195
2196 2196 return (CMI_SUCCESS);
2197 2197 }
2198 2198
2199 2199 static cmi_errno_t
2200 2200 imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa)
2201 2201 {
2202 2202 return (CMIERR_UNKNOWN);
2203 2203 }
2204 2204
2205 2205 static const cmi_mc_ops_t imc_mc_ops = {
2206 2206 .cmi_mc_patounum = imc_mc_patounum,
2207 2207 .cmi_mc_unumtopa = imc_mc_unumtopa
2208 2208 };
2209 2209
2210 2210 /*
2211 2211 * This is where we really finish attaching and become open for business. This
2212 2212 * occurs once we have all of the expected stubs attached. Here's where all of
2213 2213 * the real fun begins.
2214 2214 */
2215 2215 static void
2216 2216 imc_attach_complete(void *arg)
2217 2217 {
2218 2218 imc_t *imc = arg;
2219 2219 cmi_errno_t err;
2220 2220
2221 2221 imc_set_gen_data(imc);
2222 2222
2223 2223 /*
2224 2224 * On SKX and newer, we can fail to map PCI buses at this point due to
2225 2225 * bad PCIe reads.
2226 2226 */
2227 2227 if (!imc_map_stubs(imc)) {
2228 2228 goto done;
2229 2229 }
2230 2230
2231 2231 if (!imc_validate_stubs(imc)) {
2232 2232 imc->imc_flags |= IMC_F_VALIDATE_FAILED;
2233 2233 goto done;
2234 2234 }
2235 2235
2236 2236 imc_fixup_stubs(imc);
2237 2237 imc_map_sockets(imc);
2238 2238
2239 2239 if (!imc_create_minors(imc)) {
2240 2240 goto done;
2241 2241 }
2242 2242
2243 2243 imc_fill_data(imc);
2244 2244 imc_nvl_create(imc);
2245 2245
2246 2246 /*
2247 2247 * Gather additional information that we need so that we can properly
2248 2248 * initialize the memory decoder and encoder.
2249 2249 */
2250 2250 imc_decoder_init_sad(imc);
2251 2251 imc_decoder_init_tad(imc);
2252 2252 imc_decoder_init_rir(imc);
2253 2253
2254 2254 /*
2255 2255 * Register decoder functions. This may fail. If so, try and complain
2256 2256 * loudly, but stay active to allow other data to be useful. Register a
2257 2257 * global handle.
2258 2258 */
2259 2259 if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) {
2260 2260 imc->imc_flags |= IMC_F_MCREG_FAILED;
2261 2261 dev_err(imc->imc_dip, CE_WARN, "failed to register memory "
2262 2262 "decoding operations: 0x%x", err);
2263 2263 }
2264 2264
2265 2265 done:
2266 2266 mutex_enter(&imc->imc_lock);
2267 2267 imc->imc_flags &= IMC_F_ATTACH_DISPATCHED;
2268 2268 imc->imc_flags |= IMC_F_ATTACH_COMPLETE;
2269 2269 mutex_exit(&imc->imc_lock);
2270 2270 }
2271 2271
2272 2272 static int
2273 2273 imc_stub_comparator(const void *l, const void *r)
2274 2274 {
2275 2275 const imc_stub_t *sl = l, *sr = r;
2276 2276 if (sl->istub_bus > sr->istub_bus)
2277 2277 return (1);
2278 2278 if (sl->istub_bus < sr->istub_bus)
2279 2279 return (-1);
2280 2280 if (sl->istub_dev > sr->istub_dev)
2281 2281 return (1);
2282 2282 if (sl->istub_dev < sr->istub_dev)
2283 2283 return (-1);
2284 2284 if (sl->istub_func > sr->istub_func)
2285 2285 return (1);
2286 2286 if (sl->istub_func < sr->istub_func)
2287 2287 return (-1);
2288 2288 return (0);
2289 2289 }
2290 2290
2291 2291 static int
2292 2292 imc_stub_scan_cb(dev_info_t *dip, void *arg)
2293 2293 {
2294 2294 int vid, did;
2295 2295 const imc_stub_table_t *table;
2296 2296 imc_t *imc = arg;
2297 2297 int *regs;
2298 2298 uint_t i, nregs;
2299 2299
2300 2300 if (dip == ddi_root_node()) {
2301 2301 return (DDI_WALK_CONTINUE);
2302 2302 }
2303 2303
2304 2304 /*
2305 2305 * Get the dev info name. PCI devices will always be children of PCI
2306 2306 * devices today on x86. If we reach something that has a device name
2307 2307 * that's not PCI, then we can prune it's children.
2308 2308 */
2309 2309 if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2310 2310 return (DDI_WALK_PRUNECHILD);
2311 2311 }
2312 2312
2313 2313 /*
2314 2314 * Get the device and vendor ID and see if this is something the imc
2315 2315 * knows about or cares about.
2316 2316 */
2317 2317 vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2318 2318 "vendor-id", PCI_EINVAL16);
2319 2319 did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2320 2320 "device-id", PCI_EINVAL16);
2321 2321 if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2322 2322 return (DDI_WALK_CONTINUE);
2323 2323 }
2324 2324
2325 2325 if (vid != IMC_PCI_VENDOR_INTC) {
2326 2326 return (DDI_WALK_PRUNECHILD);
2327 2327 }
2328 2328
2329 2329 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2330 2330 "reg", ®s, &nregs) != DDI_PROP_SUCCESS) {
2331 2331 return (DDI_WALK_CONTINUE);
2332 2332 }
2333 2333
2334 2334 if (nregs == 0) {
2335 2335 ddi_prop_free(regs);
2336 2336 return (DDI_WALK_CONTINUE);
2337 2337 }
2338 2338
2339 2339
2340 2340 table = NULL;
2341 2341 for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2342 2342 if (imc_stub_table[i].imcs_devid == did &&
2343 2343 imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2344 2344 imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2345 2345 table = &imc_stub_table[i];
2346 2346 break;
2347 2347 }
2348 2348 }
2349 2349 ddi_prop_free(regs);
2350 2350
2351 2351 /*
2352 2352 * Not a match, not interesting.
2353 2353 */
2354 2354 if (table == NULL) {
2355 2355 return (DDI_WALK_CONTINUE);
2356 2356 }
2357 2357
2358 2358 mutex_enter(&imc->imc_lock);
2359 2359 imc->imc_nscanned++;
2360 2360 mutex_exit(&imc->imc_lock);
2361 2361
2362 2362 return (DDI_WALK_CONTINUE);
2363 2363 }
2364 2364
2365 2365 /*
2366 2366 * From here, go through and see how many of the devices that we know about.
2367 2367 */
2368 2368 static void
2369 2369 imc_stub_scan(void *arg)
2370 2370 {
2371 2371 imc_t *imc = arg;
2372 2372 boolean_t dispatch = B_FALSE;
2373 2373
2374 2374 /*
2375 2375 * Zero out the scan results in case we've been detached and reattached.
2376 2376 */
2377 2377 mutex_enter(&imc->imc_lock);
2378 2378 imc->imc_nscanned = 0;
2379 2379 mutex_exit(&imc->imc_lock);
2380 2380
2381 2381 ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc);
2382 2382
2383 2383 mutex_enter(&imc->imc_lock);
2384 2384 imc->imc_flags |= IMC_F_SCAN_COMPLETE;
2385 2385 imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED;
2386 2386
2387 2387 /*
2388 2388 * If the scan found no nodes, then that means that we're on a hardware
2389 2389 * platform that we don't support. Therefore, there's no reason to do
2390 2390 * anything here.
2391 2391 */
2392 2392 if (imc->imc_nscanned == 0) {
2393 2393 imc->imc_flags |= IMC_F_UNSUP_PLATFORM;
2394 2394 mutex_exit(&imc->imc_lock);
2395 2395 return;
2396 2396 }
2397 2397
2398 2398 if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2399 2399 imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2400 2400 dispatch = B_TRUE;
2401 2401 }
2402 2402
2403 2403 mutex_exit(&imc->imc_lock);
2404 2404
2405 2405 if (dispatch) {
2406 2406 (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2407 2407 imc, DDI_SLEEP);
2408 2408 }
2409 2409 }
2410 2410
2411 2411 /*
2412 2412 * By default, refuse to allow stubs to detach.
2413 2413 */
2414 2414 int
2415 2415 imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd)
2416 2416 {
2417 2417 imc_stub_t *stub;
2418 2418 imc_t *imc = imc_data;
2419 2419
2420 2420 mutex_enter(&imc->imc_lock);
2421 2421
2422 2422 /*
2423 2423 * By default, we do not allow stubs to detach. However, if the driver
2424 2424 * has attached to devices on a platform it doesn't recognize or
2425 2425 * support or if the override flag has been set, then allow detach to
2426 2426 * proceed.
2427 2427 */
2428 2428 if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 &&
2429 2429 imc_allow_detach == 0) {
2430 2430 mutex_exit(&imc->imc_lock);
2431 2431 return (DDI_FAILURE);
2432 2432 }
2433 2433
2434 2434 for (stub = avl_first(&imc->imc_stubs); stub != NULL;
2435 2435 stub = AVL_NEXT(&imc->imc_stubs, stub)) {
2436 2436 if (stub->istub_dip == dip) {
2437 2437 break;
2438 2438 }
2439 2439 }
2440 2440
2441 2441 /*
2442 2442 * A device was attached to us that we somehow don't know about. Allow
2443 2443 * this to proceed.
2444 2444 */
2445 2445 if (stub == NULL) {
2446 2446 mutex_exit(&imc->imc_lock);
2447 2447 return (DDI_SUCCESS);
2448 2448 }
2449 2449
2450 2450 pci_config_teardown(&stub->istub_cfgspace);
2451 2451 avl_remove(&imc->imc_stubs, stub);
2452 2452 kmem_free(stub, sizeof (imc_stub_t));
2453 2453 mutex_exit(&imc->imc_lock);
2454 2454
2455 2455 return (DDI_SUCCESS);
2456 2456 }
2457 2457
2458 2458 int
2459 2459 imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd)
2460 2460 {
2461 2461 imc_stub_t *stub, *lookup;
2462 2462 int did, vid, *regs;
2463 2463 uint_t i, nregs;
2464 2464 const imc_stub_table_t *table;
2465 2465 avl_index_t idx;
2466 2466 boolean_t dispatch = B_FALSE;
2467 2467 imc_t *imc = imc_data;
2468 2468
2469 2469 if (cmd != DDI_ATTACH) {
2470 2470 return (DDI_FAILURE);
2471 2471 }
2472 2472
2473 2473 /*
2474 2474 * We've been asked to attach a stub. First, determine if this is even a
2475 2475 * PCI device that we should care about. Then, append it to our global
2476 2476 * list and kick off the configuration task. Note that we do this
2477 2477 * configuration task in a taskq so that we don't interfere with the
2478 2478 * normal attach / detach path processing.
2479 2479 */
2480 2480 if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2481 2481 return (DDI_FAILURE);
2482 2482 }
2483 2483
2484 2484 /*
2485 2485 * Get the device and vendor ID and see if this is something the imc
2486 2486 * knows about or cares about.
2487 2487 */
2488 2488 vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2489 2489 "vendor-id", PCI_EINVAL16);
2490 2490 did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2491 2491 "device-id", PCI_EINVAL16);
2492 2492 if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2493 2493 return (DDI_FAILURE);
2494 2494 }
2495 2495
2496 2496 /*
2497 2497 * Only accept INTC parts on the imc driver.
2498 2498 */
2499 2499 if (vid != IMC_PCI_VENDOR_INTC) {
2500 2500 return (DDI_FAILURE);
2501 2501 }
2502 2502
2503 2503 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2504 2504 "reg", ®s, &nregs) != DDI_PROP_SUCCESS) {
2505 2505 return (DDI_FAILURE);
2506 2506 }
2507 2507
2508 2508 if (nregs == 0) {
2509 2509 ddi_prop_free(regs);
2510 2510 return (DDI_FAILURE);
2511 2511 }
2512 2512
2513 2513 /*
2514 2514 * Determine if this matches a known device.
2515 2515 */
2516 2516 table = NULL;
2517 2517 for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2518 2518 if (imc_stub_table[i].imcs_devid == did &&
2519 2519 imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2520 2520 imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2521 2521 table = &imc_stub_table[i];
2522 2522 break;
2523 2523 }
2524 2524 }
2525 2525
2526 2526 if (i == ARRAY_SIZE(imc_stub_table)) {
2527 2527 ddi_prop_free(regs);
2528 2528 return (DDI_FAILURE);
2529 2529 }
2530 2530
2531 2531 /*
2532 2532 * We've found something. Make sure the generation matches our current
2533 2533 * one. If it does, construct the entry and append it to the list.
2534 2534 */
2535 2535 mutex_enter(&imc->imc_lock);
2536 2536 if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen !=
2537 2537 table->imcs_gen) {
2538 2538 mutex_exit(&imc->imc_lock);
2539 2539 ddi_prop_free(regs);
2540 2540 dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) "
2541 2541 "that has different hardware generation (%u) from current "
2542 2542 "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen);
2543 2543 return (DDI_FAILURE);
2544 2544 } else {
2545 2545 imc->imc_gen = table->imcs_gen;
2546 2546 }
2547 2547 mutex_exit(&imc->imc_lock);
2548 2548
2549 2549 stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP);
2550 2550 stub->istub_dip = dip;
2551 2551 stub->istub_vid = vid;
2552 2552 stub->istub_did = did;
2553 2553 stub->istub_bus = PCI_REG_BUS_G(regs[0]);
2554 2554 stub->istub_dev = PCI_REG_DEV_G(regs[0]);
2555 2555 stub->istub_func = PCI_REG_FUNC_G(regs[0]);
2556 2556 ddi_prop_free(regs);
2557 2557 stub->istub_table = table;
2558 2558
2559 2559 if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) {
2560 2560 kmem_free(stub, sizeof (stub));
2561 2561 dev_err(dip, CE_WARN, "Failed to set up PCI config space "
2562 2562 "for IMC stub device %s (%u/%u)", ddi_node_name(dip),
2563 2563 vid, did);
2564 2564 return (DDI_FAILURE);
2565 2565 }
2566 2566
2567 2567 mutex_enter(&imc->imc_lock);
2568 2568 if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) {
2569 2569 dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate "
2570 2570 "bdf %u/%u/%u with %s (%u/%u), not attaching",
2571 2571 ddi_node_name(imc->imc_dip), vid, did,
2572 2572 stub->istub_bus, stub->istub_dev, stub->istub_func,
2573 2573 ddi_node_name(lookup->istub_dip), lookup->istub_vid,
2574 2574 lookup->istub_did);
2575 2575 mutex_exit(&imc->imc_lock);
2576 2576 pci_config_teardown(&stub->istub_cfgspace);
2577 2577 kmem_free(stub, sizeof (stub));
2578 2578
2579 2579 return (DDI_FAILURE);
2580 2580 }
2581 2581 avl_insert(&imc->imc_stubs, stub, idx);
2582 2582
2583 2583 if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE &&
2584 2584 avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2585 2585 imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2586 2586 dispatch = B_TRUE;
2587 2587 }
2588 2588 mutex_exit(&imc->imc_lock);
2589 2589
2590 2590 if (dispatch) {
2591 2591 (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2592 2592 imc, DDI_SLEEP);
2593 2593 }
2594 2594
2595 2595 return (DDI_SUCCESS);
2596 2596 }
2597 2597
2598 2598 static int
2599 2599 imc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2600 2600 {
2601 2601 imc_t *imc = imc_data;
2602 2602
2603 2603 if ((flag & (FEXCL | FNDELAY)) != 0)
2604 2604 return (EINVAL);
2605 2605
2606 2606 if (otyp != OTYP_CHR)
2607 2607 return (EINVAL);
2608 2608
2609 2609 mutex_enter(&imc->imc_lock);
2610 2610
2611 2611 if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) {
2612 2612 mutex_exit(&imc->imc_lock);
2613 2613 return (ENOTSUP);
2614 2614 }
2615 2615
2616 2616 /*
2617 2617 * It's possible that someone has come in during the window between when
2618 2618 * we've created the minor node and when we've finished doing work.
2619 2619 */
2620 2620 if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) {
2621 2621 mutex_exit(&imc->imc_lock);
2622 2622 return (EAGAIN);
2623 2623 }
2624 2624
2625 2625 /*
2626 2626 * It's not clear how someone would get a minor that we didn't create.
2627 2627 * But be paranoid and make sure.
2628 2628 */
2629 2629 if (getminor(*devp) >= imc->imc_nsockets) {
2630 2630 mutex_exit(&imc->imc_lock);
2631 2631 return (EINVAL);
2632 2632 }
2633 2633
2634 2634 /*
2635 2635 * Make sure this socket entry has been filled in.
2636 2636 */
2637 2637 if (imc->imc_spointers[getminor(*devp)] == NULL) {
2638 2638 mutex_exit(&imc->imc_lock);
2639 2639 return (EINVAL);
2640 2640 }
2641 2641
2642 2642 mutex_exit(&imc->imc_lock);
2643 2643
2644 2644 return (0);
2645 2645 }
2646 2646
2647 2647 static void
2648 2648 imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode)
2649 2649 {
2650 2650 imc_decode_state_t dec;
2651 2651 uint_t i;
2652 2652
2653 2653 bzero(&dec, sizeof (dec));
2654 2654 if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) {
2655 2655 encode->mcei_err = (uint32_t)dec.ids_fail;
2656 2656 encode->mcei_errdata = dec.ids_fail_data;
2657 2657 return;
2658 2658 }
2659 2659
2660 2660 encode->mcei_errdata = 0;
2661 2661 encode->mcei_err = 0;
2662 2662 encode->mcei_board = 0;
2663 2663 for (i = 0; i < imc->imc_nsockets; i++) {
2664 2664 if (imc->imc_spointers[i] == dec.ids_socket)
2665 2665 break;
2666 2666 }
2667 2667 encode->mcei_chip = i;
2668 2668 encode->mcei_mc = dec.ids_tadid;
2669 2669 encode->mcei_chan = dec.ids_channelid;
2670 2670 encode->mcei_dimm = dec.ids_dimmid;
2671 2671 encode->mcei_rank_addr = dec.ids_rankaddr;
2672 2672 encode->mcei_rank = dec.ids_rankid;
2673 2673 encode->mcei_row = UINT32_MAX;
2674 2674 encode->mcei_column = UINT32_MAX;
2675 2675 encode->mcei_pad = 0;
2676 2676 }
2677 2677
2678 2678 static int
2679 2679 imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2680 2680 int *rvalp)
2681 2681 {
2682 2682 int ret;
2683 2683 minor_t m;
2684 2684 mc_snapshot_info_t info;
2685 2685 mc_encode_ioc_t encode;
2686 2686 imc_t *imc = imc_data;
2687 2687 imc_socket_t *sock;
2688 2688
2689 2689 mutex_enter(&imc->imc_lock);
2690 2690 m = getminor(dev);
2691 2691 if (m >= imc->imc_nsockets) {
2692 2692 ret = EINVAL;
2693 2693 goto done;
2694 2694 }
2695 2695 sock = imc->imc_spointers[m];
2696 2696 if (sock == NULL) {
2697 2697 ret = EINVAL;
2698 2698 goto done;
2699 2699 }
2700 2700
2701 2701 /*
2702 2702 * Note, other memory controller drivers don't check mode for reading
2703 2703 * data nor do they care who can read it from a credential perspective.
2704 2704 * As such we don't either at this time.
2705 2705 */
2706 2706 switch (cmd) {
2707 2707 case MC_IOC_SNAPSHOT_INFO:
2708 2708 imc_nvl_pack(sock, B_FALSE);
2709 2709 if (sock->isock_buf == NULL) {
2710 2710 ret = EIO;
2711 2711 break;
2712 2712 }
2713 2713
2714 2714 info.mcs_size = sock->isock_buflen;
2715 2715 info.mcs_gen = sock->isock_gen;
2716 2716
2717 2717 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2718 2718 ret = EFAULT;
2719 2719 break;
2720 2720 }
2721 2721
2722 2722 ret = 0;
2723 2723 break;
2724 2724 case MC_IOC_SNAPSHOT:
2725 2725 imc_nvl_pack(sock, B_FALSE);
2726 2726 if (sock->isock_buf == NULL) {
2727 2727 ret = EIO;
2728 2728 break;
2729 2729 }
2730 2730
2731 2731 if (ddi_copyout(sock->isock_buf, (void *)arg,
2732 2732 sock->isock_buflen, mode) != 0) {
2733 2733 ret = EFAULT;
2734 2734 break;
2735 2735 }
2736 2736
2737 2737 ret = 0;
2738 2738 break;
2739 2739 case MC_IOC_DECODE_SNAPSHOT_INFO:
2740 2740 imc_decoder_pack(imc);
2741 2741 if (imc->imc_decoder_buf == NULL) {
2742 2742 ret = EIO;
2743 2743 break;
2744 2744 }
2745 2745
2746 2746 info.mcs_size = imc->imc_decoder_len;
2747 2747 info.mcs_gen = imc->imc_spointers[0]->isock_gen;
2748 2748
2749 2749 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2750 2750 ret = EFAULT;
2751 2751 break;
2752 2752 }
2753 2753
2754 2754 ret = 0;
2755 2755 break;
2756 2756 case MC_IOC_DECODE_SNAPSHOT:
2757 2757 imc_decoder_pack(imc);
2758 2758 if (imc->imc_decoder_buf == NULL) {
2759 2759 ret = EIO;
2760 2760 break;
2761 2761 }
2762 2762
2763 2763 if (ddi_copyout(imc->imc_decoder_buf, (void *)arg,
2764 2764 imc->imc_decoder_len, mode) != 0) {
2765 2765 ret = EFAULT;
2766 2766 break;
2767 2767 }
2768 2768
2769 2769 ret = 0;
2770 2770 break;
2771 2771 case MC_IOC_DECODE_PA:
2772 2772 if (crgetzoneid(credp) != GLOBAL_ZONEID ||
2773 2773 drv_priv(credp) != 0) {
2774 2774 ret = EPERM;
2775 2775 break;
2776 2776 }
2777 2777
2778 2778 if (ddi_copyin((void *)arg, &encode, sizeof (encode),
2779 2779 mode & FKIOCTL) != 0) {
2780 2780 ret = EPERM;
2781 2781 break;
2782 2782 }
2783 2783
2784 2784 imc_ioctl_decode(imc, &encode);
2785 2785 ret = 0;
2786 2786
2787 2787 if (ddi_copyout(&encode, (void *)arg, sizeof (encode),
2788 2788 mode & FKIOCTL) != 0) {
2789 2789 ret = EPERM;
2790 2790 break;
2791 2791 }
2792 2792 break;
2793 2793 default:
2794 2794 ret = EINVAL;
2795 2795 goto done;
2796 2796 }
2797 2797
2798 2798 done:
2799 2799 mutex_exit(&imc->imc_lock);
2800 2800 return (ret);
2801 2801 }
2802 2802
2803 2803 static int
2804 2804 imc_close(dev_t dev, int flag, int otyp, cred_t *credp)
2805 2805 {
2806 2806 return (0);
2807 2807 }
2808 2808
2809 2809 static int
2810 2810 imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2811 2811 {
2812 2812 if (cmd != DDI_ATTACH) {
2813 2813 return (DDI_FAILURE);
2814 2814 }
2815 2815
2816 2816 if (imc_data == NULL || imc_data->imc_dip != NULL) {
2817 2817 return (DDI_FAILURE);
2818 2818 }
2819 2819
2820 2820 mutex_enter(&imc_data->imc_lock);
2821 2821 if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1,
2822 2822 TASKQ_DEFAULTPRI, 0)) == NULL) {
2823 2823 mutex_exit(&imc_data->imc_lock);
2824 2824 return (DDI_FAILURE);
2825 2825 }
2826 2826
2827 2827 imc_data->imc_dip = dip;
2828 2828 imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED;
2829 2829 mutex_exit(&imc_data->imc_lock);
2830 2830
2831 2831 (void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data,
2832 2832 DDI_SLEEP);
2833 2833
2834 2834 return (DDI_SUCCESS);
2835 2835 }
2836 2836
2837 2837 /*
2838 2838 * We only export a single instance.
2839 2839 */
2840 2840 static int
2841 2841 imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
2842 2842 {
2843 2843 /*
2844 2844 * getinfo(9E) shouldn't be called if we're not attached. But be
2845 2845 * paranoid.
2846 2846 */
2847 2847 if (imc_data == NULL || imc_data->imc_dip == NULL) {
2848 2848 return (DDI_FAILURE);
2849 2849 }
2850 2850
2851 2851 switch (infocmd) {
2852 2852 case DDI_INFO_DEVT2DEVINFO:
2853 2853 *resultp = imc_data->imc_dip;
2854 2854 break;
2855 2855 case DDI_INFO_DEVT2INSTANCE:
2856 2856 *resultp = (void *)0;
2857 2857 break;
2858 2858 default:
2859 2859 return (DDI_FAILURE);
2860 2860 }
2861 2861
2862 2862 return (DDI_SUCCESS);
2863 2863 }
2864 2864
2865 2865 static int
2866 2866 imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2867 2867 {
2868 2868 if (cmd != DDI_DETACH) {
2869 2869 return (DDI_FAILURE);
2870 2870 }
2871 2871
2872 2872 if (imc_data == NULL || imc_data->imc_dip) {
2873 2873 return (DDI_FAILURE);
2874 2874 }
2875 2875
2876 2876 mutex_enter(&imc_data->imc_lock);
2877 2877
2878 2878 /*
2879 2879 * While a scan or attach is outstanding, don't allow us to detach.
2880 2880 */
2881 2881 if ((imc_data->imc_flags &
2882 2882 (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) {
2883 2883 mutex_exit(&imc_data->imc_lock);
2884 2884 return (DDI_FAILURE);
2885 2885 }
2886 2886
2887 2887 /*
2888 2888 * Because the stub driver depends on the imc driver, we shouldn't be
2889 2889 * able to have any entries in this list when we detach. However, we
2890 2890 * check just to make sure.
2891 2891 */
2892 2892 if (!avl_is_empty(&imc_data->imc_stubs)) {
2893 2893 mutex_exit(&imc_data->imc_lock);
2894 2894 return (DDI_FAILURE);
2895 2895 }
2896 2896
2897 2897 nvlist_free(imc_data->imc_decoder_dump);
2898 2898 imc_data->imc_decoder_dump = NULL;
2899 2899 if (imc_data->imc_decoder_buf != NULL) {
2900 2900 kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len);
2901 2901 imc_data->imc_decoder_buf = NULL;
2902 2902 imc_data->imc_decoder_len = 0;
2903 2903 }
2904 2904
2905 2905 ddi_remove_minor_node(imc_data->imc_dip, NULL);
2906 2906 imc_data->imc_dip = NULL;
2907 2907 mutex_exit(&imc_data->imc_lock);
2908 2908
2909 2909 ddi_taskq_wait(imc_data->imc_taskq);
2910 2910 ddi_taskq_destroy(imc_data->imc_taskq);
2911 2911 imc_data->imc_taskq = NULL;
2912 2912
2913 2913 return (DDI_SUCCESS);
2914 2914 }
2915 2915
2916 2916 static void
2917 2917 imc_free(void)
2918 2918 {
2919 2919 if (imc_data == NULL) {
2920 2920 return;
2921 2921 }
2922 2922
2923 2923 VERIFY(avl_is_empty(&imc_data->imc_stubs));
2924 2924 avl_destroy(&imc_data->imc_stubs);
2925 2925 mutex_destroy(&imc_data->imc_lock);
2926 2926 kmem_free(imc_data, sizeof (imc_t));
2927 2927 imc_data = NULL;
2928 2928 }
2929 2929
2930 2930 static void
2931 2931 imc_alloc(void)
2932 2932 {
2933 2933 imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP);
2934 2934
2935 2935 mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL);
2936 2936 avl_create(&imc_data->imc_stubs, imc_stub_comparator,
2937 2937 sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link));
2938 2938 }
2939 2939
2940 2940 static struct cb_ops imc_cb_ops = {
2941 2941 .cb_open = imc_open,
2942 2942 .cb_close = imc_close,
2943 2943 .cb_strategy = nodev,
2944 2944 .cb_print = nodev,
2945 2945 .cb_dump = nodev,
2946 2946 .cb_read = nodev,
2947 2947 .cb_write = nodev,
2948 2948 .cb_ioctl = imc_ioctl,
2949 2949 .cb_devmap = nodev,
2950 2950 .cb_mmap = nodev,
2951 2951 .cb_segmap = nodev,
2952 2952 .cb_chpoll = nochpoll,
2953 2953 .cb_prop_op = ddi_prop_op,
2954 2954 .cb_flag = D_MP,
2955 2955 .cb_rev = CB_REV,
2956 2956 .cb_aread = nodev,
2957 2957 .cb_awrite = nodev
2958 2958 };
2959 2959
2960 2960 static struct dev_ops imc_dev_ops = {
2961 2961 .devo_rev = DEVO_REV,
2962 2962 .devo_refcnt = 0,
2963 2963 .devo_getinfo = imc_getinfo,
2964 2964 .devo_identify = nulldev,
2965 2965 .devo_probe = nulldev,
2966 2966 .devo_attach = imc_attach,
2967 2967 .devo_detach = imc_detach,
2968 2968 .devo_reset = nodev,
2969 2969 .devo_cb_ops = &imc_cb_ops,
2970 2970 .devo_quiesce = ddi_quiesce_not_needed
2971 2971 };
2972 2972
2973 2973 static struct modldrv imc_modldrv = {
2974 2974 .drv_modops = &mod_driverops,
2975 2975 .drv_linkinfo = "Intel Integrated Memory Controller Driver",
2976 2976 .drv_dev_ops = &imc_dev_ops
2977 2977 };
2978 2978
2979 2979 static struct modlinkage imc_modlinkage = {
2980 2980 .ml_rev = MODREV_1,
2981 2981 .ml_linkage = { &imc_modldrv, NULL }
2982 2982 };
2983 2983
2984 2984 int
2985 2985 _init(void)
2986 2986 {
2987 2987 int ret;
2988 2988
2989 2989 if ((ret = mod_install(&imc_modlinkage)) == 0) {
2990 2990 imc_alloc();
2991 2991 }
2992 2992
2993 2993 return (ret);
2994 2994 }
2995 2995
2996 2996 int
2997 2997 _info(struct modinfo *modinfop)
2998 2998 {
2999 2999 return (mod_info(&imc_modlinkage, modinfop));
3000 3000 }
3001 3001
3002 3002 int
3003 3003 _fini(void)
3004 3004 {
3005 3005 int ret;
3006 3006
3007 3007 if ((ret = mod_remove(&imc_modlinkage)) == 0) {
3008 3008 imc_free();
3009 3009 }
3010 3010 return (ret);
3011 3011 }
|
↓ open down ↓ |
1566 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX