1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
14 * Copyright 2019 Joyent, Inc.
15 * Copyright 2017 Tegile Systems, Inc. All rights reserved.
16 * Copyright 2020 RackTop Systems, Inc.
17 * Copyright 2020 Ryan Zezeski
18 */
19
20 /*
21 * i40e - Intel 10/40 Gb Ethernet driver
22 *
23 * The i40e driver is the main software device driver for the Intel 40 Gb family
24 * of devices. Note that these devices come in many flavors with both 40 GbE
25 * ports and 10 GbE ports. This device is the successor to the 82599 family of
26 * devices (ixgbe).
27 *
28 * Unlike previous generations of Intel 1 GbE and 10 GbE devices, the 40 GbE
29 * devices defined in the XL710 controller (previously known as Fortville) are a
30 * rather different beast and have a small switch embedded inside of them. In
31 * addition, the way that most of the programming is done has been overhauled.
32 * As opposed to just using PCIe memory mapped registers, it also has an
33 * administrative queue which is used to communicate with firmware running on
34 * the chip.
35 *
36 * Each physical function in the hardware shows up as a device that this driver
37 * will bind to. The hardware splits many resources evenly across all of the
38 * physical functions present on the device, while other resources are instead
39 * shared across the entire card and its up to the device driver to
40 * intelligently partition them.
41 *
42 * ------------
43 * Organization
44 * ------------
45 *
46 * This driver is made up of several files which have their own theory
47 * statements spread across them. We'll touch on the high level purpose of each
48 * file here, and then we'll get into more discussion on how the device is
49 * generally modelled with respect to the interfaces in illumos.
50 *
51 * i40e_gld.c: This file contains all of the bindings to MAC and the networking
52 * stack.
53 *
54 * i40e_intr.c: This file contains all of the interrupt service routines and
55 * contains logic to enable and disable interrupts on the hardware.
56 * It also contains the logic to map hardware resources such as the
57 * rings to and from interrupts and controls their ability to fire.
58 *
59 * There is a big theory statement on interrupts present there.
60 *
61 * i40e_main.c: The file that you're currently in. It interfaces with the
62 * traditional OS DDI interfaces and is in charge of configuring
63 * the device.
64 *
65 * i40e_osdep.[ch]: These files contain interfaces and definitions needed to
66 * work with Intel's common code for the device.
67 *
68 * i40e_stats.c: This file contains the general work and logic around our
69 * kstats. A theory statement on their organization and use of the
70 * hardware exists there.
71 *
72 * i40e_sw.h: This header file contains all of the primary structure definitions
73 * and constants that are used across the entire driver.
74 *
75 * i40e_transceiver.c: This file contains all of the logic for sending and
76 * receiving data. It contains all of the ring and DMA
77 * allocation logic, as well as, the actual interfaces to
78 * send and receive data.
79 *
80 * A big theory statement on ring management, descriptors,
81 * and how it ties into the OS is present there.
82 *
83 * --------------
84 * General Design
85 * --------------
86 *
87 * Before we go too far into the general way we've laid out data structures and
88 * the like, it's worth taking some time to explain how the hardware is
89 * organized. This organization informs a lot of how we do things at this time
90 * in the driver.
91 *
92 * Each physical device consists of a number of one or more ports, which are
93 * considered physical functions in the PCI sense and thus each get enumerated
94 * by the system, resulting in an instance being created and attached to. While
95 * there are many resources that are unique to each physical function eg.
96 * instance of the device, there are many that are shared across all of them.
97 * Several resources have an amount reserved for each Virtual Station Interface
98 * (VSI) and then a static pool of resources, available for all functions on the
99 * card.
100 *
101 * The most important resource in hardware are its transmit and receive queue
102 * pairs (i40e_trqpair_t). These should be thought of as rings in GLDv3
103 * parlance. There are a set number of these on each device; however, they are
104 * statically partitioned among all of the different physical functions.
105 *
106 * 'Fortville' (the code name for this device family) is basically a switch. To
107 * map MAC addresses and other things to queues, we end up having to create
108 * Virtual Station Interfaces (VSIs) and establish forwarding rules that direct
109 * traffic to a queue. A VSI owns a collection of queues and has a series of
110 * forwarding rules that point to it. One way to think of this is to treat it
111 * like MAC does a VNIC. When MAC refers to a group, a collection of rings and
112 * classification resources, that is a VSI in i40e.
113 *
114 * The sets of VSIs is shared across the entire device, though there may be some
115 * amount that are reserved to each PF. Because the GLDv3 does not let us change
116 * the number of groups dynamically, we instead statically divide this amount
117 * evenly between all the functions that exist. In addition, we have the same
118 * problem with the mac address forwarding rules. There are a static number that
119 * exist shared across all the functions.
120 *
121 * To handle both of these resources, what we end up doing is going through and
122 * determining which functions belong to the same device. Nominally one might do
123 * this by having a nexus driver; however, a prime requirement for a nexus
124 * driver is identifying the various children and activating them. While it is
125 * possible to get this information from NVRAM, we would end up duplicating a
126 * lot of the PCI enumeration logic. Really, at the end of the day, the device
127 * doesn't give us the traditional identification properties we want from a
128 * nexus driver.
129 *
130 * Instead, we rely on some properties that are guaranteed to be unique. While
131 * it might be tempting to leverage the PBA or serial number of the device from
132 * NVRAM, there is nothing that says that two devices can't be mis-programmed to
133 * have the same values in NVRAM. Instead, we uniquely identify a group of
134 * functions based on their parent in the /devices tree, their PCI bus and PCI
135 * function identifiers. Using either on their own may not be sufficient.
136 *
137 * For each unique PCI device that we encounter, we'll create a i40e_device_t.
138 * From there, because we don't have a good way to tell the GLDv3 about sharing
139 * resources between everything, we'll end up just dividing the resources
140 * evenly between all of the functions. Longer term, if we don't have to declare
141 * to the GLDv3 that these resources are shared, then we'll maintain a pool and
142 * have each PF allocate from the pool in the device, thus if only two of four
143 * ports are being used, for example, then all of the resources can still be
144 * used.
145 *
146 * -------------------------------------------
147 * Transmit and Receive Queue Pair Allocations
148 * -------------------------------------------
149 *
150 * NVRAM ends up assigning each PF its own share of the transmit and receive LAN
151 * queue pairs, we have no way of modifying it, only observing it. From there,
152 * it's up to us to map these queues to VSIs and VFs. Since we don't support any
153 * VFs at this time, we only focus on assignments to VSIs.
154 *
155 * At the moment, we used a static mapping of transmit/receive queue pairs to a
156 * given VSI (eg. rings to a group). Though in the fullness of time, we want to
157 * make this something which is fully dynamic and take advantage of documented,
158 * but not yet available functionality for adding filters based on VXLAN and
159 * other encapsulation technologies.
160 *
161 * -------------------------------------
162 * Broadcast, Multicast, and Promiscuous
163 * -------------------------------------
164 *
165 * As part of the GLDv3, we need to make sure that we can handle receiving
166 * broadcast and multicast traffic. As well as enabling promiscuous mode when
167 * requested. GLDv3 requires that all broadcast and multicast traffic be
168 * retrieved by the default group, eg. the first one. This is the same thing as
169 * the default VSI.
170 *
171 * To receieve broadcast traffic, we enable it through the admin queue, rather
172 * than use one of our filters for it. For multicast traffic, we reserve a
173 * certain number of the hash filters and assign them to a given PF. When we
174 * exceed those, we then switch to using promiscuous mode for multicast traffic.
175 *
176 * More specifically, once we exceed the number of filters (indicated because
177 * the i40e_t`i40e_resources.ifr_nmcastfilt ==
178 * i40e_t`i40e_resources.ifr_nmcastfilt_used), we then instead need to toggle
179 * promiscuous mode. If promiscuous mode is toggled then we keep track of the
180 * number of MACs added to it by incrementing i40e_t`i40e_mcast_promisc_count.
181 * That will stay enabled until that count reaches zero indicating that we have
182 * only added multicast addresses that we have a corresponding entry for.
183 *
184 * Because MAC itself wants to toggle promiscuous mode, which includes both
185 * unicast and multicast traffic, we go through and keep track of that
186 * ourselves. That is maintained through the use of the i40e_t`i40e_promisc_on
187 * member.
188 *
189 * --------------
190 * VSI Management
191 * --------------
192 *
193 * The PFs share 384 VSIs. The firmware creates one VSI per PF by default.
194 * During chip start we retrieve the SEID of this VSI and assign it as the
195 * default VSI for our VEB (one VEB per PF). We then add additional VSIs to
196 * the VEB up to the determined number of rx groups: i40e_t`i40e_num_rx_groups.
197 * We currently cap this number to I40E_GROUP_MAX to a) make sure all PFs can
198 * allocate the same number of VSIs, and b) to keep the interrupt multiplexing
199 * under control. In the future, when we improve the interrupt allocation, we
200 * may want to revisit this cap to make better use of the available VSIs. The
201 * VSI allocation and configuration can be found in i40e_chip_start().
202 *
203 * ----------------
204 * Structure Layout
205 * ----------------
206 *
207 * The following images relates the core data structures together. The primary
208 * structure in the system is the i40e_t. It itself contains multiple rings,
209 * i40e_trqpair_t's which contain the various transmit and receive data. The
210 * receive data is stored outside of the i40e_trqpair_t and instead in the
211 * i40e_rx_data_t. The i40e_t has a corresponding i40e_device_t which keeps
212 * track of per-physical device state. Finally, for every active descriptor,
213 * there is a corresponding control block, which is where the
214 * i40e_rx_control_block_t and the i40e_tx_control_block_t come from.
215 *
216 * +-----------------------+ +-----------------------+
217 * | Global i40e_t list | | Global Device list |
218 * | | +--| |
219 * | i40e_glist | | | i40e_dlist |
220 * +-----------------------+ | +-----------------------+
221 * | v
222 * | +------------------------+ +-----------------------+
223 * | | Device-wide Structure |----->| Device-wide Structure |--> ...
224 * | | i40e_device_t | | i40e_device_t |
225 * | | | +-----------------------+
226 * | | dev_info_t * ------+--> Parent in devices tree.
227 * | | uint_t ------+--> PCI bus number
228 * | | uint_t ------+--> PCI device number
229 * | | uint_t ------+--> Number of functions
230 * | | i40e_switch_rsrcs_t ---+--> Captured total switch resources
231 * | | list_t ------+-------------+
232 * | +------------------------+ |
233 * | ^ |
234 * | +--------+ |
235 * | | v
236 * | +---------------------------+ | +-------------------+
237 * +->| GLDv3 Device, per PF |-----|-->| GLDv3 Device (PF) |--> ...
238 * | i40e_t | | | i40e_t |
239 * | **Primary Structure** | | +-------------------+
240 * | | |
241 * | i40e_device_t * --+-----+
242 * | i40e_state_t --+---> Device State
243 * | i40e_hw_t --+---> Intel common code structure
244 * | mac_handle_t --+---> GLDv3 handle to MAC
245 * | ddi_periodic_t --+---> Link activity timer
246 * | i40e_vsi_t * --+---> Array of VSIs
247 * | i40e_func_rsrc_t --+---> Available hardware resources
248 * | i40e_switch_rsrc_t * --+---> Switch resource snapshot
249 * | i40e_sdu --+---> Current MTU
250 * | i40e_frame_max --+---> Current HW frame size
251 * | i40e_uaddr_t * --+---> Array of assigned unicast MACs
252 * | i40e_maddr_t * --+---> Array of assigned multicast MACs
253 * | i40e_mcast_promisccount --+---> Active multicast state
254 * | i40e_promisc_on --+---> Current promiscuous mode state
255 * | uint_t --+---> Number of transmit/receive pairs
256 * | i40e_rx_group_t * --+---> Array of Rx groups
257 * | kstat_t * --+---> PF kstats
258 * | i40e_pf_stats_t --+---> PF kstat backing data
259 * | i40e_trqpair_t * --+---------+
260 * +---------------------------+ |
261 * |
262 * v
263 * +-------------------------------+ +-----------------------------+
264 * | Transmit/Receive Queue Pair |-------| Transmit/Receive Queue Pair |->...
265 * | i40e_trqpair_t | | i40e_trqpair_t |
266 * + Ring Data Structure | +-----------------------------+
267 * | |
268 * | mac_ring_handle_t +--> MAC RX ring handle
269 * | mac_ring_handle_t +--> MAC TX ring handle
270 * | i40e_rxq_stat_t --+--> RX Queue stats
271 * | i40e_txq_stat_t --+--> TX Queue stats
272 * | uint32_t (tx ring size) +--> TX Ring Size
273 * | uint32_t (tx free list size) +--> TX Free List Size
274 * | i40e_dma_buffer_t --------+--> TX Descriptor ring DMA
275 * | i40e_tx_desc_t * --------+--> TX descriptor ring
276 * | volatile unt32_t * +--> TX Write back head
277 * | uint32_t -------+--> TX ring head
278 * | uint32_t -------+--> TX ring tail
279 * | uint32_t -------+--> Num TX desc free
280 * | i40e_tx_control_block_t * --+--> TX control block array ---+
281 * | i40e_tx_control_block_t ** --+--> TCB work list ----+
282 * | i40e_tx_control_block_t ** --+--> TCB free list ---+
283 * | uint32_t -------+--> Free TCB count |
284 * | i40e_rx_data_t * -------+--+ v
285 * +-------------------------------+ | +---------------------------+
286 * | | Per-TX Frame Metadata |
287 * | | i40e_tx_control_block_t |
288 * +--------------------+ | |
289 * | mblk to transmit <--+--- mblk_t * |
290 * | type of transmit <--+--- i40e_tx_type_t |
291 * | TX DMA handle <--+--- ddi_dma_handle_t |
292 * v TX DMA buffer <--+--- i40e_dma_buffer_t |
293 * +------------------------------+ +---------------------------+
294 * | Core Receive Data |
295 * | i40e_rx_data_t |
296 * | |
297 * | i40e_dma_buffer_t --+--> RX descriptor DMA Data
298 * | i40e_rx_desc_t --+--> RX descriptor ring
299 * | uint32_t --+--> Next free desc.
300 * | i40e_rx_control_block_t * --+--> RX Control Block Array ---+
301 * | i40e_rx_control_block_t ** --+--> RCB work list ---+
302 * | i40e_rx_control_block_t ** --+--> RCB free list ---+
303 * +------------------------------+ |
304 * ^ |
305 * | +---------------------------+ |
306 * | | Per-RX Frame Metadata |<---------------+
307 * | | i40e_rx_control_block_t |
308 * | | |
309 * | | mblk_t * ----+--> Received mblk_t data
310 * | | uint32_t ----+--> Reference count
311 * | | i40e_dma_buffer_t ----+--> Receive data DMA info
312 * | | frtn_t ----+--> mblk free function info
313 * +-----+-- i40e_rx_data_t * |
314 * +---------------------------+
315 *
316 * -------------
317 * Lock Ordering
318 * -------------
319 *
320 * In order to ensure that we don't deadlock, the following represents the
321 * lock order being used. When grabbing locks, follow the following order. Lower
322 * numbers are more important. Thus, the i40e_glock which is number 0, must be
323 * taken before any other locks in the driver. On the other hand, the
324 * i40e_t`i40e_stat_lock, has the highest number because it's the least
325 * important lock. Note, that just because one lock is higher than another does
326 * not mean that all intermediary locks are required.
327 *
328 * 0) i40e_glock
329 * 1) i40e_t`i40e_general_lock
330 *
331 * 2) i40e_trqpair_t`itrq_rx_lock
332 * 3) i40e_trqpair_t`itrq_tx_lock
333 * 4) i40e_trqpair_t`itrq_intr_lock
334 * 5) i40e_t`i40e_rx_pending_lock
335 * 6) i40e_trqpair_t`itrq_tcb_lock
336 *
337 * 7) i40e_t`i40e_stat_lock
338 *
339 * Rules and expectations:
340 *
341 * 1) A thread holding locks belong to one PF should not hold locks belonging to
342 * a second. If for some reason this becomes necessary, locks should be grabbed
343 * based on the list order in the i40e_device_t, which implies that the
344 * i40e_glock is held.
345 *
346 * 2) When grabbing locks between multiple transmit and receive queues, the
347 * locks for the lowest number transmit/receive queue should be grabbed first.
348 *
349 * 3) When grabbing both the transmit and receive lock for a given queue, always
350 * grab i40e_trqpair_t`itrq_rx_lock before the i40e_trqpair_t`itrq_tx_lock.
351 *
352 * 4) The following pairs of locks are not expected to be held at the same time:
353 *
354 * o i40e_t`i40e_rx_pending_lock and i40e_trqpair_t`itrq_tcb_lock
355 * o i40e_trqpair_t`itrq_intr_lock is not expected to be held with any
356 * other lock except i40e_t`i40e_general_lock in mc_start(9E) and
357 * mc_stop(9e).
358 *
359 * -----------
360 * Future Work
361 * -----------
362 *
363 * At the moment the i40e_t driver is rather bare bones, allowing us to start
364 * getting data flowing and folks using it while we develop additional features.
365 * While bugs have been filed to cover this future work, the following gives an
366 * overview of expected work:
367 *
368 * o DMA binding and breaking up the locking in ring recycling.
369 * o Enhanced detection of device errors
370 * o Participation in IRM
371 * o FMA device reset
372 * o Stall detection, temperature error detection, etc.
373 * o More dynamic resource pools
374 */
375
376 #include "i40e_sw.h"
377
378 static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.3";
379
380 /*
381 * The i40e_glock primarily protects the lists below and the i40e_device_t
382 * structures.
383 */
384 static kmutex_t i40e_glock;
385 static list_t i40e_glist;
386 static list_t i40e_dlist;
387
388 /*
389 * Access attributes for register mapping.
390 */
391 static ddi_device_acc_attr_t i40e_regs_acc_attr = {
392 DDI_DEVICE_ATTR_V1,
393 DDI_STRUCTURE_LE_ACC,
394 DDI_STRICTORDER_ACC,
395 DDI_FLAGERR_ACC
396 };
397
398 /*
399 * Logging function for this driver.
400 */
401 static void
402 i40e_dev_err(i40e_t *i40e, int level, boolean_t console, const char *fmt,
403 va_list ap)
404 {
405 char buf[1024];
406
407 (void) vsnprintf(buf, sizeof (buf), fmt, ap);
408
409 if (i40e == NULL) {
410 cmn_err(level, (console) ? "%s: %s" : "!%s: %s",
411 I40E_MODULE_NAME, buf);
412 } else {
413 dev_err(i40e->i40e_dip, level, (console) ? "%s" : "!%s",
414 buf);
415 }
416 }
417
418 /*
419 * Because there's the stupid trailing-comma problem with the C preprocessor
420 * and variable arguments, I need to instantiate these. Pardon the redundant
421 * code.
422 */
423 /*PRINTFLIKE2*/
424 void
425 i40e_error(i40e_t *i40e, const char *fmt, ...)
426 {
427 va_list ap;
428
429 va_start(ap, fmt);
430 i40e_dev_err(i40e, CE_WARN, B_FALSE, fmt, ap);
431 va_end(ap);
432 }
433
434 /*PRINTFLIKE2*/
435 void
436 i40e_log(i40e_t *i40e, const char *fmt, ...)
437 {
438 va_list ap;
439
440 va_start(ap, fmt);
441 i40e_dev_err(i40e, CE_NOTE, B_FALSE, fmt, ap);
442 va_end(ap);
443 }
444
445 /*PRINTFLIKE2*/
446 void
447 i40e_notice(i40e_t *i40e, const char *fmt, ...)
448 {
449 va_list ap;
450
451 va_start(ap, fmt);
452 i40e_dev_err(i40e, CE_NOTE, B_TRUE, fmt, ap);
453 va_end(ap);
454 }
455
456 /*
457 * Various parts of the driver need to know if the controller is from the X722
458 * family, which has a few additional capabilities and different programming
459 * means. We don't consider virtual functions as part of this as they are quite
460 * different and will require substantially more work.
461 */
462 static boolean_t
463 i40e_is_x722(i40e_t *i40e)
464 {
465 return (i40e->i40e_hw_space.mac.type == I40E_MAC_X722);
466 }
467
468 static void
469 i40e_device_rele(i40e_t *i40e)
470 {
471 i40e_device_t *idp = i40e->i40e_device;
472
473 if (idp == NULL)
474 return;
475
476 mutex_enter(&i40e_glock);
477 VERIFY(idp->id_nreg > 0);
478 list_remove(&idp->id_i40e_list, i40e);
479 idp->id_nreg--;
480 if (idp->id_nreg == 0) {
481 list_remove(&i40e_dlist, idp);
482 list_destroy(&idp->id_i40e_list);
483 kmem_free(idp->id_rsrcs, sizeof (i40e_switch_rsrc_t) *
484 idp->id_rsrcs_alloc);
485 kmem_free(idp, sizeof (i40e_device_t));
486 }
487 i40e->i40e_device = NULL;
488 mutex_exit(&i40e_glock);
489 }
490
491 static i40e_device_t *
492 i40e_device_find(i40e_t *i40e, dev_info_t *parent, uint_t bus, uint_t device)
493 {
494 i40e_device_t *idp;
495 mutex_enter(&i40e_glock);
496 for (idp = list_head(&i40e_dlist); idp != NULL;
497 idp = list_next(&i40e_dlist, idp)) {
498 if (idp->id_parent == parent && idp->id_pci_bus == bus &&
499 idp->id_pci_device == device) {
500 break;
501 }
502 }
503
504 if (idp != NULL) {
505 VERIFY(idp->id_nreg < idp->id_nfuncs);
506 idp->id_nreg++;
507 } else {
508 i40e_hw_t *hw = &i40e->i40e_hw_space;
509 ASSERT(hw->num_ports > 0);
510 ASSERT(hw->num_partitions > 0);
511
512 /*
513 * The Intel common code doesn't exactly keep the number of PCI
514 * functions. But it calculates it during discovery of
515 * partitions and ports. So what we do is undo the calculation
516 * that it does originally, as functions are evenly spread
517 * across ports in the rare case of partitions.
518 */
519 idp = kmem_alloc(sizeof (i40e_device_t), KM_SLEEP);
520 idp->id_parent = parent;
521 idp->id_pci_bus = bus;
522 idp->id_pci_device = device;
523 idp->id_nfuncs = hw->num_ports * hw->num_partitions;
524 idp->id_nreg = 1;
525 idp->id_rsrcs_alloc = i40e->i40e_switch_rsrc_alloc;
526 idp->id_rsrcs_act = i40e->i40e_switch_rsrc_actual;
527 idp->id_rsrcs = kmem_alloc(sizeof (i40e_switch_rsrc_t) *
528 idp->id_rsrcs_alloc, KM_SLEEP);
529 bcopy(i40e->i40e_switch_rsrcs, idp->id_rsrcs,
530 sizeof (i40e_switch_rsrc_t) * idp->id_rsrcs_alloc);
531 list_create(&idp->id_i40e_list, sizeof (i40e_t),
532 offsetof(i40e_t, i40e_dlink));
533
534 list_insert_tail(&i40e_dlist, idp);
535 }
536
537 list_insert_tail(&idp->id_i40e_list, i40e);
538 mutex_exit(&i40e_glock);
539
540 return (idp);
541 }
542
543 static void
544 i40e_link_state_set(i40e_t *i40e, link_state_t state)
545 {
546 if (i40e->i40e_link_state == state)
547 return;
548
549 i40e->i40e_link_state = state;
550 mac_link_update(i40e->i40e_mac_hdl, i40e->i40e_link_state);
551 }
552
553 /*
554 * This is a basic link check routine. Mostly we're using this just to see
555 * if we can get any accurate information about the state of the link being
556 * up or down, as well as updating the link state, speed, etc. information.
557 */
558 void
559 i40e_link_check(i40e_t *i40e)
560 {
561 i40e_hw_t *hw = &i40e->i40e_hw_space;
562 boolean_t ls;
563 int ret;
564
565 ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
566
567 hw->phy.get_link_info = B_TRUE;
568 if ((ret = i40e_get_link_status(hw, &ls)) != I40E_SUCCESS) {
569 i40e->i40e_s_link_status_errs++;
570 i40e->i40e_s_link_status_lasterr = ret;
571 return;
572 }
573
574 /*
575 * Firmware abstracts all of the mac and phy information for us, so we
576 * can use i40e_get_link_status to determine the current state.
577 */
578 if (ls == B_TRUE) {
579 enum i40e_aq_link_speed speed;
580
581 speed = i40e_get_link_speed(hw);
582
583 /*
584 * Translate from an i40e value to a value in Mbits/s.
585 */
586 switch (speed) {
587 case I40E_LINK_SPEED_100MB:
588 i40e->i40e_link_speed = 100;
589 break;
590 case I40E_LINK_SPEED_1GB:
591 i40e->i40e_link_speed = 1000;
592 break;
593 case I40E_LINK_SPEED_10GB:
594 i40e->i40e_link_speed = 10000;
595 break;
596 case I40E_LINK_SPEED_20GB:
597 i40e->i40e_link_speed = 20000;
598 break;
599 case I40E_LINK_SPEED_40GB:
600 i40e->i40e_link_speed = 40000;
601 break;
602 case I40E_LINK_SPEED_25GB:
603 i40e->i40e_link_speed = 25000;
604 break;
605 default:
606 i40e->i40e_link_speed = 0;
607 break;
608 }
609
610 /*
611 * At this time, hardware does not support half-duplex
612 * operation, hence why we don't ask the hardware about our
613 * current speed.
614 */
615 i40e->i40e_link_duplex = LINK_DUPLEX_FULL;
616 i40e_link_state_set(i40e, LINK_STATE_UP);
617 } else {
618 i40e->i40e_link_speed = 0;
619 i40e->i40e_link_duplex = 0;
620 i40e_link_state_set(i40e, LINK_STATE_DOWN);
621 }
622 }
623
624 static void
625 i40e_rem_intrs(i40e_t *i40e)
626 {
627 int i, rc;
628
629 for (i = 0; i < i40e->i40e_intr_count; i++) {
630 rc = ddi_intr_free(i40e->i40e_intr_handles[i]);
631 if (rc != DDI_SUCCESS) {
632 i40e_log(i40e, "failed to free interrupt %d: %d",
633 i, rc);
634 }
635 }
636
637 kmem_free(i40e->i40e_intr_handles, i40e->i40e_intr_size);
638 i40e->i40e_intr_handles = NULL;
639 }
640
641 static void
642 i40e_rem_intr_handlers(i40e_t *i40e)
643 {
644 int i, rc;
645
646 for (i = 0; i < i40e->i40e_intr_count; i++) {
647 rc = ddi_intr_remove_handler(i40e->i40e_intr_handles[i]);
648 if (rc != DDI_SUCCESS) {
649 i40e_log(i40e, "failed to remove interrupt %d: %d",
650 i, rc);
651 }
652 }
653 }
654
655 /*
656 * illumos Fault Management Architecture (FMA) support.
657 */
658
659 int
660 i40e_check_acc_handle(ddi_acc_handle_t handle)
661 {
662 ddi_fm_error_t de;
663
664 ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION);
665 ddi_fm_acc_err_clear(handle, DDI_FME_VERSION);
666 return (de.fme_status);
667 }
668
669 int
670 i40e_check_dma_handle(ddi_dma_handle_t handle)
671 {
672 ddi_fm_error_t de;
673
674 ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION);
675 return (de.fme_status);
676 }
677
678 /*
679 * Fault service error handling callback function.
680 */
681 /* ARGSUSED */
682 static int
683 i40e_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data)
684 {
685 pci_ereport_post(dip, err, NULL);
686 return (err->fme_status);
687 }
688
689 static void
690 i40e_fm_init(i40e_t *i40e)
691 {
692 ddi_iblock_cookie_t iblk;
693
694 i40e->i40e_fm_capabilities = ddi_prop_get_int(DDI_DEV_T_ANY,
695 i40e->i40e_dip, DDI_PROP_DONTPASS, "fm_capable",
696 DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
697 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE);
698
699 if (i40e->i40e_fm_capabilities < 0) {
700 i40e->i40e_fm_capabilities = 0;
701 } else if (i40e->i40e_fm_capabilities > 0xf) {
702 i40e->i40e_fm_capabilities = DDI_FM_EREPORT_CAPABLE |
703 DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE |
704 DDI_FM_ERRCB_CAPABLE;
705 }
706
707 /*
708 * Only register with IO Fault Services if we have some capability
709 */
710 if (i40e->i40e_fm_capabilities & DDI_FM_ACCCHK_CAPABLE) {
711 i40e_regs_acc_attr.devacc_attr_access = DDI_FLAGERR_ACC;
712 } else {
713 i40e_regs_acc_attr.devacc_attr_access = DDI_DEFAULT_ACC;
714 }
715
716 if (i40e->i40e_fm_capabilities) {
717 ddi_fm_init(i40e->i40e_dip, &i40e->i40e_fm_capabilities, &iblk);
718
719 if (DDI_FM_EREPORT_CAP(i40e->i40e_fm_capabilities) ||
720 DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities)) {
721 pci_ereport_setup(i40e->i40e_dip);
722 }
723
724 if (DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities)) {
725 ddi_fm_handler_register(i40e->i40e_dip,
726 i40e_fm_error_cb, (void*)i40e);
727 }
728 }
729
730 if (i40e->i40e_fm_capabilities & DDI_FM_DMACHK_CAPABLE) {
731 i40e_init_dma_attrs(i40e, B_TRUE);
732 } else {
733 i40e_init_dma_attrs(i40e, B_FALSE);
734 }
735 }
736
737 static void
738 i40e_fm_fini(i40e_t *i40e)
739 {
740 if (i40e->i40e_fm_capabilities) {
741
742 if (DDI_FM_EREPORT_CAP(i40e->i40e_fm_capabilities) ||
743 DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities))
744 pci_ereport_teardown(i40e->i40e_dip);
745
746 if (DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities))
747 ddi_fm_handler_unregister(i40e->i40e_dip);
748
749 ddi_fm_fini(i40e->i40e_dip);
750 }
751 }
752
753 void
754 i40e_fm_ereport(i40e_t *i40e, char *detail)
755 {
756 uint64_t ena;
757 char buf[FM_MAX_CLASS];
758
759 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
760 ena = fm_ena_generate(0, FM_ENA_FMT1);
761 if (DDI_FM_EREPORT_CAP(i40e->i40e_fm_capabilities)) {
762 ddi_fm_ereport_post(i40e->i40e_dip, buf, ena, DDI_NOSLEEP,
763 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, NULL);
764 }
765 }
766
767 /*
768 * Here we're trying to set the SEID of the default VSI. In general,
769 * when we come through and look at this shortly after attach, we
770 * expect there to only be a single element present, which is the
771 * default VSI. Importantly, each PF seems to not see any other
772 * devices, in part because of the simple switch mode that we're
773 * using. If for some reason, we see more artifacts, we'll need to
774 * revisit what we're doing here.
775 */
776 static boolean_t
777 i40e_set_def_vsi_seid(i40e_t *i40e)
778 {
779 i40e_hw_t *hw = &i40e->i40e_hw_space;
780 struct i40e_aqc_get_switch_config_resp *sw_config;
781 uint8_t aq_buf[I40E_AQ_LARGE_BUF];
782 uint16_t next = 0;
783 int rc;
784
785 /* LINTED: E_BAD_PTR_CAST_ALIGN */
786 sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf;
787 rc = i40e_aq_get_switch_config(hw, sw_config, sizeof (aq_buf), &next,
788 NULL);
789 if (rc != I40E_SUCCESS) {
790 i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d",
791 rc, hw->aq.asq_last_status);
792 return (B_FALSE);
793 }
794
795 if (LE_16(sw_config->header.num_reported) != 1) {
796 i40e_error(i40e, "encountered multiple (%d) switching units "
797 "during attach, not proceeding",
798 LE_16(sw_config->header.num_reported));
799 return (B_FALSE);
800 }
801
802 I40E_DEF_VSI_SEID(i40e) = sw_config->element[0].seid;
803 return (B_TRUE);
804 }
805
806 /*
807 * Get the SEID of the uplink MAC.
808 */
809 static int
810 i40e_get_mac_seid(i40e_t *i40e)
811 {
812 i40e_hw_t *hw = &i40e->i40e_hw_space;
813 struct i40e_aqc_get_switch_config_resp *sw_config;
814 uint8_t aq_buf[I40E_AQ_LARGE_BUF];
815 uint16_t next = 0;
816 int rc;
817
818 /* LINTED: E_BAD_PTR_CAST_ALIGN */
819 sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf;
820 rc = i40e_aq_get_switch_config(hw, sw_config, sizeof (aq_buf), &next,
821 NULL);
822 if (rc != I40E_SUCCESS) {
823 i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d",
824 rc, hw->aq.asq_last_status);
825 return (-1);
826 }
827
828 return (LE_16(sw_config->element[0].uplink_seid));
829 }
830
831 /*
832 * We need to fill the i40e_hw_t structure with the capabilities of this PF. We
833 * must also provide the memory for it; however, we don't need to keep it around
834 * to the call to the common code. It takes it and parses it into an internal
835 * structure.
836 */
837 static boolean_t
838 i40e_get_hw_capabilities(i40e_t *i40e, i40e_hw_t *hw)
839 {
840 struct i40e_aqc_list_capabilities_element_resp *buf;
841 int rc;
842 size_t len;
843 uint16_t needed;
844 int nelems = I40E_HW_CAP_DEFAULT;
845
846 len = nelems * sizeof (*buf);
847
848 for (;;) {
849 ASSERT(len > 0);
850 buf = kmem_alloc(len, KM_SLEEP);
851 rc = i40e_aq_discover_capabilities(hw, buf, len,
852 &needed, i40e_aqc_opc_list_func_capabilities, NULL);
853 kmem_free(buf, len);
854
855 if (hw->aq.asq_last_status == I40E_AQ_RC_ENOMEM &&
856 nelems == I40E_HW_CAP_DEFAULT) {
857 if (nelems == needed) {
858 i40e_error(i40e, "Capability discovery failed "
859 "due to byzantine common code");
860 return (B_FALSE);
861 }
862 len = needed;
863 continue;
864 } else if (rc != I40E_SUCCESS ||
865 hw->aq.asq_last_status != I40E_AQ_RC_OK) {
866 i40e_error(i40e, "Capability discovery failed: %d", rc);
867 return (B_FALSE);
868 }
869
870 break;
871 }
872
873 return (B_TRUE);
874 }
875
876 /*
877 * Obtain the switch's capabilities as seen by this PF and keep it around for
878 * our later use.
879 */
880 static boolean_t
881 i40e_get_switch_resources(i40e_t *i40e)
882 {
883 i40e_hw_t *hw = &i40e->i40e_hw_space;
884 uint8_t cnt = 2;
885 uint8_t act;
886 size_t size;
887 i40e_switch_rsrc_t *buf;
888
889 for (;;) {
890 enum i40e_status_code ret;
891 size = cnt * sizeof (i40e_switch_rsrc_t);
892 ASSERT(size > 0);
893 if (size > UINT16_MAX)
894 return (B_FALSE);
895 buf = kmem_alloc(size, KM_SLEEP);
896
897 ret = i40e_aq_get_switch_resource_alloc(hw, &act, buf,
898 cnt, NULL);
899 if (ret == I40E_ERR_ADMIN_QUEUE_ERROR &&
900 hw->aq.asq_last_status == I40E_AQ_RC_EINVAL) {
901 kmem_free(buf, size);
902 cnt += I40E_SWITCH_CAP_DEFAULT;
903 continue;
904 } else if (ret != I40E_SUCCESS) {
905 kmem_free(buf, size);
906 i40e_error(i40e,
907 "failed to retrieve switch statistics: %d", ret);
908 return (B_FALSE);
909 }
910
911 break;
912 }
913
914 i40e->i40e_switch_rsrc_alloc = cnt;
915 i40e->i40e_switch_rsrc_actual = act;
916 i40e->i40e_switch_rsrcs = buf;
917
918 return (B_TRUE);
919 }
920
921 static void
922 i40e_cleanup_resources(i40e_t *i40e)
923 {
924 if (i40e->i40e_uaddrs != NULL) {
925 kmem_free(i40e->i40e_uaddrs, sizeof (i40e_uaddr_t) *
926 i40e->i40e_resources.ifr_nmacfilt);
927 i40e->i40e_uaddrs = NULL;
928 }
929
930 if (i40e->i40e_maddrs != NULL) {
931 kmem_free(i40e->i40e_maddrs, sizeof (i40e_maddr_t) *
932 i40e->i40e_resources.ifr_nmcastfilt);
933 i40e->i40e_maddrs = NULL;
934 }
935
936 if (i40e->i40e_switch_rsrcs != NULL) {
937 size_t sz = sizeof (i40e_switch_rsrc_t) *
938 i40e->i40e_switch_rsrc_alloc;
939 ASSERT(sz > 0);
940 kmem_free(i40e->i40e_switch_rsrcs, sz);
941 i40e->i40e_switch_rsrcs = NULL;
942 }
943
944 if (i40e->i40e_device != NULL)
945 i40e_device_rele(i40e);
946 }
947
948 static boolean_t
949 i40e_get_available_resources(i40e_t *i40e)
950 {
951 dev_info_t *parent;
952 uint16_t bus, device, func;
953 uint_t nregs;
954 int *regs, i;
955 i40e_device_t *idp;
956 i40e_hw_t *hw = &i40e->i40e_hw_space;
957
958 parent = ddi_get_parent(i40e->i40e_dip);
959
960 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, i40e->i40e_dip, 0, "reg",
961 ®s, &nregs) != DDI_PROP_SUCCESS) {
962 return (B_FALSE);
963 }
964
965 if (nregs < 1) {
966 ddi_prop_free(regs);
967 return (B_FALSE);
968 }
969
970 bus = PCI_REG_BUS_G(regs[0]);
971 device = PCI_REG_DEV_G(regs[0]);
972 func = PCI_REG_FUNC_G(regs[0]);
973 ddi_prop_free(regs);
974
975 i40e->i40e_hw_space.bus.func = func;
976 i40e->i40e_hw_space.bus.device = device;
977
978 if (i40e_get_switch_resources(i40e) == B_FALSE) {
979 return (B_FALSE);
980 }
981
982 /*
983 * To calculate the total amount of a resource we have available, we
984 * need to add how many our i40e_t thinks it has guaranteed, if any, and
985 * then we need to go through and divide the number of available on the
986 * device, which was snapshotted before anyone should have allocated
987 * anything, and use that to derive how many are available from the
988 * pool. Longer term, we may want to turn this into something that's
989 * more of a pool-like resource that everything can share (though that
990 * may require some more assistance from MAC).
991 *
992 * Though for transmit and receive queue pairs, we just have to ask
993 * firmware instead.
994 */
995 idp = i40e_device_find(i40e, parent, bus, device);
996 i40e->i40e_device = idp;
997 i40e->i40e_resources.ifr_nvsis = 0;
998 i40e->i40e_resources.ifr_nvsis_used = 0;
999 i40e->i40e_resources.ifr_nmacfilt = 0;
1000 i40e->i40e_resources.ifr_nmacfilt_used = 0;
1001 i40e->i40e_resources.ifr_nmcastfilt = 0;
1002 i40e->i40e_resources.ifr_nmcastfilt_used = 0;
1003
1004 for (i = 0; i < i40e->i40e_switch_rsrc_actual; i++) {
1005 i40e_switch_rsrc_t *srp = &i40e->i40e_switch_rsrcs[i];
1006
1007 switch (srp->resource_type) {
1008 case I40E_AQ_RESOURCE_TYPE_VSI:
1009 i40e->i40e_resources.ifr_nvsis +=
1010 LE_16(srp->guaranteed);
1011 i40e->i40e_resources.ifr_nvsis_used = LE_16(srp->used);
1012 break;
1013 case I40E_AQ_RESOURCE_TYPE_MACADDR:
1014 i40e->i40e_resources.ifr_nmacfilt +=
1015 LE_16(srp->guaranteed);
1016 i40e->i40e_resources.ifr_nmacfilt_used =
1017 LE_16(srp->used);
1018 break;
1019 case I40E_AQ_RESOURCE_TYPE_MULTICAST_HASH:
1020 i40e->i40e_resources.ifr_nmcastfilt +=
1021 LE_16(srp->guaranteed);
1022 i40e->i40e_resources.ifr_nmcastfilt_used =
1023 LE_16(srp->used);
1024 break;
1025 default:
1026 break;
1027 }
1028 }
1029
1030 for (i = 0; i < idp->id_rsrcs_act; i++) {
1031 i40e_switch_rsrc_t *srp = &i40e->i40e_switch_rsrcs[i];
1032 switch (srp->resource_type) {
1033 case I40E_AQ_RESOURCE_TYPE_VSI:
1034 i40e->i40e_resources.ifr_nvsis +=
1035 LE_16(srp->total_unalloced) / idp->id_nfuncs;
1036 break;
1037 case I40E_AQ_RESOURCE_TYPE_MACADDR:
1038 i40e->i40e_resources.ifr_nmacfilt +=
1039 LE_16(srp->total_unalloced) / idp->id_nfuncs;
1040 break;
1041 case I40E_AQ_RESOURCE_TYPE_MULTICAST_HASH:
1042 i40e->i40e_resources.ifr_nmcastfilt +=
1043 LE_16(srp->total_unalloced) / idp->id_nfuncs;
1044 default:
1045 break;
1046 }
1047 }
1048
1049 i40e->i40e_resources.ifr_nrx_queue = hw->func_caps.num_rx_qp;
1050 i40e->i40e_resources.ifr_ntx_queue = hw->func_caps.num_tx_qp;
1051
1052 i40e->i40e_uaddrs = kmem_zalloc(sizeof (i40e_uaddr_t) *
1053 i40e->i40e_resources.ifr_nmacfilt, KM_SLEEP);
1054 i40e->i40e_maddrs = kmem_zalloc(sizeof (i40e_maddr_t) *
1055 i40e->i40e_resources.ifr_nmcastfilt, KM_SLEEP);
1056
1057 /*
1058 * Initialize these as multicast addresses to indicate it's invalid for
1059 * sanity purposes. Think of it like 0xdeadbeef.
1060 */
1061 for (i = 0; i < i40e->i40e_resources.ifr_nmacfilt; i++)
1062 i40e->i40e_uaddrs[i].iua_mac[0] = 0x01;
1063
1064 return (B_TRUE);
1065 }
1066
1067 static boolean_t
1068 i40e_enable_interrupts(i40e_t *i40e)
1069 {
1070 int i, rc;
1071
1072 if (i40e->i40e_intr_cap & DDI_INTR_FLAG_BLOCK) {
1073 rc = ddi_intr_block_enable(i40e->i40e_intr_handles,
1074 i40e->i40e_intr_count);
1075 if (rc != DDI_SUCCESS) {
1076 i40e_error(i40e, "Interrupt block-enable failed: %d",
1077 rc);
1078 return (B_FALSE);
1079 }
1080 } else {
1081 for (i = 0; i < i40e->i40e_intr_count; i++) {
1082 rc = ddi_intr_enable(i40e->i40e_intr_handles[i]);
1083 if (rc != DDI_SUCCESS) {
1084 i40e_error(i40e,
1085 "Failed to enable interrupt %d: %d", i, rc);
1086 while (--i >= 0) {
1087 (void) ddi_intr_disable(
1088 i40e->i40e_intr_handles[i]);
1089 }
1090 return (B_FALSE);
1091 }
1092 }
1093 }
1094
1095 return (B_TRUE);
1096 }
1097
1098 static boolean_t
1099 i40e_disable_interrupts(i40e_t *i40e)
1100 {
1101 int i, rc;
1102
1103 if (i40e->i40e_intr_cap & DDI_INTR_FLAG_BLOCK) {
1104 rc = ddi_intr_block_disable(i40e->i40e_intr_handles,
1105 i40e->i40e_intr_count);
1106 if (rc != DDI_SUCCESS) {
1107 i40e_error(i40e,
1108 "Interrupt block-disabled failed: %d", rc);
1109 return (B_FALSE);
1110 }
1111 } else {
1112 for (i = 0; i < i40e->i40e_intr_count; i++) {
1113 rc = ddi_intr_disable(i40e->i40e_intr_handles[i]);
1114 if (rc != DDI_SUCCESS) {
1115 i40e_error(i40e,
1116 "Failed to disable interrupt %d: %d",
1117 i, rc);
1118 return (B_FALSE);
1119 }
1120 }
1121 }
1122
1123 return (B_TRUE);
1124 }
1125
1126 /*
1127 * Free receive & transmit rings.
1128 */
1129 static void
1130 i40e_free_trqpairs(i40e_t *i40e)
1131 {
1132 i40e_trqpair_t *itrq;
1133
1134 if (i40e->i40e_rx_groups != NULL) {
1135 kmem_free(i40e->i40e_rx_groups,
1136 sizeof (i40e_rx_group_t) * i40e->i40e_num_rx_groups);
1137 i40e->i40e_rx_groups = NULL;
1138 }
1139
1140 if (i40e->i40e_trqpairs != NULL) {
1141 for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
1142 itrq = &i40e->i40e_trqpairs[i];
1143 mutex_destroy(&itrq->itrq_intr_lock);
1144 mutex_destroy(&itrq->itrq_rx_lock);
1145 mutex_destroy(&itrq->itrq_tx_lock);
1146 mutex_destroy(&itrq->itrq_tcb_lock);
1147 cv_destroy(&itrq->itrq_intr_cv);
1148 cv_destroy(&itrq->itrq_tx_cv);
1149
1150 i40e_stats_trqpair_fini(itrq);
1151 }
1152
1153 kmem_free(i40e->i40e_trqpairs,
1154 sizeof (i40e_trqpair_t) * i40e->i40e_num_trqpairs);
1155 i40e->i40e_trqpairs = NULL;
1156 }
1157
1158 cv_destroy(&i40e->i40e_rx_pending_cv);
1159 mutex_destroy(&i40e->i40e_rx_pending_lock);
1160 mutex_destroy(&i40e->i40e_general_lock);
1161 }
1162
1163 /*
1164 * Allocate transmit and receive rings, as well as other data structures that we
1165 * need.
1166 */
1167 static boolean_t
1168 i40e_alloc_trqpairs(i40e_t *i40e)
1169 {
1170 void *mutexpri = DDI_INTR_PRI(i40e->i40e_intr_pri);
1171
1172 /*
1173 * Now that we have the priority for the interrupts, initialize
1174 * all relevant locks.
1175 */
1176 mutex_init(&i40e->i40e_general_lock, NULL, MUTEX_DRIVER, mutexpri);
1177 mutex_init(&i40e->i40e_rx_pending_lock, NULL, MUTEX_DRIVER, mutexpri);
1178 cv_init(&i40e->i40e_rx_pending_cv, NULL, CV_DRIVER, NULL);
1179
1180 i40e->i40e_trqpairs = kmem_zalloc(sizeof (i40e_trqpair_t) *
1181 i40e->i40e_num_trqpairs, KM_SLEEP);
1182 for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
1183 i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
1184
1185 itrq->itrq_i40e = i40e;
1186 mutex_init(&itrq->itrq_intr_lock, NULL, MUTEX_DRIVER, mutexpri);
1187 mutex_init(&itrq->itrq_rx_lock, NULL, MUTEX_DRIVER, mutexpri);
1188 mutex_init(&itrq->itrq_tx_lock, NULL, MUTEX_DRIVER, mutexpri);
1189 mutex_init(&itrq->itrq_tcb_lock, NULL, MUTEX_DRIVER, mutexpri);
1190 cv_init(&itrq->itrq_intr_cv, NULL, CV_DRIVER, NULL);
1191 cv_init(&itrq->itrq_tx_cv, NULL, CV_DRIVER, NULL);
1192 itrq->itrq_index = i;
1193 itrq->itrq_intr_quiesce = B_TRUE;
1194 itrq->itrq_tx_quiesce = B_TRUE;
1195 }
1196
1197 for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
1198 /*
1199 * Keeping this in a separate iteration makes the
1200 * clean up path safe.
1201 */
1202 if (!i40e_stats_trqpair_init(&i40e->i40e_trqpairs[i])) {
1203 i40e_free_trqpairs(i40e);
1204 return (B_FALSE);
1205 }
1206 }
1207
1208 i40e->i40e_rx_groups = kmem_zalloc(sizeof (i40e_rx_group_t) *
1209 i40e->i40e_num_rx_groups, KM_SLEEP);
1210
1211 for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
1212 i40e_rx_group_t *rxg = &i40e->i40e_rx_groups[i];
1213
1214 rxg->irg_index = i;
1215 rxg->irg_i40e = i40e;
1216 }
1217
1218 return (B_TRUE);
1219 }
1220
1221
1222
1223 /*
1224 * Unless a .conf file already overrode i40e_t structure values, they will
1225 * be 0, and need to be set in conjunction with the now-available HW report.
1226 */
1227 /* ARGSUSED */
1228 static void
1229 i40e_hw_to_instance(i40e_t *i40e, i40e_hw_t *hw)
1230 {
1231 if (i40e->i40e_num_trqpairs_per_vsi == 0) {
1232 if (i40e_is_x722(i40e)) {
1233 i40e->i40e_num_trqpairs_per_vsi =
1234 I40E_722_MAX_TC_QUEUES;
1235 } else {
1236 i40e->i40e_num_trqpairs_per_vsi =
1237 I40E_710_MAX_TC_QUEUES;
1238 }
1239 }
1240
1241 if (i40e->i40e_num_rx_groups == 0) {
1242 i40e->i40e_num_rx_groups = I40E_DEF_NUM_RX_GROUPS;
1243 }
1244 }
1245
1246 /*
1247 * Free any resources required by, or setup by, the Intel common code.
1248 */
1249 static void
1250 i40e_common_code_fini(i40e_t *i40e)
1251 {
1252 i40e_hw_t *hw = &i40e->i40e_hw_space;
1253 int rc;
1254
1255 rc = i40e_shutdown_lan_hmc(hw);
1256 if (rc != I40E_SUCCESS)
1257 i40e_error(i40e, "failed to shutdown LAN hmc: %d", rc);
1258
1259 rc = i40e_shutdown_adminq(hw);
1260 if (rc != I40E_SUCCESS)
1261 i40e_error(i40e, "failed to shutdown admin queue: %d", rc);
1262 }
1263
1264 /*
1265 * Initialize and call Intel common-code routines, includes some setup
1266 * the common code expects from the driver. Also prints on failure, so
1267 * the caller doesn't have to.
1268 */
1269 static boolean_t
1270 i40e_common_code_init(i40e_t *i40e, i40e_hw_t *hw)
1271 {
1272 int rc;
1273
1274 i40e_clear_hw(hw);
1275 rc = i40e_pf_reset(hw);
1276 if (rc != 0) {
1277 i40e_error(i40e, "failed to reset hardware: %d", rc);
1278 i40e_fm_ereport(i40e, DDI_FM_DEVICE_NO_RESPONSE);
1279 return (B_FALSE);
1280 }
1281
1282 rc = i40e_init_shared_code(hw);
1283 if (rc != 0) {
1284 i40e_error(i40e, "failed to initialize i40e core: %d", rc);
1285 return (B_FALSE);
1286 }
1287
1288 hw->aq.num_arq_entries = I40E_DEF_ADMINQ_SIZE;
1289 hw->aq.num_asq_entries = I40E_DEF_ADMINQ_SIZE;
1290 hw->aq.arq_buf_size = I40E_ADMINQ_BUFSZ;
1291 hw->aq.asq_buf_size = I40E_ADMINQ_BUFSZ;
1292
1293 rc = i40e_init_adminq(hw);
1294 if (rc != 0) {
1295 i40e_error(i40e, "failed to initialize firmware admin queue: "
1296 "%d, potential firmware version mismatch", rc);
1297 i40e_fm_ereport(i40e, DDI_FM_DEVICE_INVAL_STATE);
1298 return (B_FALSE);
1299 }
1300
1301 if (hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
1302 hw->aq.api_min_ver > I40E_FW_MINOR_VERSION(hw)) {
1303 i40e_log(i40e, "The driver for the device detected a newer "
1304 "version of the NVM image (%d.%d) than expected (%d.%d).\n"
1305 "Please install the most recent version of the network "
1306 "driver.\n", hw->aq.api_maj_ver, hw->aq.api_min_ver,
1307 I40E_FW_API_VERSION_MAJOR, I40E_FW_MINOR_VERSION(hw));
1308 } else if (hw->aq.api_maj_ver < I40E_FW_API_VERSION_MAJOR ||
1309 hw->aq.api_min_ver < (I40E_FW_MINOR_VERSION(hw) - 1)) {
1310 i40e_log(i40e, "The driver for the device detected an older"
1311 " version of the NVM image (%d.%d) than expected (%d.%d)."
1312 "\nPlease update the NVM image.\n",
1313 hw->aq.api_maj_ver, hw->aq.api_min_ver,
1314 I40E_FW_API_VERSION_MAJOR, I40E_FW_MINOR_VERSION(hw) - 1);
1315 }
1316
1317 i40e_clear_pxe_mode(hw);
1318
1319 /*
1320 * We need to call this so that the common code can discover
1321 * capabilities of the hardware, which it uses throughout the rest.
1322 */
1323 if (!i40e_get_hw_capabilities(i40e, hw)) {
1324 i40e_error(i40e, "failed to obtain hardware capabilities");
1325 return (B_FALSE);
1326 }
1327
1328 if (i40e_get_available_resources(i40e) == B_FALSE) {
1329 i40e_error(i40e, "failed to obtain hardware resources");
1330 return (B_FALSE);
1331 }
1332
1333 i40e_hw_to_instance(i40e, hw);
1334
1335 rc = i40e_init_lan_hmc(hw, hw->func_caps.num_tx_qp,
1336 hw->func_caps.num_rx_qp, 0, 0);
1337 if (rc != 0) {
1338 i40e_error(i40e, "failed to initialize hardware memory cache: "
1339 "%d", rc);
1340 return (B_FALSE);
1341 }
1342
1343 rc = i40e_configure_lan_hmc(hw, I40E_HMC_MODEL_DIRECT_ONLY);
1344 if (rc != 0) {
1345 i40e_error(i40e, "failed to configure hardware memory cache: "
1346 "%d", rc);
1347 return (B_FALSE);
1348 }
1349
1350 (void) i40e_aq_stop_lldp(hw, TRUE, NULL);
1351
1352 rc = i40e_get_mac_addr(hw, hw->mac.addr);
1353 if (rc != I40E_SUCCESS) {
1354 i40e_error(i40e, "failed to retrieve hardware mac address: %d",
1355 rc);
1356 return (B_FALSE);
1357 }
1358
1359 rc = i40e_validate_mac_addr(hw->mac.addr);
1360 if (rc != 0) {
1361 i40e_error(i40e, "failed to validate internal mac address: "
1362 "%d", rc);
1363 return (B_FALSE);
1364 }
1365 bcopy(hw->mac.addr, hw->mac.perm_addr, ETHERADDRL);
1366 if ((rc = i40e_get_port_mac_addr(hw, hw->mac.port_addr)) !=
1367 I40E_SUCCESS) {
1368 i40e_error(i40e, "failed to retrieve port mac address: %d",
1369 rc);
1370 return (B_FALSE);
1371 }
1372
1373 /*
1374 * We need to obtain the Default Virtual Station SEID (VSI)
1375 * before we can perform other operations on the device.
1376 */
1377 if (!i40e_set_def_vsi_seid(i40e)) {
1378 i40e_error(i40e, "failed to obtain Default VSI SEID");
1379 return (B_FALSE);
1380 }
1381
1382 return (B_TRUE);
1383 }
1384
1385 static void
1386 i40e_unconfigure(dev_info_t *devinfo, i40e_t *i40e)
1387 {
1388 int rc;
1389
1390 if (i40e->i40e_attach_progress & I40E_ATTACH_ENABLE_INTR)
1391 (void) i40e_disable_interrupts(i40e);
1392
1393 if ((i40e->i40e_attach_progress & I40E_ATTACH_LINK_TIMER) &&
1394 i40e->i40e_periodic_id != 0) {
1395 ddi_periodic_delete(i40e->i40e_periodic_id);
1396 i40e->i40e_periodic_id = 0;
1397 }
1398
1399 if (i40e->i40e_attach_progress & I40E_ATTACH_UFM_INIT)
1400 ddi_ufm_fini(i40e->i40e_ufmh);
1401
1402 if (i40e->i40e_attach_progress & I40E_ATTACH_MAC) {
1403 rc = mac_unregister(i40e->i40e_mac_hdl);
1404 if (rc != 0) {
1405 i40e_error(i40e, "failed to unregister from mac: %d",
1406 rc);
1407 }
1408 }
1409
1410 if (i40e->i40e_attach_progress & I40E_ATTACH_STATS) {
1411 i40e_stats_fini(i40e);
1412 }
1413
1414 if (i40e->i40e_attach_progress & I40E_ATTACH_ADD_INTR)
1415 i40e_rem_intr_handlers(i40e);
1416
1417 if (i40e->i40e_attach_progress & I40E_ATTACH_ALLOC_RINGSLOCKS)
1418 i40e_free_trqpairs(i40e);
1419
1420 if (i40e->i40e_attach_progress & I40E_ATTACH_ALLOC_INTR)
1421 i40e_rem_intrs(i40e);
1422
1423 if (i40e->i40e_attach_progress & I40E_ATTACH_COMMON_CODE)
1424 i40e_common_code_fini(i40e);
1425
1426 i40e_cleanup_resources(i40e);
1427
1428 if (i40e->i40e_attach_progress & I40E_ATTACH_PROPS)
1429 (void) ddi_prop_remove_all(devinfo);
1430
1431 if (i40e->i40e_attach_progress & I40E_ATTACH_REGS_MAP &&
1432 i40e->i40e_osdep_space.ios_reg_handle != NULL) {
1433 ddi_regs_map_free(&i40e->i40e_osdep_space.ios_reg_handle);
1434 i40e->i40e_osdep_space.ios_reg_handle = NULL;
1435 }
1436
1437 if ((i40e->i40e_attach_progress & I40E_ATTACH_PCI_CONFIG) &&
1438 i40e->i40e_osdep_space.ios_cfg_handle != NULL) {
1439 pci_config_teardown(&i40e->i40e_osdep_space.ios_cfg_handle);
1440 i40e->i40e_osdep_space.ios_cfg_handle = NULL;
1441 }
1442
1443 if (i40e->i40e_attach_progress & I40E_ATTACH_FM_INIT)
1444 i40e_fm_fini(i40e);
1445
1446 kmem_free(i40e->i40e_aqbuf, I40E_ADMINQ_BUFSZ);
1447 kmem_free(i40e, sizeof (i40e_t));
1448
1449 ddi_set_driver_private(devinfo, NULL);
1450 }
1451
1452 static boolean_t
1453 i40e_final_init(i40e_t *i40e)
1454 {
1455 i40e_hw_t *hw = &i40e->i40e_hw_space;
1456 struct i40e_osdep *osdep = OS_DEP(hw);
1457 uint8_t pbanum[I40E_PBANUM_STRLEN];
1458 enum i40e_status_code irc;
1459 char buf[I40E_DDI_PROP_LEN];
1460
1461 pbanum[0] = '\0';
1462 irc = i40e_read_pba_string(hw, pbanum, sizeof (pbanum));
1463 if (irc != I40E_SUCCESS) {
1464 i40e_log(i40e, "failed to read PBA string: %d", irc);
1465 } else {
1466 (void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip,
1467 "printed-board-assembly", (char *)pbanum);
1468 }
1469
1470 #ifdef DEBUG
1471 ASSERT(snprintf(NULL, 0, "%d.%d", hw->aq.fw_maj_ver,
1472 hw->aq.fw_min_ver) < sizeof (buf));
1473 ASSERT(snprintf(NULL, 0, "%x", hw->aq.fw_build) < sizeof (buf));
1474 ASSERT(snprintf(NULL, 0, "%d.%d", hw->aq.api_maj_ver,
1475 hw->aq.api_min_ver) < sizeof (buf));
1476 #endif
1477
1478 (void) snprintf(buf, sizeof (buf), "%d.%d", hw->aq.fw_maj_ver,
1479 hw->aq.fw_min_ver);
1480 (void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip,
1481 "firmware-version", buf);
1482 (void) snprintf(buf, sizeof (buf), "%x", hw->aq.fw_build);
1483 (void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip,
1484 "firmware-build", buf);
1485 (void) snprintf(buf, sizeof (buf), "%d.%d", hw->aq.api_maj_ver,
1486 hw->aq.api_min_ver);
1487 (void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip,
1488 "api-version", buf);
1489
1490 if (!i40e_set_hw_bus_info(hw))
1491 return (B_FALSE);
1492
1493 if (i40e_check_acc_handle(osdep->ios_reg_handle) != DDI_FM_OK) {
1494 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
1495 return (B_FALSE);
1496 }
1497
1498 return (B_TRUE);
1499 }
1500
1501 static void
1502 i40e_identify_hardware(i40e_t *i40e)
1503 {
1504 i40e_hw_t *hw = &i40e->i40e_hw_space;
1505 struct i40e_osdep *osdep = &i40e->i40e_osdep_space;
1506
1507 hw->vendor_id = pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_VENID);
1508 hw->device_id = pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_DEVID);
1509 hw->revision_id = pci_config_get8(osdep->ios_cfg_handle,
1510 PCI_CONF_REVID);
1511 hw->subsystem_device_id =
1512 pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_SUBSYSID);
1513 hw->subsystem_vendor_id =
1514 pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_SUBVENID);
1515
1516 /*
1517 * Note that we set the hardware's bus information later on, in
1518 * i40e_get_available_resources(). The common code doesn't seem to
1519 * require that it be set in any ways, it seems to be mostly for
1520 * book-keeping.
1521 */
1522 }
1523
1524 static boolean_t
1525 i40e_regs_map(i40e_t *i40e)
1526 {
1527 dev_info_t *devinfo = i40e->i40e_dip;
1528 i40e_hw_t *hw = &i40e->i40e_hw_space;
1529 struct i40e_osdep *osdep = &i40e->i40e_osdep_space;
1530 off_t memsize;
1531 int ret;
1532
1533 if (ddi_dev_regsize(devinfo, I40E_ADAPTER_REGSET, &memsize) !=
1534 DDI_SUCCESS) {
1535 i40e_error(i40e, "Used invalid register set to map PCIe regs");
1536 return (B_FALSE);
1537 }
1538
1539 if ((ret = ddi_regs_map_setup(devinfo, I40E_ADAPTER_REGSET,
1540 (caddr_t *)&hw->hw_addr, 0, memsize, &i40e_regs_acc_attr,
1541 &osdep->ios_reg_handle)) != DDI_SUCCESS) {
1542 i40e_error(i40e, "failed to map device registers: %d", ret);
1543 return (B_FALSE);
1544 }
1545
1546 osdep->ios_reg_size = memsize;
1547 return (B_TRUE);
1548 }
1549
1550 /*
1551 * Update parameters required when a new MTU has been configured. Calculate the
1552 * maximum frame size, as well as, size our DMA buffers which we size in
1553 * increments of 1K.
1554 */
1555 void
1556 i40e_update_mtu(i40e_t *i40e)
1557 {
1558 uint32_t rx, tx;
1559
1560 i40e->i40e_frame_max = i40e->i40e_sdu +
1561 sizeof (struct ether_vlan_header) + ETHERFCSL;
1562
1563 rx = i40e->i40e_frame_max + I40E_BUF_IPHDR_ALIGNMENT;
1564 i40e->i40e_rx_buf_size = ((rx >> 10) +
1565 ((rx & (((uint32_t)1 << 10) -1)) > 0 ? 1 : 0)) << 10;
1566
1567 tx = i40e->i40e_frame_max;
1568 i40e->i40e_tx_buf_size = ((tx >> 10) +
1569 ((tx & (((uint32_t)1 << 10) -1)) > 0 ? 1 : 0)) << 10;
1570 }
1571
1572 static int
1573 i40e_get_prop(i40e_t *i40e, char *prop, int min, int max, int def)
1574 {
1575 int val;
1576
1577 val = ddi_prop_get_int(DDI_DEV_T_ANY, i40e->i40e_dip, DDI_PROP_DONTPASS,
1578 prop, def);
1579 if (val > max)
1580 val = max;
1581 if (val < min)
1582 val = min;
1583 return (val);
1584 }
1585
1586 static void
1587 i40e_init_properties(i40e_t *i40e)
1588 {
1589 i40e->i40e_sdu = i40e_get_prop(i40e, "default_mtu",
1590 I40E_MIN_MTU, I40E_MAX_MTU, I40E_DEF_MTU);
1591
1592 i40e->i40e_intr_force = i40e_get_prop(i40e, "intr_force",
1593 I40E_INTR_NONE, I40E_INTR_LEGACY, I40E_INTR_NONE);
1594
1595 i40e->i40e_mr_enable = i40e_get_prop(i40e, "mr_enable",
1596 B_FALSE, B_TRUE, B_TRUE);
1597
1598 i40e->i40e_tx_ring_size = i40e_get_prop(i40e, "tx_ring_size",
1599 I40E_MIN_TX_RING_SIZE, I40E_MAX_TX_RING_SIZE,
1600 I40E_DEF_TX_RING_SIZE);
1601 if ((i40e->i40e_tx_ring_size % I40E_DESC_ALIGN) != 0) {
1602 i40e->i40e_tx_ring_size = P2ROUNDUP(i40e->i40e_tx_ring_size,
1603 I40E_DESC_ALIGN);
1604 }
1605
1606 i40e->i40e_tx_block_thresh = i40e_get_prop(i40e, "tx_resched_threshold",
1607 I40E_MIN_TX_BLOCK_THRESH,
1608 i40e->i40e_tx_ring_size - I40E_TX_MAX_COOKIE,
1609 I40E_DEF_TX_BLOCK_THRESH);
1610
1611 i40e->i40e_num_rx_groups = i40e_get_prop(i40e, "rx_num_groups",
1612 I40E_MIN_NUM_RX_GROUPS, I40E_MAX_NUM_RX_GROUPS,
1613 I40E_DEF_NUM_RX_GROUPS);
1614
1615 i40e->i40e_rx_ring_size = i40e_get_prop(i40e, "rx_ring_size",
1616 I40E_MIN_RX_RING_SIZE, I40E_MAX_RX_RING_SIZE,
1617 I40E_DEF_RX_RING_SIZE);
1618 if ((i40e->i40e_rx_ring_size % I40E_DESC_ALIGN) != 0) {
1619 i40e->i40e_rx_ring_size = P2ROUNDUP(i40e->i40e_rx_ring_size,
1620 I40E_DESC_ALIGN);
1621 }
1622
1623 i40e->i40e_rx_limit_per_intr = i40e_get_prop(i40e, "rx_limit_per_intr",
1624 I40E_MIN_RX_LIMIT_PER_INTR, I40E_MAX_RX_LIMIT_PER_INTR,
1625 I40E_DEF_RX_LIMIT_PER_INTR);
1626
1627 i40e->i40e_tx_hcksum_enable = i40e_get_prop(i40e, "tx_hcksum_enable",
1628 B_FALSE, B_TRUE, B_TRUE);
1629
1630 i40e->i40e_tx_lso_enable = i40e_get_prop(i40e, "tx_lso_enable",
1631 B_FALSE, B_TRUE, B_TRUE);
1632
1633 i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable",
1634 B_FALSE, B_TRUE, B_TRUE);
1635
1636 i40e->i40e_rx_dma_min = i40e_get_prop(i40e, "rx_dma_threshold",
1637 I40E_MIN_RX_DMA_THRESH, I40E_MAX_RX_DMA_THRESH,
1638 I40E_DEF_RX_DMA_THRESH);
1639
1640 i40e->i40e_tx_dma_min = i40e_get_prop(i40e, "tx_dma_threshold",
1641 I40E_MIN_TX_DMA_THRESH, I40E_MAX_TX_DMA_THRESH,
1642 I40E_DEF_TX_DMA_THRESH);
1643
1644 i40e->i40e_tx_itr = i40e_get_prop(i40e, "tx_intr_throttle",
1645 I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_TX_ITR);
1646
1647 i40e->i40e_rx_itr = i40e_get_prop(i40e, "rx_intr_throttle",
1648 I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_RX_ITR);
1649
1650 i40e->i40e_other_itr = i40e_get_prop(i40e, "other_intr_throttle",
1651 I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_OTHER_ITR);
1652
1653 if (!i40e->i40e_mr_enable) {
1654 i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX;
1655 i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX;
1656 }
1657
1658 i40e_update_mtu(i40e);
1659 }
1660
1661 /*
1662 * There are a few constraints on interrupts that we're currently imposing, some
1663 * of which are restrictions from hardware. For a fuller treatment, see
1664 * i40e_intr.c.
1665 *
1666 * Currently, to use MSI-X we require two interrupts be available though in
1667 * theory we should participate in IRM and happily use more interrupts.
1668 *
1669 * Hardware only supports a single MSI being programmed and therefore if we
1670 * don't have MSI-X interrupts available at this time, then we ratchet down the
1671 * number of rings and groups available. Obviously, we only bother with a single
1672 * fixed interrupt.
1673 */
1674 static boolean_t
1675 i40e_alloc_intr_handles(i40e_t *i40e, dev_info_t *devinfo, int intr_type)
1676 {
1677 i40e_hw_t *hw = &i40e->i40e_hw_space;
1678 ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle;
1679 int request, count, actual, rc, min;
1680 uint32_t reg;
1681
1682 switch (intr_type) {
1683 case DDI_INTR_TYPE_FIXED:
1684 case DDI_INTR_TYPE_MSI:
1685 request = 1;
1686 min = 1;
1687 break;
1688 case DDI_INTR_TYPE_MSIX:
1689 min = 2;
1690 if (!i40e->i40e_mr_enable) {
1691 request = 2;
1692 break;
1693 }
1694 reg = I40E_READ_REG(hw, I40E_GLPCI_CNF2);
1695 /*
1696 * Should this read fail, we will drop back to using
1697 * MSI or fixed interrupts.
1698 */
1699 if (i40e_check_acc_handle(rh) != DDI_FM_OK) {
1700 ddi_fm_service_impact(i40e->i40e_dip,
1701 DDI_SERVICE_DEGRADED);
1702 return (B_FALSE);
1703 }
1704 request = (reg & I40E_GLPCI_CNF2_MSI_X_PF_N_MASK) >>
1705 I40E_GLPCI_CNF2_MSI_X_PF_N_SHIFT;
1706 request++; /* the register value is n - 1 */
1707 break;
1708 default:
1709 panic("bad interrupt type passed to i40e_alloc_intr_handles: "
1710 "%d", intr_type);
1711 }
1712
1713 rc = ddi_intr_get_nintrs(devinfo, intr_type, &count);
1714 if (rc != DDI_SUCCESS || count < min) {
1715 i40e_log(i40e, "Get interrupt number failed, "
1716 "returned %d, count %d", rc, count);
1717 return (B_FALSE);
1718 }
1719
1720 rc = ddi_intr_get_navail(devinfo, intr_type, &count);
1721 if (rc != DDI_SUCCESS || count < min) {
1722 i40e_log(i40e, "Get AVAILABLE interrupt number failed, "
1723 "returned %d, count %d", rc, count);
1724 return (B_FALSE);
1725 }
1726
1727 actual = 0;
1728 i40e->i40e_intr_count = 0;
1729 i40e->i40e_intr_count_max = 0;
1730 i40e->i40e_intr_count_min = 0;
1731
1732 i40e->i40e_intr_size = request * sizeof (ddi_intr_handle_t);
1733 ASSERT(i40e->i40e_intr_size != 0);
1734 i40e->i40e_intr_handles = kmem_alloc(i40e->i40e_intr_size, KM_SLEEP);
1735
1736 rc = ddi_intr_alloc(devinfo, i40e->i40e_intr_handles, intr_type, 0,
1737 min(request, count), &actual, DDI_INTR_ALLOC_NORMAL);
1738 if (rc != DDI_SUCCESS) {
1739 i40e_log(i40e, "Interrupt allocation failed with %d.", rc);
1740 goto alloc_handle_fail;
1741 }
1742
1743 i40e->i40e_intr_count = actual;
1744 i40e->i40e_intr_count_max = request;
1745 i40e->i40e_intr_count_min = min;
1746
1747 if (actual < min) {
1748 i40e_log(i40e, "actual (%d) is less than minimum (%d).",
1749 actual, min);
1750 goto alloc_handle_fail;
1751 }
1752
1753 /*
1754 * Record the priority and capabilities for our first vector. Once
1755 * we have it, that's our priority until detach time. Even if we
1756 * eventually participate in IRM, our priority shouldn't change.
1757 */
1758 rc = ddi_intr_get_pri(i40e->i40e_intr_handles[0], &i40e->i40e_intr_pri);
1759 if (rc != DDI_SUCCESS) {
1760 i40e_log(i40e,
1761 "Getting interrupt priority failed with %d.", rc);
1762 goto alloc_handle_fail;
1763 }
1764
1765 rc = ddi_intr_get_cap(i40e->i40e_intr_handles[0], &i40e->i40e_intr_cap);
1766 if (rc != DDI_SUCCESS) {
1767 i40e_log(i40e,
1768 "Getting interrupt capabilities failed with %d.", rc);
1769 goto alloc_handle_fail;
1770 }
1771
1772 i40e->i40e_intr_type = intr_type;
1773 return (B_TRUE);
1774
1775 alloc_handle_fail:
1776
1777 i40e_rem_intrs(i40e);
1778 return (B_FALSE);
1779 }
1780
1781 static boolean_t
1782 i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
1783 {
1784 i40e_hw_t *hw = &i40e->i40e_hw_space;
1785 int intr_types, rc;
1786 uint_t max_trqpairs;
1787
1788 if (i40e_is_x722(i40e)) {
1789 max_trqpairs = I40E_722_MAX_TC_QUEUES;
1790 } else {
1791 max_trqpairs = I40E_710_MAX_TC_QUEUES;
1792 }
1793
1794 rc = ddi_intr_get_supported_types(devinfo, &intr_types);
1795 if (rc != DDI_SUCCESS) {
1796 i40e_error(i40e, "failed to get supported interrupt types: %d",
1797 rc);
1798 return (B_FALSE);
1799 }
1800
1801 i40e->i40e_intr_type = 0;
1802
1803 /*
1804 * We need to determine the number of queue pairs per traffic
1805 * class. We only have one traffic class (TC0), so we'll base
1806 * this off the number of interrupts provided. Furthermore,
1807 * since we only use one traffic class, the number of queues
1808 * per traffic class and per VSI are the same.
1809 */
1810 if ((intr_types & DDI_INTR_TYPE_MSIX) &&
1811 (i40e->i40e_intr_force <= I40E_INTR_MSIX) &&
1812 (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSIX))) {
1813 uint32_t n, qp_cap, num_trqpairs;
1814
1815 /*
1816 * While we want the number of queue pairs to match
1817 * the number of interrupts, we must keep stay in
1818 * bounds of the maximum number of queues per traffic
1819 * class. We subtract one from i40e_intr_count to
1820 * account for interrupt zero; which is currently
1821 * restricted to admin queue commands and other
1822 * interrupt causes.
1823 */
1824 n = MIN(i40e->i40e_intr_count - 1, max_trqpairs);
1825 ASSERT3U(n, >, 0);
1826
1827 /*
1828 * Round up to the nearest power of two to ensure that
1829 * the QBASE aligns with the TC size which must be
1830 * programmed as a power of two. See the queue mapping
1831 * description in section 7.4.9.5.5.1.
1832 *
1833 * If i40e_intr_count - 1 is not a power of two then
1834 * some queue pairs on the same VSI will have to share
1835 * an interrupt.
1836 *
1837 * We may want to revisit this logic in a future where
1838 * we have more interrupts and more VSIs. Otherwise,
1839 * each VSI will use as many interrupts as possible.
1840 * Using more QPs per VSI means better RSS for each
1841 * group, but at the same time may require more
1842 * sharing of interrupts across VSIs. This may be a
1843 * good candidate for a .conf tunable.
1844 */
1845 n = 0x1 << ddi_fls(n);
1846 i40e->i40e_num_trqpairs_per_vsi = n;
1847
1848 /*
1849 * Make sure the number of tx/rx qpairs does not exceed
1850 * the device's capabilities.
1851 */
1852 ASSERT3U(i40e->i40e_num_rx_groups, >, 0);
1853 qp_cap = MIN(hw->func_caps.num_rx_qp, hw->func_caps.num_tx_qp);
1854 num_trqpairs = i40e->i40e_num_trqpairs_per_vsi *
1855 i40e->i40e_num_rx_groups;
1856 if (num_trqpairs > qp_cap) {
1857 i40e->i40e_num_rx_groups = MAX(1, qp_cap /
1858 i40e->i40e_num_trqpairs_per_vsi);
1859 num_trqpairs = i40e->i40e_num_trqpairs_per_vsi *
1860 i40e->i40e_num_rx_groups;
1861 i40e_log(i40e, "Rx groups restricted to %u",
1862 i40e->i40e_num_rx_groups);
1863 }
1864 ASSERT3U(num_trqpairs, >, 0);
1865 i40e->i40e_num_trqpairs = num_trqpairs;
1866 return (B_TRUE);
1867 }
1868
1869 /*
1870 * We only use multiple transmit/receive pairs when MSI-X interrupts are
1871 * available due to the fact that the device basically only supports a
1872 * single MSI interrupt.
1873 */
1874 i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX;
1875 i40e->i40e_num_trqpairs_per_vsi = i40e->i40e_num_trqpairs;
1876 i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX;
1877
1878 if ((intr_types & DDI_INTR_TYPE_MSI) &&
1879 (i40e->i40e_intr_force <= I40E_INTR_MSI)) {
1880 if (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSI))
1881 return (B_TRUE);
1882 }
1883
1884 if (intr_types & DDI_INTR_TYPE_FIXED) {
1885 if (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_FIXED))
1886 return (B_TRUE);
1887 }
1888
1889 return (B_FALSE);
1890 }
1891
1892 /*
1893 * Map different interrupts to MSI-X vectors.
1894 */
1895 static boolean_t
1896 i40e_map_intrs_to_vectors(i40e_t *i40e)
1897 {
1898 if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) {
1899 return (B_TRUE);
1900 }
1901
1902 /*
1903 * Each queue pair is mapped to a single interrupt, so
1904 * transmit and receive interrupts for a given queue share the
1905 * same vector. Vector zero is reserved for the admin queue.
1906 */
1907 for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
1908 uint_t vector = i % (i40e->i40e_intr_count - 1);
1909
1910 i40e->i40e_trqpairs[i].itrq_rx_intrvec = vector + 1;
1911 i40e->i40e_trqpairs[i].itrq_tx_intrvec = vector + 1;
1912 }
1913
1914 return (B_TRUE);
1915 }
1916
1917 static boolean_t
1918 i40e_add_intr_handlers(i40e_t *i40e)
1919 {
1920 int rc, vector;
1921
1922 switch (i40e->i40e_intr_type) {
1923 case DDI_INTR_TYPE_MSIX:
1924 for (vector = 0; vector < i40e->i40e_intr_count; vector++) {
1925 rc = ddi_intr_add_handler(
1926 i40e->i40e_intr_handles[vector],
1927 (ddi_intr_handler_t *)i40e_intr_msix, i40e,
1928 (void *)(uintptr_t)vector);
1929 if (rc != DDI_SUCCESS) {
1930 i40e_log(i40e, "Add interrupt handler (MSI-X) "
1931 "failed: return %d, vector %d", rc, vector);
1932 for (vector--; vector >= 0; vector--) {
1933 (void) ddi_intr_remove_handler(
1934 i40e->i40e_intr_handles[vector]);
1935 }
1936 return (B_FALSE);
1937 }
1938 }
1939 break;
1940 case DDI_INTR_TYPE_MSI:
1941 rc = ddi_intr_add_handler(i40e->i40e_intr_handles[0],
1942 (ddi_intr_handler_t *)i40e_intr_msi, i40e, NULL);
1943 if (rc != DDI_SUCCESS) {
1944 i40e_log(i40e, "Add interrupt handler (MSI) failed: "
1945 "return %d", rc);
1946 return (B_FALSE);
1947 }
1948 break;
1949 case DDI_INTR_TYPE_FIXED:
1950 rc = ddi_intr_add_handler(i40e->i40e_intr_handles[0],
1951 (ddi_intr_handler_t *)i40e_intr_legacy, i40e, NULL);
1952 if (rc != DDI_SUCCESS) {
1953 i40e_log(i40e, "Add interrupt handler (legacy) failed:"
1954 " return %d", rc);
1955 return (B_FALSE);
1956 }
1957 break;
1958 default:
1959 /* Cast to pacify lint */
1960 panic("i40e_intr_type %p contains an unknown type: %d",
1961 (void *)i40e, i40e->i40e_intr_type);
1962 }
1963
1964 return (B_TRUE);
1965 }
1966
1967 /*
1968 * Perform periodic checks. Longer term, we should be thinking about additional
1969 * things here:
1970 *
1971 * o Stall Detection
1972 * o Temperature sensor detection
1973 * o Device resetting
1974 * o Statistics updating to avoid wraparound
1975 */
1976 static void
1977 i40e_timer(void *arg)
1978 {
1979 i40e_t *i40e = arg;
1980
1981 mutex_enter(&i40e->i40e_general_lock);
1982 i40e_link_check(i40e);
1983 mutex_exit(&i40e->i40e_general_lock);
1984 }
1985
1986 /*
1987 * Get the hardware state, and scribble away anything that needs scribbling.
1988 */
1989 static void
1990 i40e_get_hw_state(i40e_t *i40e, i40e_hw_t *hw)
1991 {
1992 int rc;
1993
1994 ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
1995
1996 (void) i40e_aq_get_link_info(hw, TRUE, NULL, NULL);
1997 i40e_link_check(i40e);
1998
1999 /*
2000 * Try and determine our PHY. Note that we may have to retry to and
2001 * delay to detect fiber correctly.
2002 */
2003 rc = i40e_aq_get_phy_capabilities(hw, B_FALSE, B_TRUE, &i40e->i40e_phy,
2004 NULL);
2005 if (rc == I40E_ERR_UNKNOWN_PHY) {
2006 i40e_msec_delay(200);
2007 rc = i40e_aq_get_phy_capabilities(hw, B_FALSE, B_TRUE,
2008 &i40e->i40e_phy, NULL);
2009 }
2010
2011 if (rc != I40E_SUCCESS) {
2012 if (rc == I40E_ERR_UNKNOWN_PHY) {
2013 i40e_error(i40e, "encountered unknown PHY type, "
2014 "not attaching.");
2015 } else {
2016 i40e_error(i40e, "error getting physical capabilities: "
2017 "%d, %d", rc, hw->aq.asq_last_status);
2018 }
2019 }
2020
2021 rc = i40e_update_link_info(hw);
2022 if (rc != I40E_SUCCESS) {
2023 i40e_error(i40e, "failed to update link information: %d", rc);
2024 }
2025
2026 /*
2027 * In general, we don't want to mask off (as in stop from being a cause)
2028 * any of the interrupts that the phy might be able to generate.
2029 */
2030 rc = i40e_aq_set_phy_int_mask(hw, 0, NULL);
2031 if (rc != I40E_SUCCESS) {
2032 i40e_error(i40e, "failed to update phy link mask: %d", rc);
2033 }
2034 }
2035
2036 /*
2037 * Go through and re-initialize any existing filters that we may have set up for
2038 * this device. Note that we would only expect them to exist if hardware had
2039 * already been initialized and we had just reset it. While we're not
2040 * implementing this yet, we're keeping this around for when we add reset
2041 * capabilities, so this isn't forgotten.
2042 */
2043 /* ARGSUSED */
2044 static void
2045 i40e_init_macaddrs(i40e_t *i40e, i40e_hw_t *hw)
2046 {
2047 }
2048
2049 /*
2050 * Set the properties which have common values across all the VSIs.
2051 * Consult the "Add VSI" command section (7.4.9.5.5.1) for a
2052 * complete description of these properties.
2053 */
2054 static void
2055 i40e_set_shared_vsi_props(i40e_t *i40e,
2056 struct i40e_aqc_vsi_properties_data *info, uint_t vsi_idx)
2057 {
2058 uint_t tc_queues;
2059 uint16_t vsi_qp_base;
2060
2061 /*
2062 * It's important that we use bitwise-OR here; callers to this
2063 * function might enable other sections before calling this
2064 * function.
2065 */
2066 info->valid_sections |= LE_16(I40E_AQ_VSI_PROP_QUEUE_MAP_VALID |
2067 I40E_AQ_VSI_PROP_VLAN_VALID);
2068
2069 /*
2070 * Calculate the starting QP index for this VSI. This base is
2071 * relative to the PF queue space; so a value of 0 for PF#1
2072 * represents the absolute index PFLAN_QALLOC_FIRSTQ for PF#1.
2073 */
2074 vsi_qp_base = vsi_idx * i40e->i40e_num_trqpairs_per_vsi;
2075 info->mapping_flags = LE_16(I40E_AQ_VSI_QUE_MAP_CONTIG);
2076 info->queue_mapping[0] =
2077 LE_16((vsi_qp_base << I40E_AQ_VSI_QUEUE_SHIFT) &
2078 I40E_AQ_VSI_QUEUE_MASK);
2079
2080 /*
2081 * tc_queues determines the size of the traffic class, where
2082 * the size is 2^^tc_queues to a maximum of 64 for the X710
2083 * and 128 for the X722.
2084 *
2085 * Some examples:
2086 * i40e_num_trqpairs_per_vsi == 1 => tc_queues = 0, 2^^0 = 1.
2087 * i40e_num_trqpairs_per_vsi == 7 => tc_queues = 3, 2^^3 = 8.
2088 * i40e_num_trqpairs_per_vsi == 8 => tc_queues = 3, 2^^3 = 8.
2089 * i40e_num_trqpairs_per_vsi == 9 => tc_queues = 4, 2^^4 = 16.
2090 * i40e_num_trqpairs_per_vsi == 17 => tc_queues = 5, 2^^5 = 32.
2091 * i40e_num_trqpairs_per_vsi == 64 => tc_queues = 6, 2^^6 = 64.
2092 */
2093 tc_queues = ddi_fls(i40e->i40e_num_trqpairs_per_vsi - 1);
2094
2095 /*
2096 * The TC queue mapping is in relation to the VSI queue space.
2097 * Since we are only using one traffic class (TC0) we always
2098 * start at queue offset 0.
2099 */
2100 info->tc_mapping[0] =
2101 LE_16(((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) &
2102 I40E_AQ_VSI_TC_QUE_OFFSET_MASK) |
2103 ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) &
2104 I40E_AQ_VSI_TC_QUE_NUMBER_MASK));
2105
2106 /*
2107 * I40E_AQ_VSI_PVLAN_MODE_ALL ("VLAN driver insertion mode")
2108 *
2109 * Allow tagged and untagged packets to be sent to this
2110 * VSI from the host.
2111 *
2112 * I40E_AQ_VSI_PVLAN_EMOD_NOTHING ("VLAN and UP expose mode")
2113 *
2114 * Leave the tag on the frame and place no VLAN
2115 * information in the descriptor. We want this mode
2116 * because our MAC layer will take care of the VLAN tag,
2117 * if there is one.
2118 */
2119 info->port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL |
2120 I40E_AQ_VSI_PVLAN_EMOD_NOTHING;
2121 }
2122
2123 /*
2124 * Delete the VSI at this index, if one exists. We assume there is no
2125 * action we can take if this command fails but to log the failure.
2126 */
2127 static void
2128 i40e_delete_vsi(i40e_t *i40e, uint_t idx)
2129 {
2130 i40e_hw_t *hw = &i40e->i40e_hw_space;
2131 uint16_t seid = i40e->i40e_vsis[idx].iv_seid;
2132
2133 if (seid != 0) {
2134 int rc;
2135
2136 rc = i40e_aq_delete_element(hw, seid, NULL);
2137
2138 if (rc != I40E_SUCCESS) {
2139 i40e_error(i40e, "Failed to delete VSI %d: %d",
2140 rc, hw->aq.asq_last_status);
2141 }
2142
2143 i40e->i40e_vsis[idx].iv_seid = 0;
2144 }
2145 }
2146
2147 /*
2148 * Add a new VSI.
2149 */
2150 static boolean_t
2151 i40e_add_vsi(i40e_t *i40e, i40e_hw_t *hw, uint_t idx)
2152 {
2153 struct i40e_vsi_context ctx;
2154 i40e_rx_group_t *rxg;
2155 int rc;
2156
2157 /*
2158 * The default VSI is created by the controller. This function
2159 * creates new, non-defualt VSIs only.
2160 */
2161 ASSERT3U(idx, !=, 0);
2162
2163 bzero(&ctx, sizeof (struct i40e_vsi_context));
2164 ctx.uplink_seid = i40e->i40e_veb_seid;
2165 ctx.pf_num = hw->pf_id;
2166 ctx.flags = I40E_AQ_VSI_TYPE_PF;
2167 ctx.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL;
2168 i40e_set_shared_vsi_props(i40e, &ctx.info, idx);
2169
2170 rc = i40e_aq_add_vsi(hw, &ctx, NULL);
2171 if (rc != I40E_SUCCESS) {
2172 i40e_error(i40e, "i40e_aq_add_vsi() failed %d: %d", rc,
2173 hw->aq.asq_last_status);
2174 return (B_FALSE);
2175 }
2176
2177 rxg = &i40e->i40e_rx_groups[idx];
2178 rxg->irg_vsi_seid = ctx.seid;
2179 i40e->i40e_vsis[idx].iv_number = ctx.vsi_number;
2180 i40e->i40e_vsis[idx].iv_seid = ctx.seid;
2181 i40e->i40e_vsis[idx].iv_stats_id = LE_16(ctx.info.stat_counter_idx);
2182
2183 if (i40e_stat_vsi_init(i40e, idx) == B_FALSE)
2184 return (B_FALSE);
2185
2186 return (B_TRUE);
2187 }
2188
2189 /*
2190 * Configure the hardware for the Default Virtual Station Interface (VSI).
2191 */
2192 static boolean_t
2193 i40e_config_def_vsi(i40e_t *i40e, i40e_hw_t *hw)
2194 {
2195 struct i40e_vsi_context ctx;
2196 i40e_rx_group_t *def_rxg;
2197 int err;
2198 struct i40e_aqc_remove_macvlan_element_data filt;
2199
2200 bzero(&ctx, sizeof (struct i40e_vsi_context));
2201 ctx.seid = I40E_DEF_VSI_SEID(i40e);
2202 ctx.pf_num = hw->pf_id;
2203 err = i40e_aq_get_vsi_params(hw, &ctx, NULL);
2204 if (err != I40E_SUCCESS) {
2205 i40e_error(i40e, "get VSI params failed with %d", err);
2206 return (B_FALSE);
2207 }
2208
2209 ctx.info.valid_sections = 0;
2210 i40e->i40e_vsis[0].iv_number = ctx.vsi_number;
2211 i40e->i40e_vsis[0].iv_stats_id = LE_16(ctx.info.stat_counter_idx);
2212 if (i40e_stat_vsi_init(i40e, 0) == B_FALSE)
2213 return (B_FALSE);
2214
2215 i40e_set_shared_vsi_props(i40e, &ctx.info, I40E_DEF_VSI_IDX);
2216
2217 err = i40e_aq_update_vsi_params(hw, &ctx, NULL);
2218 if (err != I40E_SUCCESS) {
2219 i40e_error(i40e, "Update VSI params failed with %d", err);
2220 return (B_FALSE);
2221 }
2222
2223 def_rxg = &i40e->i40e_rx_groups[0];
2224 def_rxg->irg_vsi_seid = I40E_DEF_VSI_SEID(i40e);
2225
2226 /*
2227 * We have seen three different behaviors in regards to the
2228 * Default VSI and its implicit L2 MAC+VLAN filter.
2229 *
2230 * 1. It has an implicit filter for the factory MAC address
2231 * and this filter counts against 'ifr_nmacfilt_used'.
2232 *
2233 * 2. It has an implicit filter for the factory MAC address
2234 * and this filter DOES NOT count against 'ifr_nmacfilt_used'.
2235 *
2236 * 3. It DOES NOT have an implicit filter.
2237 *
2238 * All three of these cases are accounted for below. If we
2239 * fail to remove the L2 filter (ENOENT) then we assume there
2240 * wasn't one. Otherwise, if we successfully remove the
2241 * filter, we make sure to update the 'ifr_nmacfilt_used'
2242 * count accordingly.
2243 *
2244 * We remove this filter to prevent duplicate delivery of
2245 * packets destined for the primary MAC address as DLS will
2246 * create the same filter on a non-default VSI for the primary
2247 * MAC client.
2248 *
2249 * If you change the following code please test it across as
2250 * many X700 series controllers and firmware revisions as you
2251 * can.
2252 */
2253 bzero(&filt, sizeof (filt));
2254 bcopy(hw->mac.port_addr, filt.mac_addr, ETHERADDRL);
2255 filt.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH;
2256 filt.vlan_tag = 0;
2257
2258 ASSERT3U(i40e->i40e_resources.ifr_nmacfilt_used, <=, 1);
2259 i40e_log(i40e, "Num L2 filters: %u",
2260 i40e->i40e_resources.ifr_nmacfilt_used);
2261
2262 err = i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1,
2263 NULL);
2264 if (err == I40E_SUCCESS) {
2265 i40e_log(i40e,
2266 "Removed L2 filter from Default VSI with SEID %u",
2267 I40E_DEF_VSI_SEID(i40e));
2268 } else if (hw->aq.asq_last_status == ENOENT) {
2269 i40e_log(i40e,
2270 "No L2 filter for Default VSI with SEID %u",
2271 I40E_DEF_VSI_SEID(i40e));
2272 } else {
2273 i40e_error(i40e, "Failed to remove L2 filter from"
2274 " Default VSI with SEID %u: %d (%d)",
2275 I40E_DEF_VSI_SEID(i40e), err, hw->aq.asq_last_status);
2276
2277 return (B_FALSE);
2278 }
2279
2280 /*
2281 * As mentioned above, the controller created an implicit L2
2282 * filter for the primary MAC. We want to remove both the
2283 * filter and decrement the filter count. However, not all
2284 * controllers count this implicit filter against the total
2285 * MAC filter count. So here we are making sure it is either
2286 * one or zero. If it is one, then we know it is for the
2287 * implicit filter and we should decrement since we just
2288 * removed the filter above. If it is zero then we know the
2289 * controller that does not count the implicit filter, and it
2290 * was enough to just remove it; we leave the count alone.
2291 * But if it is neither, then we have never seen a controller
2292 * like this before and we should fail to attach.
2293 *
2294 * It is unfortunate that this code must exist but the
2295 * behavior of this implicit L2 filter and its corresponding
2296 * count were dicovered through empirical testing. The
2297 * programming manuals hint at this filter but do not
2298 * explicitly call out the exact behavior.
2299 */
2300 if (i40e->i40e_resources.ifr_nmacfilt_used == 1) {
2301 i40e->i40e_resources.ifr_nmacfilt_used--;
2302 } else {
2303 if (i40e->i40e_resources.ifr_nmacfilt_used != 0) {
2304 i40e_error(i40e, "Unexpected L2 filter count: %u"
2305 " (expected 0)",
2306 i40e->i40e_resources.ifr_nmacfilt_used);
2307 return (B_FALSE);
2308 }
2309 }
2310
2311 return (B_TRUE);
2312 }
2313
2314 static boolean_t
2315 i40e_config_rss_key_x722(i40e_t *i40e, i40e_hw_t *hw)
2316 {
2317 for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
2318 uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
2319 struct i40e_aqc_get_set_rss_key_data key;
2320 const char *u8seed;
2321 enum i40e_status_code status;
2322 uint16_t vsi_number = i40e->i40e_vsis[i].iv_number;
2323
2324 (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
2325 u8seed = (char *)seed;
2326
2327 CTASSERT(sizeof (key) >= (sizeof (key.standard_rss_key) +
2328 sizeof (key.extended_hash_key)));
2329
2330 bcopy(u8seed, key.standard_rss_key,
2331 sizeof (key.standard_rss_key));
2332 bcopy(&u8seed[sizeof (key.standard_rss_key)],
2333 key.extended_hash_key, sizeof (key.extended_hash_key));
2334
2335 ASSERT3U(vsi_number, !=, 0);
2336 status = i40e_aq_set_rss_key(hw, vsi_number, &key);
2337
2338 if (status != I40E_SUCCESS) {
2339 i40e_error(i40e, "failed to set RSS key for VSI %u: %d",
2340 vsi_number, status);
2341 return (B_FALSE);
2342 }
2343 }
2344
2345 return (B_TRUE);
2346 }
2347
2348 /*
2349 * Configure the RSS key. For the X710 controller family, this is set on a
2350 * per-PF basis via registers. For the X722, this is done on a per-VSI basis
2351 * through the admin queue.
2352 */
2353 static boolean_t
2354 i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
2355 {
2356 if (i40e_is_x722(i40e)) {
2357 if (!i40e_config_rss_key_x722(i40e, hw))
2358 return (B_FALSE);
2359 } else {
2360 uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
2361
2362 (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
2363 for (uint_t i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
2364 i40e_write_rx_ctl(hw, I40E_PFQF_HKEY(i), seed[i]);
2365 }
2366
2367 return (B_TRUE);
2368 }
2369
2370 /*
2371 * Populate the LUT. The size of each entry in the LUT depends on the controller
2372 * family, with the X722 using a known 7-bit width. On the X710 controller, this
2373 * is programmed through its control registers where as on the X722 this is
2374 * configured through the admin queue. Also of note, the X722 allows the LUT to
2375 * be set on a per-PF or VSI basis. At this time we use the PF setting. If we
2376 * decide to use the per-VSI LUT in the future, then we will need to modify the
2377 * i40e_add_vsi() function to set the RSS LUT bits in the queueing section.
2378 *
2379 * We populate the LUT in a round robin fashion with the rx queue indices from 0
2380 * to i40e_num_trqpairs_per_vsi - 1.
2381 */
2382 static boolean_t
2383 i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw)
2384 {
2385 uint32_t *hlut;
2386 uint8_t lut_mask;
2387 uint_t i;
2388 boolean_t ret = B_FALSE;
2389
2390 /*
2391 * We always configure the PF with a table size of 512 bytes in
2392 * i40e_chip_start().
2393 */
2394 hlut = kmem_alloc(I40E_HLUT_TABLE_SIZE, KM_NOSLEEP);
2395 if (hlut == NULL) {
2396 i40e_error(i40e, "i40e_config_rss() buffer allocation failed");
2397 return (B_FALSE);
2398 }
2399
2400 /*
2401 * The width of the X722 is apparently defined to be 7 bits, regardless
2402 * of the capability.
2403 */
2404 if (i40e_is_x722(i40e)) {
2405 lut_mask = (1 << 7) - 1;
2406 } else {
2407 lut_mask = (1 << hw->func_caps.rss_table_entry_width) - 1;
2408 }
2409
2410 for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++) {
2411 ((uint8_t *)hlut)[i] =
2412 (i % i40e->i40e_num_trqpairs_per_vsi) & lut_mask;
2413 }
2414
2415 if (i40e_is_x722(i40e)) {
2416 enum i40e_status_code status;
2417
2418 status = i40e_aq_set_rss_lut(hw, 0, B_TRUE, (uint8_t *)hlut,
2419 I40E_HLUT_TABLE_SIZE);
2420
2421 if (status != I40E_SUCCESS) {
2422 i40e_error(i40e, "failed to set RSS LUT %d: %d",
2423 status, hw->aq.asq_last_status);
2424 goto out;
2425 }
2426 } else {
2427 for (i = 0; i < I40E_HLUT_TABLE_SIZE >> 2; i++) {
2428 I40E_WRITE_REG(hw, I40E_PFQF_HLUT(i), hlut[i]);
2429 }
2430 }
2431 ret = B_TRUE;
2432 out:
2433 kmem_free(hlut, I40E_HLUT_TABLE_SIZE);
2434 return (ret);
2435 }
2436
2437 /*
2438 * Set up RSS.
2439 * 1. Seed the hash key.
2440 * 2. Enable PCTYPEs for the hash filter.
2441 * 3. Populate the LUT.
2442 */
2443 static boolean_t
2444 i40e_config_rss(i40e_t *i40e, i40e_hw_t *hw)
2445 {
2446 uint64_t hena;
2447
2448 /*
2449 * 1. Seed the hash key
2450 */
2451 if (!i40e_config_rss_key(i40e, hw))
2452 return (B_FALSE);
2453
2454 /*
2455 * 2. Configure PCTYPES
2456 */
2457 hena = (1ULL << I40E_FILTER_PCTYPE_NONF_IPV4_OTHER) |
2458 (1ULL << I40E_FILTER_PCTYPE_NONF_IPV4_TCP) |
2459 (1ULL << I40E_FILTER_PCTYPE_NONF_IPV4_SCTP) |
2460 (1ULL << I40E_FILTER_PCTYPE_NONF_IPV4_UDP) |
2461 (1ULL << I40E_FILTER_PCTYPE_FRAG_IPV4) |
2462 (1ULL << I40E_FILTER_PCTYPE_NONF_IPV6_OTHER) |
2463 (1ULL << I40E_FILTER_PCTYPE_NONF_IPV6_TCP) |
2464 (1ULL << I40E_FILTER_PCTYPE_NONF_IPV6_SCTP) |
2465 (1ULL << I40E_FILTER_PCTYPE_NONF_IPV6_UDP) |
2466 (1ULL << I40E_FILTER_PCTYPE_FRAG_IPV6) |
2467 (1ULL << I40E_FILTER_PCTYPE_L2_PAYLOAD);
2468
2469 /*
2470 * Add additional types supported by the X722 controller.
2471 */
2472 if (i40e_is_x722(i40e)) {
2473 hena |= (1ULL << I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) |
2474 (1ULL << I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP) |
2475 (1ULL << I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK) |
2476 (1ULL << I40E_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) |
2477 (1ULL << I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP) |
2478 (1ULL << I40E_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK);
2479 }
2480
2481 i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), (uint32_t)hena);
2482 i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), (uint32_t)(hena >> 32));
2483
2484 /*
2485 * 3. Populate LUT
2486 */
2487 return (i40e_config_rss_hlut(i40e, hw));
2488 }
2489
2490 /*
2491 * Wrapper to kick the chipset on.
2492 */
2493 static boolean_t
2494 i40e_chip_start(i40e_t *i40e)
2495 {
2496 i40e_hw_t *hw = &i40e->i40e_hw_space;
2497 struct i40e_filter_control_settings filter;
2498 int rc;
2499 uint8_t err;
2500
2501 if (((hw->aq.fw_maj_ver == 4) && (hw->aq.fw_min_ver < 33)) ||
2502 (hw->aq.fw_maj_ver < 4)) {
2503 i40e_msec_delay(75);
2504 if (i40e_aq_set_link_restart_an(hw, TRUE, NULL) !=
2505 I40E_SUCCESS) {
2506 i40e_error(i40e, "failed to restart link: admin queue "
2507 "error: %d", hw->aq.asq_last_status);
2508 return (B_FALSE);
2509 }
2510 }
2511
2512 /* Determine hardware state */
2513 i40e_get_hw_state(i40e, hw);
2514
2515 /* For now, we always disable Ethernet Flow Control. */
2516 hw->fc.requested_mode = I40E_FC_NONE;
2517 rc = i40e_set_fc(hw, &err, B_TRUE);
2518 if (rc != I40E_SUCCESS) {
2519 i40e_error(i40e, "Setting flow control failed, returned %d"
2520 " with error: 0x%x", rc, err);
2521 return (B_FALSE);
2522 }
2523
2524 /* Initialize mac addresses. */
2525 i40e_init_macaddrs(i40e, hw);
2526
2527 /*
2528 * Set up the filter control. If the hash lut size is changed from
2529 * I40E_HASH_LUT_SIZE_512 then I40E_HLUT_TABLE_SIZE and
2530 * i40e_config_rss_hlut() will need to be updated.
2531 */
2532 bzero(&filter, sizeof (filter));
2533 filter.enable_ethtype = TRUE;
2534 filter.enable_macvlan = TRUE;
2535 filter.hash_lut_size = I40E_HASH_LUT_SIZE_512;
2536
2537 rc = i40e_set_filter_control(hw, &filter);
2538 if (rc != I40E_SUCCESS) {
2539 i40e_error(i40e, "i40e_set_filter_control() returned %d", rc);
2540 return (B_FALSE);
2541 }
2542
2543 i40e_intr_chip_init(i40e);
2544
2545 rc = i40e_get_mac_seid(i40e);
2546 if (rc == -1) {
2547 i40e_error(i40e, "failed to obtain MAC Uplink SEID");
2548 return (B_FALSE);
2549 }
2550 i40e->i40e_mac_seid = (uint16_t)rc;
2551
2552 /*
2553 * Create a VEB in order to support multiple VSIs. Each VSI
2554 * functions as a MAC group. This call sets the PF's MAC as
2555 * the uplink port and the PF's default VSI as the default
2556 * downlink port.
2557 */
2558 rc = i40e_aq_add_veb(hw, i40e->i40e_mac_seid, I40E_DEF_VSI_SEID(i40e),
2559 0x1, B_TRUE, &i40e->i40e_veb_seid, B_FALSE, NULL);
2560 if (rc != I40E_SUCCESS) {
2561 i40e_error(i40e, "i40e_aq_add_veb() failed %d: %d", rc,
2562 hw->aq.asq_last_status);
2563 return (B_FALSE);
2564 }
2565
2566 if (!i40e_config_def_vsi(i40e, hw))
2567 return (B_FALSE);
2568
2569 for (uint_t i = 1; i < i40e->i40e_num_rx_groups; i++) {
2570 if (!i40e_add_vsi(i40e, hw, i))
2571 return (B_FALSE);
2572 }
2573
2574 if (!i40e_config_rss(i40e, hw))
2575 return (B_FALSE);
2576
2577 i40e_flush(hw);
2578
2579 return (B_TRUE);
2580 }
2581
2582 /*
2583 * Take care of tearing down the rx ring. See 8.3.3.1.2 for more information.
2584 */
2585 static void
2586 i40e_shutdown_rx_ring(i40e_trqpair_t *itrq)
2587 {
2588 i40e_t *i40e = itrq->itrq_i40e;
2589 i40e_hw_t *hw = &i40e->i40e_hw_space;
2590 uint32_t reg;
2591
2592 /*
2593 * Step 1. 8.3.3.1.2 suggests the interrupt is removed from the
2594 * hardware interrupt linked list (see i40e_intr.c) but for
2595 * simplicity we keep this list immutable until the device
2596 * (distinct from an individual ring) is stopped.
2597 */
2598
2599 /*
2600 * Step 2. Request the queue by clearing QENA_REQ. It may not be
2601 * set due to unwinding from failures and a partially enabled
2602 * ring set.
2603 */
2604 reg = I40E_READ_REG(hw, I40E_QRX_ENA(itrq->itrq_index));
2605 if (!(reg & I40E_QRX_ENA_QENA_REQ_MASK))
2606 return;
2607 VERIFY((reg & I40E_QRX_ENA_QENA_REQ_MASK) ==
2608 I40E_QRX_ENA_QENA_REQ_MASK);
2609 reg &= ~I40E_QRX_ENA_QENA_REQ_MASK;
2610 I40E_WRITE_REG(hw, I40E_QRX_ENA(itrq->itrq_index), reg);
2611
2612 /*
2613 * Step 3. Wait for the disable to take, by having QENA_STAT in the FPM
2614 * be cleared. Note that we could still receive data in the queue during
2615 * this time. We don't actually wait for this now and instead defer this
2616 * to i40e_shutdown_ring_wait(), after we've interleaved disabling the
2617 * TX queue as well.
2618 */
2619 }
2620
2621 static void
2622 i40e_shutdown_tx_ring(i40e_trqpair_t *itrq)
2623 {
2624 i40e_t *i40e = itrq->itrq_i40e;
2625 i40e_hw_t *hw = &i40e->i40e_hw_space;
2626 uint32_t reg;
2627
2628 /*
2629 * Step 2. Set the SET_QDIS flag for the queue.
2630 */
2631 i40e_pre_tx_queue_cfg(hw, itrq->itrq_index, B_FALSE);
2632
2633 /*
2634 * Step 3. Wait at least 400 usec.
2635 */
2636 drv_usecwait(500);
2637
2638 /*
2639 * Step 4. Clear the QENA_REQ flag which tells hardware to
2640 * quiesce. If QENA_REQ is not already set then that means that
2641 * we likely already tried to disable this queue.
2642 */
2643 reg = I40E_READ_REG(hw, I40E_QTX_ENA(itrq->itrq_index));
2644 if ((reg & I40E_QTX_ENA_QENA_REQ_MASK) != 0) {
2645 reg &= ~I40E_QTX_ENA_QENA_REQ_MASK;
2646 I40E_WRITE_REG(hw, I40E_QTX_ENA(itrq->itrq_index), reg);
2647 }
2648
2649 /*
2650 * Step 5. Wait for the drain to finish. This will be done by the
2651 * hardware removing the QENA_STAT flag from the queue. Rather than
2652 * waiting here, we interleave it with the receive shutdown in
2653 * i40e_shutdown_ring_wait().
2654 */
2655 }
2656
2657 /*
2658 * Wait for a ring to be shut down. e.g. Steps 2 and 5 from the above
2659 * functions.
2660 */
2661 static boolean_t
2662 i40e_shutdown_ring_wait(i40e_trqpair_t *itrq)
2663 {
2664 i40e_t *i40e = itrq->itrq_i40e;
2665 i40e_hw_t *hw = &i40e->i40e_hw_space;
2666 uint32_t reg;
2667 int try;
2668
2669 for (try = 0; try < I40E_RING_WAIT_NTRIES; try++) {
2670 reg = I40E_READ_REG(hw, I40E_QRX_ENA(itrq->itrq_index));
2671 if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) == 0)
2672 break;
2673 i40e_msec_delay(I40E_RING_WAIT_PAUSE);
2674 }
2675
2676 if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) != 0) {
2677 i40e_error(i40e, "timed out disabling rx queue %d",
2678 itrq->itrq_index);
2679 return (B_FALSE);
2680 }
2681
2682 for (try = 0; try < I40E_RING_WAIT_NTRIES; try++) {
2683 reg = I40E_READ_REG(hw, I40E_QTX_ENA(itrq->itrq_index));
2684 if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) == 0)
2685 break;
2686 i40e_msec_delay(I40E_RING_WAIT_PAUSE);
2687 }
2688
2689 if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) != 0) {
2690 i40e_error(i40e, "timed out disabling tx queue %d",
2691 itrq->itrq_index);
2692 return (B_FALSE);
2693 }
2694
2695 return (B_TRUE);
2696 }
2697
2698
2699 /*
2700 * Shutdown an individual ring and release any memory.
2701 */
2702 boolean_t
2703 i40e_shutdown_ring(i40e_trqpair_t *itrq)
2704 {
2705 boolean_t rv = B_TRUE;
2706
2707 /*
2708 * Tell transmit path to quiesce, and wait until done.
2709 */
2710 if (i40e_ring_tx_quiesce(itrq)) {
2711 /* Already quiesced. */
2712 return (B_TRUE);
2713 }
2714
2715 i40e_shutdown_rx_ring(itrq);
2716 i40e_shutdown_tx_ring(itrq);
2717 if (!i40e_shutdown_ring_wait(itrq))
2718 rv = B_FALSE;
2719
2720 /*
2721 * After the ring has stopped, we need to wait 50ms before
2722 * programming it again. Rather than wait here, we'll record
2723 * the time the ring was stopped. When the ring is started, we'll
2724 * check if enough time has expired and then wait if necessary.
2725 */
2726 itrq->irtq_time_stopped = gethrtime();
2727
2728 /*
2729 * The rings have been stopped in the hardware, now wait for
2730 * a possibly active interrupt thread.
2731 */
2732 i40e_intr_quiesce(itrq);
2733
2734 mutex_enter(&itrq->itrq_tx_lock);
2735 i40e_tx_cleanup_ring(itrq);
2736 mutex_exit(&itrq->itrq_tx_lock);
2737
2738 i40e_free_ring_mem(itrq, B_FALSE);
2739
2740 return (rv);
2741 }
2742
2743 /*
2744 * Shutdown all the rings.
2745 * Called from i40e_stop(), and hopefully the mac layer has already
2746 * called ring stop for each ring, which would make this almost a no-op.
2747 */
2748 static boolean_t
2749 i40e_shutdown_rings(i40e_t *i40e)
2750 {
2751 boolean_t rv = B_TRUE;
2752 int i;
2753
2754 for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
2755 if (!i40e_shutdown_ring(&i40e->i40e_trqpairs[i]))
2756 rv = B_FALSE;
2757 }
2758
2759 return (rv);
2760 }
2761
2762 static void
2763 i40e_setup_rx_descs(i40e_trqpair_t *itrq)
2764 {
2765 int i;
2766 i40e_rx_data_t *rxd = itrq->itrq_rxdata;
2767
2768 for (i = 0; i < rxd->rxd_ring_size; i++) {
2769 i40e_rx_control_block_t *rcb;
2770 i40e_rx_desc_t *rdesc;
2771
2772 rcb = rxd->rxd_work_list[i];
2773 rdesc = &rxd->rxd_desc_ring[i];
2774
2775 rdesc->read.pkt_addr =
2776 CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address);
2777 rdesc->read.hdr_addr = 0;
2778 }
2779 }
2780
2781 static boolean_t
2782 i40e_setup_rx_hmc(i40e_trqpair_t *itrq)
2783 {
2784 i40e_rx_data_t *rxd = itrq->itrq_rxdata;
2785 i40e_t *i40e = itrq->itrq_i40e;
2786 i40e_hw_t *hw = &i40e->i40e_hw_space;
2787
2788 struct i40e_hmc_obj_rxq rctx;
2789 int err;
2790
2791 bzero(&rctx, sizeof (struct i40e_hmc_obj_rxq));
2792 rctx.base = rxd->rxd_desc_area.dmab_dma_address /
2793 I40E_HMC_RX_CTX_UNIT;
2794 rctx.qlen = rxd->rxd_ring_size;
2795 VERIFY(i40e->i40e_rx_buf_size >= I40E_HMC_RX_DBUFF_MIN);
2796 VERIFY(i40e->i40e_rx_buf_size <= I40E_HMC_RX_DBUFF_MAX);
2797 rctx.dbuff = i40e->i40e_rx_buf_size >> I40E_RXQ_CTX_DBUFF_SHIFT;
2798 rctx.hbuff = 0 >> I40E_RXQ_CTX_HBUFF_SHIFT;
2799 rctx.dtype = I40E_HMC_RX_DTYPE_NOSPLIT;
2800 rctx.dsize = I40E_HMC_RX_DSIZE_32BYTE;
2801 rctx.crcstrip = I40E_HMC_RX_CRCSTRIP_ENABLE;
2802 rctx.fc_ena = I40E_HMC_RX_FC_DISABLE;
2803 rctx.l2tsel = I40E_HMC_RX_L2TAGORDER;
2804 rctx.hsplit_0 = I40E_HMC_RX_HDRSPLIT_DISABLE;
2805 rctx.hsplit_1 = I40E_HMC_RX_HDRSPLIT_DISABLE;
2806 rctx.showiv = I40E_HMC_RX_INVLAN_DONTSTRIP;
2807 rctx.rxmax = i40e->i40e_frame_max;
2808 rctx.tphrdesc_ena = I40E_HMC_RX_TPH_DISABLE;
2809 rctx.tphwdesc_ena = I40E_HMC_RX_TPH_DISABLE;
2810 rctx.tphdata_ena = I40E_HMC_RX_TPH_DISABLE;
2811 rctx.tphhead_ena = I40E_HMC_RX_TPH_DISABLE;
2812 rctx.lrxqthresh = I40E_HMC_RX_LOWRXQ_NOINTR;
2813
2814 /*
2815 * This must be set to 0x1, see Table 8-12 in section 8.3.3.2.2.
2816 */
2817 rctx.prefena = I40E_HMC_RX_PREFENA;
2818
2819 err = i40e_clear_lan_rx_queue_context(hw, itrq->itrq_index);
2820 if (err != I40E_SUCCESS) {
2821 i40e_error(i40e, "failed to clear rx queue %d context: %d",
2822 itrq->itrq_index, err);
2823 return (B_FALSE);
2824 }
2825
2826 err = i40e_set_lan_rx_queue_context(hw, itrq->itrq_index, &rctx);
2827 if (err != I40E_SUCCESS) {
2828 i40e_error(i40e, "failed to set rx queue %d context: %d",
2829 itrq->itrq_index, err);
2830 return (B_FALSE);
2831 }
2832
2833 return (B_TRUE);
2834 }
2835
2836 /*
2837 * Take care of setting up the descriptor ring and actually programming the
2838 * device. See 8.3.3.1.1 for the full list of steps we need to do to enable the
2839 * rx rings.
2840 */
2841 static boolean_t
2842 i40e_setup_rx_ring(i40e_trqpair_t *itrq)
2843 {
2844 i40e_t *i40e = itrq->itrq_i40e;
2845 i40e_hw_t *hw = &i40e->i40e_hw_space;
2846 i40e_rx_data_t *rxd = itrq->itrq_rxdata;
2847 uint32_t reg;
2848 int i;
2849
2850 /*
2851 * Step 1. Program all receive ring descriptors.
2852 */
2853 i40e_setup_rx_descs(itrq);
2854
2855 /*
2856 * Step 2. Program the queue's FPM/HMC context.
2857 */
2858 if (!i40e_setup_rx_hmc(itrq))
2859 return (B_FALSE);
2860
2861 /*
2862 * Step 3. Clear the queue's tail pointer and set it to the end
2863 * of the space.
2864 */
2865 I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), 0);
2866 I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index),
2867 rxd->rxd_ring_size - 1);
2868
2869 /*
2870 * Step 4. Enable the queue via the QENA_REQ.
2871 */
2872 reg = I40E_READ_REG(hw, I40E_QRX_ENA(itrq->itrq_index));
2873 VERIFY0(reg & (I40E_QRX_ENA_QENA_REQ_MASK |
2874 I40E_QRX_ENA_QENA_STAT_MASK));
2875 reg |= I40E_QRX_ENA_QENA_REQ_MASK;
2876 I40E_WRITE_REG(hw, I40E_QRX_ENA(itrq->itrq_index), reg);
2877
2878 /*
2879 * Step 5. Verify that QENA_STAT has been set. It's promised
2880 * that this should occur within about 10 us, but like other
2881 * systems, we give the card a bit more time.
2882 */
2883 for (i = 0; i < I40E_RING_WAIT_NTRIES; i++) {
2884 reg = I40E_READ_REG(hw, I40E_QRX_ENA(itrq->itrq_index));
2885
2886 if (reg & I40E_QRX_ENA_QENA_STAT_MASK)
2887 break;
2888 i40e_msec_delay(I40E_RING_WAIT_PAUSE);
2889 }
2890
2891 if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) == 0) {
2892 i40e_error(i40e, "failed to enable rx queue %d, timed "
2893 "out.", itrq->itrq_index);
2894 return (B_FALSE);
2895 }
2896
2897 return (B_TRUE);
2898 }
2899
2900 static boolean_t
2901 i40e_setup_tx_hmc(i40e_trqpair_t *itrq)
2902 {
2903 i40e_t *i40e = itrq->itrq_i40e;
2904 i40e_hw_t *hw = &i40e->i40e_hw_space;
2905
2906 struct i40e_hmc_obj_txq tctx;
2907 struct i40e_vsi_context context;
2908 int err;
2909
2910 bzero(&tctx, sizeof (struct i40e_hmc_obj_txq));
2911 tctx.new_context = I40E_HMC_TX_NEW_CONTEXT;
2912 tctx.base = itrq->itrq_desc_area.dmab_dma_address /
2913 I40E_HMC_TX_CTX_UNIT;
2914 tctx.fc_ena = I40E_HMC_TX_FC_DISABLE;
2915 tctx.timesync_ena = I40E_HMC_TX_TS_DISABLE;
2916 tctx.fd_ena = I40E_HMC_TX_FD_DISABLE;
2917 tctx.alt_vlan_ena = I40E_HMC_TX_ALT_VLAN_DISABLE;
2918 tctx.head_wb_ena = I40E_HMC_TX_WB_ENABLE;
2919 tctx.qlen = itrq->itrq_tx_ring_size;
2920 tctx.tphrdesc_ena = I40E_HMC_TX_TPH_DISABLE;
2921 tctx.tphrpacket_ena = I40E_HMC_TX_TPH_DISABLE;
2922 tctx.tphwdesc_ena = I40E_HMC_TX_TPH_DISABLE;
2923 tctx.head_wb_addr = itrq->itrq_desc_area.dmab_dma_address +
2924 sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
2925
2926 /*
2927 * This field isn't actually documented, like crc, but it suggests that
2928 * it should be zeroed. We leave both of these here because of that for
2929 * now. We should check with Intel on why these are here even.
2930 */
2931 tctx.crc = 0;
2932 tctx.rdylist_act = 0;
2933
2934 /*
2935 * We're supposed to assign the rdylist field with the value of the
2936 * traffic class index for the first device. We query the VSI parameters
2937 * again to get what the handle is. Note that every queue is always
2938 * assigned to traffic class zero, because we don't actually use them.
2939 */
2940 bzero(&context, sizeof (struct i40e_vsi_context));
2941 context.seid = I40E_DEF_VSI_SEID(i40e);
2942 context.pf_num = hw->pf_id;
2943 err = i40e_aq_get_vsi_params(hw, &context, NULL);
2944 if (err != I40E_SUCCESS) {
2945 i40e_error(i40e, "get VSI params failed with %d", err);
2946 return (B_FALSE);
2947 }
2948 tctx.rdylist = LE_16(context.info.qs_handle[0]);
2949
2950 err = i40e_clear_lan_tx_queue_context(hw, itrq->itrq_index);
2951 if (err != I40E_SUCCESS) {
2952 i40e_error(i40e, "failed to clear tx queue %d context: %d",
2953 itrq->itrq_index, err);
2954 return (B_FALSE);
2955 }
2956
2957 err = i40e_set_lan_tx_queue_context(hw, itrq->itrq_index, &tctx);
2958 if (err != I40E_SUCCESS) {
2959 i40e_error(i40e, "failed to set tx queue %d context: %d",
2960 itrq->itrq_index, err);
2961 return (B_FALSE);
2962 }
2963
2964 return (B_TRUE);
2965 }
2966
2967 /*
2968 * Take care of setting up the descriptor ring and actually programming the
2969 * device. See 8.4.3.1.1 for what we need to do here.
2970 */
2971 static boolean_t
2972 i40e_setup_tx_ring(i40e_trqpair_t *itrq)
2973 {
2974 i40e_t *i40e = itrq->itrq_i40e;
2975 i40e_hw_t *hw = &i40e->i40e_hw_space;
2976 uint32_t reg;
2977 int i;
2978
2979 /*
2980 * Step 1. Clear the queue disable flag and verify that the
2981 * index is set correctly.
2982 */
2983 i40e_pre_tx_queue_cfg(hw, itrq->itrq_index, B_TRUE);
2984
2985 /*
2986 * Step 2. Prepare the queue's FPM/HMC context.
2987 */
2988 if (!i40e_setup_tx_hmc(itrq))
2989 return (B_FALSE);
2990
2991 /*
2992 * Step 3. Verify that it's clear that this PF owns this queue.
2993 */
2994 reg = I40E_QTX_CTL_PF_QUEUE;
2995 reg |= (hw->pf_id << I40E_QTX_CTL_PF_INDX_SHIFT) &
2996 I40E_QTX_CTL_PF_INDX_MASK;
2997 I40E_WRITE_REG(hw, I40E_QTX_CTL(itrq->itrq_index), reg);
2998 i40e_flush(hw);
2999
3000 /*
3001 * Step 4. Set the QENA_REQ flag.
3002 */
3003 reg = I40E_READ_REG(hw, I40E_QTX_ENA(itrq->itrq_index));
3004 VERIFY0(reg & (I40E_QTX_ENA_QENA_REQ_MASK |
3005 I40E_QTX_ENA_QENA_STAT_MASK));
3006 reg |= I40E_QTX_ENA_QENA_REQ_MASK;
3007 I40E_WRITE_REG(hw, I40E_QTX_ENA(itrq->itrq_index), reg);
3008
3009 /*
3010 * Step 5. Verify that QENA_STAT has been set. It's promised
3011 * that this should occur within about 10 us, but like BSD,
3012 * we'll try for up to 100 ms for this queue.
3013 */
3014 for (i = 0; i < I40E_RING_WAIT_NTRIES; i++) {
3015 reg = I40E_READ_REG(hw, I40E_QTX_ENA(itrq->itrq_index));
3016
3017 if (reg & I40E_QTX_ENA_QENA_STAT_MASK)
3018 break;
3019 i40e_msec_delay(I40E_RING_WAIT_PAUSE);
3020 }
3021
3022 if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) == 0) {
3023 i40e_error(i40e, "failed to enable tx queue %d, timed "
3024 "out", itrq->itrq_index);
3025 return (B_FALSE);
3026 }
3027
3028 return (B_TRUE);
3029 }
3030
3031 int
3032 i40e_setup_ring(i40e_trqpair_t *itrq)
3033 {
3034 i40e_t *i40e = itrq->itrq_i40e;
3035 hrtime_t now, gap;
3036
3037 if (!i40e_alloc_ring_mem(itrq)) {
3038 i40e_error(i40e, "Failed to allocate ring memory");
3039 return (ENOMEM);
3040 }
3041
3042 /*
3043 * 8.3.3.1.1 Receive Queue Enable Flow states software should
3044 * wait at least 50ms between ring disable and enable. See how
3045 * long we need to wait, and wait only if required.
3046 */
3047 now = gethrtime();
3048 gap = NSEC2MSEC(now - itrq->irtq_time_stopped);
3049 if (gap < I40E_RING_ENABLE_GAP && gap != 0)
3050 delay(drv_usectohz(gap * 1000));
3051
3052 mutex_enter(&itrq->itrq_intr_lock);
3053 if (!i40e_setup_rx_ring(itrq))
3054 goto failed;
3055
3056 if (!i40e_setup_tx_ring(itrq))
3057 goto failed;
3058
3059 if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
3060 DDI_FM_OK)
3061 goto failed;
3062
3063 itrq->itrq_intr_quiesce = B_FALSE;
3064 mutex_exit(&itrq->itrq_intr_lock);
3065
3066 mutex_enter(&itrq->itrq_tx_lock);
3067 itrq->itrq_tx_quiesce = B_FALSE;
3068 mutex_exit(&itrq->itrq_tx_lock);
3069
3070 return (0);
3071
3072 failed:
3073 mutex_exit(&itrq->itrq_intr_lock);
3074 i40e_free_ring_mem(itrq, B_TRUE);
3075 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
3076
3077 return (EIO);
3078 }
3079
3080 void
3081 i40e_stop(i40e_t *i40e)
3082 {
3083 uint_t i;
3084 i40e_hw_t *hw = &i40e->i40e_hw_space;
3085
3086 ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
3087
3088 /*
3089 * Shutdown and drain the tx and rx pipeline. We do this using the
3090 * following steps.
3091 *
3092 * 1) Shutdown interrupts to all the queues (trying to keep the admin
3093 * queue alive).
3094 *
3095 * 2) Remove all of the interrupt tx and rx causes by setting the
3096 * interrupt linked lists to zero.
3097 *
3098 * 2) Shutdown the tx and rx rings. Because i40e_shutdown_rings() should
3099 * wait for all the queues to be disabled, once we reach that point
3100 * it should be safe to free associated data.
3101 *
3102 * 4) Wait 50ms after all that is done. This ensures that the rings are
3103 * ready for programming again and we don't have to think about this
3104 * in other parts of the driver.
3105 *
3106 * 5) Disable remaining chip interrupts, (admin queue, etc.)
3107 *
3108 * 6) Verify that FM is happy with all the register accesses we
3109 * performed.
3110 */
3111 i40e_intr_io_disable_all(i40e);
3112 i40e_intr_io_clear_cause(i40e);
3113
3114 if (!i40e_shutdown_rings(i40e))
3115 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
3116
3117 /*
3118 * We don't delete the default VSI because it replaces the VEB
3119 * after VEB deletion (see the "Delete Element" section).
3120 * Furthermore, since the default VSI is provided by the
3121 * firmware, we never attempt to delete it.
3122 */
3123 for (i = 1; i < i40e->i40e_num_rx_groups; i++) {
3124 i40e_delete_vsi(i40e, i);
3125 }
3126
3127 if (i40e->i40e_veb_seid != 0) {
3128 int rc = i40e_aq_delete_element(hw, i40e->i40e_veb_seid, NULL);
3129
3130 if (rc != I40E_SUCCESS) {
3131 i40e_error(i40e, "Failed to delete VEB %d: %d", rc,
3132 hw->aq.asq_last_status);
3133 }
3134
3135 i40e->i40e_veb_seid = 0;
3136 }
3137
3138 i40e_intr_chip_fini(i40e);
3139
3140 if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_cfg_handle) !=
3141 DDI_FM_OK) {
3142 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
3143 }
3144
3145 for (i = 0; i < i40e->i40e_num_rx_groups; i++) {
3146 i40e_stat_vsi_fini(i40e, i);
3147 }
3148
3149 i40e->i40e_link_speed = 0;
3150 i40e->i40e_link_duplex = 0;
3151 i40e_link_state_set(i40e, LINK_STATE_UNKNOWN);
3152 }
3153
3154 boolean_t
3155 i40e_start(i40e_t *i40e)
3156 {
3157 i40e_hw_t *hw = &i40e->i40e_hw_space;
3158 boolean_t rc = B_TRUE;
3159 int err;
3160
3161 ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
3162
3163 if (!i40e_chip_start(i40e)) {
3164 i40e_fm_ereport(i40e, DDI_FM_DEVICE_INVAL_STATE);
3165 rc = B_FALSE;
3166 goto done;
3167 }
3168
3169 /*
3170 * Enable broadcast traffic; however, do not enable multicast traffic.
3171 * That's handle exclusively through MAC's mc_multicst routines.
3172 */
3173 err = i40e_aq_set_vsi_broadcast(hw, I40E_DEF_VSI_SEID(i40e), B_TRUE,
3174 NULL);
3175 if (err != I40E_SUCCESS) {
3176 i40e_error(i40e, "failed to set default VSI: %d", err);
3177 rc = B_FALSE;
3178 goto done;
3179 }
3180
3181 err = i40e_aq_set_mac_config(hw, i40e->i40e_frame_max, B_TRUE, 0, NULL);
3182 if (err != I40E_SUCCESS) {
3183 i40e_error(i40e, "failed to set MAC config: %d", err);
3184 rc = B_FALSE;
3185 goto done;
3186 }
3187
3188 /*
3189 * Finally, make sure that we're happy from an FM perspective.
3190 */
3191 if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
3192 DDI_FM_OK) {
3193 rc = B_FALSE;
3194 goto done;
3195 }
3196
3197 /* Clear state bits prior to final interrupt enabling. */
3198 atomic_and_32(&i40e->i40e_state,
3199 ~(I40E_ERROR | I40E_STALL | I40E_OVERTEMP));
3200
3201 i40e_intr_io_enable_all(i40e);
3202
3203 done:
3204 if (rc == B_FALSE) {
3205 i40e_stop(i40e);
3206 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
3207 }
3208
3209 return (rc);
3210 }
3211
3212 /*
3213 * We may have loaned up descriptors to the stack. As such, if we still have
3214 * them outstanding, then we will not continue with detach.
3215 */
3216 static boolean_t
3217 i40e_drain_rx(i40e_t *i40e)
3218 {
3219 mutex_enter(&i40e->i40e_rx_pending_lock);
3220 while (i40e->i40e_rx_pending > 0) {
3221 if (cv_reltimedwait(&i40e->i40e_rx_pending_cv,
3222 &i40e->i40e_rx_pending_lock,
3223 drv_usectohz(I40E_DRAIN_RX_WAIT), TR_CLOCK_TICK) == -1) {
3224 mutex_exit(&i40e->i40e_rx_pending_lock);
3225 return (B_FALSE);
3226 }
3227 }
3228 mutex_exit(&i40e->i40e_rx_pending_lock);
3229
3230 return (B_TRUE);
3231 }
3232
3233 /*
3234 * DDI UFM Callbacks
3235 */
3236 static int
3237 i40e_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
3238 ddi_ufm_image_t *img)
3239 {
3240 if (imgno != 0)
3241 return (EINVAL);
3242
3243 ddi_ufm_image_set_desc(img, "Firmware");
3244 ddi_ufm_image_set_nslots(img, 1);
3245
3246 return (0);
3247 }
3248
3249 static int
3250 i40e_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
3251 uint_t slotno, ddi_ufm_slot_t *slot)
3252 {
3253 i40e_t *i40e = (i40e_t *)arg;
3254 char *fw_ver = NULL, *fw_bld = NULL, *api_ver = NULL;
3255 nvlist_t *misc = NULL;
3256 uint_t flags = DDI_PROP_DONTPASS;
3257 int err;
3258
3259 if (imgno != 0 || slotno != 0 ||
3260 ddi_prop_lookup_string(DDI_DEV_T_ANY, i40e->i40e_dip, flags,
3261 "firmware-version", &fw_ver) != DDI_PROP_SUCCESS ||
3262 ddi_prop_lookup_string(DDI_DEV_T_ANY, i40e->i40e_dip, flags,
3263 "firmware-build", &fw_bld) != DDI_PROP_SUCCESS ||
3264 ddi_prop_lookup_string(DDI_DEV_T_ANY, i40e->i40e_dip, flags,
3265 "api-version", &api_ver) != DDI_PROP_SUCCESS) {
3266 err = EINVAL;
3267 goto err;
3268 }
3269
3270 ddi_ufm_slot_set_attrs(slot, DDI_UFM_ATTR_ACTIVE);
3271 ddi_ufm_slot_set_version(slot, fw_ver);
3272
3273 (void) nvlist_alloc(&misc, NV_UNIQUE_NAME, KM_SLEEP);
3274 if ((err = nvlist_add_string(misc, "firmware-build", fw_bld)) != 0 ||
3275 (err = nvlist_add_string(misc, "api-version", api_ver)) != 0) {
3276 goto err;
3277 }
3278 ddi_ufm_slot_set_misc(slot, misc);
3279
3280 ddi_prop_free(fw_ver);
3281 ddi_prop_free(fw_bld);
3282 ddi_prop_free(api_ver);
3283
3284 return (0);
3285 err:
3286 nvlist_free(misc);
3287 if (fw_ver != NULL)
3288 ddi_prop_free(fw_ver);
3289 if (fw_bld != NULL)
3290 ddi_prop_free(fw_bld);
3291 if (api_ver != NULL)
3292 ddi_prop_free(api_ver);
3293
3294 return (err);
3295 }
3296
3297 static int
3298 i40e_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps)
3299 {
3300 *caps = DDI_UFM_CAP_REPORT;
3301
3302 return (0);
3303 }
3304
3305 static ddi_ufm_ops_t i40e_ufm_ops = {
3306 NULL,
3307 i40e_ufm_fill_image,
3308 i40e_ufm_fill_slot,
3309 i40e_ufm_getcaps
3310 };
3311
3312 static int
3313 i40e_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
3314 {
3315 i40e_t *i40e;
3316 struct i40e_osdep *osdep;
3317 i40e_hw_t *hw;
3318 int instance;
3319
3320 if (cmd != DDI_ATTACH)
3321 return (DDI_FAILURE);
3322
3323 instance = ddi_get_instance(devinfo);
3324 i40e = kmem_zalloc(sizeof (i40e_t), KM_SLEEP);
3325
3326 i40e->i40e_aqbuf = kmem_zalloc(I40E_ADMINQ_BUFSZ, KM_SLEEP);
3327 i40e->i40e_instance = instance;
3328 i40e->i40e_dip = devinfo;
3329
3330 hw = &i40e->i40e_hw_space;
3331 osdep = &i40e->i40e_osdep_space;
3332 hw->back = osdep;
3333 osdep->ios_i40e = i40e;
3334
3335 ddi_set_driver_private(devinfo, i40e);
3336
3337 i40e_fm_init(i40e);
3338 i40e->i40e_attach_progress |= I40E_ATTACH_FM_INIT;
3339
3340 if (pci_config_setup(devinfo, &osdep->ios_cfg_handle) != DDI_SUCCESS) {
3341 i40e_error(i40e, "Failed to map PCI configurations.");
3342 goto attach_fail;
3343 }
3344 i40e->i40e_attach_progress |= I40E_ATTACH_PCI_CONFIG;
3345
3346 i40e_identify_hardware(i40e);
3347
3348 if (!i40e_regs_map(i40e)) {
3349 i40e_error(i40e, "Failed to map device registers.");
3350 goto attach_fail;
3351 }
3352 i40e->i40e_attach_progress |= I40E_ATTACH_REGS_MAP;
3353
3354 i40e_init_properties(i40e);
3355 i40e->i40e_attach_progress |= I40E_ATTACH_PROPS;
3356
3357 if (!i40e_common_code_init(i40e, hw))
3358 goto attach_fail;
3359 i40e->i40e_attach_progress |= I40E_ATTACH_COMMON_CODE;
3360
3361 /*
3362 * When we participate in IRM, we should make sure that we register
3363 * ourselves with it before callbacks.
3364 */
3365 if (!i40e_alloc_intrs(i40e, devinfo)) {
3366 i40e_error(i40e, "Failed to allocate interrupts.");
3367 goto attach_fail;
3368 }
3369 i40e->i40e_attach_progress |= I40E_ATTACH_ALLOC_INTR;
3370
3371 if (!i40e_alloc_trqpairs(i40e)) {
3372 i40e_error(i40e,
3373 "Failed to allocate receive & transmit rings.");
3374 goto attach_fail;
3375 }
3376 i40e->i40e_attach_progress |= I40E_ATTACH_ALLOC_RINGSLOCKS;
3377
3378 if (!i40e_map_intrs_to_vectors(i40e)) {
3379 i40e_error(i40e, "Failed to map interrupts to vectors.");
3380 goto attach_fail;
3381 }
3382
3383 if (!i40e_add_intr_handlers(i40e)) {
3384 i40e_error(i40e, "Failed to add the interrupt handlers.");
3385 goto attach_fail;
3386 }
3387 i40e->i40e_attach_progress |= I40E_ATTACH_ADD_INTR;
3388
3389 if (!i40e_final_init(i40e)) {
3390 i40e_error(i40e, "Final initialization failed.");
3391 goto attach_fail;
3392 }
3393 i40e->i40e_attach_progress |= I40E_ATTACH_INIT;
3394
3395 if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_cfg_handle) !=
3396 DDI_FM_OK) {
3397 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
3398 goto attach_fail;
3399 }
3400
3401 if (!i40e_stats_init(i40e)) {
3402 i40e_error(i40e, "Stats initialization failed.");
3403 goto attach_fail;
3404 }
3405 i40e->i40e_attach_progress |= I40E_ATTACH_STATS;
3406
3407 if (!i40e_register_mac(i40e)) {
3408 i40e_error(i40e, "Failed to register to MAC/GLDv3");
3409 goto attach_fail;
3410 }
3411 i40e->i40e_attach_progress |= I40E_ATTACH_MAC;
3412
3413 i40e->i40e_periodic_id = ddi_periodic_add(i40e_timer, i40e,
3414 I40E_CYCLIC_PERIOD, DDI_IPL_0);
3415 if (i40e->i40e_periodic_id == 0) {
3416 i40e_error(i40e, "Failed to add the link-check timer");
3417 goto attach_fail;
3418 }
3419 i40e->i40e_attach_progress |= I40E_ATTACH_LINK_TIMER;
3420
3421 if (!i40e_enable_interrupts(i40e)) {
3422 i40e_error(i40e, "Failed to enable DDI interrupts");
3423 goto attach_fail;
3424 }
3425 i40e->i40e_attach_progress |= I40E_ATTACH_ENABLE_INTR;
3426
3427 if (i40e->i40e_hw_space.bus.func == 0) {
3428 if (ddi_ufm_init(i40e->i40e_dip, DDI_UFM_CURRENT_VERSION,
3429 &i40e_ufm_ops, &i40e->i40e_ufmh, i40e) != 0) {
3430 i40e_error(i40e, "failed to initialize UFM subsystem");
3431 goto attach_fail;
3432 }
3433 ddi_ufm_update(i40e->i40e_ufmh);
3434 i40e->i40e_attach_progress |= I40E_ATTACH_UFM_INIT;
3435 }
3436
3437 atomic_or_32(&i40e->i40e_state, I40E_INITIALIZED);
3438
3439 mutex_enter(&i40e_glock);
3440 list_insert_tail(&i40e_glist, i40e);
3441 mutex_exit(&i40e_glock);
3442
3443 return (DDI_SUCCESS);
3444
3445 attach_fail:
3446 i40e_unconfigure(devinfo, i40e);
3447 return (DDI_FAILURE);
3448 }
3449
3450 static int
3451 i40e_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
3452 {
3453 i40e_t *i40e;
3454
3455 if (cmd != DDI_DETACH)
3456 return (DDI_FAILURE);
3457
3458 i40e = (i40e_t *)ddi_get_driver_private(devinfo);
3459 if (i40e == NULL) {
3460 i40e_log(NULL, "i40e_detach() called with no i40e pointer!");
3461 return (DDI_FAILURE);
3462 }
3463
3464 if (i40e_drain_rx(i40e) == B_FALSE) {
3465 i40e_log(i40e, "timed out draining DMA resources, %d buffers "
3466 "remain", i40e->i40e_rx_pending);
3467 return (DDI_FAILURE);
3468 }
3469
3470 mutex_enter(&i40e_glock);
3471 list_remove(&i40e_glist, i40e);
3472 mutex_exit(&i40e_glock);
3473
3474 i40e_unconfigure(devinfo, i40e);
3475
3476 return (DDI_SUCCESS);
3477 }
3478
3479 static struct cb_ops i40e_cb_ops = {
3480 nulldev, /* cb_open */
3481 nulldev, /* cb_close */
3482 nodev, /* cb_strategy */
3483 nodev, /* cb_print */
3484 nodev, /* cb_dump */
3485 nodev, /* cb_read */
3486 nodev, /* cb_write */
3487 nodev, /* cb_ioctl */
3488 nodev, /* cb_devmap */
3489 nodev, /* cb_mmap */
3490 nodev, /* cb_segmap */
3491 nochpoll, /* cb_chpoll */
3492 ddi_prop_op, /* cb_prop_op */
3493 NULL, /* cb_stream */
3494 D_MP | D_HOTPLUG, /* cb_flag */
3495 CB_REV, /* cb_rev */
3496 nodev, /* cb_aread */
3497 nodev /* cb_awrite */
3498 };
3499
3500 static struct dev_ops i40e_dev_ops = {
3501 DEVO_REV, /* devo_rev */
3502 0, /* devo_refcnt */
3503 NULL, /* devo_getinfo */
3504 nulldev, /* devo_identify */
3505 nulldev, /* devo_probe */
3506 i40e_attach, /* devo_attach */
3507 i40e_detach, /* devo_detach */
3508 nodev, /* devo_reset */
3509 &i40e_cb_ops, /* devo_cb_ops */
3510 NULL, /* devo_bus_ops */
3511 nulldev, /* devo_power */
3512 ddi_quiesce_not_supported /* devo_quiesce */
3513 };
3514
3515 static struct modldrv i40e_modldrv = {
3516 &mod_driverops,
3517 i40e_ident,
3518 &i40e_dev_ops
3519 };
3520
3521 static struct modlinkage i40e_modlinkage = {
3522 MODREV_1,
3523 &i40e_modldrv,
3524 NULL
3525 };
3526
3527 /*
3528 * Module Initialization Functions.
3529 */
3530 int
3531 _init(void)
3532 {
3533 int status;
3534
3535 list_create(&i40e_glist, sizeof (i40e_t), offsetof(i40e_t, i40e_glink));
3536 list_create(&i40e_dlist, sizeof (i40e_device_t),
3537 offsetof(i40e_device_t, id_link));
3538 mutex_init(&i40e_glock, NULL, MUTEX_DRIVER, NULL);
3539 mac_init_ops(&i40e_dev_ops, I40E_MODULE_NAME);
3540
3541 status = mod_install(&i40e_modlinkage);
3542 if (status != DDI_SUCCESS) {
3543 mac_fini_ops(&i40e_dev_ops);
3544 mutex_destroy(&i40e_glock);
3545 list_destroy(&i40e_dlist);
3546 list_destroy(&i40e_glist);
3547 }
3548
3549 return (status);
3550 }
3551
3552 int
3553 _info(struct modinfo *modinfop)
3554 {
3555 return (mod_info(&i40e_modlinkage, modinfop));
3556 }
3557
3558 int
3559 _fini(void)
3560 {
3561 int status;
3562
3563 status = mod_remove(&i40e_modlinkage);
3564 if (status == DDI_SUCCESS) {
3565 mac_fini_ops(&i40e_dev_ops);
3566 mutex_destroy(&i40e_glock);
3567 list_destroy(&i40e_dlist);
3568 list_destroy(&i40e_glist);
3569 }
3570
3571 return (status);
3572 }