1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
14 * Copyright 2016 Joyent, Inc.
15 */
16
17 #include "i40e_sw.h"
18
19 /*
20 * ---------------------------------------------------------
21 * Buffer and Memory Management, Receiving, and Transmitting
22 * ---------------------------------------------------------
23 *
24 * Each physical function (PF), which is what we think of as an instance of the
25 * device driver, has a series of associated transmit and receive queue pairs.
26 * Effectively, what we think of in MAC as rings. Each of these has their own
27 * ring of descriptors which is used as part of doing DMA activity.
28 *
29 * The transmit ring of descriptors are 16-byte entries which are used to send
30 * packets, program filters, etc. The receive ring of descriptors are either
31 * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
32 * format so that we're in a better position if we ever want to leverage that
33 * information later on.
34 *
43 * we end up allocating. Those are:
44 *
45 * o The size of the ring (controlled through the driver.conf file)
46 *
47 * o The maximum size frame we can receive.
48 *
49 * The size of the rings currently defaults to 1024 descriptors and is stored in
50 * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
51 *
52 * While the size of the rings is controlled by the driver.conf, the maximum
53 * size frame is informed primarily through the use of dladm and the setting of
54 * the MTU property on the device. From the MTU, we then go and do some
55 * machinations. The first thing we do is we then have to add in space for the
56 * Ethernet header, potentially a VLAN header, and the FCS check. This value is
57 * what's stored as i40e_t`i40e_frame_max and is derived any time
58 * i40e_t`i40e_sdu changes.
59 *
60 * This size is then rounded up to the nearest 1k chunk, which represents the
61 * actual amount of memory that we'll allocate for a single frame.
62 *
63 * Note, that for rx, we do something that might be unexpected. We always add
64 * an extra two bytes to the frame size that we allocate. We then offset the DMA
65 * address that we receive a packet into by two bytes. This ensures that the IP
66 * header will always be 4 byte aligned because the MAC header is either 14 or
67 * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
68 * and MAC's lives easier.
69 *
70 * Both the rx and tx descriptor rings (which are what we use to communicate
71 * with hardware) are allocated as a single region of DMA memory which is the
72 * size of the descriptor (4 bytes and 2 bytes respectively) times the total
73 * number of descriptors for an rx and tx ring.
74 *
75 * While the rx and tx descriptors are allocated using DMA-based memory, the
76 * control blocks for each of them are allocated using normal kernel memory.
77 * They aren't special from a DMA perspective. We'll go over the design of both
78 * receiving and transmitting separately, as they have slightly different
79 * control blocks and different ways that we manage the relationship between
80 * control blocks and descriptors.
81 *
82 * ---------------------------------
83 * RX Descriptors and Control Blocks
84 * ---------------------------------
85 *
86 * For every descriptor in the ring that the driver has, we need some associated
87 * memory, which means that we need to have the receive specific control block.
88 * We have a couple different, but related goals:
89 *
90 * o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do
91 * not want to do any additional memory allocations or DMA allocations if
92 * we don't have to.
93 *
94 * o We'd like to try and do as much zero-copy as possible, while taking into
95 * account the cost of mapping in DMA resources.
96 *
97 * o We'd like to have every receive descriptor available.
98 *
99 * Now, these rules are a bit in tension with one another. The act of mapping in
100 * is an exercise of trying to find the break-even point between page table
101 * updates and bcopy. We currently start by using the same metrics that ixgbe
102 * used; however, it should be known that this value has effectively been
103 * cargo-culted across to yet another driver, sorry.
104 *
105 * If we receive a packet which is larger than our copy threshold, we'll create
106 * a message block out of the DMA memory via desballoc(9F) and send that up to
107 * MAC that way. This will cause us to be notified when the message block is
108 * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
109 * it's less than the threshold, we'll try to use allocb and bcopy it into the
110 * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
111 * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
112 * the behavior and always do a bcopy or a DMA bind.
113 *
114 * To try and ensure that the device always has blocks that it can receive data
115 * into, we maintain two lists of control blocks, a working list and a free
116 * list. Each list is sized equal to the number of descriptors in the rx ring.
117 * During the GLDv3 mc_start routine, we allocate a number of rx control blocks
118 * equal to twice the number of descriptors in the ring and we assign them
119 * equally to the free list and to the working list. Each control block also has
120 * DMA memory allocated and associated with which it will be used to receive the
121 * actual packet data. All of a received frame's data will end up in a single
122 * DMA buffer.
123 *
124 * During operation, we always maintain the invariant that each rx descriptor
125 * has an associated rx control block which lives in the working list. If we
126 * feel that we should loan up DMA memory to MAC in the form of a message block,
127 * we can only do so if we can maintain this invariant. To do that, we swap in
128 * one of the buffers from the free list. If none are available, then we resort
129 * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
130 * size.
131 *
132 * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
133 * called on the block, at which point we restore the rx control block to the
134 * free list and are able to reuse the DMA memory again. While the scheme may
135 * seem odd, it importantly keeps us out of trying to do any DMA allocations in
136 * the normal path of operation, even though we may still have to allocate
137 * message blocks and copy.
138 *
139 * The following state machine describes the life time of a rx control block. In
140 * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx
141 * control block entry as rcb.
142 *
143 * | |
144 * * ... 1/2 of all initial rcb's ... *
145 * | |
146 * v v
147 * +------------------+ +------------------+
148 * | rcb on free list |---*---------->| rcb on work list |
149 * +------------------+ . +------------------+
150 * ^ . moved to |
151 * | replace rcb * . . Frame received,
152 * | loaned to | entry on free list
153 * | MAC + co. | available. rcb's
154 * | | memory made into mblk_t
155 * * . freemsg(9F) | and sent up to MAC.
156 * | called on |
157 * | loaned rcb |
158 * | and it is v
159 * | recycled. +-------------------+
160 * +--------------------<-----| rcb loaned to MAC |
161 * +-------------------+
162 *
163 * Finally, note that every rx control block has a reference count on it. One
164 * reference is added as long as the driver has had the GLDv3 mc_start endpoint
165 * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
166 * no other DLPI consumers remain, then we'll decrement the reference count by
167 * one. Whenever we loan up the rx control block and associated buffer to MAC,
168 * then we bump the reference count again. Even though the device is stopped,
169 * there may still be loaned frames in upper levels that we'll want to account
170 * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
171 * that it is cleaned up.
172 *
173 * --------------------
174 * Managing the RX Ring
175 * --------------------
176 *
177 * The receive ring descriptors are arranged in a circular buffer with a head
178 * and tail pointer. There are both the conventional head and tail pointers
179 * which are used to partition the ring into two portions, a portion that we,
180 * the operating system, manage and a portion that is managed by hardware. When
181 * hardware owns a descriptor in the ring, it means that it is waiting for data
182 * to be filled in. However, when a portion of the ring is owned by the driver,
183 * then that means that the descriptor has been consumed and we need to go take
184 * a look at it.
185 *
186 * The initial head is configured to be zero by writing it as such in the
187 * receive queue context in the FPM (function private memory from the host). The
188 * initial tail is written to be the last descriptor. This is written to via the
189 * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
190 * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
191 * the only values we ever consult ourselves are the TAIL register and our own
192 * state tracking. Effectively, we cache the HEAD register and then update it
193 * ourselves based on our work.
194 *
195 * When we iterate over the rx descriptors and thus the received frames, we are
196 * either in an interrupt context or we've been asked by MAC to poll on the
197 * ring. If we've been asked to poll on the ring, we have a maximum number of
198 * bytes of mblk_t's to return. If processing an rx descriptor would cause us to
199 * exceed that count, then we do not process it. When in interrupt context, we
200 * don't have a strict byte count. However, to ensure liveness, we limit the
201 * amount of data based on a configuration value
202 * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
203 * is based on similar numbers that are used for ixgbe. After some additional
204 * time in the field, we'll have a sense as to whether or not it should be
205 * changed.
206 *
207 * When processing, we start at our own HEAD pointer
208 * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
209 * processing. Every RX descriptor has what's described as the DD bit. This bit
210 * (the LSB of the second 8-byte word), indicates whether or not the descriptor
211 * is done. When we give descriptors to the hardware, this value is always
212 * zero. When the hardware has finished a descriptor, it will always be one.
213 *
214 * The first thing that we check is whether the DD bit indicates that the
215 * current HEAD is ready. If it isn't, then we're done. That's the primary
216 * invariant of processing a frame. If it's done, then there are a few other
217 * things that we want to look at. In the same status word as the DD bit, there
218 * are two other important bits:
232 * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
233 * on how that selection is made.
234 *
235 * Regardless of whether we construct an mblk_t or encounter an error, we end up
236 * resetting the descriptor. This re-arms the descriptor for hardware and in the
237 * process, we may end up assigning it a new receive control bock. After we do
238 * this, we always update our HEAD pointer, no matter what.
239 *
240 * Finally, once we've consumed as much as we will in a given window, we go and
241 * update the TAIL register to indicate all the frames we've consumed. We only
242 * do a single bulk write for the ring.
243 *
244 * ---------------------------------
245 * TX Descriptors and Control Blocks
246 * ---------------------------------
247 *
248 * While the transmit path is similar in spirit to the receive path, it works
249 * differently due to the fact that all data is originated by the operating
250 * system and not by the device.
251 *
252 * Like rx, there is both a descriptor ring that we use to communicate to the
253 * driver and which points to the memory used to transmit a frame. Similarly,
254 * there is a corresponding transmit control block. Each transmit control block
255 * has a region of DMA memory allocated to it; however, the way we use it
256 * varies.
257 *
258 * The driver is asked to process a single frame at a time. That message block
259 * may be made up of multiple fragments linked together by the mblk_t`b_cont
260 * member. The device has a hard limit of up to 8 buffers being allowed for use
261 * for a single logical frame. For each fragment, we'll try and use an entry
262 * from the tx descriptor ring and then we'll allocate a corresponding tx
263 * control block. Depending on the size of the fragment, we may copy it around
264 * or we might instead try to do DMA binding of the fragment.
265 *
266 * If we exceed the number of blocks that fit, we'll try to pull up the block
267 * and then we'll do a DMA bind and send it out.
268 *
269 * If we don't have enough space in the ring or tx control blocks available,
270 * then we'll return the unprocessed message block to MAC. This will induce flow
271 * control and once we recycle enough entries, we'll once again enable sending
272 * on the ring.
273 *
274 * We size the working list as equal to the number of descriptors in the ring.
275 * We size the free list as equal to 1.5 times the number of descriptors in the
276 * ring. We'll allocate a number of tx control block entries equal to the number
277 * of entries in the free list. By default, all entries are placed in the free
278 * list. As we come along and try to send something, we'll allocate entries from
279 * the free list and add them to the working list, where they'll stay until the
280 * hardware indicates that all of the data has been written back to us. The
281 * reason that we start with 1.5x is to help facilitate having more than one TX
282 * buffer associated with the DMA activity.
283 *
284 * --------------------
285 * Managing the TX Ring
286 * --------------------
287 *
288 * The transmit descriptor ring is driven by us. We maintain our own notion of a
289 * HEAD and TAIL register and we update the hardware with updates to the TAIL
290 * register. When the hardware is done writing out data, it updates us by
291 * writing back to a specific address, not by updating the individual
292 * descriptors. That address is a 4-byte region after the main transmit
293 * descriptor ring. This is why the descriptor ring has an extra descriptor's
294 * worth allocated to it.
295 *
296 * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
308 * has been freed, we'll return it to the list.
309 *
310 * The transmit control block free list is managed by keeping track of the
311 * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to
312 * index into the free list and add things to it. In effect, we always push and
313 * pop from the tail and protect it with a single lock,
314 * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
315 * stand up to further performance testing; however, it does allow us to get off
316 * the ground with the device driver.
317 *
318 * The following image describes where a given transmit control block lives in
319 * its lifetime:
320 *
321 * |
322 * * ... Initial placement for all tcb's
323 * |
324 * v
325 * +------------------+ +------------------+
326 * | tcb on free list |---*------------------>| tcb on work list |
327 * +------------------+ . +------------------+
328 * ^ . tcb allocated |
329 * | to send frame v
330 * | or fragment on |
331 * | wire, mblk from |
332 * | MAC associated. |
333 * | |
334 * +------*-------------------------------<----+
335 * .
336 * . Hardware indicates
337 * entry transmitted.
338 * tcb recycled, mblk
339 * from MAC freed.
340 *
341 * ------------
342 * Blocking MAC
343 * ------------
344 *
345 * Wen performing transmit, we can run out of descriptors and ring entries. When
346 * such a case happens, we return the mblk_t to MAC to indicate that we've been
347 * blocked. At that point in time, MAC becomes blocked and will not transmit
348 * anything out that specific ring until we notify MAC. To indicate that we're
349 * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE.
350 *
351 * When we recycle tx descriptors then we'll end up signaling MAC by calling
352 * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
353 * start sending frames out to us again.
354 */
355
356 /*
357 * We set our DMA alignment requests based on the smallest supported page size
358 * of the corresponding platform.
359 */
360 #if defined(__sparc)
361 #define I40E_DMA_ALIGNMENT 0x2000ull
362 #elif defined(__x86)
363 #define I40E_DMA_ALIGNMENT 0x1000ull
364 #else
365 #error "unknown architecture for i40e"
366 #endif
367
368 /*
369 * This structure is used to maintain information and flags related to
370 * transmitting a frame. The first member is the set of flags we need to or into
371 * the command word (generally checksumming related). The second member controls
372 * the word offsets which is required for IP and L4 checksumming.
373 */
374 typedef struct i40e_tx_context {
375 enum i40e_tx_desc_cmd_bits itc_cmdflags;
376 uint32_t itc_offsets;
377 } i40e_tx_context_t;
378
379 /*
380 * Toggles on debug builds which can be used to override our RX behaviour based
381 * on thresholds.
382 */
383 #ifdef DEBUG
384 typedef enum {
385 I40E_DEBUG_RX_DEFAULT = 0,
386 I40E_DEBUG_RX_BCOPY = 1,
387 I40E_DEBUG_RX_DMABIND = 2
388 } i40e_debug_rx_t;
389
390 i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
391 #endif /* DEBUG */
392
393 /*
394 * Notes on the following pair of DMA attributes. The first attribute,
395 * i40e_static_dma_attr, is designed to be used for both the descriptor rings
396 * and the static buffers that we associate with control blocks. For this
397 * reason, we force an SGL length of one. While technically the driver supports
398 * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our
399 * management here. In addition, when the Intel common code wants to allocate
400 * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
401 * the static dma attr.
402 *
403 * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're
404 * binding a bunch of mblk_t fragments to go out the door. Note that the main
405 * difference here is that we're allowed a larger SGL length -- eight.
406 *
407 * Note, we default to setting ourselves to be DMA capable here. However,
408 * because we could have multiple instances which have different FMA error
409 * checking capabilities, or end up on different buses, we make these static
410 * and const and copy them into the i40e_t for the given device with the actual
411 * values that reflect the actual capabilities.
412 */
413 static const ddi_dma_attr_t i40e_g_static_dma_attr = {
414 DMA_ATTR_V0, /* version number */
415 0x0000000000000000ull, /* low address */
416 0xFFFFFFFFFFFFFFFFull, /* high address */
417 0x00000000FFFFFFFFull, /* dma counter max */
418 I40E_DMA_ALIGNMENT, /* alignment */
419 0x00000FFF, /* burst sizes */
420 0x00000001, /* minimum transfer size */
421 0x00000000FFFFFFFFull, /* maximum transfer size */
422 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
423 1, /* scatter/gather list length */
424 0x00000001, /* granularity */
425 DDI_DMA_FLAGERR /* DMA flags */
426 };
427
428 static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
429 DMA_ATTR_V0, /* version number */
430 0x0000000000000000ull, /* low address */
431 0xFFFFFFFFFFFFFFFFull, /* high address */
432 0x00000000FFFFFFFFull, /* dma counter max */
433 I40E_DMA_ALIGNMENT, /* alignment */
434 0x00000FFF, /* burst sizes */
435 0x00000001, /* minimum transfer size */
436 0x00000000FFFFFFFFull, /* maximum transfer size */
437 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
438 I40E_TX_MAX_COOKIE, /* scatter/gather list length */
439 0x00000001, /* granularity */
440 DDI_DMA_FLAGERR /* DMA flags */
441 };
442
443 /*
444 * Next, we have the attributes for these structures. The descriptor rings are
445 * all strictly little endian, while the data buffers are just arrays of bytes
446 * representing frames. Because of this, we purposefully simplify the driver
447 * programming life by programming the descriptor ring as little endian, while
448 * for the buffer data we keep it as unstructured.
449 *
450 * Note, that to keep the Intel common code operating in a reasonable way, when
451 * we allocate DMA memory for it, we do not use byte swapping and thus use the
452 * standard i40e_buf_acc_attr.
453 */
454 static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
455 DDI_DEVICE_ATTR_V0,
456 DDI_STRUCTURE_LE_ACC,
457 DDI_STRICTORDER_ACC
458 };
459
460 static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
461 DDI_DEVICE_ATTR_V0,
462 DDI_NEVERSWAP_ACC,
651
652 static boolean_t
653 i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
654 {
655 i40e_rx_data_t *rxd;
656
657 rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
658 if (rxd == NULL)
659 return (B_FALSE);
660 itrq->itrq_rxdata = rxd;
661 rxd->rxd_i40e = i40e;
662
663 rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
664 rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
665
666 rxd->rxd_rcb_free = rxd->rxd_free_list_size;
667
668 rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
669 rxd->rxd_ring_size, KM_NOSLEEP);
670 if (rxd->rxd_work_list == NULL) {
671 i40e_error(i40e, "failed to allocate rx work list for a ring "
672 "of %d entries for ring %d", rxd->rxd_ring_size,
673 itrq->itrq_index);
674 goto cleanup;
675 }
676
677 rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
678 rxd->rxd_free_list_size, KM_NOSLEEP);
679 if (rxd->rxd_free_list == NULL) {
680 i40e_error(i40e, "failed to allocate a %d entry rx free list "
681 "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
682 goto cleanup;
683 }
684
685 rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
686 (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
687 if (rxd->rxd_rcb_area == NULL) {
688 i40e_error(i40e, "failed to allocate a %d entry rcb area for "
689 "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
690 itrq->itrq_index);
691 goto cleanup;
692 }
693
694 return (B_TRUE);
695
696 cleanup:
697 i40e_free_rx_data(rxd);
698 itrq->itrq_rxdata = NULL;
699 return (B_FALSE);
700 }
748 atomic_inc_32(&rxd->rxd_rcb_pending);
749 atomic_inc_32(&i40e->i40e_rx_pending);
750 }
751 }
752 mutex_exit(&i40e->i40e_rx_pending_lock);
753 }
754
755 /*
756 * Initialize the DMA memory for the descriptor ring and for each frame in the
757 * control block list.
758 */
759 static boolean_t
760 i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
761 {
762 int i, count;
763 size_t dmasz;
764 i40e_rx_control_block_t *rcb;
765 i40e_t *i40e = rxd->rxd_i40e;
766
767 /*
768 * First allocate the rx descriptor ring.
769 */
770 dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
771 VERIFY(dmasz > 0);
772 if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
773 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
774 B_TRUE, dmasz) == B_FALSE) {
775 i40e_error(i40e, "failed to allocate DMA resources "
776 "for rx descriptor ring");
777 return (B_FALSE);
778 }
779 rxd->rxd_desc_ring =
780 (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
781 rxd->rxd_desc_next = 0;
782
783 count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
784 rcb = rxd->rxd_rcb_area;
785
786 dmasz = i40e->i40e_rx_buf_size;
787 VERIFY(dmasz > 0);
788 for (i = 0; i < count; i++, rcb++) {
789 i40e_dma_buffer_t *dmap;
790 VERIFY(rcb != NULL);
791
792 if (i < rxd->rxd_ring_size) {
793 rxd->rxd_work_list[i] = rcb;
794 } else {
795 rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
796 }
797
798 dmap = &rcb->rcb_dma;
799 if (i40e_alloc_dma_buffer(i40e, dmap,
800 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
801 B_TRUE, B_FALSE, dmasz) == B_FALSE) {
802 i40e_error(i40e, "failed to allocate rx dma buffer");
803 return (B_FALSE);
804 }
805
806 /*
807 * Initialize the control block and offset the DMA address. See
808 * the note in the big theory statement that explains how this
809 * helps IP deal with alignment. Note, we don't worry about
810 * whether or not we successfully get an mblk_t from desballoc,
811 * it's a common case that we have to handle later on in the
812 * system.
813 */
814 dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT;
815 dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT;
816 dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT;
817
818 rcb->rcb_ref = 1;
819 rcb->rcb_rxd = rxd;
820 rcb->rcb_free_rtn.free_func = i40e_rx_recycle;
821 rcb->rcb_free_rtn.free_arg = (caddr_t)rcb;
822 rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address,
824 }
825
826 return (B_TRUE);
827 }
828
829 static void
830 i40e_free_tx_dma(i40e_trqpair_t *itrq)
831 {
832 size_t fsz;
833
834 if (itrq->itrq_tcb_area != NULL) {
835 uint32_t i;
836 i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
837
838 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
839 i40e_free_dma_buffer(&tcb->tcb_dma);
840 if (tcb->tcb_dma_handle != NULL) {
841 ddi_dma_free_handle(&tcb->tcb_dma_handle);
842 tcb->tcb_dma_handle = NULL;
843 }
844 }
845
846 fsz = sizeof (i40e_tx_control_block_t) *
847 itrq->itrq_tx_free_list_size;
848 kmem_free(itrq->itrq_tcb_area, fsz);
849 itrq->itrq_tcb_area = NULL;
850 }
851
852 if (itrq->itrq_tcb_free_list != NULL) {
853 fsz = sizeof (i40e_tx_control_block_t *) *
854 itrq->itrq_tx_free_list_size;
855 kmem_free(itrq->itrq_tcb_free_list, fsz);
856 itrq->itrq_tcb_free_list = NULL;
857 }
858
859 if (itrq->itrq_tcb_work_list != NULL) {
860 fsz = sizeof (i40e_tx_control_block_t *) *
861 itrq->itrq_tx_ring_size;
862 kmem_free(itrq->itrq_tcb_work_list, fsz);
863 itrq->itrq_tcb_work_list = NULL;
864 }
865
866 i40e_free_dma_buffer(&itrq->itrq_desc_area);
867 itrq->itrq_desc_ring = NULL;
868
869 }
870
871 static boolean_t
872 i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
873 {
874 int i, ret;
875 size_t dmasz;
876 i40e_tx_control_block_t *tcb;
877 i40e_t *i40e = itrq->itrq_i40e;
878
879 itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
880 itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
881 (i40e->i40e_tx_ring_size >> 1);
882
883 /*
884 * Allocate an additional tx descriptor for the writeback head.
885 */
886 dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
887 dmasz += sizeof (i40e_tx_desc_t);
888
889 VERIFY(dmasz > 0);
890 if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
891 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
892 B_FALSE, B_TRUE, dmasz) == B_FALSE) {
893 i40e_error(i40e, "failed to allocate DMA resources for tx "
894 "descriptor ring");
895 return (B_FALSE);
896 }
897 itrq->itrq_desc_ring =
898 (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
899 itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
900 itrq->itrq_tx_ring_size);
901 itrq->itrq_desc_head = 0;
902 itrq->itrq_desc_tail = 0;
903 itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
904
905 itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
906 sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
907 if (itrq->itrq_tcb_work_list == NULL) {
908 i40e_error(i40e, "failed to allocate a %d entry tx work list "
909 "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
910 goto cleanup;
911 }
912
913 itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
914 sizeof (i40e_tx_control_block_t *), KM_SLEEP);
915 if (itrq->itrq_tcb_free_list == NULL) {
916 i40e_error(i40e, "failed to allocate a %d entry tx free list "
917 "for ring %d", itrq->itrq_tx_free_list_size,
918 itrq->itrq_index);
919 goto cleanup;
920 }
921
922 /*
923 * We allocate enough tx control blocks to cover the free list.
924 */
925 itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
926 itrq->itrq_tx_free_list_size, KM_NOSLEEP);
927 if (itrq->itrq_tcb_area == NULL) {
928 i40e_error(i40e, "failed to allocate a %d entry tcb area for "
929 "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
930 goto cleanup;
931 }
932
933 /*
934 * For each tcb, allocate DMA memory.
935 */
936 dmasz = i40e->i40e_tx_buf_size;
937 VERIFY(dmasz > 0);
938 tcb = itrq->itrq_tcb_area;
939 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
940 VERIFY(tcb != NULL);
941
942 /*
943 * Allocate both a DMA buffer which we'll use for when we copy
944 * packets for transmission and allocate a DMA handle which
945 * we'll use when we bind data.
946 */
947 ret = ddi_dma_alloc_handle(i40e->i40e_dip,
948 &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
949 &tcb->tcb_dma_handle);
950 if (ret != DDI_SUCCESS) {
951 i40e_error(i40e, "failed to allocate DMA handle for tx "
952 "data binding on ring %d: %d", itrq->itrq_index,
953 ret);
954 tcb->tcb_dma_handle = NULL;
955 goto cleanup;
956 }
957
958 if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
959 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
960 B_TRUE, B_FALSE, dmasz) == B_FALSE) {
961 i40e_error(i40e, "failed to allocate %ld bytes of "
962 "DMA for tx data binding on ring %d", dmasz,
963 itrq->itrq_index);
964 goto cleanup;
965 }
966
967 itrq->itrq_tcb_free_list[i] = tcb;
968 }
969
970 itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
971
972 return (B_TRUE);
973
974 cleanup:
975 i40e_free_tx_dma(itrq);
976 return (B_FALSE);
977 }
978
979 /*
980 * Free all memory associated with all of the rings on this i40e instance. Note,
981 * this is done as part of the GLDv3 stop routine.
982 */
983 void
984 i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
985 {
986 int i;
987
988 for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
989 i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
990
991 /*
992 * Clean up our rx data. We have to free DMA resources first and
993 * then if we have no more pending RCB's, then we'll go ahead
994 * and clean things up. Note, we can't set the stopped flag on
995 * the rx data until after we've done the first pass of the
996 * pending resources. Otherwise we might race with
997 * i40e_rx_recycle on determining who should free the
998 * i40e_rx_data_t above.
999 */
1000 i40e_free_rx_dma(rxd, failed_init);
1001
1002 mutex_enter(&i40e->i40e_rx_pending_lock);
1003 rxd->rxd_shutdown = B_TRUE;
1004 if (rxd->rxd_rcb_pending == 0) {
1005 i40e_free_rx_data(rxd);
1006 i40e->i40e_trqpairs[i].itrq_rxdata = NULL;
1007 }
1008 mutex_exit(&i40e->i40e_rx_pending_lock);
1009
1010 i40e_free_tx_dma(&i40e->i40e_trqpairs[i]);
1011 }
1012 }
1013
1014 /*
1015 * Allocate all of the resources associated with all of the rings on this i40e
1038 return (B_TRUE);
1039
1040 unwind:
1041 i40e_free_ring_mem(i40e, B_TRUE);
1042 return (B_FALSE);
1043 }
1044
1045
1046 /*
1047 * Because every instance of i40e may have different support for FMA
1048 * capabilities, we copy the DMA attributes into the i40e_t and set them that
1049 * way and use them for determining attributes.
1050 */
1051 void
1052 i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
1053 {
1054 bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
1055 sizeof (ddi_dma_attr_t));
1056 bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
1057 sizeof (ddi_dma_attr_t));
1058 bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
1059 sizeof (ddi_device_acc_attr_t));
1060 bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
1061 sizeof (ddi_device_acc_attr_t));
1062
1063 if (fma == B_TRUE) {
1064 i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1065 i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1066 } else {
1067 i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1068 i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1069 }
1070 }
1071
1072 static void
1073 i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
1074 {
1075 mutex_enter(&rxd->rxd_free_lock);
1076 ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
1077 ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
1078 rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
1079 rxd->rxd_rcb_free++;
1080 mutex_exit(&rxd->rxd_free_lock);
1081 }
1082
1083 static i40e_rx_control_block_t *
1084 i40e_rcb_alloc(i40e_rx_data_t *rxd)
1085 {
1086 i40e_rx_control_block_t *rcb;
1087
1088 mutex_enter(&rxd->rxd_free_lock);
1089 if (rxd->rxd_rcb_free == 0) {
1090 mutex_exit(&rxd->rxd_free_lock);
1091 return (NULL);
1092 }
1093 rxd->rxd_rcb_free--;
1094 rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
1095 VERIFY(rcb != NULL);
1096 rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
1097 mutex_exit(&rxd->rxd_free_lock);
1098
1099 return (rcb);
1100 }
1101
1102 /*
1103 * This is the callback that we get from the OS when freemsg(9F) has been called
1104 * on a loaned descriptor. In addition, if we take the last reference count
1105 * here, then we have to tear down all of the rx data.
1106 */
1107 void
1108 i40e_rx_recycle(caddr_t arg)
1109 {
1110 uint32_t ref;
1111 i40e_rx_control_block_t *rcb;
1112 i40e_rx_data_t *rxd;
1113 i40e_t *i40e;
1114
1115 /* LINTED: E_BAD_PTR_CAST_ALIGN */
1116 rcb = (i40e_rx_control_block_t *)arg;
1117 rxd = rcb->rcb_rxd;
1118 i40e = rxd->rxd_i40e;
1119
1120 /*
1121 * It's possible for this to be called with a reference count of zero.
1122 * That will happen when we're doing the freemsg after taking the last
1123 * reference because we're tearing down everything and this rcb is not
1124 * outstanding.
1125 */
1751 case IPPROTO_UDP:
1752 l4len = sizeof (struct udphdr);
1753 break;
1754 case IPPROTO_SCTP:
1755 l4len = sizeof (sctp_hdr_t);
1756 break;
1757 default:
1758 return (0);
1759 }
1760
1761 meoi->meoi_l4hlen = l4len;
1762 meoi->meoi_flags |= MEOI_L4INFO_SET;
1763 return (0);
1764 }
1765
1766 /*
1767 * Attempt to put togther the information we'll need to feed into a descriptor
1768 * to properly program the hardware for checksum offload as well as the
1769 * generally required flags.
1770 *
1771 * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or
1772 * into the descriptor based on the checksum flags for this mblk_t and the
1773 * actual information we care about.
1774 */
1775 static int
1776 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
1777 i40e_tx_context_t *tctx)
1778 {
1779 int ret;
1780 uint32_t flags, start;
1781 mac_ether_offload_info_t meo;
1782 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
1783
1784 bzero(tctx, sizeof (i40e_tx_context_t));
1785
1786 if (i40e->i40e_tx_hcksum_enable != B_TRUE)
1787 return (0);
1788
1789 mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
1790 if (flags == 0)
1791 return (0);
1792
1793 if ((ret = mac_ether_offload_info(mp, &meo)) != 0) {
1794 txs->itxs_hck_meoifail.value.ui64++;
1795 return (ret);
1796 }
1797
1798 /*
1799 * Have we been asked to checksum an IPv4 header. If so, verify that we
1800 * have sufficient information and then set the proper fields in the
1801 * command structure.
1802 */
1803 if (flags & HCK_IPV4_HDRCKSUM) {
1804 if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
1805 txs->itxs_hck_nol2info.value.ui64++;
1806 return (-1);
1807 }
1808 if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
1809 txs->itxs_hck_nol3info.value.ui64++;
1810 return (-1);
1811 }
1812 if (meo.meoi_l3proto != ETHERTYPE_IP) {
1813 txs->itxs_hck_badl3.value.ui64++;
1814 return (-1);
1815 }
1816 tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
1817 tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
1818 I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1819 tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
1820 I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1821 }
1822
1823 /*
1824 * We've been asked to provide an L4 header, first, set up the IP
1825 * information in the descriptor if we haven't already before moving
1826 * onto seeing if we have enough information for the L4 checksum
1827 * offload.
1828 */
1829 if (flags & HCK_PARTIALCKSUM) {
1830 if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) {
1831 txs->itxs_hck_nol4info.value.ui64++;
1832 return (-1);
1833 }
1834
1835 if (!(flags & HCK_IPV4_HDRCKSUM)) {
1836 if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
1837 txs->itxs_hck_nol2info.value.ui64++;
1838 return (-1);
1839 }
1840 if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
1841 txs->itxs_hck_nol3info.value.ui64++;
1842 return (-1);
1843 }
1844
1845 if (meo.meoi_l3proto == ETHERTYPE_IP) {
1846 tctx->itc_cmdflags |=
1847 I40E_TX_DESC_CMD_IIPT_IPV4;
1848 } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) {
1849 tctx->itc_cmdflags |=
1850 I40E_TX_DESC_CMD_IIPT_IPV6;
1851 } else {
1852 txs->itxs_hck_badl3.value.ui64++;
1853 return (-1);
1854 }
1855 tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
1856 I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1857 tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
1858 I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1859 }
1860
1861 switch (meo.meoi_l4proto) {
1862 case IPPROTO_TCP:
1863 tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
1864 break;
1865 case IPPROTO_UDP:
1866 tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
1867 break;
1868 case IPPROTO_SCTP:
1869 tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
1870 break;
1871 default:
1872 txs->itxs_hck_badl4.value.ui64++;
1873 return (-1);
1874 }
1875
1876 tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) <<
1877 I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
1878 }
1879
1880 return (0);
1881 }
1882
1883 static void
1884 i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
1885 {
1886 ASSERT(tcb != NULL);
1887
1888 mutex_enter(&itrq->itrq_tcb_lock);
1889 ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
1890 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb;
1891 itrq->itrq_tcb_free++;
1892 mutex_exit(&itrq->itrq_tcb_lock);
1893 }
1894
1895 static i40e_tx_control_block_t *
1896 i40e_tcb_alloc(i40e_trqpair_t *itrq)
1897 {
1898 i40e_tx_control_block_t *ret;
1899
1908 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL;
1909 mutex_exit(&itrq->itrq_tcb_lock);
1910
1911 ASSERT(ret != NULL);
1912 return (ret);
1913 }
1914
1915 /*
1916 * This should be used to free any DMA resources, associated mblk_t's, etc. It's
1917 * used as part of recycling the message blocks when we have either an interrupt
1918 * or other activity that indicates that we need to take a look.
1919 */
1920 static void
1921 i40e_tcb_reset(i40e_tx_control_block_t *tcb)
1922 {
1923 switch (tcb->tcb_type) {
1924 case I40E_TX_COPY:
1925 tcb->tcb_dma.dmab_len = 0;
1926 break;
1927 case I40E_TX_DMA:
1928 (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
1929 break;
1930 case I40E_TX_NONE:
1931 /* Cast to pacify lint */
1932 panic("trying to free tcb %p with bad type none", (void *)tcb);
1933 default:
1934 panic("unknown i40e tcb type: %d", tcb->tcb_type);
1935 }
1936
1937 tcb->tcb_type = I40E_TX_NONE;
1938 freemsg(tcb->tcb_mp);
1939 tcb->tcb_mp = NULL;
1940 tcb->tcb_next = NULL;
1941 }
1942
1943 /*
1944 * This is called as part of shutting down to clean up all outstanding
1945 * descriptors. Similar to recycle, except we don't re-arm anything and instead
1946 * just return control blocks to the free list.
1947 */
1948 void
1949 i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
1950 {
1951 uint32_t index;
1952
1953 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
1954 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
1955
1956 /*
1957 * Because we should have shut down the chip at this point, it should be
1958 * safe to just clean up all the entries between our head and tail.
1959 */
1960 #ifdef DEBUG
1961 index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space,
1962 I40E_QTX_ENA(itrq->itrq_index));
1963 VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
1964 I40E_QTX_ENA_QENA_STAT_MASK));
1965 #endif
1966
1967 index = itrq->itrq_desc_head;
1968 while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
1969 i40e_tx_control_block_t *tcb;
1970
1971 tcb = itrq->itrq_tcb_work_list[index];
1972 VERIFY(tcb != NULL);
1973 itrq->itrq_tcb_work_list[index] = NULL;
1974 i40e_tcb_reset(tcb);
1975 i40e_tcb_free(itrq, tcb);
1976
1977 bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
1978 index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
1979 itrq->itrq_desc_free++;
1980 }
1981
1982 ASSERT(index == itrq->itrq_desc_tail);
1983 itrq->itrq_desc_head = index;
1984 }
1985
1986 /*
1987 * We're here either by hook or by crook. We need to see if there are transmit
1988 * descriptors available for us to go and clean up and return to the hardware.
1989 * We may also be blocked, and if so, we should make sure that we let it know
1990 * we're good to go.
1991 */
1992 void
1993 i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
1994 {
1995 uint32_t wbhead, toclean, count;
1996 i40e_tx_control_block_t *tcbhead;
1997 i40e_t *i40e = itrq->itrq_i40e;
1998
1999 mutex_enter(&itrq->itrq_tx_lock);
2000
2001 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2002 if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
2003 if (itrq->itrq_tx_blocked == B_TRUE) {
2004 itrq->itrq_tx_blocked = B_FALSE;
2005 mac_tx_ring_update(i40e->i40e_mac_hdl,
2006 itrq->itrq_mactxring);
2007 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2008 }
2009 mutex_exit(&itrq->itrq_tx_lock);
2010 return;
2011 }
2012
2013 /*
2014 * Now we need to try and see if there's anything available. The driver
2015 * will write to the head location and it guarantees that it does not
2016 * use relaxed ordering.
2017 */
2025 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2026 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2027 return;
2028 }
2029
2030 wbhead = *itrq->itrq_desc_wbhead;
2031 toclean = itrq->itrq_desc_head;
2032 count = 0;
2033 tcbhead = NULL;
2034
2035 while (toclean != wbhead) {
2036 i40e_tx_control_block_t *tcb;
2037
2038 tcb = itrq->itrq_tcb_work_list[toclean];
2039 itrq->itrq_tcb_work_list[toclean] = NULL;
2040 ASSERT(tcb != NULL);
2041 tcb->tcb_next = tcbhead;
2042 tcbhead = tcb;
2043
2044 /*
2045 * We zero this out for sanity purposes.
2046 */
2047 bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t));
2048 toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size);
2049 count++;
2050 }
2051
2052 itrq->itrq_desc_head = wbhead;
2053 itrq->itrq_desc_free += count;
2054 itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
2055 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2056
2057 if (itrq->itrq_tx_blocked == B_TRUE &&
2058 itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
2059 itrq->itrq_tx_blocked = B_FALSE;
2060
2061 mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring);
2062 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2063 }
2064
2065 mutex_exit(&itrq->itrq_tx_lock);
2066
2067 /*
2068 * Now clean up the tcb.
2069 */
2070 while (tcbhead != NULL) {
2071 i40e_tx_control_block_t *tcb = tcbhead;
2072
2073 tcbhead = tcb->tcb_next;
2074 i40e_tcb_reset(tcb);
2075 i40e_tcb_free(itrq, tcb);
2076 }
2077
2078 DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
2079 }
2080
2081 /*
2082 * We've been asked to send a message block on the wire. We'll only have a
2083 * single chain. There will not be any b_next pointers; however, there may be
2084 * multiple b_cont blocks.
2085 *
2086 * We may do one of three things with any given mblk_t chain:
2087 *
2088 * 1) Drop it
2089 * 2) Transmit it
2090 * 3) Return it
2091 *
2092 * If we return it to MAC, then MAC will flow control on our behalf. In other
2093 * words, it won't send us anything until we tell it that it's okay to send us
2094 * something.
2095 */
2096 mblk_t *
2097 i40e_ring_tx(void *arg, mblk_t *mp)
2098 {
2099 const mblk_t *nmp;
2100 size_t mpsize;
2101 i40e_tx_control_block_t *tcb;
2102 i40e_tx_desc_t *txdesc;
2103 i40e_tx_context_t tctx;
2104 int cmd, type;
2105
2106 i40e_trqpair_t *itrq = arg;
2107 i40e_t *i40e = itrq->itrq_i40e;
2108 i40e_hw_t *hw = &i40e->i40e_hw_space;
2109 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2110
2111 ASSERT(mp->b_next == NULL);
2112
2113 if (!(i40e->i40e_state & I40E_STARTED) ||
2114 (i40e->i40e_state & I40E_OVERTEMP) ||
2115 (i40e->i40e_state & I40E_SUSPENDED) ||
2116 (i40e->i40e_state & I40E_ERROR) ||
2117 (i40e->i40e_link_state != LINK_STATE_UP)) {
2118 freemsg(mp);
2119 return (NULL);
2120 }
2121
2122 /*
2123 * Figure out the relevant context about this frame that we might need
2124 * for enabling checksum, lso, etc. This also fills in information that
2125 * we might set around the packet type, etc.
2126 */
2127 if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) {
2128 freemsg(mp);
2129 itrq->itrq_txstat.itxs_err_context.value.ui64++;
2130 return (NULL);
2131 }
2132
2133 /*
2134 * For the primordial driver we can punt on doing any recycling right
2135 * now; however, longer term we need to probably do some more pro-active
2136 * recycling to cut back on stalls in the tx path.
2137 */
2138
2139 /*
2140 * Do a quick size check to make sure it fits into what we think it
2141 * should for this device. Note that longer term this will be false,
2142 * particularly when we have the world of TSO.
2143 */
2144 mpsize = 0;
2145 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
2146 mpsize += MBLKL(nmp);
2147 }
2148
2149 /*
2150 * First we allocate our tx control block and prepare the packet for
2151 * transmit before we do a final check for descriptors. We do it this
2152 * way to minimize the time under the tx lock.
2153 */
2154 tcb = i40e_tcb_alloc(itrq);
2155 if (tcb == NULL) {
2156 txs->itxs_err_notcb.value.ui64++;
2157 goto txfail;
2158 }
2159
2160 /*
2161 * For transmitting a block, we're currently going to use just a
2162 * single control block and bcopy all of the fragments into it. We
2163 * should be more intelligent about doing DMA binding or otherwise, but
2164 * for getting off the ground this will have to do.
2165 */
2166 ASSERT(tcb->tcb_dma.dmab_len == 0);
2167 ASSERT(tcb->tcb_dma.dmab_size >= mpsize);
2168 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
2169 size_t clen = MBLKL(nmp);
2170 void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
2171
2172 bcopy(nmp->b_rptr, coff, clen);
2173 tcb->tcb_dma.dmab_len += clen;
2174 }
2175 ASSERT(tcb->tcb_dma.dmab_len == mpsize);
2176
2177 /*
2178 * While there's really no need to keep the mp here, but let's just do
2179 * it to help with our own debugging for now.
2180 */
2181 tcb->tcb_mp = mp;
2182 tcb->tcb_type = I40E_TX_COPY;
2183 I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
2184
2185 mutex_enter(&itrq->itrq_tx_lock);
2186 if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) {
2187 txs->itxs_err_nodescs.value.ui64++;
2188 mutex_exit(&itrq->itrq_tx_lock);
2189 goto txfail;
2190 }
2191
2192 /*
2193 * Build up the descriptor and send it out. Thankfully at the moment
2194 * we only need a single desc, because we're not doing anything fancy
2195 * yet.
2196 */
2197 ASSERT(itrq->itrq_desc_free > 0);
2198 itrq->itrq_desc_free--;
2199 txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
2200 itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
2201 itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
2202 itrq->itrq_tx_ring_size);
2203
2204 /*
2205 * Note, we always set EOP and RS which indicates that this is the last
2206 * data frame and that we should ask for it to be transmitted. We also
2207 * must always set ICRC, because that is an internal bit that must be
2208 * set to one for data descriptors. The remaining bits in the command
2209 * descriptor depend on checksumming and are determined based on the
2210 * information set up in i40e_tx_context().
2211 */
2212 type = I40E_TX_DESC_DTYPE_DATA;
2213 cmd = I40E_TX_DESC_CMD_EOP |
2214 I40E_TX_DESC_CMD_RS |
2215 I40E_TX_DESC_CMD_ICRC |
2216 tctx.itc_cmdflags;
2217 txdesc->buffer_addr =
2218 CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address);
2219 txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
2220 ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
2221 ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
2222 ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
2223
2224 /*
2225 * Now, finally, sync the DMA data and alert hardware.
2226 */
2227 I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
2228
2229 I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
2230 itrq->itrq_desc_tail);
2231 if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
2232 DDI_FM_OK) {
2233 /*
2234 * Note, we can't really go through and clean this up very well,
2235 * because the memory has been given to the device, so just
2236 * indicate it's been transmitted.
2237 */
2238 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2239 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2240 }
2241
2242 txs->itxs_bytes.value.ui64 += mpsize;
2243 txs->itxs_packets.value.ui64++;
2244 txs->itxs_descriptors.value.ui64++;
2245
2246 mutex_exit(&itrq->itrq_tx_lock);
2247
2248 return (NULL);
2249
2250 txfail:
2251 /*
2252 * We ran out of resources. Return it to MAC and indicate that we'll
2253 * need to signal MAC. If there are allocated tcb's, return them now.
2254 * Make sure to reset their message block's, since we'll return them
2255 * back to MAC.
2256 */
2257 if (tcb != NULL) {
2258 tcb->tcb_mp = NULL;
2259 i40e_tcb_reset(tcb);
2260 i40e_tcb_free(itrq, tcb);
2261 }
2262
2263 mutex_enter(&itrq->itrq_tx_lock);
2264 itrq->itrq_tx_blocked = B_TRUE;
2265 mutex_exit(&itrq->itrq_tx_lock);
2266
2267 return (mp);
2268 }
|
1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
14 * Copyright 2019 Joyent, Inc.
15 */
16
17 #include "i40e_sw.h"
18
19 /*
20 * ---------------------------------------------------------
21 * Buffer and Memory Management, Receiving, and Transmitting
22 * ---------------------------------------------------------
23 *
24 * Each physical function (PF), which is what we think of as an instance of the
25 * device driver, has a series of associated transmit and receive queue pairs.
26 * Effectively, what we think of in MAC as rings. Each of these has their own
27 * ring of descriptors which is used as part of doing DMA activity.
28 *
29 * The transmit ring of descriptors are 16-byte entries which are used to send
30 * packets, program filters, etc. The receive ring of descriptors are either
31 * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
32 * format so that we're in a better position if we ever want to leverage that
33 * information later on.
34 *
43 * we end up allocating. Those are:
44 *
45 * o The size of the ring (controlled through the driver.conf file)
46 *
47 * o The maximum size frame we can receive.
48 *
49 * The size of the rings currently defaults to 1024 descriptors and is stored in
50 * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
51 *
52 * While the size of the rings is controlled by the driver.conf, the maximum
53 * size frame is informed primarily through the use of dladm and the setting of
54 * the MTU property on the device. From the MTU, we then go and do some
55 * machinations. The first thing we do is we then have to add in space for the
56 * Ethernet header, potentially a VLAN header, and the FCS check. This value is
57 * what's stored as i40e_t`i40e_frame_max and is derived any time
58 * i40e_t`i40e_sdu changes.
59 *
60 * This size is then rounded up to the nearest 1k chunk, which represents the
61 * actual amount of memory that we'll allocate for a single frame.
62 *
63 * Note, that for RX, we do something that might be unexpected. We always add
64 * an extra two bytes to the frame size that we allocate. We then offset the DMA
65 * address that we receive a packet into by two bytes. This ensures that the IP
66 * header will always be 4 byte aligned because the MAC header is either 14 or
67 * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
68 * and MAC's lives easier.
69 *
70 * Both the RX and TX descriptor rings (which are what we use to communicate
71 * with hardware) are allocated as a single region of DMA memory which is the
72 * size of the descriptor (4 bytes and 2 bytes respectively) times the total
73 * number of descriptors for an RX and TX ring.
74 *
75 * While the RX and TX descriptors are allocated using DMA-based memory, the
76 * control blocks for each of them are allocated using normal kernel memory.
77 * They aren't special from a DMA perspective. We'll go over the design of both
78 * receiving and transmitting separately, as they have slightly different
79 * control blocks and different ways that we manage the relationship between
80 * control blocks and descriptors.
81 *
82 * ---------------------------------
83 * RX Descriptors and Control Blocks
84 * ---------------------------------
85 *
86 * For every descriptor in the ring that the driver has, we need some associated
87 * memory, which means that we need to have the receive specific control block.
88 * We have a couple different, but related goals:
89 *
90 * o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do
91 * not want to do any additional memory allocations or DMA allocations if
92 * we don't have to.
93 *
94 * o We'd like to try and do as much zero-copy as possible, while taking into
95 * account the cost of mapping in DMA resources.
96 *
97 * o We'd like to have every receive descriptor available.
98 *
99 * Now, these rules are a bit in tension with one another. The act of mapping in
100 * is an exercise of trying to find the break-even point between page table
101 * updates and bcopy. We currently start by using the same metrics that ixgbe
102 * used; however, it should be known that this value has effectively been
103 * cargo-culted across to yet another driver, sorry.
104 *
105 * If we receive a packet which is larger than our copy threshold, we'll create
106 * a message block out of the DMA memory via desballoc(9F) and send that up to
107 * MAC that way. This will cause us to be notified when the message block is
108 * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
109 * it's less than the threshold, we'll try to use allocb and bcopy it into the
110 * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
111 * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
112 * the behavior and always do a bcopy or a DMA bind.
113 *
114 * To try and ensure that the device always has blocks that it can receive data
115 * into, we maintain two lists of control blocks, a working list and a free
116 * list. Each list is sized equal to the number of descriptors in the RX ring.
117 * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
118 * equal to twice the number of descriptors in the ring and we assign them
119 * equally to the free list and to the working list. Each control block also has
120 * DMA memory allocated and associated with which it will be used to receive the
121 * actual packet data. All of a received frame's data will end up in a single
122 * DMA buffer.
123 *
124 * During operation, we always maintain the invariant that each RX descriptor
125 * has an associated RX control block which lives in the working list. If we
126 * feel that we should loan up DMA memory to MAC in the form of a message block,
127 * we can only do so if we can maintain this invariant. To do that, we swap in
128 * one of the buffers from the free list. If none are available, then we resort
129 * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
130 * size.
131 *
132 * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
133 * called on the block, at which point we restore the RX control block to the
134 * free list and are able to reuse the DMA memory again. While the scheme may
135 * seem odd, it importantly keeps us out of trying to do any DMA allocations in
136 * the normal path of operation, even though we may still have to allocate
137 * message blocks and copy.
138 *
139 * The following state machine describes the life time of a RX control block. In
140 * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
141 * control block entry as rcb.
142 *
143 * | |
144 * * ... 1/2 of all initial rcb's ... *
145 * | |
146 * v v
147 * +------------------+ +------------------+
148 * | rcb on free list |---*---------->| rcb on work list |
149 * +------------------+ . +------------------+
150 * ^ . moved to |
151 * | replace rcb * . . Frame received,
152 * | loaned to | entry on free list
153 * | MAC + co. | available. rcb's
154 * | | memory made into mblk_t
155 * * . freemsg(9F) | and sent up to MAC.
156 * | called on |
157 * | loaned rcb |
158 * | and it is v
159 * | recycled. +-------------------+
160 * +--------------------<-----| rcb loaned to MAC |
161 * +-------------------+
162 *
163 * Finally, note that every RX control block has a reference count on it. One
164 * reference is added as long as the driver has had the GLDv3 mc_start endpoint
165 * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
166 * no other DLPI consumers remain, then we'll decrement the reference count by
167 * one. Whenever we loan up the RX control block and associated buffer to MAC,
168 * then we bump the reference count again. Even though the device is stopped,
169 * there may still be loaned frames in upper levels that we'll want to account
170 * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
171 * that it is cleaned up.
172 *
173 * --------------------
174 * Managing the RX Ring
175 * --------------------
176 *
177 * The receive ring descriptors are arranged in a circular buffer with a head
178 * and tail pointer. There are both the conventional head and tail pointers
179 * which are used to partition the ring into two portions, a portion that we,
180 * the operating system, manage and a portion that is managed by hardware. When
181 * hardware owns a descriptor in the ring, it means that it is waiting for data
182 * to be filled in. However, when a portion of the ring is owned by the driver,
183 * then that means that the descriptor has been consumed and we need to go take
184 * a look at it.
185 *
186 * The initial head is configured to be zero by writing it as such in the
187 * receive queue context in the FPM (function private memory from the host). The
188 * initial tail is written to be the last descriptor. This is written to via the
189 * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
190 * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
191 * the only values we ever consult ourselves are the TAIL register and our own
192 * state tracking. Effectively, we cache the HEAD register and then update it
193 * ourselves based on our work.
194 *
195 * When we iterate over the RX descriptors and thus the received frames, we are
196 * either in an interrupt context or we've been asked by MAC to poll on the
197 * ring. If we've been asked to poll on the ring, we have a maximum number of
198 * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
199 * exceed that count, then we do not process it. When in interrupt context, we
200 * don't have a strict byte count. However, to ensure liveness, we limit the
201 * amount of data based on a configuration value
202 * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
203 * is based on similar numbers that are used for ixgbe. After some additional
204 * time in the field, we'll have a sense as to whether or not it should be
205 * changed.
206 *
207 * When processing, we start at our own HEAD pointer
208 * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
209 * processing. Every RX descriptor has what's described as the DD bit. This bit
210 * (the LSB of the second 8-byte word), indicates whether or not the descriptor
211 * is done. When we give descriptors to the hardware, this value is always
212 * zero. When the hardware has finished a descriptor, it will always be one.
213 *
214 * The first thing that we check is whether the DD bit indicates that the
215 * current HEAD is ready. If it isn't, then we're done. That's the primary
216 * invariant of processing a frame. If it's done, then there are a few other
217 * things that we want to look at. In the same status word as the DD bit, there
218 * are two other important bits:
232 * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
233 * on how that selection is made.
234 *
235 * Regardless of whether we construct an mblk_t or encounter an error, we end up
236 * resetting the descriptor. This re-arms the descriptor for hardware and in the
237 * process, we may end up assigning it a new receive control bock. After we do
238 * this, we always update our HEAD pointer, no matter what.
239 *
240 * Finally, once we've consumed as much as we will in a given window, we go and
241 * update the TAIL register to indicate all the frames we've consumed. We only
242 * do a single bulk write for the ring.
243 *
244 * ---------------------------------
245 * TX Descriptors and Control Blocks
246 * ---------------------------------
247 *
248 * While the transmit path is similar in spirit to the receive path, it works
249 * differently due to the fact that all data is originated by the operating
250 * system and not by the device.
251 *
252 * Like RX, there is both a descriptor ring that we use to communicate to the
253 * driver and which points to the memory used to transmit a frame. Similarly,
254 * there is a corresponding transmit control block, however, the correspondence
255 * between descriptors and control blocks is more complex and not necessarily
256 * 1-to-1.
257 *
258 * The driver is asked to process a single frame at a time. That message block
259 * may be made up of multiple fragments linked together by the mblk_t`b_cont
260 * member. The device has a hard limit of up to 8 buffers being allowed for use
261 * for a single non-LSO packet or LSO segment. The number of TX ring entires
262 * (and thus TX control blocks) used depends on the fragment sizes and DMA
263 * layout, as explained below.
264 *
265 * We alter our DMA strategy based on a threshold tied to the fragment size.
266 * This threshold is configurable via the tx_dma_threshold property. If the
267 * fragment is above the threshold, we DMA bind it -- consuming one TCB and
268 * potentially several data descriptors. The exact number of descriptors (equal
269 * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset
270 * into page, b_wptr offset into page, and the physical layout of the dblk's
271 * memory (contiguous or not). Essentially, we are at the mercy of the DMA
272 * engine and the dblk's memory allocation. Knowing the exact number of
273 * descriptors up front is a task best not taken on by the driver itself.
274 * Instead, we attempt to DMA bind the fragment and verify the descriptor
275 * layout meets hardware constraints. If the proposed DMA bind does not satisfy
276 * the hardware constaints, then we discard it and instead copy the entire
277 * fragment into the pre-allocated TCB buffer (or buffers if the fragment is
278 * larger than the TCB buffer).
279 *
280 * If the fragment is below or at the threshold, we copy it to the pre-allocated
281 * buffer of a TCB. We compress consecutive copy fragments into a single TCB to
282 * conserve resources. We are guaranteed that the TCB buffer is made up of only
283 * 1 DMA cookie; and therefore consumes only one descriptor on the controller.
284 *
285 * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
286 * filtering, then the TX data descriptors must be preceeded by a single TX
287 * context descriptor. Because there is no DMA transfer associated with the
288 * context descriptor, we allocate a control block with a special type which
289 * indicates to the TX ring recycle code that there are no associated DMA
290 * resources to unbind when the control block is free'd.
291 *
292 * If we don't have enough space in the ring or TX control blocks available,
293 * then we'll return the unprocessed message block to MAC. This will induce flow
294 * control and once we recycle enough entries, we'll once again enable sending
295 * on the ring.
296 *
297 * We size the working list as equal to the number of descriptors in the ring.
298 * We size the free list as equal to 1.5 times the number of descriptors in the
299 * ring. We'll allocate a number of TX control block entries equal to the number
300 * of entries in the free list. By default, all entries are placed in the free
301 * list. As we come along and try to send something, we'll allocate entries from
302 * the free list and add them to the working list, where they'll stay until the
303 * hardware indicates that all of the data has been written back to us. The
304 * reason that we start with 1.5x is to help facilitate having more than one TX
305 * buffer associated with the DMA activity.
306 *
307 * --------------------
308 * Managing the TX Ring
309 * --------------------
310 *
311 * The transmit descriptor ring is driven by us. We maintain our own notion of a
312 * HEAD and TAIL register and we update the hardware with updates to the TAIL
313 * register. When the hardware is done writing out data, it updates us by
314 * writing back to a specific address, not by updating the individual
315 * descriptors. That address is a 4-byte region after the main transmit
316 * descriptor ring. This is why the descriptor ring has an extra descriptor's
317 * worth allocated to it.
318 *
319 * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
331 * has been freed, we'll return it to the list.
332 *
333 * The transmit control block free list is managed by keeping track of the
334 * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to
335 * index into the free list and add things to it. In effect, we always push and
336 * pop from the tail and protect it with a single lock,
337 * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
338 * stand up to further performance testing; however, it does allow us to get off
339 * the ground with the device driver.
340 *
341 * The following image describes where a given transmit control block lives in
342 * its lifetime:
343 *
344 * |
345 * * ... Initial placement for all tcb's
346 * |
347 * v
348 * +------------------+ +------------------+
349 * | tcb on free list |---*------------------>| tcb on work list |
350 * +------------------+ . +------------------+
351 * ^ . N tcbs allocated[1] |
352 * | to send frame v
353 * | or fragment on |
354 * | wire, mblk from |
355 * | MAC associated. |
356 * | |
357 * +------*-------------------------------<----+
358 * .
359 * . Hardware indicates
360 * entry transmitted.
361 * tcbs recycled, mblk
362 * from MAC freed.
363 *
364 * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
365 * descriptor plus 1 data descriptor, in the non-DMA-bind case. In the DMA
366 * bind case, N can be 1 context descriptor plus 1 data descriptor per
367 * b_cont in the mblk. In this case, the mblk is associated with the first
368 * data descriptor and freed as part of freeing that data descriptor.
369 *
370 * ------------
371 * Blocking MAC
372 * ------------
373 *
374 * When performing transmit, we can run out of descriptors and ring entries.
375 * When such a case happens, we return the mblk_t to MAC to indicate that we've
376 * been blocked. At that point in time, MAC becomes blocked and will not
377 * transmit anything out that specific ring until we notify MAC. To indicate
378 * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
379 * to B_TRUE.
380 *
381 * When we recycle TX descriptors then we'll end up signaling MAC by calling
382 * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
383 * start sending frames out to us again.
384 */
385
386 /*
387 * We set our DMA alignment requests based on the smallest supported page size
388 * of the corresponding platform.
389 */
390 #if defined(__sparc)
391 #define I40E_DMA_ALIGNMENT 0x2000ull
392 #elif defined(__x86)
393 #define I40E_DMA_ALIGNMENT 0x1000ull
394 #else
395 #error "unknown architecture for i40e"
396 #endif
397
398 /*
399 * This structure is used to maintain information and flags related to
400 * transmitting a frame. These fields are ultimately used to construct the
401 * TX data descriptor(s) and, if necessary, the TX context descriptor.
402 */
403 typedef struct i40e_tx_context {
404 enum i40e_tx_desc_cmd_bits itc_data_cmdflags;
405 uint32_t itc_data_offsets;
406 enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags;
407 uint32_t itc_ctx_tsolen;
408 uint32_t itc_ctx_mss;
409 } i40e_tx_context_t;
410
411 /*
412 * Toggles on debug builds which can be used to override our RX behaviour based
413 * on thresholds.
414 */
415 #ifdef DEBUG
416 typedef enum {
417 I40E_DEBUG_RX_DEFAULT = 0,
418 I40E_DEBUG_RX_BCOPY = 1,
419 I40E_DEBUG_RX_DMABIND = 2
420 } i40e_debug_rx_t;
421
422 i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
423 #endif /* DEBUG */
424
425 /*
426 * Notes on the following pair of DMA attributes. The first attribute,
427 * i40e_static_dma_attr, is designed to be used for both the descriptor rings
428 * and the static buffers that we associate with control blocks. For this
429 * reason, we force an SGL length of one. While technically the driver supports
430 * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
431 * management here. In addition, when the Intel common code wants to allocate
432 * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
433 * the static dma attr.
434 *
435 * The latter two sets of attributes, are what we use when we're binding a
436 * bunch of mblk_t fragments to go out the door. Note that the main difference
437 * here is that we're allowed a larger SGL length. For non-LSO TX, we
438 * restrict the SGL length to match the number of TX buffers available to the
439 * PF (8). For the LSO case we can go much larger, with the caveat that each
440 * MSS-sized chunk (segment) must not span more than 8 data descriptors and
441 * hence must not span more than 8 cookies.
442 *
443 * Note, we default to setting ourselves to be DMA capable here. However,
444 * because we could have multiple instances which have different FMA error
445 * checking capabilities, or end up on different buses, we make these static
446 * and const and copy them into the i40e_t for the given device with the actual
447 * values that reflect the actual capabilities.
448 */
449 static const ddi_dma_attr_t i40e_g_static_dma_attr = {
450 DMA_ATTR_V0, /* version number */
451 0x0000000000000000ull, /* low address */
452 0xFFFFFFFFFFFFFFFFull, /* high address */
453 0x00000000FFFFFFFFull, /* dma counter max */
454 I40E_DMA_ALIGNMENT, /* alignment */
455 0x00000FFF, /* burst sizes */
456 0x00000001, /* minimum transfer size */
457 0x00000000FFFFFFFFull, /* maximum transfer size */
458 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
459 1, /* scatter/gather list length */
460 0x00000001, /* granularity */
461 DDI_DMA_FLAGERR /* DMA flags */
462 };
463
464 static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
465 DMA_ATTR_V0, /* version number */
466 0x0000000000000000ull, /* low address */
467 0xFFFFFFFFFFFFFFFFull, /* high address */
468 I40E_MAX_TX_BUFSZ - 1, /* dma counter max */
469 I40E_DMA_ALIGNMENT, /* alignment */
470 0x00000FFF, /* burst sizes */
471 0x00000001, /* minimum transfer size */
472 0x00000000FFFFFFFFull, /* maximum transfer size */
473 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
474 I40E_TX_MAX_COOKIE, /* scatter/gather list length */
475 0x00000001, /* granularity */
476 DDI_DMA_FLAGERR /* DMA flags */
477 };
478
479 static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
480 DMA_ATTR_V0, /* version number */
481 0x0000000000000000ull, /* low address */
482 0xFFFFFFFFFFFFFFFFull, /* high address */
483 I40E_MAX_TX_BUFSZ - 1, /* dma counter max */
484 I40E_DMA_ALIGNMENT, /* alignment */
485 0x00000FFF, /* burst sizes */
486 0x00000001, /* minimum transfer size */
487 0x00000000FFFFFFFFull, /* maximum transfer size */
488 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
489 I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */
490 0x00000001, /* granularity */
491 DDI_DMA_FLAGERR /* DMA flags */
492 };
493
494 /*
495 * Next, we have the attributes for these structures. The descriptor rings are
496 * all strictly little endian, while the data buffers are just arrays of bytes
497 * representing frames. Because of this, we purposefully simplify the driver
498 * programming life by programming the descriptor ring as little endian, while
499 * for the buffer data we keep it as unstructured.
500 *
501 * Note, that to keep the Intel common code operating in a reasonable way, when
502 * we allocate DMA memory for it, we do not use byte swapping and thus use the
503 * standard i40e_buf_acc_attr.
504 */
505 static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
506 DDI_DEVICE_ATTR_V0,
507 DDI_STRUCTURE_LE_ACC,
508 DDI_STRICTORDER_ACC
509 };
510
511 static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
512 DDI_DEVICE_ATTR_V0,
513 DDI_NEVERSWAP_ACC,
702
703 static boolean_t
704 i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
705 {
706 i40e_rx_data_t *rxd;
707
708 rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
709 if (rxd == NULL)
710 return (B_FALSE);
711 itrq->itrq_rxdata = rxd;
712 rxd->rxd_i40e = i40e;
713
714 rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
715 rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
716
717 rxd->rxd_rcb_free = rxd->rxd_free_list_size;
718
719 rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
720 rxd->rxd_ring_size, KM_NOSLEEP);
721 if (rxd->rxd_work_list == NULL) {
722 i40e_error(i40e, "failed to allocate RX work list for a ring "
723 "of %d entries for ring %d", rxd->rxd_ring_size,
724 itrq->itrq_index);
725 goto cleanup;
726 }
727
728 rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
729 rxd->rxd_free_list_size, KM_NOSLEEP);
730 if (rxd->rxd_free_list == NULL) {
731 i40e_error(i40e, "failed to allocate a %d entry RX free list "
732 "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
733 goto cleanup;
734 }
735
736 rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
737 (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
738 if (rxd->rxd_rcb_area == NULL) {
739 i40e_error(i40e, "failed to allocate a %d entry rcb area for "
740 "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
741 itrq->itrq_index);
742 goto cleanup;
743 }
744
745 return (B_TRUE);
746
747 cleanup:
748 i40e_free_rx_data(rxd);
749 itrq->itrq_rxdata = NULL;
750 return (B_FALSE);
751 }
799 atomic_inc_32(&rxd->rxd_rcb_pending);
800 atomic_inc_32(&i40e->i40e_rx_pending);
801 }
802 }
803 mutex_exit(&i40e->i40e_rx_pending_lock);
804 }
805
806 /*
807 * Initialize the DMA memory for the descriptor ring and for each frame in the
808 * control block list.
809 */
810 static boolean_t
811 i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
812 {
813 int i, count;
814 size_t dmasz;
815 i40e_rx_control_block_t *rcb;
816 i40e_t *i40e = rxd->rxd_i40e;
817
818 /*
819 * First allocate the RX descriptor ring.
820 */
821 dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
822 VERIFY(dmasz > 0);
823 if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
824 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
825 B_TRUE, dmasz) == B_FALSE) {
826 i40e_error(i40e, "failed to allocate DMA resources "
827 "for RX descriptor ring");
828 return (B_FALSE);
829 }
830 rxd->rxd_desc_ring =
831 (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
832 rxd->rxd_desc_next = 0;
833
834 count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
835 rcb = rxd->rxd_rcb_area;
836
837 dmasz = i40e->i40e_rx_buf_size;
838 VERIFY(dmasz > 0);
839 for (i = 0; i < count; i++, rcb++) {
840 i40e_dma_buffer_t *dmap;
841 VERIFY(rcb != NULL);
842
843 if (i < rxd->rxd_ring_size) {
844 rxd->rxd_work_list[i] = rcb;
845 } else {
846 rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
847 }
848
849 dmap = &rcb->rcb_dma;
850 if (i40e_alloc_dma_buffer(i40e, dmap,
851 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
852 B_TRUE, B_FALSE, dmasz) == B_FALSE) {
853 i40e_error(i40e, "failed to allocate RX dma buffer");
854 return (B_FALSE);
855 }
856
857 /*
858 * Initialize the control block and offset the DMA address. See
859 * the note in the big theory statement that explains how this
860 * helps IP deal with alignment. Note, we don't worry about
861 * whether or not we successfully get an mblk_t from desballoc,
862 * it's a common case that we have to handle later on in the
863 * system.
864 */
865 dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT;
866 dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT;
867 dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT;
868
869 rcb->rcb_ref = 1;
870 rcb->rcb_rxd = rxd;
871 rcb->rcb_free_rtn.free_func = i40e_rx_recycle;
872 rcb->rcb_free_rtn.free_arg = (caddr_t)rcb;
873 rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address,
875 }
876
877 return (B_TRUE);
878 }
879
880 static void
881 i40e_free_tx_dma(i40e_trqpair_t *itrq)
882 {
883 size_t fsz;
884
885 if (itrq->itrq_tcb_area != NULL) {
886 uint32_t i;
887 i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
888
889 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
890 i40e_free_dma_buffer(&tcb->tcb_dma);
891 if (tcb->tcb_dma_handle != NULL) {
892 ddi_dma_free_handle(&tcb->tcb_dma_handle);
893 tcb->tcb_dma_handle = NULL;
894 }
895 if (tcb->tcb_lso_dma_handle != NULL) {
896 ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
897 tcb->tcb_lso_dma_handle = NULL;
898 }
899 }
900
901 fsz = sizeof (i40e_tx_control_block_t) *
902 itrq->itrq_tx_free_list_size;
903 kmem_free(itrq->itrq_tcb_area, fsz);
904 itrq->itrq_tcb_area = NULL;
905 }
906
907 if (itrq->itrq_tcb_free_list != NULL) {
908 fsz = sizeof (i40e_tx_control_block_t *) *
909 itrq->itrq_tx_free_list_size;
910 kmem_free(itrq->itrq_tcb_free_list, fsz);
911 itrq->itrq_tcb_free_list = NULL;
912 }
913
914 if (itrq->itrq_tcb_work_list != NULL) {
915 fsz = sizeof (i40e_tx_control_block_t *) *
916 itrq->itrq_tx_ring_size;
917 kmem_free(itrq->itrq_tcb_work_list, fsz);
918 itrq->itrq_tcb_work_list = NULL;
919 }
920
921 i40e_free_dma_buffer(&itrq->itrq_desc_area);
922 itrq->itrq_desc_ring = NULL;
923
924 }
925
926 static boolean_t
927 i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
928 {
929 int i, ret;
930 size_t dmasz;
931 i40e_tx_control_block_t *tcb;
932 i40e_t *i40e = itrq->itrq_i40e;
933
934 itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
935 itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
936 (i40e->i40e_tx_ring_size >> 1);
937
938 /*
939 * Allocate an additional TX descriptor for the writeback head.
940 */
941 dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
942 dmasz += sizeof (i40e_tx_desc_t);
943
944 VERIFY(dmasz > 0);
945 if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
946 &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
947 B_FALSE, B_TRUE, dmasz) == B_FALSE) {
948 i40e_error(i40e, "failed to allocate DMA resources for TX "
949 "descriptor ring");
950 return (B_FALSE);
951 }
952 itrq->itrq_desc_ring =
953 (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
954 itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
955 itrq->itrq_tx_ring_size);
956 itrq->itrq_desc_head = 0;
957 itrq->itrq_desc_tail = 0;
958 itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
959
960 itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
961 sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
962 if (itrq->itrq_tcb_work_list == NULL) {
963 i40e_error(i40e, "failed to allocate a %d entry TX work list "
964 "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
965 goto cleanup;
966 }
967
968 itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
969 sizeof (i40e_tx_control_block_t *), KM_SLEEP);
970 if (itrq->itrq_tcb_free_list == NULL) {
971 i40e_error(i40e, "failed to allocate a %d entry TX free list "
972 "for ring %d", itrq->itrq_tx_free_list_size,
973 itrq->itrq_index);
974 goto cleanup;
975 }
976
977 /*
978 * We allocate enough TX control blocks to cover the free list.
979 */
980 itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
981 itrq->itrq_tx_free_list_size, KM_NOSLEEP);
982 if (itrq->itrq_tcb_area == NULL) {
983 i40e_error(i40e, "failed to allocate a %d entry tcb area for "
984 "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
985 goto cleanup;
986 }
987
988 /*
989 * For each tcb, allocate DMA memory.
990 */
991 dmasz = i40e->i40e_tx_buf_size;
992 VERIFY(dmasz > 0);
993 tcb = itrq->itrq_tcb_area;
994 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
995 VERIFY(tcb != NULL);
996
997 /*
998 * Allocate both a DMA buffer which we'll use for when we copy
999 * packets for transmission and allocate a DMA handle which
1000 * we'll use when we bind data.
1001 */
1002 ret = ddi_dma_alloc_handle(i40e->i40e_dip,
1003 &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
1004 &tcb->tcb_dma_handle);
1005 if (ret != DDI_SUCCESS) {
1006 i40e_error(i40e, "failed to allocate DMA handle for TX "
1007 "data binding on ring %d: %d", itrq->itrq_index,
1008 ret);
1009 tcb->tcb_dma_handle = NULL;
1010 goto cleanup;
1011 }
1012
1013 ret = ddi_dma_alloc_handle(i40e->i40e_dip,
1014 &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
1015 &tcb->tcb_lso_dma_handle);
1016 if (ret != DDI_SUCCESS) {
1017 i40e_error(i40e, "failed to allocate DMA handle for TX "
1018 "LSO data binding on ring %d: %d", itrq->itrq_index,
1019 ret);
1020 tcb->tcb_lso_dma_handle = NULL;
1021 goto cleanup;
1022 }
1023
1024 if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
1025 &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
1026 B_TRUE, B_FALSE, dmasz) == B_FALSE) {
1027 i40e_error(i40e, "failed to allocate %ld bytes of "
1028 "DMA for TX data binding on ring %d", dmasz,
1029 itrq->itrq_index);
1030 goto cleanup;
1031 }
1032
1033 itrq->itrq_tcb_free_list[i] = tcb;
1034 }
1035
1036 itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
1037
1038 return (B_TRUE);
1039
1040 cleanup:
1041 i40e_free_tx_dma(itrq);
1042 return (B_FALSE);
1043 }
1044
1045 /*
1046 * Free all memory associated with all of the rings on this i40e instance. Note,
1047 * this is done as part of the GLDv3 stop routine.
1048 */
1049 void
1050 i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
1051 {
1052 int i;
1053
1054 for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
1055 i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
1056
1057 /*
1058 * In some cases i40e_alloc_rx_data() may have failed
1059 * and in that case there is no rxd to free.
1060 */
1061 if (rxd == NULL)
1062 continue;
1063
1064 /*
1065 * Clean up our RX data. We have to free DMA resources first and
1066 * then if we have no more pending RCB's, then we'll go ahead
1067 * and clean things up. Note, we can't set the stopped flag on
1068 * the RX data until after we've done the first pass of the
1069 * pending resources. Otherwise we might race with
1070 * i40e_rx_recycle on determining who should free the
1071 * i40e_rx_data_t above.
1072 */
1073 i40e_free_rx_dma(rxd, failed_init);
1074
1075 mutex_enter(&i40e->i40e_rx_pending_lock);
1076 rxd->rxd_shutdown = B_TRUE;
1077 if (rxd->rxd_rcb_pending == 0) {
1078 i40e_free_rx_data(rxd);
1079 i40e->i40e_trqpairs[i].itrq_rxdata = NULL;
1080 }
1081 mutex_exit(&i40e->i40e_rx_pending_lock);
1082
1083 i40e_free_tx_dma(&i40e->i40e_trqpairs[i]);
1084 }
1085 }
1086
1087 /*
1088 * Allocate all of the resources associated with all of the rings on this i40e
1111 return (B_TRUE);
1112
1113 unwind:
1114 i40e_free_ring_mem(i40e, B_TRUE);
1115 return (B_FALSE);
1116 }
1117
1118
1119 /*
1120 * Because every instance of i40e may have different support for FMA
1121 * capabilities, we copy the DMA attributes into the i40e_t and set them that
1122 * way and use them for determining attributes.
1123 */
1124 void
1125 i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
1126 {
1127 bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
1128 sizeof (ddi_dma_attr_t));
1129 bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
1130 sizeof (ddi_dma_attr_t));
1131 bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
1132 sizeof (ddi_dma_attr_t));
1133 bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
1134 sizeof (ddi_device_acc_attr_t));
1135 bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
1136 sizeof (ddi_device_acc_attr_t));
1137
1138 if (fma == B_TRUE) {
1139 i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1140 i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1141 i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
1142 DDI_DMA_FLAGERR;
1143 } else {
1144 i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1145 i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1146 i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
1147 ~DDI_DMA_FLAGERR;
1148 }
1149 }
1150
1151 static void
1152 i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
1153 {
1154 mutex_enter(&rxd->rxd_free_lock);
1155 ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
1156 ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
1157 rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
1158 rxd->rxd_rcb_free++;
1159 mutex_exit(&rxd->rxd_free_lock);
1160 }
1161
1162 static i40e_rx_control_block_t *
1163 i40e_rcb_alloc(i40e_rx_data_t *rxd)
1164 {
1165 i40e_rx_control_block_t *rcb;
1166
1167 mutex_enter(&rxd->rxd_free_lock);
1168 if (rxd->rxd_rcb_free == 0) {
1169 mutex_exit(&rxd->rxd_free_lock);
1170 return (NULL);
1171 }
1172 rxd->rxd_rcb_free--;
1173 rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
1174 VERIFY(rcb != NULL);
1175 rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
1176 mutex_exit(&rxd->rxd_free_lock);
1177
1178 return (rcb);
1179 }
1180
1181 /*
1182 * This is the callback that we get from the OS when freemsg(9F) has been called
1183 * on a loaned descriptor. In addition, if we take the last reference count
1184 * here, then we have to tear down all of the RX data.
1185 */
1186 void
1187 i40e_rx_recycle(caddr_t arg)
1188 {
1189 uint32_t ref;
1190 i40e_rx_control_block_t *rcb;
1191 i40e_rx_data_t *rxd;
1192 i40e_t *i40e;
1193
1194 /* LINTED: E_BAD_PTR_CAST_ALIGN */
1195 rcb = (i40e_rx_control_block_t *)arg;
1196 rxd = rcb->rcb_rxd;
1197 i40e = rxd->rxd_i40e;
1198
1199 /*
1200 * It's possible for this to be called with a reference count of zero.
1201 * That will happen when we're doing the freemsg after taking the last
1202 * reference because we're tearing down everything and this rcb is not
1203 * outstanding.
1204 */
1830 case IPPROTO_UDP:
1831 l4len = sizeof (struct udphdr);
1832 break;
1833 case IPPROTO_SCTP:
1834 l4len = sizeof (sctp_hdr_t);
1835 break;
1836 default:
1837 return (0);
1838 }
1839
1840 meoi->meoi_l4hlen = l4len;
1841 meoi->meoi_flags |= MEOI_L4INFO_SET;
1842 return (0);
1843 }
1844
1845 /*
1846 * Attempt to put togther the information we'll need to feed into a descriptor
1847 * to properly program the hardware for checksum offload as well as the
1848 * generally required flags.
1849 *
1850 * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
1851 * 'or' into the descriptor based on the checksum flags for this mblk_t and the
1852 * actual information we care about.
1853 *
1854 * If the mblk requires LSO then we'll also gather the information that will be
1855 * used to construct the Transmit Context Descriptor.
1856 */
1857 static int
1858 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
1859 mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx)
1860 {
1861 uint32_t chkflags, start, mss, lsoflags;
1862 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
1863
1864 bzero(tctx, sizeof (i40e_tx_context_t));
1865
1866 if (i40e->i40e_tx_hcksum_enable != B_TRUE)
1867 return (0);
1868
1869 mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
1870 mac_lso_get(mp, &mss, &lsoflags);
1871
1872 if (chkflags == 0 && lsoflags == 0)
1873 return (0);
1874
1875 /*
1876 * Have we been asked to checksum an IPv4 header. If so, verify that we
1877 * have sufficient information and then set the proper fields in the
1878 * command structure.
1879 */
1880 if (chkflags & HCK_IPV4_HDRCKSUM) {
1881 if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1882 txs->itxs_hck_nol2info.value.ui64++;
1883 return (-1);
1884 }
1885 if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1886 txs->itxs_hck_nol3info.value.ui64++;
1887 return (-1);
1888 }
1889 if (meo->meoi_l3proto != ETHERTYPE_IP) {
1890 txs->itxs_hck_badl3.value.ui64++;
1891 return (-1);
1892 }
1893 tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
1894 tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1895 I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1896 tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1897 I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1898 }
1899
1900 /*
1901 * We've been asked to provide an L4 header, first, set up the IP
1902 * information in the descriptor if we haven't already before moving
1903 * onto seeing if we have enough information for the L4 checksum
1904 * offload.
1905 */
1906 if (chkflags & HCK_PARTIALCKSUM) {
1907 if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) {
1908 txs->itxs_hck_nol4info.value.ui64++;
1909 return (-1);
1910 }
1911
1912 if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
1913 if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1914 txs->itxs_hck_nol2info.value.ui64++;
1915 return (-1);
1916 }
1917 if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1918 txs->itxs_hck_nol3info.value.ui64++;
1919 return (-1);
1920 }
1921
1922 if (meo->meoi_l3proto == ETHERTYPE_IP) {
1923 tctx->itc_data_cmdflags |=
1924 I40E_TX_DESC_CMD_IIPT_IPV4;
1925 } else if (meo->meoi_l3proto == ETHERTYPE_IPV6) {
1926 tctx->itc_data_cmdflags |=
1927 I40E_TX_DESC_CMD_IIPT_IPV6;
1928 } else {
1929 txs->itxs_hck_badl3.value.ui64++;
1930 return (-1);
1931 }
1932 tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1933 I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1934 tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1935 I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1936 }
1937
1938 switch (meo->meoi_l4proto) {
1939 case IPPROTO_TCP:
1940 tctx->itc_data_cmdflags |=
1941 I40E_TX_DESC_CMD_L4T_EOFT_TCP;
1942 break;
1943 case IPPROTO_UDP:
1944 tctx->itc_data_cmdflags |=
1945 I40E_TX_DESC_CMD_L4T_EOFT_UDP;
1946 break;
1947 case IPPROTO_SCTP:
1948 tctx->itc_data_cmdflags |=
1949 I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
1950 break;
1951 default:
1952 txs->itxs_hck_badl4.value.ui64++;
1953 return (-1);
1954 }
1955
1956 tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) <<
1957 I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
1958 }
1959
1960 if (lsoflags & HW_LSO) {
1961 /*
1962 * LSO requires that checksum offloads are enabled. If for
1963 * some reason they're not we bail out with an error.
1964 */
1965 if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 ||
1966 (chkflags & HCK_PARTIALCKSUM) == 0) {
1967 txs->itxs_lso_nohck.value.ui64++;
1968 return (-1);
1969 }
1970
1971 tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
1972 tctx->itc_ctx_mss = mss;
1973 tctx->itc_ctx_tsolen = msgsize(mp) -
1974 (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen);
1975 }
1976
1977 return (0);
1978 }
1979
1980 static void
1981 i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
1982 {
1983 ASSERT(tcb != NULL);
1984
1985 mutex_enter(&itrq->itrq_tcb_lock);
1986 ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
1987 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb;
1988 itrq->itrq_tcb_free++;
1989 mutex_exit(&itrq->itrq_tcb_lock);
1990 }
1991
1992 static i40e_tx_control_block_t *
1993 i40e_tcb_alloc(i40e_trqpair_t *itrq)
1994 {
1995 i40e_tx_control_block_t *ret;
1996
2005 itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL;
2006 mutex_exit(&itrq->itrq_tcb_lock);
2007
2008 ASSERT(ret != NULL);
2009 return (ret);
2010 }
2011
2012 /*
2013 * This should be used to free any DMA resources, associated mblk_t's, etc. It's
2014 * used as part of recycling the message blocks when we have either an interrupt
2015 * or other activity that indicates that we need to take a look.
2016 */
2017 static void
2018 i40e_tcb_reset(i40e_tx_control_block_t *tcb)
2019 {
2020 switch (tcb->tcb_type) {
2021 case I40E_TX_COPY:
2022 tcb->tcb_dma.dmab_len = 0;
2023 break;
2024 case I40E_TX_DMA:
2025 if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
2026 (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
2027 else if (tcb->tcb_bind_ncookies > 0)
2028 (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
2029 if (tcb->tcb_bind_info != NULL) {
2030 kmem_free(tcb->tcb_bind_info,
2031 tcb->tcb_bind_ncookies *
2032 sizeof (struct i40e_dma_bind_info));
2033 }
2034 tcb->tcb_bind_info = NULL;
2035 tcb->tcb_bind_ncookies = 0;
2036 tcb->tcb_used_lso = B_FALSE;
2037 break;
2038 case I40E_TX_DESC:
2039 break;
2040 case I40E_TX_NONE:
2041 /* Cast to pacify lint */
2042 panic("trying to free tcb %p with bad type none", (void *)tcb);
2043 default:
2044 panic("unknown i40e tcb type: %d", tcb->tcb_type);
2045 }
2046
2047 tcb->tcb_type = I40E_TX_NONE;
2048 if (tcb->tcb_mp != NULL) {
2049 freemsg(tcb->tcb_mp);
2050 tcb->tcb_mp = NULL;
2051 }
2052 tcb->tcb_next = NULL;
2053 }
2054
2055 /*
2056 * This is called as part of shutting down to clean up all outstanding
2057 * descriptors. Similar to recycle, except we don't re-arm anything and instead
2058 * just return control blocks to the free list.
2059 */
2060 void
2061 i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
2062 {
2063 uint32_t index;
2064
2065 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
2066 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2067
2068 /*
2069 * Because we should have shut down the chip at this point, it should be
2070 * safe to just clean up all the entries between our head and tail.
2071 */
2072 #ifdef DEBUG
2073 index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space,
2074 I40E_QTX_ENA(itrq->itrq_index));
2075 VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
2076 I40E_QTX_ENA_QENA_STAT_MASK));
2077 #endif
2078
2079 index = itrq->itrq_desc_head;
2080 while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
2081 i40e_tx_control_block_t *tcb;
2082
2083 tcb = itrq->itrq_tcb_work_list[index];
2084 if (tcb != NULL) {
2085 itrq->itrq_tcb_work_list[index] = NULL;
2086 i40e_tcb_reset(tcb);
2087 i40e_tcb_free(itrq, tcb);
2088 }
2089
2090 bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
2091 index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
2092 itrq->itrq_desc_free++;
2093 }
2094
2095 ASSERT(index == itrq->itrq_desc_tail);
2096 itrq->itrq_desc_head = index;
2097 }
2098
2099 /*
2100 * We're here either by hook or by crook. We need to see if there are transmit
2101 * descriptors available for us to go and clean up and return to the hardware.
2102 * We may also be blocked, and if so, we should make sure that we let it know
2103 * we're good to go.
2104 */
2105 void
2106 i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
2107 {
2108 uint32_t wbhead, toclean, count;
2109 i40e_tx_control_block_t *tcbhead;
2110 i40e_t *i40e = itrq->itrq_i40e;
2111 uint_t desc_per_tcb, i;
2112
2113 mutex_enter(&itrq->itrq_tx_lock);
2114
2115 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2116 if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
2117 if (itrq->itrq_tx_blocked == B_TRUE) {
2118 itrq->itrq_tx_blocked = B_FALSE;
2119 mac_tx_ring_update(i40e->i40e_mac_hdl,
2120 itrq->itrq_mactxring);
2121 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2122 }
2123 mutex_exit(&itrq->itrq_tx_lock);
2124 return;
2125 }
2126
2127 /*
2128 * Now we need to try and see if there's anything available. The driver
2129 * will write to the head location and it guarantees that it does not
2130 * use relaxed ordering.
2131 */
2139 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2140 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2141 return;
2142 }
2143
2144 wbhead = *itrq->itrq_desc_wbhead;
2145 toclean = itrq->itrq_desc_head;
2146 count = 0;
2147 tcbhead = NULL;
2148
2149 while (toclean != wbhead) {
2150 i40e_tx_control_block_t *tcb;
2151
2152 tcb = itrq->itrq_tcb_work_list[toclean];
2153 itrq->itrq_tcb_work_list[toclean] = NULL;
2154 ASSERT(tcb != NULL);
2155 tcb->tcb_next = tcbhead;
2156 tcbhead = tcb;
2157
2158 /*
2159 * In the DMA bind case, there may not necessarily be a 1:1
2160 * mapping between tcb's and descriptors. If the tcb type
2161 * indicates a DMA binding then check the number of DMA
2162 * cookies to determine how many entries to clean in the
2163 * descriptor ring.
2164 */
2165 if (tcb->tcb_type == I40E_TX_DMA)
2166 desc_per_tcb = tcb->tcb_bind_ncookies;
2167 else
2168 desc_per_tcb = 1;
2169
2170 for (i = 0; i < desc_per_tcb; i++) {
2171 /*
2172 * We zero this out for sanity purposes.
2173 */
2174 bzero(&itrq->itrq_desc_ring[toclean],
2175 sizeof (i40e_tx_desc_t));
2176 toclean = i40e_next_desc(toclean, 1,
2177 itrq->itrq_tx_ring_size);
2178 count++;
2179 }
2180 }
2181
2182 itrq->itrq_desc_head = wbhead;
2183 itrq->itrq_desc_free += count;
2184 itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
2185 ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2186
2187 if (itrq->itrq_tx_blocked == B_TRUE &&
2188 itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
2189 itrq->itrq_tx_blocked = B_FALSE;
2190
2191 mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring);
2192 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2193 }
2194
2195 mutex_exit(&itrq->itrq_tx_lock);
2196
2197 /*
2198 * Now clean up the tcb.
2199 */
2200 while (tcbhead != NULL) {
2201 i40e_tx_control_block_t *tcb = tcbhead;
2202
2203 tcbhead = tcb->tcb_next;
2204 i40e_tcb_reset(tcb);
2205 i40e_tcb_free(itrq, tcb);
2206 }
2207
2208 DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
2209 }
2210
2211 static void
2212 i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp,
2213 const size_t off, const size_t len)
2214 {
2215 const void *soff = mp->b_rptr + off;
2216 void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
2217
2218 ASSERT3U(len, >, 0);
2219 ASSERT3P(soff, >=, mp->b_rptr);
2220 ASSERT3P(soff, <=, mp->b_wptr);
2221 ASSERT3U(len, <=, MBLKL(mp));
2222 ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr);
2223 ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len);
2224 bcopy(soff, doff, len);
2225 tcb->tcb_type = I40E_TX_COPY;
2226 tcb->tcb_dma.dmab_len += len;
2227 I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
2228 }
2229
2230 static i40e_tx_control_block_t *
2231 i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
2232 size_t off, boolean_t use_lso)
2233 {
2234 ddi_dma_handle_t dma_handle;
2235 ddi_dma_cookie_t dma_cookie;
2236 uint_t i = 0, ncookies = 0, dmaflags;
2237 i40e_tx_control_block_t *tcb;
2238 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2239
2240 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2241 txs->itxs_err_notcb.value.ui64++;
2242 return (NULL);
2243 }
2244 tcb->tcb_type = I40E_TX_DMA;
2245
2246 if (use_lso == B_TRUE)
2247 dma_handle = tcb->tcb_lso_dma_handle;
2248 else
2249 dma_handle = tcb->tcb_dma_handle;
2250
2251 dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING;
2252 if (ddi_dma_addr_bind_handle(dma_handle, NULL,
2253 (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags,
2254 DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
2255 txs->itxs_bind_fails.value.ui64++;
2256 goto bffail;
2257 }
2258
2259 tcb->tcb_bind_ncookies = ncookies;
2260 tcb->tcb_used_lso = use_lso;
2261
2262 tcb->tcb_bind_info =
2263 kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
2264 KM_NOSLEEP);
2265 if (tcb->tcb_bind_info == NULL)
2266 goto bffail;
2267
2268 while (i < ncookies) {
2269 if (i > 0)
2270 ddi_dma_nextcookie(dma_handle, &dma_cookie);
2271
2272 tcb->tcb_bind_info[i].dbi_paddr =
2273 (caddr_t)dma_cookie.dmac_laddress;
2274 tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
2275 }
2276
2277 return (tcb);
2278
2279 bffail:
2280 i40e_tcb_reset(tcb);
2281 i40e_tcb_free(itrq, tcb);
2282 return (NULL);
2283 }
2284
2285 static void
2286 i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
2287 caddr_t buff, size_t len, boolean_t last_desc)
2288 {
2289 i40e_tx_desc_t *txdesc;
2290 int cmd;
2291
2292 ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
2293 itrq->itrq_desc_free--;
2294 txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
2295 itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
2296 itrq->itrq_tx_ring_size);
2297
2298 cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
2299
2300 /*
2301 * The last data descriptor needs the EOP bit set, so that the HW knows
2302 * that we're ready to send. Additionally, we set the RS (Report
2303 * Status) bit, so that we are notified when the transmit engine has
2304 * completed DMA'ing all of the data descriptors and data buffers
2305 * associated with this frame.
2306 */
2307 if (last_desc == B_TRUE) {
2308 cmd |= I40E_TX_DESC_CMD_EOP;
2309 cmd |= I40E_TX_DESC_CMD_RS;
2310 }
2311
2312 /*
2313 * Per the X710 manual, section 8.4.2.1.1, the buffer size
2314 * must be a value from 1 to 16K minus 1, inclusive.
2315 */
2316 ASSERT3U(len, >=, 1);
2317 ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ - 1);
2318
2319 txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff);
2320 txdesc->cmd_type_offset_bsz =
2321 LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
2322 ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
2323 ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
2324 ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
2325 }
2326
2327 /*
2328 * Place 'tcb' on the tail of the list represented by 'head'/'tail'.
2329 */
2330 static inline void
2331 tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail,
2332 i40e_tx_control_block_t *tcb)
2333 {
2334 if (*head == NULL) {
2335 *head = tcb;
2336 *tail = *head;
2337 } else {
2338 ASSERT3P(*tail, !=, NULL);
2339 ASSERT3P((*tail)->tcb_next, ==, NULL);
2340 (*tail)->tcb_next = tcb;
2341 *tail = tcb;
2342 }
2343 }
2344
2345 /*
2346 * This function takes a single packet, possibly consisting of
2347 * multiple mblks, and creates a TCB chain to send to the controller.
2348 * This TCB chain may span up to a maximum of 8 descriptors. A copy
2349 * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or
2350 * more, depending on several factors. For each fragment (invidual
2351 * mblk making up the packet), we determine if its size dictates a
2352 * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a
2353 * count of descriptors used; when that count reaches the max we force
2354 * all remaining fragments into a single TCB buffer. We have a
2355 * guarantee that the TCB buffer is always larger than the MTU -- so
2356 * there is always enough room. Consecutive fragments below the DMA
2357 * threshold are copied into a single TCB. In the event of an error
2358 * this function returns NULL but leaves 'mp' alone.
2359 */
2360 static i40e_tx_control_block_t *
2361 i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc)
2362 {
2363 const mblk_t *nmp = mp;
2364 uint_t needed_desc = 0;
2365 boolean_t force_copy = B_FALSE;
2366 i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
2367 i40e_t *i40e = itrq->itrq_i40e;
2368 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2369
2370 /* TCB buffer is always larger than MTU. */
2371 ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size);
2372
2373 while (nmp != NULL) {
2374 const size_t nmp_len = MBLKL(nmp);
2375
2376 /* Ignore zero-length mblks. */
2377 if (nmp_len == 0) {
2378 nmp = nmp->b_cont;
2379 continue;
2380 }
2381
2382 if (nmp_len < i40e->i40e_tx_dma_min || force_copy) {
2383 /* Compress consecutive copies into one TCB. */
2384 if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) {
2385 i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
2386 nmp = nmp->b_cont;
2387 continue;
2388 }
2389
2390 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2391 txs->itxs_err_notcb.value.ui64++;
2392 goto fail;
2393 }
2394
2395 /*
2396 * TCB DMA buffer is guaranteed to be one
2397 * cookie by i40e_alloc_dma_buffer().
2398 */
2399 i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
2400 needed_desc++;
2401 tcb_list_append(&tcbhead, &tcbtail, tcb);
2402 } else {
2403 uint_t total_desc;
2404
2405 tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE);
2406 if (tcb == NULL) {
2407 i40e_error(i40e, "dma bind failed!");
2408 goto fail;
2409 }
2410
2411 /*
2412 * If the new total exceeds the max or we've
2413 * reached the limit and there's data left,
2414 * then give up binding and copy the rest into
2415 * the pre-allocated TCB buffer.
2416 */
2417 total_desc = needed_desc + tcb->tcb_bind_ncookies;
2418 if ((total_desc > I40E_TX_MAX_COOKIE) ||
2419 (total_desc == I40E_TX_MAX_COOKIE &&
2420 nmp->b_cont != NULL)) {
2421 i40e_tcb_reset(tcb);
2422 i40e_tcb_free(itrq, tcb);
2423
2424 if (tcbtail != NULL &&
2425 tcbtail->tcb_type == I40E_TX_COPY) {
2426 tcb = tcbtail;
2427 } else {
2428 tcb = NULL;
2429 }
2430
2431 force_copy = B_TRUE;
2432 txs->itxs_force_copy.value.ui64++;
2433 continue;
2434 }
2435
2436 needed_desc += tcb->tcb_bind_ncookies;
2437 tcb_list_append(&tcbhead, &tcbtail, tcb);
2438 }
2439
2440 nmp = nmp->b_cont;
2441 }
2442
2443 ASSERT3P(nmp, ==, NULL);
2444 ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE);
2445 ASSERT3P(tcbhead, !=, NULL);
2446 *ndesc += needed_desc;
2447 return (tcbhead);
2448
2449 fail:
2450 tcb = tcbhead;
2451 while (tcb != NULL) {
2452 i40e_tx_control_block_t *next = tcb->tcb_next;
2453
2454 ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2455 tcb->tcb_type == I40E_TX_COPY);
2456
2457 tcb->tcb_mp = NULL;
2458 i40e_tcb_reset(tcb);
2459 i40e_tcb_free(itrq, tcb);
2460 tcb = next;
2461 }
2462
2463 return (NULL);
2464 }
2465
2466 /*
2467 * Section 8.4.1 of the 700-series programming guide states that a
2468 * segment may span up to 8 data descriptors; including both header
2469 * and payload data. However, empirical evidence shows that the
2470 * controller freezes the Tx queue when presented with a segment of 8
2471 * descriptors. Or, at least, when the first segment contains 8
2472 * descriptors. One explanation is that the controller counts the
2473 * context descriptor against the first segment, even though the
2474 * programming guide makes no mention of such a constraint. In any
2475 * case, we limit TSO segments to 7 descriptors to prevent Tx queue
2476 * freezes. We still allow non-TSO segments to utilize all 8
2477 * descriptors as they have not demonstrated the faulty behavior.
2478 */
2479 uint_t i40e_lso_num_descs = 7;
2480
2481 #define I40E_TCB_LEFT(tcb) \
2482 ((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len)
2483
2484 /*
2485 * This function is similar in spirit to i40e_non_lso_chain(), but
2486 * much more complicated in reality. Like the previous function, it
2487 * takes a packet (an LSO packet) as input and returns a chain of
2488 * TCBs. The complication comes with the fact that we are no longer
2489 * trying to fit the entire packet into 8 descriptors, but rather we
2490 * must fit each MSS-size segment of the LSO packet into 8 descriptors.
2491 * Except it's really 7 descriptors, see i40e_lso_num_descs.
2492 *
2493 * Your first inclination might be to verify that a given segment
2494 * spans no more than 7 mblks; but it's actually much more subtle than
2495 * that. First, let's describe what the hardware expects, and then we
2496 * can expound on the software side of things.
2497 *
2498 * For an LSO packet the hardware expects the following:
2499 *
2500 * o Each MSS-sized segment must span no more than 7 descriptors.
2501 *
2502 * o The header size does not count towards the segment size.
2503 *
2504 * o If header and payload share the first descriptor, then the
2505 * controller will count the descriptor twice.
2506 *
2507 * The most important thing to keep in mind is that the hardware does
2508 * not view the segments in terms of mblks, like we do. The hardware
2509 * only sees descriptors. It will iterate each descriptor in turn,
2510 * keeping a tally of bytes seen and descriptors visited. If the byte
2511 * count hasn't reached MSS by the time the descriptor count reaches
2512 * 7, then the controller freezes the queue and we are stuck.
2513 * Furthermore, the hardware picks up its tally where it left off. So
2514 * if it reached MSS in the middle of a descriptor, it will start
2515 * tallying the next segment in the middle of that descriptor. The
2516 * hardware's view is entirely removed from the mblk chain or even the
2517 * descriptor layout. Consider these facts:
2518 *
2519 * o The MSS will vary dpeneding on MTU and other factors.
2520 *
2521 * o The dblk allocation will sit at various offsets within a
2522 * memory page.
2523 *
2524 * o The page size itself could vary in the future (i.e. not
2525 * always 4K).
2526 *
2527 * o Just because a dblk is virtually contiguous doesn't mean
2528 * it's physically contiguous. The number of cookies
2529 * (descriptors) required by a DMA bind of a single dblk is at
2530 * the mercy of the page size and physical layout.
2531 *
2532 * o The descriptors will most often NOT start/end on a MSS
2533 * boundary. Thus the hardware will often start counting the
2534 * MSS mid descriptor and finish mid descriptor.
2535 *
2536 * The upshot of all this is that the driver must learn to think like
2537 * the controller; and verify that none of the constraints are broken.
2538 * It does this by tallying up the segment just like the hardware
2539 * would. This is handled by the two variables 'segsz' and 'segdesc'.
2540 * After each attempt to bind a dblk, we check the constaints. If
2541 * violated, we undo the DMA and force a copy until MSS is met. We
2542 * have a guarantee that the TCB buffer is larger than MTU; thus
2543 * ensuring we can always meet the MSS with a single copy buffer. We
2544 * also copy consecutive non-DMA fragments into the same TCB buffer.
2545 */
2546 static i40e_tx_control_block_t *
2547 i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp,
2548 const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx,
2549 uint_t *ndesc)
2550 {
2551 size_t mp_len = MBLKL(mp);
2552 /*
2553 * The cpoff (copy offset) variable tracks the offset inside
2554 * the current mp. There are cases where the entire mp is not
2555 * fully copied in one go: such as the header copy followed by
2556 * a non-DMA mblk, or a TCB buffer that only has enough space
2557 * to copy part of the current mp.
2558 */
2559 size_t cpoff = 0;
2560 /*
2561 * The segsz and segdesc variables track the controller's view
2562 * of the segment. The needed_desc variable tracks the total
2563 * number of data descriptors used by the driver.
2564 */
2565 size_t segsz = 0;
2566 uint_t segdesc = 0;
2567 uint_t needed_desc = 0;
2568 const size_t hdrlen =
2569 meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen;
2570 const size_t mss = tctx->itc_ctx_mss;
2571 boolean_t force_copy = B_FALSE;
2572 i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
2573 i40e_t *i40e = itrq->itrq_i40e;
2574 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2575
2576 /*
2577 * We always copy the header in order to avoid more
2578 * complicated code dealing with various edge cases.
2579 */
2580 ASSERT3U(MBLKL(mp), >=, hdrlen);
2581 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2582 txs->itxs_err_notcb.value.ui64++;
2583 goto fail;
2584 }
2585 needed_desc++;
2586
2587 tcb_list_append(&tcbhead, &tcbtail, tcb);
2588 i40e_tx_copy_fragment(tcb, mp, 0, hdrlen);
2589 cpoff += hdrlen;
2590
2591 /*
2592 * A single descriptor containing both header and data is
2593 * counted twice by the controller.
2594 */
2595 if ((mp_len > hdrlen && mp_len < i40e->i40e_tx_dma_min) ||
2596 (mp->b_cont != NULL &&
2597 MBLKL(mp->b_cont) < i40e->i40e_tx_dma_min)) {
2598 segdesc = 2;
2599 } else {
2600 segdesc = 1;
2601 }
2602
2603 /* If this fragment was pure header, then move to the next one. */
2604 if (cpoff == mp_len) {
2605 mp = mp->b_cont;
2606 cpoff = 0;
2607 }
2608
2609 while (mp != NULL) {
2610 mp_len = MBLKL(mp);
2611 force_copy:
2612 /* Ignore zero-length mblks. */
2613 if (mp_len == 0) {
2614 mp = mp->b_cont;
2615 cpoff = 0;
2616 continue;
2617 }
2618
2619 /*
2620 * We copy into the preallocated TCB buffer when the
2621 * current fragment is less than the DMA threshold OR
2622 * when the DMA bind can't meet the controller's
2623 * segment descriptor limit.
2624 */
2625 if (mp_len < i40e->i40e_tx_dma_min || force_copy) {
2626 size_t tocopy;
2627
2628 /*
2629 * Our objective here is to compress
2630 * consecutive copies into one TCB (until it
2631 * is full). If there is no current TCB, or if
2632 * it is a DMA TCB, then allocate a new one.
2633 */
2634 if (tcb == NULL ||
2635 (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) {
2636 if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
2637 txs->itxs_err_notcb.value.ui64++;
2638 goto fail;
2639 }
2640
2641 /*
2642 * The TCB DMA buffer is guaranteed to
2643 * be one cookie by i40e_alloc_dma_buffer().
2644 */
2645 needed_desc++;
2646 segdesc++;
2647 ASSERT3U(segdesc, <=, i40e_lso_num_descs);
2648 tcb_list_append(&tcbhead, &tcbtail, tcb);
2649 }
2650
2651 tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff);
2652 i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy);
2653 cpoff += tocopy;
2654 segsz += tocopy;
2655
2656 /* We have consumed the current mp. */
2657 if (cpoff == mp_len) {
2658 mp = mp->b_cont;
2659 cpoff = 0;
2660 }
2661
2662 /* We have consumed the current TCB buffer. */
2663 if (I40E_TCB_LEFT(tcb) == 0) {
2664 tcb = NULL;
2665 }
2666
2667 /*
2668 * We have met MSS with this copy; restart the
2669 * counters.
2670 */
2671 if (segsz >= mss) {
2672 segsz = segsz % mss;
2673 segdesc = segsz == 0 ? 0 : 1;
2674 force_copy = B_FALSE;
2675 }
2676
2677 /*
2678 * We are at the controller's descriptor
2679 * limit; we must copy into the current TCB
2680 * until MSS is reached. The TCB buffer is
2681 * always bigger than the MTU so we know it is
2682 * big enough to meet the MSS.
2683 */
2684 if (segdesc == i40e_lso_num_descs) {
2685 force_copy = B_TRUE;
2686 }
2687 } else {
2688 uint_t tsegdesc = segdesc;
2689 size_t tsegsz = segsz;
2690
2691 ASSERT(force_copy == B_FALSE);
2692 ASSERT3U(tsegdesc, <, i40e_lso_num_descs);
2693
2694 tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE);
2695 if (tcb == NULL) {
2696 i40e_error(i40e, "dma bind failed!");
2697 goto fail;
2698 }
2699
2700 for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) {
2701 struct i40e_dma_bind_info dbi =
2702 tcb->tcb_bind_info[i];
2703
2704 tsegsz += dbi.dbi_len;
2705 tsegdesc++;
2706 ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
2707
2708 /*
2709 * We've met the MSS with this portion
2710 * of the DMA.
2711 */
2712 if (tsegsz >= mss) {
2713 tsegdesc = 1;
2714 tsegsz = tsegsz % mss;
2715 }
2716
2717 /*
2718 * We've reached max descriptors but
2719 * have not met the MSS. Undo the bind
2720 * and instead copy.
2721 */
2722 if (tsegdesc == i40e_lso_num_descs) {
2723 i40e_tcb_reset(tcb);
2724 i40e_tcb_free(itrq, tcb);
2725
2726 if (tcbtail != NULL &&
2727 I40E_TCB_LEFT(tcb) > 0 &&
2728 tcbtail->tcb_type == I40E_TX_COPY) {
2729 tcb = tcbtail;
2730 } else {
2731 tcb = NULL;
2732 }
2733
2734 /*
2735 * Remember, we are still on
2736 * the same mp.
2737 */
2738 force_copy = B_TRUE;
2739 txs->itxs_tso_force_copy.value.ui64++;
2740 goto force_copy;
2741 }
2742 }
2743
2744 ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
2745 ASSERT3U(tsegsz, <, mss);
2746
2747 /*
2748 * We've made if through the loop without
2749 * breaking the segment descriptor contract
2750 * with the controller -- replace the segment
2751 * tracking values with the temporary ones.
2752 */
2753 segdesc = tsegdesc;
2754 segsz = tsegsz;
2755 needed_desc += tcb->tcb_bind_ncookies;
2756 cpoff = 0;
2757 tcb_list_append(&tcbhead, &tcbtail, tcb);
2758 mp = mp->b_cont;
2759 }
2760 }
2761
2762 ASSERT3P(mp, ==, NULL);
2763 ASSERT3P(tcbhead, !=, NULL);
2764 *ndesc += needed_desc;
2765 return (tcbhead);
2766
2767 fail:
2768 tcb = tcbhead;
2769 while (tcb != NULL) {
2770 i40e_tx_control_block_t *next = tcb->tcb_next;
2771
2772 ASSERT(tcb->tcb_type == I40E_TX_DMA ||
2773 tcb->tcb_type == I40E_TX_COPY);
2774
2775 tcb->tcb_mp = NULL;
2776 i40e_tcb_reset(tcb);
2777 i40e_tcb_free(itrq, tcb);
2778 tcb = next;
2779 }
2780
2781 return (NULL);
2782 }
2783
2784 /*
2785 * We've been asked to send a message block on the wire. We'll only have a
2786 * single chain. There will not be any b_next pointers; however, there may be
2787 * multiple b_cont blocks. The number of b_cont blocks may exceed the
2788 * controller's Tx descriptor limit.
2789 *
2790 * We may do one of three things with any given mblk_t chain:
2791 *
2792 * 1) Drop it
2793 * 2) Transmit it
2794 * 3) Return it
2795 *
2796 * If we return it to MAC, then MAC will flow control on our behalf. In other
2797 * words, it won't send us anything until we tell it that it's okay to send us
2798 * something.
2799 */
2800 mblk_t *
2801 i40e_ring_tx(void *arg, mblk_t *mp)
2802 {
2803 size_t msglen;
2804 i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL;
2805 i40e_tx_context_desc_t *ctxdesc;
2806 mac_ether_offload_info_t meo;
2807 i40e_tx_context_t tctx;
2808 int type;
2809 uint_t needed_desc = 0;
2810 boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE;
2811
2812 i40e_trqpair_t *itrq = arg;
2813 i40e_t *i40e = itrq->itrq_i40e;
2814 i40e_hw_t *hw = &i40e->i40e_hw_space;
2815 i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2816
2817 ASSERT(mp->b_next == NULL);
2818
2819 if (!(i40e->i40e_state & I40E_STARTED) ||
2820 (i40e->i40e_state & I40E_OVERTEMP) ||
2821 (i40e->i40e_state & I40E_SUSPENDED) ||
2822 (i40e->i40e_state & I40E_ERROR) ||
2823 (i40e->i40e_link_state != LINK_STATE_UP)) {
2824 freemsg(mp);
2825 return (NULL);
2826 }
2827
2828 if (mac_ether_offload_info(mp, &meo) != 0) {
2829 freemsg(mp);
2830 itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++;
2831 return (NULL);
2832 }
2833
2834 /*
2835 * Figure out the relevant context about this frame that we might need
2836 * for enabling checksum, LSO, etc. This also fills in information that
2837 * we might set around the packet type, etc.
2838 */
2839 if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) {
2840 freemsg(mp);
2841 itrq->itrq_txstat.itxs_err_context.value.ui64++;
2842 return (NULL);
2843 }
2844 if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
2845 use_lso = B_TRUE;
2846 do_ctx_desc = B_TRUE;
2847 }
2848
2849 /*
2850 * For the primordial driver we can punt on doing any recycling right
2851 * now; however, longer term we need to probably do some more pro-active
2852 * recycling to cut back on stalls in the TX path.
2853 */
2854
2855 msglen = msgsize(mp);
2856
2857 if (do_ctx_desc) {
2858 /*
2859 * If we're doing tunneling or LSO, then we'll need a TX
2860 * context descriptor in addition to one or more TX data
2861 * descriptors. Since there's no data DMA block or handle
2862 * associated with the context descriptor, we create a special
2863 * control block that behaves effectively like a NOP.
2864 */
2865 if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
2866 txs->itxs_err_notcb.value.ui64++;
2867 goto txfail;
2868 }
2869 tcb_ctx->tcb_type = I40E_TX_DESC;
2870 needed_desc++;
2871 }
2872
2873 if (!use_lso) {
2874 tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc);
2875 } else {
2876 tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc);
2877 }
2878
2879 if (tcbhead == NULL)
2880 goto txfail;
2881
2882 tcbhead->tcb_mp = mp;
2883
2884 /*
2885 * The second condition ensures that 'itrq_desc_tail' never
2886 * equals 'itrq_desc_head'. This enforces the rule found in
2887 * the second bullet point of section 8.4.3.1.5 of the XL710
2888 * PG, which declares the TAIL pointer in I40E_QTX_TAIL should
2889 * never overlap with the head. This means that we only ever
2890 * have 'itrq_tx_ring_size - 1' total available descriptors.
2891 */
2892 mutex_enter(&itrq->itrq_tx_lock);
2893 if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
2894 (itrq->itrq_desc_free - 1) < needed_desc) {
2895 txs->itxs_err_nodescs.value.ui64++;
2896 mutex_exit(&itrq->itrq_tx_lock);
2897 goto txfail;
2898 }
2899
2900 if (do_ctx_desc) {
2901 /*
2902 * If we're enabling any offloads for this frame, then we'll
2903 * need to build up a transmit context descriptor, first. The
2904 * context descriptor needs to be placed in the TX ring before
2905 * the data descriptor(s). See section 8.4.2, table 8-16
2906 */
2907 uint_t tail = itrq->itrq_desc_tail;
2908 itrq->itrq_desc_free--;
2909 ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
2910 itrq->itrq_tcb_work_list[tail] = tcb_ctx;
2911 itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
2912 itrq->itrq_tx_ring_size);
2913
2914 /* QW0 */
2915 type = I40E_TX_DESC_DTYPE_CONTEXT;
2916 ctxdesc->tunneling_params = 0;
2917 ctxdesc->l2tag2 = 0;
2918
2919 /* QW1 */
2920 ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
2921 if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
2922 ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
2923 ((uint64_t)tctx.itc_ctx_cmdflags <<
2924 I40E_TXD_CTX_QW1_CMD_SHIFT) |
2925 ((uint64_t)tctx.itc_ctx_tsolen <<
2926 I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
2927 ((uint64_t)tctx.itc_ctx_mss <<
2928 I40E_TXD_CTX_QW1_MSS_SHIFT));
2929 }
2930 }
2931
2932 tcb = tcbhead;
2933 while (tcb != NULL) {
2934
2935 itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
2936 if (tcb->tcb_type == I40E_TX_COPY) {
2937 boolean_t last_desc = (tcb->tcb_next == NULL);
2938
2939 i40e_tx_set_data_desc(itrq, &tctx,
2940 (caddr_t)tcb->tcb_dma.dmab_dma_address,
2941 tcb->tcb_dma.dmab_len, last_desc);
2942 } else {
2943 boolean_t last_desc = B_FALSE;
2944 ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA);
2945
2946 for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) {
2947 last_desc = (c == tcb->tcb_bind_ncookies - 1) &&
2948 (tcb->tcb_next == NULL);
2949
2950 i40e_tx_set_data_desc(itrq, &tctx,
2951 tcb->tcb_bind_info[c].dbi_paddr,
2952 tcb->tcb_bind_info[c].dbi_len,
2953 last_desc);
2954 }
2955 }
2956
2957 tcb = tcb->tcb_next;
2958 }
2959
2960 /*
2961 * Now, finally, sync the DMA data and alert hardware.
2962 */
2963 I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
2964
2965 I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
2966 itrq->itrq_desc_tail);
2967
2968 if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
2969 DDI_FM_OK) {
2970 /*
2971 * Note, we can't really go through and clean this up very well,
2972 * because the memory has been given to the device, so just
2973 * indicate it's been transmitted.
2974 */
2975 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2976 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2977 }
2978
2979 txs->itxs_bytes.value.ui64 += msglen;
2980 txs->itxs_packets.value.ui64++;
2981 txs->itxs_descriptors.value.ui64 += needed_desc;
2982
2983 mutex_exit(&itrq->itrq_tx_lock);
2984
2985 return (NULL);
2986
2987 txfail:
2988 /*
2989 * We ran out of resources. Return it to MAC and indicate that we'll
2990 * need to signal MAC. If there are allocated tcb's, return them now.
2991 * Make sure to reset their message block's, since we'll return them
2992 * back to MAC.
2993 */
2994 if (tcb_ctx != NULL) {
2995 tcb_ctx->tcb_mp = NULL;
2996 i40e_tcb_reset(tcb_ctx);
2997 i40e_tcb_free(itrq, tcb_ctx);
2998 }
2999
3000 tcb = tcbhead;
3001 while (tcb != NULL) {
3002 i40e_tx_control_block_t *next = tcb->tcb_next;
3003
3004 ASSERT(tcb->tcb_type == I40E_TX_DMA ||
3005 tcb->tcb_type == I40E_TX_COPY);
3006
3007 tcb->tcb_mp = NULL;
3008 i40e_tcb_reset(tcb);
3009 i40e_tcb_free(itrq, tcb);
3010 tcb = next;
3011 }
3012
3013 mutex_enter(&itrq->itrq_tx_lock);
3014 itrq->itrq_tx_blocked = B_TRUE;
3015 mutex_exit(&itrq->itrq_tx_lock);
3016
3017 return (mp);
3018 }
|