1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
  14  * Copyright 2016 Joyent, Inc.
  15  */
  16 
  17 #include "i40e_sw.h"
  18 
  19 /*
  20  * ---------------------------------------------------------
  21  * Buffer and Memory Management, Receiving, and Transmitting
  22  * ---------------------------------------------------------
  23  *
  24  * Each physical function (PF), which is what we think of as an instance of the
  25  * device driver, has a series of associated transmit and receive queue pairs.
  26  * Effectively, what we think of in MAC as rings. Each of these has their own
  27  * ring of descriptors which is used as part of doing DMA activity.
  28  *
  29  * The transmit ring of descriptors are 16-byte entries which are used to send
  30  * packets, program filters, etc. The receive ring of descriptors are either
  31  * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
  32  * format so that we're in a better position if we ever want to leverage that
  33  * information later on.
  34  *
  35  * However, these rings are just for descriptors, they don't talk or deal with
  36  * how we actually store the memory that we need for DMA or the associated
  37  * information that we need for keeping track of message blocks. To correspond
  38  * to the hardware descriptor ring which is how we communicate with hardware, we
  39  * introduce a control block which keeps track of our required metadata like DMA
  40  * mappings.
  41  *
  42  * There are two main considerations that dictate how much memory and buffers
  43  * we end up allocating. Those are:
  44  *
  45  *   o The size of the ring (controlled through the driver.conf file)
  46  *
  47  *   o The maximum size frame we can receive.
  48  *
  49  * The size of the rings currently defaults to 1024 descriptors and is stored in
  50  * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
  51  *
  52  * While the size of the rings is controlled by the driver.conf, the maximum
  53  * size frame is informed primarily through the use of dladm and the setting of
  54  * the MTU property on the device. From the MTU, we then go and do some
  55  * machinations. The first thing we do is we then have to add in space for the
  56  * Ethernet header, potentially a VLAN header, and the FCS check. This value is
  57  * what's stored as i40e_t`i40e_frame_max and is derived any time
  58  * i40e_t`i40e_sdu changes.
  59  *
  60  * This size is then rounded up to the nearest 1k chunk, which represents the
  61  * actual amount of memory that we'll allocate for a single frame.
  62  *
  63  * Note, that for rx, we do something that might be unexpected. We always add
  64  * an extra two bytes to the frame size that we allocate. We then offset the DMA
  65  * address that we receive a packet into by two bytes. This ensures that the IP
  66  * header will always be 4 byte aligned because the MAC header is either 14 or
  67  * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
  68  * and MAC's lives easier.
  69  *
  70  * Both the rx and tx descriptor rings (which are what we use to communicate
  71  * with hardware) are allocated as a single region of DMA memory which is the
  72  * size of the descriptor (4 bytes and 2 bytes respectively) times the total
  73  * number of descriptors for an rx and tx ring.
  74  *
  75  * While the rx and tx descriptors are allocated using DMA-based memory, the
  76  * control blocks for each of them are allocated using normal kernel memory.
  77  * They aren't special from a DMA perspective. We'll go over the design of both
  78  * receiving and transmitting separately, as they have slightly different
  79  * control blocks and different ways that we manage the relationship between
  80  * control blocks and descriptors.
  81  *
  82  * ---------------------------------
  83  * RX Descriptors and Control Blocks
  84  * ---------------------------------
  85  *
  86  * For every descriptor in the ring that the driver has, we need some associated
  87  * memory, which means that we need to have the receive specific control block.
  88  * We have a couple different, but related goals:
  89  *
  90  *   o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do
  91  *     not want to do any additional memory allocations or DMA allocations if
  92  *     we don't have to.
  93  *
  94  *   o We'd like to try and do as much zero-copy as possible, while taking into
  95  *     account the cost of mapping in DMA resources.
  96  *
  97  *   o We'd like to have every receive descriptor available.
  98  *
  99  * Now, these rules are a bit in tension with one another. The act of mapping in
 100  * is an exercise of trying to find the break-even point between page table
 101  * updates and bcopy. We currently start by using the same metrics that ixgbe
 102  * used; however, it should be known that this value has effectively been
 103  * cargo-culted across to yet another driver, sorry.
 104  *
 105  * If we receive a packet which is larger than our copy threshold, we'll create
 106  * a message block out of the DMA memory via desballoc(9F) and send that up to
 107  * MAC that way. This will cause us to be notified when the message block is
 108  * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
 109  * it's less than the threshold, we'll try to use allocb and bcopy it into the
 110  * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
 111  * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
 112  * the behavior and always do a bcopy or a DMA bind.
 113  *
 114  * To try and ensure that the device always has blocks that it can receive data
 115  * into, we maintain two lists of control blocks, a working list and a free
 116  * list. Each list is sized equal to the number of descriptors in the rx ring.
 117  * During the GLDv3 mc_start routine, we allocate a number of rx control blocks
 118  * equal to twice the number of descriptors in the ring and we assign them
 119  * equally to the free list and to the working list. Each control block also has
 120  * DMA memory allocated and associated with which it will be used to receive the
 121  * actual packet data. All of a received frame's data will end up in a single
 122  * DMA buffer.
 123  *
 124  * During operation, we always maintain the invariant that each rx descriptor
 125  * has an associated rx control block which lives in the working list. If we
 126  * feel that we should loan up DMA memory to MAC in the form of a message block,
 127  * we can only do so if we can maintain this invariant. To do that, we swap in
 128  * one of the buffers from the free list. If none are available, then we resort
 129  * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
 130  * size.
 131  *
 132  * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
 133  * called on the block, at which point we restore the rx control block to the
 134  * free list and are able to reuse the DMA memory again. While the scheme may
 135  * seem odd, it importantly keeps us out of trying to do any DMA allocations in
 136  * the normal path of operation, even though we may still have to allocate
 137  * message blocks and copy.
 138  *
 139  * The following state machine describes the life time of a rx control block. In
 140  * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx
 141  * control block entry as rcb.
 142  *
 143  *             |                                   |
 144  *             * ... 1/2 of all initial rcb's  ... *
 145  *             |                                   |
 146  *             v                                   v
 147  *     +------------------+               +------------------+
 148  *     | rcb on free list |---*---------->| rcb on work list |
 149  *     +------------------+   .           +------------------+
 150  *             ^              . moved to          |
 151  *             |                replace rcb       * . . Frame received,
 152  *             |                loaned to         |     entry on free list
 153  *             |                MAC + co.         |     available. rcb's
 154  *             |                                  |     memory made into mblk_t
 155  *             * . freemsg(9F)                    |     and sent up to MAC.
 156  *             |   called on                      |
 157  *             |   loaned rcb                     |
 158  *             |   and it is                      v
 159  *             |   recycled.              +-------------------+
 160  *             +--------------------<-----| rcb loaned to MAC |
 161  *                                        +-------------------+
 162  *
 163  * Finally, note that every rx control block has a reference count on it. One
 164  * reference is added as long as the driver has had the GLDv3 mc_start endpoint
 165  * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
 166  * no other DLPI consumers remain, then we'll decrement the reference count by
 167  * one. Whenever we loan up the rx control block and associated buffer to MAC,
 168  * then we bump the reference count again. Even though the device is stopped,
 169  * there may still be loaned frames in upper levels that we'll want to account
 170  * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
 171  * that it is cleaned up.
 172  *
 173  * --------------------
 174  * Managing the RX Ring
 175  * --------------------
 176  *
 177  * The receive ring descriptors are arranged in a circular buffer with a head
 178  * and tail pointer. There are both the conventional head and tail pointers
 179  * which are used to partition the ring into two portions, a portion that we,
 180  * the operating system, manage and a portion that is managed by hardware. When
 181  * hardware owns a descriptor in the ring, it means that it is waiting for data
 182  * to be filled in. However, when a portion of the ring is owned by the driver,
 183  * then that means that the descriptor has been consumed and we need to go take
 184  * a look at it.
 185  *
 186  * The initial head is configured to be zero by writing it as such in the
 187  * receive queue context in the FPM (function private memory from the host). The
 188  * initial tail is written to be the last descriptor. This is written to via the
 189  * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
 190  * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
 191  * the only values we ever consult ourselves are the TAIL register and our own
 192  * state tracking. Effectively, we cache the HEAD register and then update it
 193  * ourselves based on our work.
 194  *
 195  * When we iterate over the rx descriptors and thus the received frames, we are
 196  * either in an interrupt context or we've been asked by MAC to poll on the
 197  * ring. If we've been asked to poll on the ring, we have a maximum number of
 198  * bytes of mblk_t's to return. If processing an rx descriptor would cause us to
 199  * exceed that count, then we do not process it. When in interrupt context, we
 200  * don't have a strict byte count. However, to ensure liveness, we limit the
 201  * amount of data based on a configuration value
 202  * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
 203  * is based on similar numbers that are used for ixgbe. After some additional
 204  * time in the field, we'll have a sense as to whether or not it should be
 205  * changed.
 206  *
 207  * When processing, we start at our own HEAD pointer
 208  * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
 209  * processing. Every RX descriptor has what's described as the DD bit. This bit
 210  * (the LSB of the second 8-byte word), indicates whether or not the descriptor
 211  * is done.  When we give descriptors to the hardware, this value is always
 212  * zero. When the hardware has finished a descriptor, it will always be one.
 213  *
 214  * The first thing that we check is whether the DD bit indicates that the
 215  * current HEAD is ready. If it isn't, then we're done. That's the primary
 216  * invariant of processing a frame. If it's done, then there are a few other
 217  * things that we want to look at. In the same status word as the DD bit, there
 218  * are two other important bits:
 219  *
 220  *   o End of Packet (EOP)
 221  *   o Error bits
 222  *
 223  * The end of packet indicates that we have reached the last descriptor. Now,
 224  * you might ask when would there be more than one descriptor. The reason for
 225  * that might be due to large receive offload (lro) or header splitting
 226  * functionality, which presently isn't supported in the driver. The error bits
 227  * in the frame are only valid when EOP is set.
 228  *
 229  * If error bits are set on the frame, then we still consume it; however, we
 230  * will not generate an mblk_t to send up to MAC. If there are no error bits
 231  * set, then we'll consume the descriptor either using bcopy or DMA binding. See
 232  * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
 233  * on how that selection is made.
 234  *
 235  * Regardless of whether we construct an mblk_t or encounter an error, we end up
 236  * resetting the descriptor. This re-arms the descriptor for hardware and in the
 237  * process, we may end up assigning it a new receive control bock. After we do
 238  * this, we always update our HEAD pointer, no matter what.
 239  *
 240  * Finally, once we've consumed as much as we will in a given window, we go and
 241  * update the TAIL register to indicate all the frames we've consumed. We only
 242  * do a single bulk write for the ring.
 243  *
 244  * ---------------------------------
 245  * TX Descriptors and Control Blocks
 246  * ---------------------------------
 247  *
 248  * While the transmit path is similar in spirit to the receive path, it works
 249  * differently due to the fact that all data is originated by the operating
 250  * system and not by the device.
 251  *
 252  * Like rx, there is both a descriptor ring that we use to communicate to the
 253  * driver and which points to the memory used to transmit a frame. Similarly,
 254  * there is a corresponding transmit control block. Each transmit control block
 255  * has a region of DMA memory allocated to it; however, the way we use it
 256  * varies.
 257  *
 258  * The driver is asked to process a single frame at a time. That message block
 259  * may be made up of multiple fragments linked together by the mblk_t`b_cont
 260  * member. The device has a hard limit of up to 8 buffers being allowed for use
 261  * for a single logical frame. For each fragment, we'll try and use an entry
 262  * from the tx descriptor ring and then we'll allocate a corresponding tx
 263  * control block. Depending on the size of the fragment, we may copy it around
 264  * or we might instead try to do DMA binding of the fragment.
 265  *
 266  * If we exceed the number of blocks that fit, we'll try to pull up the block
 267  * and then we'll do a DMA bind and send it out.
 268  *
 269  * If we don't have enough space in the ring or tx control blocks available,
 270  * then we'll return the unprocessed message block to MAC. This will induce flow
 271  * control and once we recycle enough entries, we'll once again enable sending
 272  * on the ring.
 273  *
 274  * We size the working list as equal to the number of descriptors in the ring.
 275  * We size the free list as equal to 1.5 times the number of descriptors in the
 276  * ring. We'll allocate a number of tx control block entries equal to the number
 277  * of entries in the free list. By default, all entries are placed in the free
 278  * list. As we come along and try to send something, we'll allocate entries from
 279  * the free list and add them to the working list, where they'll stay until the
 280  * hardware indicates that all of the data has been written back to us. The
 281  * reason that we start with 1.5x is to help facilitate having more than one TX
 282  * buffer associated with the DMA activity.
 283  *
 284  * --------------------
 285  * Managing the TX Ring
 286  * --------------------
 287  *
 288  * The transmit descriptor ring is driven by us. We maintain our own notion of a
 289  * HEAD and TAIL register and we update the hardware with updates to the TAIL
 290  * register. When the hardware is done writing out data, it updates us by
 291  * writing back to a specific address, not by updating the individual
 292  * descriptors. That address is a 4-byte region after the main transmit
 293  * descriptor ring. This is why the descriptor ring has an extra descriptor's
 294  * worth allocated to it.
 295  *
 296  * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
 297  * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames,
 298  * we'll update the tail there and in the I40E_QTX_TAIL() register. At various
 299  * points in time, through both interrupts, and our own internal checks, we'll
 300  * sync the write-back head portion of the DMA space. Based on the index it
 301  * reports back, we'll free everything between our current HEAD and the
 302  * indicated index and update HEAD to the new index.
 303  *
 304  * When a frame comes in, we try to use a number of transmit control blocks and
 305  * we'll transition them from the free list to the work list. They'll get moved
 306  * to the entry on the work list that corresponds with the transmit descriptor
 307  * they correspond to. Once we are indicated that the corresponding descriptor
 308  * has been freed, we'll return it to the list.
 309  *
 310  * The transmit control block free list is managed by keeping track of the
 311  * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to
 312  * index into the free list and add things to it. In effect, we always push and
 313  * pop from the tail and protect it with a single lock,
 314  * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
 315  * stand up to further performance testing; however, it does allow us to get off
 316  * the ground with the device driver.
 317  *
 318  * The following image describes where a given transmit control block lives in
 319  * its lifetime:
 320  *
 321  *             |
 322  *             * ... Initial placement for all tcb's
 323  *             |
 324  *             v
 325  *    +------------------+                       +------------------+
 326  *    | tcb on free list |---*------------------>| tcb on work list |
 327  *    +------------------+   .                   +------------------+
 328  *             ^             . tcb allocated               |
 329  *             |               to send frame               v
 330  *             |               or fragment on              |
 331  *             |               wire, mblk from             |
 332  *             |               MAC associated.             |
 333  *             |                                           |
 334  *             +------*-------------------------------<----+
 335  *                    .
 336  *                    . Hardware indicates
 337  *                      entry transmitted.
 338  *                      tcb recycled, mblk
 339  *                      from MAC freed.
 340  *
 341  * ------------
 342  * Blocking MAC
 343  * ------------
 344  *
 345  * Wen performing transmit, we can run out of descriptors and ring entries. When
 346  * such a case happens, we return the mblk_t to MAC to indicate that we've been
 347  * blocked. At that point in time, MAC becomes blocked and will not transmit
 348  * anything out that specific ring until we notify MAC. To indicate that we're
 349  * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE.
 350  *
 351  * When we recycle tx descriptors then we'll end up signaling MAC by calling
 352  * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
 353  * start sending frames out to us again.
 354  */
 355 
 356 /*
 357  * We set our DMA alignment requests based on the smallest supported page size
 358  * of the corresponding platform.
 359  */
 360 #if     defined(__sparc)
 361 #define I40E_DMA_ALIGNMENT 0x2000ull
 362 #elif defined(__x86)
 363 #define I40E_DMA_ALIGNMENT 0x1000ull
 364 #else
 365 #error  "unknown architecture for i40e"
 366 #endif
 367 
 368 /*
 369  * This structure is used to maintain information and flags related to
 370  * transmitting a frame. The first member is the set of flags we need to or into
 371  * the command word (generally checksumming related). The second member controls
 372  * the word offsets which is required for IP and L4 checksumming.
 373  */
 374 typedef struct i40e_tx_context {
 375         enum i40e_tx_desc_cmd_bits      itc_cmdflags;
 376         uint32_t                        itc_offsets;
 377 } i40e_tx_context_t;
 378 
 379 /*
 380  * Toggles on debug builds which can be used to override our RX behaviour based
 381  * on thresholds.
 382  */
 383 #ifdef  DEBUG
 384 typedef enum {
 385         I40E_DEBUG_RX_DEFAULT   = 0,
 386         I40E_DEBUG_RX_BCOPY     = 1,
 387         I40E_DEBUG_RX_DMABIND   = 2
 388 } i40e_debug_rx_t;
 389 
 390 i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
 391 #endif  /* DEBUG */
 392 
 393 /*
 394  * Notes on the following pair of DMA attributes. The first attribute,
 395  * i40e_static_dma_attr, is designed to be used for both the descriptor rings
 396  * and the static buffers that we associate with control blocks. For this
 397  * reason, we force an SGL length of one. While technically the driver supports
 398  * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our
 399  * management here. In addition, when the Intel common code wants to allocate
 400  * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
 401  * the static dma attr.
 402  *
 403  * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're
 404  * binding a bunch of mblk_t fragments to go out the door. Note that the main
 405  * difference here is that we're allowed a larger SGL length -- eight.
 406  *
 407  * Note, we default to setting ourselves to be DMA capable here. However,
 408  * because we could have multiple instances which have different FMA error
 409  * checking capabilities, or end up on different buses, we make these static
 410  * and const and copy them into the i40e_t for the given device with the actual
 411  * values that reflect the actual capabilities.
 412  */
 413 static const ddi_dma_attr_t i40e_g_static_dma_attr = {
 414         DMA_ATTR_V0,                    /* version number */
 415         0x0000000000000000ull,          /* low address */
 416         0xFFFFFFFFFFFFFFFFull,          /* high address */
 417         0x00000000FFFFFFFFull,          /* dma counter max */
 418         I40E_DMA_ALIGNMENT,             /* alignment */
 419         0x00000FFF,                     /* burst sizes */
 420         0x00000001,                     /* minimum transfer size */
 421         0x00000000FFFFFFFFull,          /* maximum transfer size */
 422         0xFFFFFFFFFFFFFFFFull,          /* maximum segment size */
 423         1,                              /* scatter/gather list length */
 424         0x00000001,                     /* granularity */
 425         DDI_DMA_FLAGERR                 /* DMA flags */
 426 };
 427 
 428 static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
 429         DMA_ATTR_V0,                    /* version number */
 430         0x0000000000000000ull,          /* low address */
 431         0xFFFFFFFFFFFFFFFFull,          /* high address */
 432         0x00000000FFFFFFFFull,          /* dma counter max */
 433         I40E_DMA_ALIGNMENT,             /* alignment */
 434         0x00000FFF,                     /* burst sizes */
 435         0x00000001,                     /* minimum transfer size */
 436         0x00000000FFFFFFFFull,          /* maximum transfer size */
 437         0xFFFFFFFFFFFFFFFFull,          /* maximum segment size  */
 438         I40E_TX_MAX_COOKIE,             /* scatter/gather list length */
 439         0x00000001,                     /* granularity */
 440         DDI_DMA_FLAGERR                 /* DMA flags */
 441 };
 442 
 443 /*
 444  * Next, we have the attributes for these structures. The descriptor rings are
 445  * all strictly little endian, while the data buffers are just arrays of bytes
 446  * representing frames. Because of this, we purposefully simplify the driver
 447  * programming life by programming the descriptor ring as little endian, while
 448  * for the buffer data we keep it as unstructured.
 449  *
 450  * Note, that to keep the Intel common code operating in a reasonable way, when
 451  * we allocate DMA memory for it, we do not use byte swapping and thus use the
 452  * standard i40e_buf_acc_attr.
 453  */
 454 static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
 455         DDI_DEVICE_ATTR_V0,
 456         DDI_STRUCTURE_LE_ACC,
 457         DDI_STRICTORDER_ACC
 458 };
 459 
 460 static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
 461         DDI_DEVICE_ATTR_V0,
 462         DDI_NEVERSWAP_ACC,
 463         DDI_STRICTORDER_ACC
 464 };
 465 
 466 /*
 467  * The next two functions are designed to be type-safe versions of macros that
 468  * are used to increment and decrement a descriptor index in the loop. Note,
 469  * these are marked inline to try and keep the data path hot and they were
 470  * effectively inlined in their previous life as macros.
 471  */
 472 static inline int
 473 i40e_next_desc(int base, int count, int size)
 474 {
 475         int out;
 476 
 477         ASSERT(base >= 0);
 478         ASSERT(count > 0);
 479         ASSERT(size > 0);
 480 
 481         if (base + count < size) {
 482                 out = base + count;
 483         } else {
 484                 out = base + count - size;
 485         }
 486 
 487         ASSERT(out >= 0 && out < size);
 488         return (out);
 489 }
 490 
 491 static inline int
 492 i40e_prev_desc(int base, int count, int size)
 493 {
 494         int out;
 495 
 496         ASSERT(base >= 0);
 497         ASSERT(count > 0);
 498         ASSERT(size > 0);
 499 
 500         if (base >= count) {
 501                 out = base - count;
 502         } else {
 503                 out = base - count + size;
 504         }
 505 
 506         ASSERT(out >= 0 && out < size);
 507         return (out);
 508 }
 509 
 510 /*
 511  * Free DMA memory that is represented by a i40e_dma_buffer_t.
 512  */
 513 static void
 514 i40e_free_dma_buffer(i40e_dma_buffer_t *dmap)
 515 {
 516         if (dmap->dmab_dma_address != NULL) {
 517                 VERIFY(dmap->dmab_dma_handle != NULL);
 518                 (void) ddi_dma_unbind_handle(dmap->dmab_dma_handle);
 519                 dmap->dmab_dma_address = NULL;
 520                 dmap->dmab_size = 0;
 521         }
 522 
 523         if (dmap->dmab_acc_handle != NULL) {
 524                 ddi_dma_mem_free(&dmap->dmab_acc_handle);
 525                 dmap->dmab_acc_handle = NULL;
 526                 dmap->dmab_address = NULL;
 527         }
 528 
 529         if (dmap->dmab_dma_handle != NULL) {
 530                 ddi_dma_free_handle(&dmap->dmab_dma_handle);
 531                 dmap->dmab_dma_handle = NULL;
 532         }
 533 
 534         /*
 535          * These should only be set if we have valid handles allocated and
 536          * therefore should always be NULLed out due to the above code. This
 537          * is here to catch us acting sloppy.
 538          */
 539         ASSERT(dmap->dmab_dma_address == NULL);
 540         ASSERT(dmap->dmab_address == NULL);
 541         ASSERT(dmap->dmab_size == 0);
 542         dmap->dmab_len = 0;
 543 }
 544 
 545 /*
 546  * Allocate size bytes of DMA memory based on the passed in attributes. This
 547  * fills in the information in dmap and is designed for all of our single cookie
 548  * allocations.
 549  */
 550 static boolean_t
 551 i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap,
 552     ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream,
 553     boolean_t zero, size_t size)
 554 {
 555         int ret;
 556         uint_t flags;
 557         size_t len;
 558         ddi_dma_cookie_t cookie;
 559         uint_t ncookies;
 560 
 561         if (stream == B_TRUE)
 562                 flags = DDI_DMA_STREAMING;
 563         else
 564                 flags = DDI_DMA_CONSISTENT;
 565 
 566         /*
 567          * Step one: Allocate the DMA handle
 568          */
 569         ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT,
 570             NULL, &dmap->dmab_dma_handle);
 571         if (ret != DDI_SUCCESS) {
 572                 i40e_error(i40e, "failed to allocate dma handle for I/O "
 573                     "buffers: %d", ret);
 574                 dmap->dmab_dma_handle = NULL;
 575                 return (B_FALSE);
 576         }
 577 
 578         /*
 579          * Step two: Allocate the DMA memory
 580          */
 581         ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags,
 582             DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len,
 583             &dmap->dmab_acc_handle);
 584         if (ret != DDI_SUCCESS) {
 585                 i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
 586                     "buffers", size);
 587                 dmap->dmab_address = NULL;
 588                 dmap->dmab_acc_handle = NULL;
 589                 i40e_free_dma_buffer(dmap);
 590                 return (B_FALSE);
 591         }
 592 
 593         /*
 594          * Step three: Optionally zero
 595          */
 596         if (zero == B_TRUE)
 597                 bzero(dmap->dmab_address, len);
 598 
 599         /*
 600          * Step four: Bind the memory
 601          */
 602         ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL,
 603             dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT,
 604             NULL, &cookie, &ncookies);
 605         if (ret != DDI_DMA_MAPPED) {
 606                 i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O "
 607                     "buffers: %d", size, ret);
 608                 i40e_free_dma_buffer(dmap);
 609                 return (B_FALSE);
 610         }
 611 
 612         VERIFY(ncookies == 1);
 613         dmap->dmab_dma_address = cookie.dmac_laddress;
 614         dmap->dmab_size = len;
 615         dmap->dmab_len = 0;
 616         return (B_TRUE);
 617 }
 618 
 619 /*
 620  * This function is called once the last pending rcb has been freed by the upper
 621  * levels of the system.
 622  */
 623 static void
 624 i40e_free_rx_data(i40e_rx_data_t *rxd)
 625 {
 626         VERIFY(rxd->rxd_rcb_pending == 0);
 627 
 628         if (rxd->rxd_rcb_area != NULL) {
 629                 kmem_free(rxd->rxd_rcb_area,
 630                     sizeof (i40e_rx_control_block_t) *
 631                     (rxd->rxd_free_list_size + rxd->rxd_ring_size));
 632                 rxd->rxd_rcb_area = NULL;
 633         }
 634 
 635         if (rxd->rxd_free_list != NULL) {
 636                 kmem_free(rxd->rxd_free_list,
 637                     sizeof (i40e_rx_control_block_t *) *
 638                     rxd->rxd_free_list_size);
 639                 rxd->rxd_free_list = NULL;
 640         }
 641 
 642         if (rxd->rxd_work_list != NULL) {
 643                 kmem_free(rxd->rxd_work_list,
 644                     sizeof (i40e_rx_control_block_t *) *
 645                     rxd->rxd_ring_size);
 646                 rxd->rxd_work_list = NULL;
 647         }
 648 
 649         kmem_free(rxd, sizeof (i40e_rx_data_t));
 650 }
 651 
 652 static boolean_t
 653 i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
 654 {
 655         i40e_rx_data_t *rxd;
 656 
 657         rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
 658         if (rxd == NULL)
 659                 return (B_FALSE);
 660         itrq->itrq_rxdata = rxd;
 661         rxd->rxd_i40e = i40e;
 662 
 663         rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
 664         rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
 665 
 666         rxd->rxd_rcb_free = rxd->rxd_free_list_size;
 667 
 668         rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
 669             rxd->rxd_ring_size, KM_NOSLEEP);
 670         if (rxd->rxd_work_list == NULL) {
 671                 i40e_error(i40e, "failed to allocate rx work list for a ring "
 672                     "of %d entries for ring %d", rxd->rxd_ring_size,
 673                     itrq->itrq_index);
 674                 goto cleanup;
 675         }
 676 
 677         rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
 678             rxd->rxd_free_list_size, KM_NOSLEEP);
 679         if (rxd->rxd_free_list == NULL) {
 680                 i40e_error(i40e, "failed to allocate a %d entry rx free list "
 681                     "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
 682                 goto cleanup;
 683         }
 684 
 685         rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
 686             (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
 687         if (rxd->rxd_rcb_area == NULL) {
 688                 i40e_error(i40e, "failed to allocate a %d entry rcb area for "
 689                     "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
 690                     itrq->itrq_index);
 691                 goto cleanup;
 692         }
 693 
 694         return (B_TRUE);
 695 
 696 cleanup:
 697         i40e_free_rx_data(rxd);
 698         itrq->itrq_rxdata = NULL;
 699         return (B_FALSE);
 700 }
 701 
 702 /*
 703  * Free all of the memory that we've allocated for DMA. Note that we may have
 704  * buffers that we've loaned up to the OS which are still outstanding. We'll
 705  * always free up the descriptor ring, because we no longer need that. For each
 706  * rcb, we'll iterate over it and if we send the reference count to zero, then
 707  * we'll free the message block and DMA related resources. However, if we don't
 708  * take the last one, then we'll go ahead and keep track that we'll have pending
 709  * data and clean it up when we get there.
 710  */
 711 static void
 712 i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init)
 713 {
 714         uint32_t i, count, ref;
 715 
 716         i40e_rx_control_block_t *rcb;
 717         i40e_t *i40e = rxd->rxd_i40e;
 718 
 719         i40e_free_dma_buffer(&rxd->rxd_desc_area);
 720         rxd->rxd_desc_ring = NULL;
 721         rxd->rxd_desc_next = 0;
 722 
 723         mutex_enter(&i40e->i40e_rx_pending_lock);
 724 
 725         rcb = rxd->rxd_rcb_area;
 726         count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
 727 
 728         for (i = 0; i < count; i++, rcb++) {
 729                 VERIFY(rcb != NULL);
 730 
 731                 /*
 732                  * If we're cleaning up from a failed creation attempt, then an
 733                  * entry may never have been assembled which would mean that
 734                  * it's reference count is zero. If we find that, we leave it
 735                  * be, because nothing else should be modifying it at this
 736                  * point. We're not at the point that any more references can be
 737                  * added, just removed.
 738                  */
 739                 if (failed_init == B_TRUE && rcb->rcb_ref == 0)
 740                         continue;
 741 
 742                 ref = atomic_dec_32_nv(&rcb->rcb_ref);
 743                 if (ref == 0) {
 744                         freemsg(rcb->rcb_mp);
 745                         rcb->rcb_mp = NULL;
 746                         i40e_free_dma_buffer(&rcb->rcb_dma);
 747                 } else {
 748                         atomic_inc_32(&rxd->rxd_rcb_pending);
 749                         atomic_inc_32(&i40e->i40e_rx_pending);
 750                 }
 751         }
 752         mutex_exit(&i40e->i40e_rx_pending_lock);
 753 }
 754 
 755 /*
 756  * Initialize the DMA memory for the descriptor ring and for each frame in the
 757  * control block list.
 758  */
 759 static boolean_t
 760 i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
 761 {
 762         int i, count;
 763         size_t dmasz;
 764         i40e_rx_control_block_t *rcb;
 765         i40e_t *i40e = rxd->rxd_i40e;
 766 
 767         /*
 768          * First allocate the rx descriptor ring.
 769          */
 770         dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
 771         VERIFY(dmasz > 0);
 772         if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
 773             &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
 774             B_TRUE, dmasz) == B_FALSE) {
 775                 i40e_error(i40e, "failed to allocate DMA resources "
 776                     "for rx descriptor ring");
 777                 return (B_FALSE);
 778         }
 779         rxd->rxd_desc_ring =
 780             (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
 781         rxd->rxd_desc_next = 0;
 782 
 783         count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
 784         rcb = rxd->rxd_rcb_area;
 785 
 786         dmasz = i40e->i40e_rx_buf_size;
 787         VERIFY(dmasz > 0);
 788         for (i = 0; i < count; i++, rcb++) {
 789                 i40e_dma_buffer_t *dmap;
 790                 VERIFY(rcb != NULL);
 791 
 792                 if (i < rxd->rxd_ring_size) {
 793                         rxd->rxd_work_list[i] = rcb;
 794                 } else {
 795                         rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
 796                 }
 797 
 798                 dmap = &rcb->rcb_dma;
 799                 if (i40e_alloc_dma_buffer(i40e, dmap,
 800                     &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
 801                     B_TRUE, B_FALSE, dmasz) == B_FALSE) {
 802                         i40e_error(i40e, "failed to allocate rx dma buffer");
 803                         return (B_FALSE);
 804                 }
 805 
 806                 /*
 807                  * Initialize the control block and offset the DMA address. See
 808                  * the note in the big theory statement that explains how this
 809                  * helps IP deal with alignment. Note, we don't worry about
 810                  * whether or not we successfully get an mblk_t from desballoc,
 811                  * it's a common case that we have to handle later on in the
 812                  * system.
 813                  */
 814                 dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT;
 815                 dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT;
 816                 dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT;
 817 
 818                 rcb->rcb_ref = 1;
 819                 rcb->rcb_rxd = rxd;
 820                 rcb->rcb_free_rtn.free_func = i40e_rx_recycle;
 821                 rcb->rcb_free_rtn.free_arg = (caddr_t)rcb;
 822                 rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address,
 823                     dmap->dmab_size, 0, &rcb->rcb_free_rtn);
 824         }
 825 
 826         return (B_TRUE);
 827 }
 828 
 829 static void
 830 i40e_free_tx_dma(i40e_trqpair_t *itrq)
 831 {
 832         size_t fsz;
 833 
 834         if (itrq->itrq_tcb_area != NULL) {
 835                 uint32_t i;
 836                 i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
 837 
 838                 for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
 839                         i40e_free_dma_buffer(&tcb->tcb_dma);
 840                         if (tcb->tcb_dma_handle != NULL) {
 841                                 ddi_dma_free_handle(&tcb->tcb_dma_handle);
 842                                 tcb->tcb_dma_handle = NULL;
 843                         }
 844                 }
 845 
 846                 fsz = sizeof (i40e_tx_control_block_t) *
 847                     itrq->itrq_tx_free_list_size;
 848                 kmem_free(itrq->itrq_tcb_area, fsz);
 849                 itrq->itrq_tcb_area = NULL;
 850         }
 851 
 852         if (itrq->itrq_tcb_free_list != NULL) {
 853                 fsz = sizeof (i40e_tx_control_block_t *) *
 854                     itrq->itrq_tx_free_list_size;
 855                 kmem_free(itrq->itrq_tcb_free_list, fsz);
 856                 itrq->itrq_tcb_free_list = NULL;
 857         }
 858 
 859         if (itrq->itrq_tcb_work_list != NULL) {
 860                 fsz = sizeof (i40e_tx_control_block_t *) *
 861                     itrq->itrq_tx_ring_size;
 862                 kmem_free(itrq->itrq_tcb_work_list, fsz);
 863                 itrq->itrq_tcb_work_list = NULL;
 864         }
 865 
 866         i40e_free_dma_buffer(&itrq->itrq_desc_area);
 867         itrq->itrq_desc_ring = NULL;
 868 
 869 }
 870 
 871 static boolean_t
 872 i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 873 {
 874         int i, ret;
 875         size_t dmasz;
 876         i40e_tx_control_block_t *tcb;
 877         i40e_t *i40e = itrq->itrq_i40e;
 878 
 879         itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
 880         itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
 881             (i40e->i40e_tx_ring_size >> 1);
 882 
 883         /*
 884          * Allocate an additional tx descriptor for the writeback head.
 885          */
 886         dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
 887         dmasz += sizeof (i40e_tx_desc_t);
 888 
 889         VERIFY(dmasz > 0);
 890         if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
 891             &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
 892             B_FALSE, B_TRUE, dmasz) == B_FALSE) {
 893                 i40e_error(i40e, "failed to allocate DMA resources for tx "
 894                     "descriptor ring");
 895                 return (B_FALSE);
 896         }
 897         itrq->itrq_desc_ring =
 898             (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
 899         itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
 900             itrq->itrq_tx_ring_size);
 901         itrq->itrq_desc_head = 0;
 902         itrq->itrq_desc_tail = 0;
 903         itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
 904 
 905         itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
 906             sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
 907         if (itrq->itrq_tcb_work_list == NULL) {
 908                 i40e_error(i40e, "failed to allocate a %d entry tx work list "
 909                     "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
 910                 goto cleanup;
 911         }
 912 
 913         itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
 914             sizeof (i40e_tx_control_block_t *), KM_SLEEP);
 915         if (itrq->itrq_tcb_free_list == NULL) {
 916                 i40e_error(i40e, "failed to allocate a %d entry tx free list "
 917                     "for ring %d", itrq->itrq_tx_free_list_size,
 918                     itrq->itrq_index);
 919                 goto cleanup;
 920         }
 921 
 922         /*
 923          * We allocate enough tx control blocks to cover the free list.
 924          */
 925         itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
 926             itrq->itrq_tx_free_list_size, KM_NOSLEEP);
 927         if (itrq->itrq_tcb_area == NULL) {
 928                 i40e_error(i40e, "failed to allocate a %d entry tcb area for "
 929                     "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
 930                 goto cleanup;
 931         }
 932 
 933         /*
 934          * For each tcb, allocate DMA memory.
 935          */
 936         dmasz = i40e->i40e_tx_buf_size;
 937         VERIFY(dmasz > 0);
 938         tcb = itrq->itrq_tcb_area;
 939         for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
 940                 VERIFY(tcb != NULL);
 941 
 942                 /*
 943                  * Allocate both a DMA buffer which we'll use for when we copy
 944                  * packets for transmission and allocate a DMA handle which
 945                  * we'll use when we bind data.
 946                  */
 947                 ret = ddi_dma_alloc_handle(i40e->i40e_dip,
 948                     &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
 949                     &tcb->tcb_dma_handle);
 950                 if (ret != DDI_SUCCESS) {
 951                         i40e_error(i40e, "failed to allocate DMA handle for tx "
 952                             "data binding on ring %d: %d", itrq->itrq_index,
 953                             ret);
 954                         tcb->tcb_dma_handle = NULL;
 955                         goto cleanup;
 956                 }
 957 
 958                 if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
 959                     &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
 960                     B_TRUE, B_FALSE, dmasz) == B_FALSE) {
 961                         i40e_error(i40e, "failed to allocate %ld bytes of "
 962                             "DMA for tx data binding on ring %d", dmasz,
 963                             itrq->itrq_index);
 964                         goto cleanup;
 965                 }
 966 
 967                 itrq->itrq_tcb_free_list[i] = tcb;
 968         }
 969 
 970         itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
 971 
 972         return (B_TRUE);
 973 
 974 cleanup:
 975         i40e_free_tx_dma(itrq);
 976         return (B_FALSE);
 977 }
 978 
 979 /*
 980  * Free all memory associated with all of the rings on this i40e instance. Note,
 981  * this is done as part of the GLDv3 stop routine.
 982  */
 983 void
 984 i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
 985 {
 986         int i;
 987 
 988         for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
 989                 i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
 990 
 991                 /*
 992                  * Clean up our rx data. We have to free DMA resources first and
 993                  * then if we have no more pending RCB's, then we'll go ahead
 994                  * and clean things up. Note, we can't set the stopped flag on
 995                  * the rx data until after we've done the first pass of the
 996                  * pending resources. Otherwise we might race with
 997                  * i40e_rx_recycle on determining who should free the
 998                  * i40e_rx_data_t above.
 999                  */
1000                 i40e_free_rx_dma(rxd, failed_init);
1001 
1002                 mutex_enter(&i40e->i40e_rx_pending_lock);
1003                 rxd->rxd_shutdown = B_TRUE;
1004                 if (rxd->rxd_rcb_pending == 0) {
1005                         i40e_free_rx_data(rxd);
1006                         i40e->i40e_trqpairs[i].itrq_rxdata = NULL;
1007                 }
1008                 mutex_exit(&i40e->i40e_rx_pending_lock);
1009 
1010                 i40e_free_tx_dma(&i40e->i40e_trqpairs[i]);
1011         }
1012 }
1013 
1014 /*
1015  * Allocate all of the resources associated with all of the rings on this i40e
1016  * instance. Note this is done as part of the GLDv3 start routine and thus we
1017  * should not use blocking allocations. This takes care of both DMA and non-DMA
1018  * related resources.
1019  */
1020 boolean_t
1021 i40e_alloc_ring_mem(i40e_t *i40e)
1022 {
1023         int i;
1024 
1025         for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
1026                 if (i40e_alloc_rx_data(i40e, &i40e->i40e_trqpairs[i]) ==
1027                     B_FALSE)
1028                         goto unwind;
1029 
1030                 if (i40e_alloc_rx_dma(i40e->i40e_trqpairs[i].itrq_rxdata) ==
1031                     B_FALSE)
1032                         goto unwind;
1033 
1034                 if (i40e_alloc_tx_dma(&i40e->i40e_trqpairs[i]) == B_FALSE)
1035                         goto unwind;
1036         }
1037 
1038         return (B_TRUE);
1039 
1040 unwind:
1041         i40e_free_ring_mem(i40e, B_TRUE);
1042         return (B_FALSE);
1043 }
1044 
1045 
1046 /*
1047  * Because every instance of i40e may have different support for FMA
1048  * capabilities, we copy the DMA attributes into the i40e_t and set them that
1049  * way and use them for determining attributes.
1050  */
1051 void
1052 i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
1053 {
1054         bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
1055             sizeof (ddi_dma_attr_t));
1056         bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
1057             sizeof (ddi_dma_attr_t));
1058         bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
1059             sizeof (ddi_device_acc_attr_t));
1060         bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
1061             sizeof (ddi_device_acc_attr_t));
1062 
1063         if (fma == B_TRUE) {
1064                 i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1065                 i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1066         } else {
1067                 i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1068                 i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1069         }
1070 }
1071 
1072 static void
1073 i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
1074 {
1075         mutex_enter(&rxd->rxd_free_lock);
1076         ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
1077         ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
1078         rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
1079         rxd->rxd_rcb_free++;
1080         mutex_exit(&rxd->rxd_free_lock);
1081 }
1082 
1083 static i40e_rx_control_block_t *
1084 i40e_rcb_alloc(i40e_rx_data_t *rxd)
1085 {
1086         i40e_rx_control_block_t *rcb;
1087 
1088         mutex_enter(&rxd->rxd_free_lock);
1089         if (rxd->rxd_rcb_free == 0) {
1090                 mutex_exit(&rxd->rxd_free_lock);
1091                 return (NULL);
1092         }
1093         rxd->rxd_rcb_free--;
1094         rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
1095         VERIFY(rcb != NULL);
1096         rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
1097         mutex_exit(&rxd->rxd_free_lock);
1098 
1099         return (rcb);
1100 }
1101 
1102 /*
1103  * This is the callback that we get from the OS when freemsg(9F) has been called
1104  * on a loaned descriptor. In addition, if we take the last reference count
1105  * here, then we have to tear down all of the rx data.
1106  */
1107 void
1108 i40e_rx_recycle(caddr_t arg)
1109 {
1110         uint32_t ref;
1111         i40e_rx_control_block_t *rcb;
1112         i40e_rx_data_t *rxd;
1113         i40e_t *i40e;
1114 
1115         /* LINTED: E_BAD_PTR_CAST_ALIGN */
1116         rcb = (i40e_rx_control_block_t *)arg;
1117         rxd = rcb->rcb_rxd;
1118         i40e = rxd->rxd_i40e;
1119 
1120         /*
1121          * It's possible for this to be called with a reference count of zero.
1122          * That will happen when we're doing the freemsg after taking the last
1123          * reference because we're tearing down everything and this rcb is not
1124          * outstanding.
1125          */
1126         if (rcb->rcb_ref == 0)
1127                 return;
1128 
1129         /*
1130          * Don't worry about failure of desballoc here. It'll only become fatal
1131          * if we're trying to use it and we can't in i40e_rx_bind().
1132          */
1133         rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1134             rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1135         i40e_rcb_free(rxd, rcb);
1136 
1137         /*
1138          * It's possible that the rcb was being used while we are shutting down
1139          * the device. In that case, we'll take the final reference from the
1140          * device here.
1141          */
1142         ref = atomic_dec_32_nv(&rcb->rcb_ref);
1143         if (ref == 0) {
1144                 freemsg(rcb->rcb_mp);
1145                 rcb->rcb_mp = NULL;
1146                 i40e_free_dma_buffer(&rcb->rcb_dma);
1147 
1148                 mutex_enter(&i40e->i40e_rx_pending_lock);
1149                 atomic_dec_32(&rxd->rxd_rcb_pending);
1150                 atomic_dec_32(&i40e->i40e_rx_pending);
1151 
1152                 /*
1153                  * If this was the last block and it's been indicated that we've
1154                  * passed the shutdown point, we should clean up.
1155                  */
1156                 if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) {
1157                         i40e_free_rx_data(rxd);
1158                         cv_broadcast(&i40e->i40e_rx_pending_cv);
1159                 }
1160 
1161                 mutex_exit(&i40e->i40e_rx_pending_lock);
1162         }
1163 }
1164 
1165 static mblk_t *
1166 i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1167     uint32_t plen)
1168 {
1169         mblk_t *mp;
1170         i40e_t *i40e = rxd->rxd_i40e;
1171         i40e_rx_control_block_t *rcb, *rep_rcb;
1172 
1173         ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1174 
1175         if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) {
1176                 itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++;
1177                 return (NULL);
1178         }
1179 
1180         rcb = rxd->rxd_work_list[index];
1181 
1182         /*
1183          * Check to make sure we have a mblk_t. If we don't, this is our last
1184          * chance to try and get one.
1185          */
1186         if (rcb->rcb_mp == NULL) {
1187                 rcb->rcb_mp =
1188                     desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
1189                     rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
1190                 if (rcb->rcb_mp == NULL) {
1191                         itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++;
1192                         i40e_rcb_free(rxd, rcb);
1193                         return (NULL);
1194                 }
1195         }
1196 
1197         I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1198 
1199         if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1200                 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1201                 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1202                 i40e_rcb_free(rxd, rcb);
1203                 return (NULL);
1204         }
1205 
1206         /*
1207          * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT.
1208          */
1209         mp = rcb->rcb_mp;
1210         atomic_inc_32(&rcb->rcb_ref);
1211         mp->b_wptr = mp->b_rptr + plen;
1212         mp->b_next = mp->b_cont = NULL;
1213 
1214         rxd->rxd_work_list[index] = rep_rcb;
1215         return (mp);
1216 }
1217 
1218 /*
1219  * We're going to allocate a new message block for this frame and attempt to
1220  * receive it. See the big theory statement for more information on when we copy
1221  * versus bind.
1222  */
1223 static mblk_t *
1224 i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
1225     uint32_t plen)
1226 {
1227         i40e_t *i40e = rxd->rxd_i40e;
1228         i40e_rx_control_block_t *rcb;
1229         mblk_t *mp;
1230 
1231         ASSERT(index < rxd->rxd_ring_size);
1232         rcb = rxd->rxd_work_list[index];
1233 
1234         I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
1235 
1236         if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
1237                 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1238                 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1239                 return (NULL);
1240         }
1241 
1242         mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0);
1243         if (mp == NULL) {
1244                 itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++;
1245                 return (NULL);
1246         }
1247 
1248         mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT;
1249         bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen);
1250         mp->b_wptr = mp->b_rptr + plen;
1251 
1252         return (mp);
1253 }
1254 
1255 /*
1256  * Determine if the device has enabled any checksum flags for us. The level of
1257  * checksum computed will depend on the type packet that we have, which is
1258  * contained in ptype. For example, the checksum logic it does will vary
1259  * depending on whether or not the packet is considered tunneled, whether it
1260  * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are
1261  * valid.
1262  *
1263  * While there are additional checksums that we could recognize here, we'll need
1264  * to get some additional GLDv3 enhancements to be able to properly describe
1265  * them.
1266  */
1267 static void
1268 i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err,
1269     uint32_t ptype)
1270 {
1271         uint32_t cksum;
1272         struct i40e_rx_ptype_decoded pinfo;
1273 
1274         ASSERT(ptype <= 255);
1275         pinfo = decode_rx_desc_ptype(ptype);
1276 
1277         cksum = 0;
1278 
1279         /*
1280          * If the ptype isn't something that we know in the driver, then we
1281          * shouldn't even consider moving forward.
1282          */
1283         if (pinfo.known == 0) {
1284                 itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++;
1285                 return;
1286         }
1287 
1288         /*
1289          * If hardware didn't set the L3L4P bit on the frame, then there is no
1290          * checksum offload to consider.
1291          */
1292         if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) {
1293                 itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++;
1294                 return;
1295         }
1296 
1297         /*
1298          * The device tells us that IPv6 checksums where a Destination Options
1299          * Header or a Routing header shouldn't be trusted. Discard all
1300          * checksums in this case.
1301          */
1302         if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1303             pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 &&
1304             (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) {
1305                 itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++;
1306                 return;
1307         }
1308 
1309         /*
1310          * The hardware denotes three kinds of possible errors. Two are reserved
1311          * for inner and outer IP checksum errors (IPE and EIPE) and the latter
1312          * is for L4 checksum errors (L4E). If there is only one IP header, then
1313          * the only thing that we care about is IPE. Note that since we don't
1314          * support inner checksums, we will ignore IPE being set on tunneled
1315          * packets and only care about EIPE.
1316          */
1317         if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1318             pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1319                 if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) {
1320                         if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) {
1321                                 itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++;
1322                         } else {
1323                                 itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1324                                 cksum |= HCK_IPV4_HDRCKSUM_OK;
1325                         }
1326                 } else {
1327                         if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) {
1328                                 itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++;
1329                         } else {
1330                                 itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
1331                                 cksum |= HCK_IPV4_HDRCKSUM_OK;
1332                         }
1333                 }
1334         }
1335 
1336         /*
1337          * We only have meaningful L4 checksums in the case of IP->L4 and
1338          * IP->IP->L4. There is not outer L4 checksum data available in any
1339          * other case. Further, we don't bother reporting the valid checksum in
1340          * the case of IP->IP->L4 set.
1341          */
1342         if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1343             pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE &&
1344             (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP ||
1345             pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP ||
1346             pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP ||
1347             pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) {
1348                 ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4);
1349                 if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) {
1350                         itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++;
1351                 } else {
1352                         itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++;
1353                         cksum |= HCK_FULLCKSUM_OK;
1354                 }
1355         }
1356 
1357         if (cksum != 0) {
1358                 itrq->itrq_rxstat.irxs_hck_set.value.ui64++;
1359                 mac_hcksum_set(mp, 0, 0, 0, 0, cksum);
1360         } else {
1361                 itrq->itrq_rxstat.irxs_hck_miss.value.ui64++;
1362         }
1363 }
1364 
1365 mblk_t *
1366 i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes)
1367 {
1368         i40e_t *i40e;
1369         i40e_hw_t *hw;
1370         i40e_rx_data_t *rxd;
1371         uint32_t cur_head;
1372         i40e_rx_desc_t *cur_desc;
1373         i40e_rx_control_block_t *rcb;
1374         uint64_t rx_bytes, rx_frames;
1375         uint64_t stword;
1376         mblk_t *mp, *mp_head, **mp_tail;
1377 
1378         ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
1379         rxd = itrq->itrq_rxdata;
1380         i40e = itrq->itrq_i40e;
1381         hw = &i40e->i40e_hw_space;
1382 
1383         if (!(i40e->i40e_state & I40E_STARTED) ||
1384             (i40e->i40e_state & I40E_OVERTEMP) ||
1385             (i40e->i40e_state & I40E_SUSPENDED) ||
1386             (i40e->i40e_state & I40E_ERROR))
1387                 return (NULL);
1388 
1389         /*
1390          * Before we do anything else, we have to make sure that all of the DMA
1391          * buffers are synced up and then check to make sure that they're
1392          * actually good from an FM perspective.
1393          */
1394         I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL);
1395         if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1396             DDI_FM_OK) {
1397                 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1398                 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1399                 return (NULL);
1400         }
1401 
1402         /*
1403          * Prepare our stats. We do a limited amount of processing in both
1404          * polling and interrupt context. The limit in interrupt context is
1405          * based on frames, in polling context based on bytes.
1406          */
1407         rx_bytes = rx_frames = 0;
1408         mp_head = NULL;
1409         mp_tail = &mp_head;
1410 
1411         /*
1412          * At this point, the descriptor ring is available to check. We'll try
1413          * and process until we either run out of poll_bytes or descriptors.
1414          */
1415         cur_head = rxd->rxd_desc_next;
1416         cur_desc = &rxd->rxd_desc_ring[cur_head];
1417         stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1418 
1419         /*
1420          * Note, the primary invariant of this loop should be that cur_head,
1421          * cur_desc, and stword always point to the currently processed
1422          * descriptor. When we leave the loop, it should point to a descriptor
1423          * that HAS NOT been processed. Meaning, that if we haven't consumed the
1424          * frame, the descriptor should not be advanced.
1425          */
1426         while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) {
1427                 uint32_t error, eop, plen, ptype;
1428 
1429                 /*
1430                  * The DD, PLEN, and EOP bits are the only ones that are valid
1431                  * in every frame. The error information is only valid when EOP
1432                  * is set in the same frame.
1433                  *
1434                  * At this time, because we don't do any LRO or header
1435                  * splitting. We expect that every frame should have EOP set in
1436                  * it. When later functionality comes in, we'll want to
1437                  * re-evaluate this.
1438                  */
1439                 eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
1440                 VERIFY(eop != 0);
1441 
1442                 error = (stword & I40E_RXD_QW1_ERROR_MASK) >>
1443                     I40E_RXD_QW1_ERROR_SHIFT;
1444                 if (error & I40E_RX_ERR_BITS) {
1445                         itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++;
1446                         goto discard;
1447                 }
1448 
1449                 plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1450                     I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1451 
1452                 ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >>
1453                     I40E_RXD_QW1_PTYPE_SHIFT;
1454 
1455                 /*
1456                  * This packet contains valid data. We should check to see if
1457                  * we're actually going to consume it based on its length (to
1458                  * ensure that we don't overshoot our quota). We determine
1459                  * whether to bcopy or bind the DMA resources based on the size
1460                  * of the frame. However, if on debug, we allow it to be
1461                  * overridden for testing purposes.
1462                  *
1463                  * We should be smarter about this and do DMA binding for
1464                  * larger frames, but for now, it's really more important that
1465                  * we actually just get something simple working.
1466                  */
1467 
1468                 /*
1469                  * Ensure we don't exceed our polling quota by reading this
1470                  * frame. Note we only bump bytes now, we bump frames later.
1471                  */
1472                 if ((poll_bytes != I40E_POLL_NULL) &&
1473                     (rx_bytes + plen) > poll_bytes)
1474                         break;
1475                 rx_bytes += plen;
1476 
1477                 mp = NULL;
1478                 if (plen >= i40e->i40e_rx_dma_min)
1479                         mp = i40e_rx_bind(itrq, rxd, cur_head, plen);
1480                 if (mp == NULL)
1481                         mp = i40e_rx_copy(itrq, rxd, cur_head, plen);
1482 
1483                 if (mp != NULL) {
1484                         if (i40e->i40e_rx_hcksum_enable)
1485                                 i40e_rx_hcksum(itrq, mp, stword, error, ptype);
1486                         *mp_tail = mp;
1487                         mp_tail = &mp->b_next;
1488                 }
1489 
1490                 /*
1491                  * Now we need to prepare this frame for use again. See the
1492                  * discussion in the big theory statements.
1493                  *
1494                  * However, right now we're doing the simple version of this.
1495                  * Normally what we'd do would depend on whether or not we were
1496                  * doing DMA binding or bcopying. But because we're always doing
1497                  * bcopying, we can just always use the current index as a key
1498                  * for what to do and reassign the buffer based on the ring.
1499                  */
1500 discard:
1501                 rcb = rxd->rxd_work_list[cur_head];
1502                 cur_desc->read.pkt_addr =
1503                     CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address);
1504                 cur_desc->read.hdr_addr = 0;
1505 
1506                 /*
1507                  * Finally, update our loop invariants.
1508                  */
1509                 cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size);
1510                 cur_desc = &rxd->rxd_desc_ring[cur_head];
1511                 stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
1512 
1513                 /*
1514                  * To help provide liveness, we limit the amount of data that
1515                  * we'll end up counting. Note that in these cases, an interrupt
1516                  * is not dissimilar from a polling request.
1517                  */
1518                 rx_frames++;
1519                 if (rx_frames > i40e->i40e_rx_limit_per_intr) {
1520                         itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++;
1521                         break;
1522                 }
1523         }
1524 
1525         /*
1526          * As we've modified the ring, we need to make sure that we sync the
1527          * descriptor ring for the device. Next, we update the hardware and
1528          * update our notion of where the head for us to read from hardware is
1529          * next.
1530          */
1531         I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV);
1532         if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
1533             DDI_FM_OK) {
1534                 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
1535                 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1536         }
1537 
1538         if (rx_frames != 0) {
1539                 uint32_t tail;
1540                 ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle;
1541                 rxd->rxd_desc_next = cur_head;
1542                 tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size);
1543 
1544                 I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail);
1545                 if (i40e_check_acc_handle(rh) != DDI_FM_OK) {
1546                         ddi_fm_service_impact(i40e->i40e_dip,
1547                             DDI_SERVICE_DEGRADED);
1548                         atomic_or_32(&i40e->i40e_state, I40E_ERROR);
1549                 }
1550 
1551                 itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes;
1552                 itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames;
1553         }
1554 
1555 #ifdef DEBUG
1556         if (rx_frames == 0) {
1557                 ASSERT(rx_bytes == 0);
1558         }
1559 #endif
1560 
1561         return (mp_head);
1562 }
1563 
1564 /*
1565  * This function is called by the GLDv3 when it wants to poll on a ring. The
1566  * only primary difference from when we call this during an interrupt is that we
1567  * have a limit on the number of bytes that we should consume.
1568  */
1569 mblk_t *
1570 i40e_ring_rx_poll(void *arg, int poll_bytes)
1571 {
1572         i40e_trqpair_t *itrq = arg;
1573         mblk_t *mp;
1574 
1575         ASSERT(poll_bytes > 0);
1576         if (poll_bytes == 0)
1577                 return (NULL);
1578 
1579         mutex_enter(&itrq->itrq_rx_lock);
1580         mp = i40e_ring_rx(itrq, poll_bytes);
1581         mutex_exit(&itrq->itrq_rx_lock);
1582 
1583         return (mp);
1584 }
1585 
1586 /*
1587  * This is a structure I wish someone would fill out for me for dorking with the
1588  * checksums. When we get some more experience with this, we should go ahead and
1589  * consider adding this to MAC.
1590  */
1591 typedef enum mac_ether_offload_flags {
1592         MEOI_L2INFO_SET         = 0x01,
1593         MEOI_VLAN_TAGGED        = 0x02,
1594         MEOI_L3INFO_SET         = 0x04,
1595         MEOI_L3CKSUM_SET        = 0x08,
1596         MEOI_L4INFO_SET         = 0x10,
1597         MEOI_L4CKSUM_SET        = 0x20
1598 } mac_ether_offload_flags_t;
1599 
1600 typedef struct mac_ether_offload_info {
1601         mac_ether_offload_flags_t       meoi_flags;
1602         uint8_t         meoi_l2hlen;    /* How long is the Ethernet header? */
1603         uint16_t        meoi_l3proto;   /* What's the Ethertype */
1604         uint8_t         meoi_l3hlen;    /* How long is the header? */
1605         uint8_t         meoi_l4proto;   /* What is the payload type? */
1606         uint8_t         meoi_l4hlen;    /* How long is the L4 header */
1607         mblk_t          *meoi_l3ckmp;   /* Which mblk has the l3 checksum */
1608         off_t           meoi_l3ckoff;   /* What's the offset to it */
1609         mblk_t          *meoi_l4ckmp;   /* Which mblk has the L4 checksum */
1610         off_t           meoi_l4off;     /* What is the offset to it? */
1611 } mac_ether_offload_info_t;
1612 
1613 /*
1614  * This is something that we'd like to make a general MAC function. Before we do
1615  * that, we should add support for TSO.
1616  *
1617  * We should really keep track of our offset and not walk everything every
1618  * time. I can't imagine that this will be kind to us at high packet rates;
1619  * however, for the moment, let's leave that.
1620  *
1621  * This walks a message block chain without pulling up to fill in the context
1622  * information. Note that the data we care about could be hidden across more
1623  * than one mblk_t.
1624  */
1625 static int
1626 i40e_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out)
1627 {
1628         size_t mpsize;
1629         uint8_t *bp;
1630 
1631         mpsize = msgsize(mp);
1632         /* Check for overflow */
1633         if (off + sizeof (uint16_t) > mpsize)
1634                 return (-1);
1635 
1636         mpsize = MBLKL(mp);
1637         while (off >= mpsize) {
1638                 mp = mp->b_cont;
1639                 off -= mpsize;
1640                 mpsize = MBLKL(mp);
1641         }
1642 
1643         bp = mp->b_rptr + off;
1644         *out = *bp;
1645         return (0);
1646 
1647 }
1648 
1649 static int
1650 i40e_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out)
1651 {
1652         size_t mpsize;
1653         uint8_t *bp;
1654 
1655         mpsize = msgsize(mp);
1656         /* Check for overflow */
1657         if (off + sizeof (uint16_t) > mpsize)
1658                 return (-1);
1659 
1660         mpsize = MBLKL(mp);
1661         while (off >= mpsize) {
1662                 mp = mp->b_cont;
1663                 off -= mpsize;
1664                 mpsize = MBLKL(mp);
1665         }
1666 
1667         /*
1668          * Data is in network order. Note the second byte of data might be in
1669          * the next mp.
1670          */
1671         bp = mp->b_rptr + off;
1672         *out = *bp << 8;
1673         if (off + 1 == mpsize) {
1674                 mp = mp->b_cont;
1675                 bp = mp->b_rptr;
1676         } else {
1677                 bp++;
1678         }
1679 
1680         *out |= *bp;
1681         return (0);
1682 
1683 }
1684 
1685 static int
1686 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
1687 {
1688         size_t off;
1689         uint16_t ether;
1690         uint8_t ipproto, iplen, l4len, maclen;
1691 
1692         bzero(meoi, sizeof (mac_ether_offload_info_t));
1693 
1694         off = offsetof(struct ether_header, ether_type);
1695         if (i40e_meoi_get_uint16(mp, off, &ether) != 0)
1696                 return (-1);
1697 
1698         if (ether == ETHERTYPE_VLAN) {
1699                 off = offsetof(struct ether_vlan_header, ether_type);
1700                 if (i40e_meoi_get_uint16(mp, off, &ether) != 0)
1701                         return (-1);
1702                 meoi->meoi_flags |= MEOI_VLAN_TAGGED;
1703                 maclen = sizeof (struct ether_vlan_header);
1704         } else {
1705                 maclen = sizeof (struct ether_header);
1706         }
1707         meoi->meoi_flags |= MEOI_L2INFO_SET;
1708         meoi->meoi_l2hlen = maclen;
1709         meoi->meoi_l3proto = ether;
1710 
1711         switch (ether) {
1712         case ETHERTYPE_IP:
1713                 /*
1714                  * For IPv4 we need to get the length of the header, as it can
1715                  * be variable.
1716                  */
1717                 off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen;
1718                 if (i40e_meoi_get_uint8(mp, off, &iplen) != 0)
1719                         return (-1);
1720                 iplen &= 0x0f;
1721                 if (iplen < 5 || iplen > 0x0f)
1722                         return (-1);
1723                 iplen *= 4;
1724                 off = offsetof(ipha_t, ipha_protocol) + maclen;
1725                 if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1)
1726                         return (-1);
1727                 break;
1728         case ETHERTYPE_IPV6:
1729                 iplen = 40;
1730                 off = offsetof(ip6_t, ip6_nxt) + maclen;
1731                 if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1)
1732                         return (-1);
1733                 break;
1734         default:
1735                 return (0);
1736         }
1737         meoi->meoi_l3hlen = iplen;
1738         meoi->meoi_l4proto = ipproto;
1739         meoi->meoi_flags |= MEOI_L3INFO_SET;
1740 
1741         switch (ipproto) {
1742         case IPPROTO_TCP:
1743                 off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen;
1744                 if (i40e_meoi_get_uint8(mp, off, &l4len) == -1)
1745                         return (-1);
1746                 l4len = (l4len & 0xf0) >> 4;
1747                 if (l4len < 5 || l4len > 0xf)
1748                         return (-1);
1749                 l4len *= 4;
1750                 break;
1751         case IPPROTO_UDP:
1752                 l4len = sizeof (struct udphdr);
1753                 break;
1754         case IPPROTO_SCTP:
1755                 l4len = sizeof (sctp_hdr_t);
1756                 break;
1757         default:
1758                 return (0);
1759         }
1760 
1761         meoi->meoi_l4hlen = l4len;
1762         meoi->meoi_flags |= MEOI_L4INFO_SET;
1763         return (0);
1764 }
1765 
1766 /*
1767  * Attempt to put togther the information we'll need to feed into a descriptor
1768  * to properly program the hardware for checksum offload as well as the
1769  * generally required flags.
1770  *
1771  * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or
1772  * into the descriptor based on the checksum flags for this mblk_t and the
1773  * actual information we care about.
1774  */
1775 static int
1776 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
1777     i40e_tx_context_t *tctx)
1778 {
1779         int ret;
1780         uint32_t flags, start;
1781         mac_ether_offload_info_t meo;
1782         i40e_txq_stat_t *txs = &itrq->itrq_txstat;
1783 
1784         bzero(tctx, sizeof (i40e_tx_context_t));
1785 
1786         if (i40e->i40e_tx_hcksum_enable != B_TRUE)
1787                 return (0);
1788 
1789         mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
1790         if (flags == 0)
1791                 return (0);
1792 
1793         if ((ret = mac_ether_offload_info(mp, &meo)) != 0) {
1794                 txs->itxs_hck_meoifail.value.ui64++;
1795                 return (ret);
1796         }
1797 
1798         /*
1799          * Have we been asked to checksum an IPv4 header. If so, verify that we
1800          * have sufficient information and then set the proper fields in the
1801          * command structure.
1802          */
1803         if (flags & HCK_IPV4_HDRCKSUM) {
1804                 if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
1805                         txs->itxs_hck_nol2info.value.ui64++;
1806                         return (-1);
1807                 }
1808                 if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
1809                         txs->itxs_hck_nol3info.value.ui64++;
1810                         return (-1);
1811                 }
1812                 if (meo.meoi_l3proto != ETHERTYPE_IP) {
1813                         txs->itxs_hck_badl3.value.ui64++;
1814                         return (-1);
1815                 }
1816                 tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
1817                 tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
1818                     I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1819                 tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
1820                     I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1821         }
1822 
1823         /*
1824          * We've been asked to provide an L4 header, first, set up the IP
1825          * information in the descriptor if we haven't already before moving
1826          * onto seeing if we have enough information for the L4 checksum
1827          * offload.
1828          */
1829         if (flags & HCK_PARTIALCKSUM) {
1830                 if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) {
1831                         txs->itxs_hck_nol4info.value.ui64++;
1832                         return (-1);
1833                 }
1834 
1835                 if (!(flags & HCK_IPV4_HDRCKSUM)) {
1836                         if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
1837                                 txs->itxs_hck_nol2info.value.ui64++;
1838                                 return (-1);
1839                         }
1840                         if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
1841                                 txs->itxs_hck_nol3info.value.ui64++;
1842                                 return (-1);
1843                         }
1844 
1845                         if (meo.meoi_l3proto == ETHERTYPE_IP) {
1846                                 tctx->itc_cmdflags |=
1847                                     I40E_TX_DESC_CMD_IIPT_IPV4;
1848                         } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) {
1849                                 tctx->itc_cmdflags |=
1850                                     I40E_TX_DESC_CMD_IIPT_IPV6;
1851                         } else {
1852                                 txs->itxs_hck_badl3.value.ui64++;
1853                                 return (-1);
1854                         }
1855                         tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
1856                             I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1857                         tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
1858                             I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1859                 }
1860 
1861                 switch (meo.meoi_l4proto) {
1862                 case IPPROTO_TCP:
1863                         tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
1864                         break;
1865                 case IPPROTO_UDP:
1866                         tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
1867                         break;
1868                 case IPPROTO_SCTP:
1869                         tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
1870                         break;
1871                 default:
1872                         txs->itxs_hck_badl4.value.ui64++;
1873                         return (-1);
1874                 }
1875 
1876                 tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) <<
1877                     I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
1878         }
1879 
1880         return (0);
1881 }
1882 
1883 static void
1884 i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
1885 {
1886         ASSERT(tcb != NULL);
1887 
1888         mutex_enter(&itrq->itrq_tcb_lock);
1889         ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
1890         itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb;
1891         itrq->itrq_tcb_free++;
1892         mutex_exit(&itrq->itrq_tcb_lock);
1893 }
1894 
1895 static i40e_tx_control_block_t *
1896 i40e_tcb_alloc(i40e_trqpair_t *itrq)
1897 {
1898         i40e_tx_control_block_t *ret;
1899 
1900         mutex_enter(&itrq->itrq_tcb_lock);
1901         if (itrq->itrq_tcb_free == 0) {
1902                 mutex_exit(&itrq->itrq_tcb_lock);
1903                 return (NULL);
1904         }
1905 
1906         itrq->itrq_tcb_free--;
1907         ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free];
1908         itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL;
1909         mutex_exit(&itrq->itrq_tcb_lock);
1910 
1911         ASSERT(ret != NULL);
1912         return (ret);
1913 }
1914 
1915 /*
1916  * This should be used to free any DMA resources, associated mblk_t's, etc. It's
1917  * used as part of recycling the message blocks when we have either an interrupt
1918  * or other activity that indicates that we need to take a look.
1919  */
1920 static void
1921 i40e_tcb_reset(i40e_tx_control_block_t *tcb)
1922 {
1923         switch (tcb->tcb_type) {
1924         case I40E_TX_COPY:
1925                 tcb->tcb_dma.dmab_len = 0;
1926                 break;
1927         case I40E_TX_DMA:
1928                 (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
1929                 break;
1930         case I40E_TX_NONE:
1931                 /* Cast to pacify lint */
1932                 panic("trying to free tcb %p with bad type none", (void *)tcb);
1933         default:
1934                 panic("unknown i40e tcb type: %d", tcb->tcb_type);
1935         }
1936 
1937         tcb->tcb_type = I40E_TX_NONE;
1938         freemsg(tcb->tcb_mp);
1939         tcb->tcb_mp = NULL;
1940         tcb->tcb_next = NULL;
1941 }
1942 
1943 /*
1944  * This is called as part of shutting down to clean up all outstanding
1945  * descriptors. Similar to recycle, except we don't re-arm anything and instead
1946  * just return control blocks to the free list.
1947  */
1948 void
1949 i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
1950 {
1951         uint32_t index;
1952 
1953         ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
1954         ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
1955 
1956         /*
1957          * Because we should have shut down the chip at this point, it should be
1958          * safe to just clean up all the entries between our head and tail.
1959          */
1960 #ifdef  DEBUG
1961         index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space,
1962             I40E_QTX_ENA(itrq->itrq_index));
1963         VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
1964             I40E_QTX_ENA_QENA_STAT_MASK));
1965 #endif
1966 
1967         index = itrq->itrq_desc_head;
1968         while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
1969                 i40e_tx_control_block_t *tcb;
1970 
1971                 tcb = itrq->itrq_tcb_work_list[index];
1972                 VERIFY(tcb != NULL);
1973                 itrq->itrq_tcb_work_list[index] = NULL;
1974                 i40e_tcb_reset(tcb);
1975                 i40e_tcb_free(itrq, tcb);
1976 
1977                 bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
1978                 index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
1979                 itrq->itrq_desc_free++;
1980         }
1981 
1982         ASSERT(index == itrq->itrq_desc_tail);
1983         itrq->itrq_desc_head = index;
1984 }
1985 
1986 /*
1987  * We're here either by hook or by crook. We need to see if there are transmit
1988  * descriptors available for us to go and clean up and return to the hardware.
1989  * We may also be blocked, and if so, we should make sure that we let it know
1990  * we're good to go.
1991  */
1992 void
1993 i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
1994 {
1995         uint32_t wbhead, toclean, count;
1996         i40e_tx_control_block_t *tcbhead;
1997         i40e_t *i40e = itrq->itrq_i40e;
1998 
1999         mutex_enter(&itrq->itrq_tx_lock);
2000 
2001         ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2002         if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
2003                 if (itrq->itrq_tx_blocked == B_TRUE) {
2004                         itrq->itrq_tx_blocked = B_FALSE;
2005                         mac_tx_ring_update(i40e->i40e_mac_hdl,
2006                             itrq->itrq_mactxring);
2007                         itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2008                 }
2009                 mutex_exit(&itrq->itrq_tx_lock);
2010                 return;
2011         }
2012 
2013         /*
2014          * Now we need to try and see if there's anything available. The driver
2015          * will write to the head location and it guarantees that it does not
2016          * use relaxed ordering.
2017          */
2018         VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle,
2019             (uintptr_t)itrq->itrq_desc_wbhead,
2020             sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL));
2021 
2022         if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) !=
2023             DDI_FM_OK) {
2024                 mutex_exit(&itrq->itrq_tx_lock);
2025                 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2026                 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2027                 return;
2028         }
2029 
2030         wbhead = *itrq->itrq_desc_wbhead;
2031         toclean = itrq->itrq_desc_head;
2032         count = 0;
2033         tcbhead = NULL;
2034 
2035         while (toclean != wbhead) {
2036                 i40e_tx_control_block_t *tcb;
2037 
2038                 tcb = itrq->itrq_tcb_work_list[toclean];
2039                 itrq->itrq_tcb_work_list[toclean] = NULL;
2040                 ASSERT(tcb != NULL);
2041                 tcb->tcb_next = tcbhead;
2042                 tcbhead = tcb;
2043 
2044                 /*
2045                  * We zero this out for sanity purposes.
2046                  */
2047                 bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t));
2048                 toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size);
2049                 count++;
2050         }
2051 
2052         itrq->itrq_desc_head = wbhead;
2053         itrq->itrq_desc_free += count;
2054         itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
2055         ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2056 
2057         if (itrq->itrq_tx_blocked == B_TRUE &&
2058             itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
2059                 itrq->itrq_tx_blocked = B_FALSE;
2060 
2061                 mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring);
2062                 itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
2063         }
2064 
2065         mutex_exit(&itrq->itrq_tx_lock);
2066 
2067         /*
2068          * Now clean up the tcb.
2069          */
2070         while (tcbhead != NULL) {
2071                 i40e_tx_control_block_t *tcb = tcbhead;
2072 
2073                 tcbhead = tcb->tcb_next;
2074                 i40e_tcb_reset(tcb);
2075                 i40e_tcb_free(itrq, tcb);
2076         }
2077 
2078         DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
2079 }
2080 
2081 /*
2082  * We've been asked to send a message block on the wire. We'll only have a
2083  * single chain. There will not be any b_next pointers; however, there may be
2084  * multiple b_cont blocks.
2085  *
2086  * We may do one of three things with any given mblk_t chain:
2087  *
2088  *   1) Drop it
2089  *   2) Transmit it
2090  *   3) Return it
2091  *
2092  * If we return it to MAC, then MAC will flow control on our behalf. In other
2093  * words, it won't send us anything until we tell it that it's okay to send us
2094  * something.
2095  */
2096 mblk_t *
2097 i40e_ring_tx(void *arg, mblk_t *mp)
2098 {
2099         const mblk_t *nmp;
2100         size_t mpsize;
2101         i40e_tx_control_block_t *tcb;
2102         i40e_tx_desc_t *txdesc;
2103         i40e_tx_context_t tctx;
2104         int cmd, type;
2105 
2106         i40e_trqpair_t *itrq = arg;
2107         i40e_t *i40e = itrq->itrq_i40e;
2108         i40e_hw_t *hw = &i40e->i40e_hw_space;
2109         i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2110 
2111         ASSERT(mp->b_next == NULL);
2112 
2113         if (!(i40e->i40e_state & I40E_STARTED) ||
2114             (i40e->i40e_state & I40E_OVERTEMP) ||
2115             (i40e->i40e_state & I40E_SUSPENDED) ||
2116             (i40e->i40e_state & I40E_ERROR) ||
2117             (i40e->i40e_link_state != LINK_STATE_UP)) {
2118                 freemsg(mp);
2119                 return (NULL);
2120         }
2121 
2122         /*
2123          * Figure out the relevant context about this frame that we might need
2124          * for enabling checksum, lso, etc. This also fills in information that
2125          * we might set around the packet type, etc.
2126          */
2127         if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) {
2128                 freemsg(mp);
2129                 itrq->itrq_txstat.itxs_err_context.value.ui64++;
2130                 return (NULL);
2131         }
2132 
2133         /*
2134          * For the primordial driver we can punt on doing any recycling right
2135          * now; however, longer term we need to probably do some more pro-active
2136          * recycling to cut back on stalls in the tx path.
2137          */
2138 
2139         /*
2140          * Do a quick size check to make sure it fits into what we think it
2141          * should for this device. Note that longer term this will be false,
2142          * particularly when we have the world of TSO.
2143          */
2144         mpsize = 0;
2145         for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
2146                 mpsize += MBLKL(nmp);
2147         }
2148 
2149         /*
2150          * First we allocate our tx control block and prepare the packet for
2151          * transmit before we do a final check for descriptors. We do it this
2152          * way to minimize the time under the tx lock.
2153          */
2154         tcb = i40e_tcb_alloc(itrq);
2155         if (tcb == NULL) {
2156                 txs->itxs_err_notcb.value.ui64++;
2157                 goto txfail;
2158         }
2159 
2160         /*
2161          * For transmitting a block, we're currently going to use just a
2162          * single control block and bcopy all of the fragments into it. We
2163          * should be more intelligent about doing DMA binding or otherwise, but
2164          * for getting off the ground this will have to do.
2165          */
2166         ASSERT(tcb->tcb_dma.dmab_len == 0);
2167         ASSERT(tcb->tcb_dma.dmab_size >= mpsize);
2168         for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
2169                 size_t clen = MBLKL(nmp);
2170                 void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
2171 
2172                 bcopy(nmp->b_rptr, coff, clen);
2173                 tcb->tcb_dma.dmab_len += clen;
2174         }
2175         ASSERT(tcb->tcb_dma.dmab_len == mpsize);
2176 
2177         /*
2178          * While there's really no need to keep the mp here, but let's just do
2179          * it to help with our own debugging for now.
2180          */
2181         tcb->tcb_mp = mp;
2182         tcb->tcb_type = I40E_TX_COPY;
2183         I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
2184 
2185         mutex_enter(&itrq->itrq_tx_lock);
2186         if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) {
2187                 txs->itxs_err_nodescs.value.ui64++;
2188                 mutex_exit(&itrq->itrq_tx_lock);
2189                 goto txfail;
2190         }
2191 
2192         /*
2193          * Build up the descriptor and send it out. Thankfully at the moment
2194          * we only need a single desc, because we're not doing anything fancy
2195          * yet.
2196          */
2197         ASSERT(itrq->itrq_desc_free > 0);
2198         itrq->itrq_desc_free--;
2199         txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
2200         itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
2201         itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
2202             itrq->itrq_tx_ring_size);
2203 
2204         /*
2205          * Note, we always set EOP and RS which indicates that this is the last
2206          * data frame and that we should ask for it to be transmitted. We also
2207          * must always set ICRC, because that is an internal bit that must be
2208          * set to one for data descriptors. The remaining bits in the command
2209          * descriptor depend on checksumming and are determined based on the
2210          * information set up in i40e_tx_context().
2211          */
2212         type = I40E_TX_DESC_DTYPE_DATA;
2213         cmd = I40E_TX_DESC_CMD_EOP |
2214             I40E_TX_DESC_CMD_RS |
2215             I40E_TX_DESC_CMD_ICRC |
2216             tctx.itc_cmdflags;
2217         txdesc->buffer_addr =
2218             CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address);
2219         txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
2220             ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
2221             ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
2222             ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
2223 
2224         /*
2225          * Now, finally, sync the DMA data and alert hardware.
2226          */
2227         I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
2228 
2229         I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
2230             itrq->itrq_desc_tail);
2231         if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
2232             DDI_FM_OK) {
2233                 /*
2234                  * Note, we can't really go through and clean this up very well,
2235                  * because the memory has been given to the device, so just
2236                  * indicate it's been transmitted.
2237                  */
2238                 ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2239                 atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2240         }
2241 
2242         txs->itxs_bytes.value.ui64 += mpsize;
2243         txs->itxs_packets.value.ui64++;
2244         txs->itxs_descriptors.value.ui64++;
2245 
2246         mutex_exit(&itrq->itrq_tx_lock);
2247 
2248         return (NULL);
2249 
2250 txfail:
2251         /*
2252          * We ran out of resources. Return it to MAC and indicate that we'll
2253          * need to signal MAC. If there are allocated tcb's, return them now.
2254          * Make sure to reset their message block's, since we'll return them
2255          * back to MAC.
2256          */
2257         if (tcb != NULL) {
2258                 tcb->tcb_mp = NULL;
2259                 i40e_tcb_reset(tcb);
2260                 i40e_tcb_free(itrq, tcb);
2261         }
2262 
2263         mutex_enter(&itrq->itrq_tx_lock);
2264         itrq->itrq_tx_blocked = B_TRUE;
2265         mutex_exit(&itrq->itrq_tx_lock);
2266 
2267         return (mp);
2268 }