Print this page
NEX-20178 Heavy read load using 10G i40e causes network disconnect
MFV illumos-joyent@83a8d0d616db36010b59cc850d1926c0f6a30de1
OS-7457 i40e Tx freezes on zero descriptors
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Rob Johnston <rob.johnston@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
MFV illumos-joyent@0d3f2b61dcfb18edace4fd257054f6fdbe07c99c
OS-7492 i40e Tx freeze when b_cont chain exceeds 8 descriptors
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Rob Johnston <rob.johnston@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
MFV illumos-joyent@b4bede175d4c50ac1b36078a677b69388f6fb59f
OS-7577 initialize FC for i40e
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Rob Johnston <rob.johnston@joyent.com>
MFV illumos-joyent@83a8d0d616db36010b59cc850d1926c0f6a30de1
OS-7457 i40e Tx freezes on zero descriptors
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Rob Johnston <rob.johnston@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
MFV: illumos-joyent@61dc3dec4f82a3e13e94609a0a83d5f66c64e760
OS-6846 want i40e multi-group support
OS-7372 i40e_alloc_ring_mem() unwinds when it shouldn't
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Ryan Zezeski <rpz@joyent.com>
MFV: illumos-joyent@6f6fae1b433b461a7b014f48ad94fc7f4927c6ed
OS-7344 i40e Tx freeze caused by off-by-one DMA
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Rob Johnston <rob.johnston@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Ryan Zezeski <rpz@joyent.com>
MFV: illumos-joyent@757454db6669c1186f60bc625510c1b67217aae6
OS-7082 i40e: blown assert in i40e_tx_cleanup_ring()
OS-7086 i40e: add mdb dcmd to dump info on tx descriptor rings
OS-7101 i40e: add kstat to track TX DMA bind failures
Reviewed by: Ryan Zezeski <rpz@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Patrick Mooney <patrick.mooney@joyent.com>
Author: Rob Johnston <rob.johnston@joyent.com>
MFV: illumos-joyent@9e30beee2f0c127bf41868db46257124206e28d6
OS-5225 Want Fortville TSO support
Reviewed by: Ryan Zezeski <rpz@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Patrick Mooney <patrick.mooney@joyent.com>
Author: Rob Johnston <rob.johnston@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/io/i40e/i40e_transceiver.c
          +++ new/usr/src/uts/common/io/i40e/i40e_transceiver.c
↓ open down ↓ 3 lines elided ↑ open up ↑
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  
  12   12  /*
  13   13   * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
  14      - * Copyright 2016 Joyent, Inc.
       14 + * Copyright 2019 Joyent, Inc.
  15   15   */
  16   16  
  17   17  #include "i40e_sw.h"
  18   18  
  19   19  /*
  20   20   * ---------------------------------------------------------
  21   21   * Buffer and Memory Management, Receiving, and Transmitting
  22   22   * ---------------------------------------------------------
  23   23   *
  24   24   * Each physical function (PF), which is what we think of as an instance of the
↓ open down ↓ 28 lines elided ↑ open up ↑
  53   53   * size frame is informed primarily through the use of dladm and the setting of
  54   54   * the MTU property on the device. From the MTU, we then go and do some
  55   55   * machinations. The first thing we do is we then have to add in space for the
  56   56   * Ethernet header, potentially a VLAN header, and the FCS check. This value is
  57   57   * what's stored as i40e_t`i40e_frame_max and is derived any time
  58   58   * i40e_t`i40e_sdu changes.
  59   59   *
  60   60   * This size is then rounded up to the nearest 1k chunk, which represents the
  61   61   * actual amount of memory that we'll allocate for a single frame.
  62   62   *
  63      - * Note, that for rx, we do something that might be unexpected. We always add
       63 + * Note, that for RX, we do something that might be unexpected. We always add
  64   64   * an extra two bytes to the frame size that we allocate. We then offset the DMA
  65   65   * address that we receive a packet into by two bytes. This ensures that the IP
  66   66   * header will always be 4 byte aligned because the MAC header is either 14 or
  67   67   * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
  68   68   * and MAC's lives easier.
  69   69   *
  70      - * Both the rx and tx descriptor rings (which are what we use to communicate
       70 + * Both the RX and TX descriptor rings (which are what we use to communicate
  71   71   * with hardware) are allocated as a single region of DMA memory which is the
  72   72   * size of the descriptor (4 bytes and 2 bytes respectively) times the total
  73      - * number of descriptors for an rx and tx ring.
       73 + * number of descriptors for an RX and TX ring.
  74   74   *
  75      - * While the rx and tx descriptors are allocated using DMA-based memory, the
       75 + * While the RX and TX descriptors are allocated using DMA-based memory, the
  76   76   * control blocks for each of them are allocated using normal kernel memory.
  77   77   * They aren't special from a DMA perspective. We'll go over the design of both
  78   78   * receiving and transmitting separately, as they have slightly different
  79   79   * control blocks and different ways that we manage the relationship between
  80   80   * control blocks and descriptors.
  81   81   *
  82   82   * ---------------------------------
  83   83   * RX Descriptors and Control Blocks
  84   84   * ---------------------------------
  85   85   *
↓ open down ↓ 20 lines elided ↑ open up ↑
 106  106   * a message block out of the DMA memory via desballoc(9F) and send that up to
 107  107   * MAC that way. This will cause us to be notified when the message block is
 108  108   * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
 109  109   * it's less than the threshold, we'll try to use allocb and bcopy it into the
 110  110   * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
 111  111   * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
 112  112   * the behavior and always do a bcopy or a DMA bind.
 113  113   *
 114  114   * To try and ensure that the device always has blocks that it can receive data
 115  115   * into, we maintain two lists of control blocks, a working list and a free
 116      - * list. Each list is sized equal to the number of descriptors in the rx ring.
 117      - * During the GLDv3 mc_start routine, we allocate a number of rx control blocks
      116 + * list. Each list is sized equal to the number of descriptors in the RX ring.
      117 + * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
 118  118   * equal to twice the number of descriptors in the ring and we assign them
 119  119   * equally to the free list and to the working list. Each control block also has
 120  120   * DMA memory allocated and associated with which it will be used to receive the
 121  121   * actual packet data. All of a received frame's data will end up in a single
 122  122   * DMA buffer.
 123  123   *
 124      - * During operation, we always maintain the invariant that each rx descriptor
 125      - * has an associated rx control block which lives in the working list. If we
      124 + * During operation, we always maintain the invariant that each RX descriptor
      125 + * has an associated RX control block which lives in the working list. If we
 126  126   * feel that we should loan up DMA memory to MAC in the form of a message block,
 127  127   * we can only do so if we can maintain this invariant. To do that, we swap in
 128  128   * one of the buffers from the free list. If none are available, then we resort
 129  129   * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
 130  130   * size.
 131  131   *
 132  132   * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
 133      - * called on the block, at which point we restore the rx control block to the
      133 + * called on the block, at which point we restore the RX control block to the
 134  134   * free list and are able to reuse the DMA memory again. While the scheme may
 135  135   * seem odd, it importantly keeps us out of trying to do any DMA allocations in
 136  136   * the normal path of operation, even though we may still have to allocate
 137  137   * message blocks and copy.
 138  138   *
 139      - * The following state machine describes the life time of a rx control block. In
 140      - * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx
      139 + * The following state machine describes the life time of a RX control block. In
      140 + * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
 141  141   * control block entry as rcb.
 142  142   *
 143  143   *             |                                   |
 144  144   *             * ... 1/2 of all initial rcb's  ... *
 145  145   *             |                                   |
 146  146   *             v                                   v
 147  147   *     +------------------+               +------------------+
 148  148   *     | rcb on free list |---*---------->| rcb on work list |
 149  149   *     +------------------+   .           +------------------+
 150  150   *             ^              . moved to          |
↓ open down ↓ 2 lines elided ↑ open up ↑
 153  153   *             |                MAC + co.         |     available. rcb's
 154  154   *             |                                  |     memory made into mblk_t
 155  155   *             * . freemsg(9F)                    |     and sent up to MAC.
 156  156   *             |   called on                      |
 157  157   *             |   loaned rcb                     |
 158  158   *             |   and it is                      v
 159  159   *             |   recycled.              +-------------------+
 160  160   *             +--------------------<-----| rcb loaned to MAC |
 161  161   *                                        +-------------------+
 162  162   *
 163      - * Finally, note that every rx control block has a reference count on it. One
      163 + * Finally, note that every RX control block has a reference count on it. One
 164  164   * reference is added as long as the driver has had the GLDv3 mc_start endpoint
 165  165   * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
 166  166   * no other DLPI consumers remain, then we'll decrement the reference count by
 167      - * one. Whenever we loan up the rx control block and associated buffer to MAC,
      167 + * one. Whenever we loan up the RX control block and associated buffer to MAC,
 168  168   * then we bump the reference count again. Even though the device is stopped,
 169  169   * there may still be loaned frames in upper levels that we'll want to account
 170  170   * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
 171  171   * that it is cleaned up.
 172  172   *
 173  173   * --------------------
 174  174   * Managing the RX Ring
 175  175   * --------------------
 176  176   *
 177  177   * The receive ring descriptors are arranged in a circular buffer with a head
↓ open down ↓ 7 lines elided ↑ open up ↑
 185  185   *
 186  186   * The initial head is configured to be zero by writing it as such in the
 187  187   * receive queue context in the FPM (function private memory from the host). The
 188  188   * initial tail is written to be the last descriptor. This is written to via the
 189  189   * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
 190  190   * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
 191  191   * the only values we ever consult ourselves are the TAIL register and our own
 192  192   * state tracking. Effectively, we cache the HEAD register and then update it
 193  193   * ourselves based on our work.
 194  194   *
 195      - * When we iterate over the rx descriptors and thus the received frames, we are
      195 + * When we iterate over the RX descriptors and thus the received frames, we are
 196  196   * either in an interrupt context or we've been asked by MAC to poll on the
 197  197   * ring. If we've been asked to poll on the ring, we have a maximum number of
 198      - * bytes of mblk_t's to return. If processing an rx descriptor would cause us to
      198 + * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
 199  199   * exceed that count, then we do not process it. When in interrupt context, we
 200  200   * don't have a strict byte count. However, to ensure liveness, we limit the
 201  201   * amount of data based on a configuration value
 202  202   * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
 203  203   * is based on similar numbers that are used for ixgbe. After some additional
 204  204   * time in the field, we'll have a sense as to whether or not it should be
 205  205   * changed.
 206  206   *
 207  207   * When processing, we start at our own HEAD pointer
 208  208   * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
↓ open down ↓ 33 lines elided ↑ open up ↑
 242  242   * do a single bulk write for the ring.
 243  243   *
 244  244   * ---------------------------------
 245  245   * TX Descriptors and Control Blocks
 246  246   * ---------------------------------
 247  247   *
 248  248   * While the transmit path is similar in spirit to the receive path, it works
 249  249   * differently due to the fact that all data is originated by the operating
 250  250   * system and not by the device.
 251  251   *
 252      - * Like rx, there is both a descriptor ring that we use to communicate to the
 253      - * driver and which points to the memory used to transmit a frame. Similarly,
 254      - * there is a corresponding transmit control block. Each transmit control block
 255      - * has a region of DMA memory allocated to it; however, the way we use it
 256      - * varies.
      252 + * Like RX, there is both a descriptor ring that we use to communicate to the
      253 + * driver and which points to the memory used to transmit a frame.  Similarly,
      254 + * there is a corresponding transmit control block, however, the correspondence
      255 + * between descriptors and control blocks is more complex and not necessarily
      256 + * 1-to-1.
 257  257   *
 258  258   * The driver is asked to process a single frame at a time. That message block
 259  259   * may be made up of multiple fragments linked together by the mblk_t`b_cont
 260  260   * member. The device has a hard limit of up to 8 buffers being allowed for use
 261      - * for a single logical frame. For each fragment, we'll try and use an entry
 262      - * from the tx descriptor ring and then we'll allocate a corresponding tx
 263      - * control block. Depending on the size of the fragment, we may copy it around
 264      - * or we might instead try to do DMA binding of the fragment.
      261 + * for a single non-LSO packet or LSO segment. The number of TX ring entires
      262 + * (and thus TX control blocks) used depends on the fragment sizes and DMA
      263 + * layout, as explained below.
 265  264   *
 266      - * If we exceed the number of blocks that fit, we'll try to pull up the block
 267      - * and then we'll do a DMA bind and send it out.
      265 + * We alter our DMA strategy based on a threshold tied to the fragment size.
      266 + * This threshold is configurable via the tx_dma_threshold property. If the
      267 + * fragment is above the threshold, we DMA bind it -- consuming one TCB and
      268 + * potentially several data descriptors. The exact number of descriptors (equal
      269 + * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset
      270 + * into page, b_wptr offset into page, and the physical layout of the dblk's
      271 + * memory (contiguous or not). Essentially, we are at the mercy of the DMA
      272 + * engine and the dblk's memory allocation. Knowing the exact number of
      273 + * descriptors up front is a task best not taken on by the driver itself.
      274 + * Instead, we attempt to DMA bind the fragment and verify the descriptor
      275 + * layout meets hardware constraints. If the proposed DMA bind does not satisfy
      276 + * the hardware constaints, then we discard it and instead copy the entire
      277 + * fragment into the pre-allocated TCB buffer (or buffers if the fragment is
      278 + * larger than the TCB buffer).
 268  279   *
 269      - * If we don't have enough space in the ring or tx control blocks available,
      280 + * If the fragment is below or at the threshold, we copy it to the pre-allocated
      281 + * buffer of a TCB. We compress consecutive copy fragments into a single TCB to
      282 + * conserve resources. We are guaranteed that the TCB buffer is made up of only
      283 + * 1 DMA cookie; and therefore consumes only one descriptor on the controller.
      284 + *
      285 + * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
      286 + * filtering, then the TX data descriptors must be preceeded by a single TX
      287 + * context descriptor.  Because there is no DMA transfer associated with the
      288 + * context descriptor, we allocate a control block with a special type which
      289 + * indicates to the TX ring recycle code that there are no associated DMA
      290 + * resources to unbind when the control block is free'd.
      291 + *
      292 + * If we don't have enough space in the ring or TX control blocks available,
 270  293   * then we'll return the unprocessed message block to MAC. This will induce flow
 271  294   * control and once we recycle enough entries, we'll once again enable sending
 272  295   * on the ring.
 273  296   *
 274  297   * We size the working list as equal to the number of descriptors in the ring.
 275  298   * We size the free list as equal to 1.5 times the number of descriptors in the
 276      - * ring. We'll allocate a number of tx control block entries equal to the number
      299 + * ring. We'll allocate a number of TX control block entries equal to the number
 277  300   * of entries in the free list. By default, all entries are placed in the free
 278  301   * list. As we come along and try to send something, we'll allocate entries from
 279  302   * the free list and add them to the working list, where they'll stay until the
 280  303   * hardware indicates that all of the data has been written back to us. The
 281  304   * reason that we start with 1.5x is to help facilitate having more than one TX
 282  305   * buffer associated with the DMA activity.
 283  306   *
 284  307   * --------------------
 285  308   * Managing the TX Ring
 286  309   * --------------------
↓ open down ↓ 31 lines elided ↑ open up ↑
 318  341   * The following image describes where a given transmit control block lives in
 319  342   * its lifetime:
 320  343   *
 321  344   *             |
 322  345   *             * ... Initial placement for all tcb's
 323  346   *             |
 324  347   *             v
 325  348   *    +------------------+                       +------------------+
 326  349   *    | tcb on free list |---*------------------>| tcb on work list |
 327  350   *    +------------------+   .                   +------------------+
 328      - *             ^             . tcb allocated               |
      351 + *             ^             . N tcbs allocated[1]         |
 329  352   *             |               to send frame               v
 330  353   *             |               or fragment on              |
 331  354   *             |               wire, mblk from             |
 332  355   *             |               MAC associated.             |
 333  356   *             |                                           |
 334  357   *             +------*-------------------------------<----+
 335  358   *                    .
 336  359   *                    . Hardware indicates
 337  360   *                      entry transmitted.
 338      - *                      tcb recycled, mblk
      361 + *                      tcbs recycled, mblk
 339  362   *                      from MAC freed.
 340  363   *
      364 + * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
      365 + *     descriptor plus 1 data descriptor, in the non-DMA-bind case.  In the DMA
      366 + *     bind case, N can be 1 context descriptor plus 1 data descriptor per
      367 + *     b_cont in the mblk.  In this case, the mblk is associated with the first
      368 + *     data descriptor and freed as part of freeing that data descriptor.
      369 + *
 341  370   * ------------
 342  371   * Blocking MAC
 343  372   * ------------
 344  373   *
 345      - * Wen performing transmit, we can run out of descriptors and ring entries. When
 346      - * such a case happens, we return the mblk_t to MAC to indicate that we've been
 347      - * blocked. At that point in time, MAC becomes blocked and will not transmit
 348      - * anything out that specific ring until we notify MAC. To indicate that we're
 349      - * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE.
      374 + * When performing transmit, we can run out of descriptors and ring entries.
      375 + * When such a case happens, we return the mblk_t to MAC to indicate that we've
      376 + * been blocked. At that point in time, MAC becomes blocked and will not
      377 + * transmit anything out that specific ring until we notify MAC. To indicate
      378 + * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
      379 + * to B_TRUE.
 350  380   *
 351      - * When we recycle tx descriptors then we'll end up signaling MAC by calling
      381 + * When we recycle TX descriptors then we'll end up signaling MAC by calling
 352  382   * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
 353  383   * start sending frames out to us again.
 354  384   */
 355  385  
 356  386  /*
 357  387   * We set our DMA alignment requests based on the smallest supported page size
 358  388   * of the corresponding platform.
 359  389   */
 360  390  #if     defined(__sparc)
 361  391  #define I40E_DMA_ALIGNMENT 0x2000ull
 362  392  #elif defined(__x86)
 363  393  #define I40E_DMA_ALIGNMENT 0x1000ull
 364  394  #else
 365  395  #error  "unknown architecture for i40e"
 366  396  #endif
 367  397  
 368  398  /*
 369  399   * This structure is used to maintain information and flags related to
 370      - * transmitting a frame. The first member is the set of flags we need to or into
 371      - * the command word (generally checksumming related). The second member controls
 372      - * the word offsets which is required for IP and L4 checksumming.
      400 + * transmitting a frame.  These fields are ultimately used to construct the
      401 + * TX data descriptor(s) and, if necessary, the TX context descriptor.
 373  402   */
 374  403  typedef struct i40e_tx_context {
 375      -        enum i40e_tx_desc_cmd_bits      itc_cmdflags;
 376      -        uint32_t                        itc_offsets;
      404 +        enum i40e_tx_desc_cmd_bits      itc_data_cmdflags;
      405 +        uint32_t                        itc_data_offsets;
      406 +        enum i40e_tx_ctx_desc_cmd_bits  itc_ctx_cmdflags;
      407 +        uint32_t                        itc_ctx_tsolen;
      408 +        uint32_t                        itc_ctx_mss;
 377  409  } i40e_tx_context_t;
 378  410  
 379  411  /*
 380  412   * Toggles on debug builds which can be used to override our RX behaviour based
 381  413   * on thresholds.
 382  414   */
 383  415  #ifdef  DEBUG
 384  416  typedef enum {
 385  417          I40E_DEBUG_RX_DEFAULT   = 0,
 386  418          I40E_DEBUG_RX_BCOPY     = 1,
↓ open down ↓ 1 lines elided ↑ open up ↑
 388  420  } i40e_debug_rx_t;
 389  421  
 390  422  i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
 391  423  #endif  /* DEBUG */
 392  424  
 393  425  /*
 394  426   * Notes on the following pair of DMA attributes. The first attribute,
 395  427   * i40e_static_dma_attr, is designed to be used for both the descriptor rings
 396  428   * and the static buffers that we associate with control blocks. For this
 397  429   * reason, we force an SGL length of one. While technically the driver supports
 398      - * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our
      430 + * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
 399  431   * management here. In addition, when the Intel common code wants to allocate
 400  432   * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
 401  433   * the static dma attr.
 402  434   *
 403      - * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're
 404      - * binding a bunch of mblk_t fragments to go out the door. Note that the main
 405      - * difference here is that we're allowed a larger SGL length -- eight.
      435 + * The latter two sets of attributes, are what we use when we're binding a
      436 + * bunch of mblk_t fragments to go out the door. Note that the main difference
      437 + * here is that we're allowed a larger SGL length.  For non-LSO TX, we
      438 + * restrict the SGL length to match the number of TX buffers available to the
      439 + * PF (8).  For the LSO case we can go much larger, with the caveat that each
      440 + * MSS-sized chunk (segment) must not span more than 8 data descriptors and
      441 + * hence must not span more than 8 cookies.
 406  442   *
 407  443   * Note, we default to setting ourselves to be DMA capable here. However,
 408  444   * because we could have multiple instances which have different FMA error
 409  445   * checking capabilities, or end up on different buses, we make these static
 410  446   * and const and copy them into the i40e_t for the given device with the actual
 411  447   * values that reflect the actual capabilities.
 412  448   */
 413  449  static const ddi_dma_attr_t i40e_g_static_dma_attr = {
 414  450          DMA_ATTR_V0,                    /* version number */
 415  451          0x0000000000000000ull,          /* low address */
↓ open down ↓ 6 lines elided ↑ open up ↑
 422  458          0xFFFFFFFFFFFFFFFFull,          /* maximum segment size */
 423  459          1,                              /* scatter/gather list length */
 424  460          0x00000001,                     /* granularity */
 425  461          DDI_DMA_FLAGERR                 /* DMA flags */
 426  462  };
 427  463  
 428  464  static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
 429  465          DMA_ATTR_V0,                    /* version number */
 430  466          0x0000000000000000ull,          /* low address */
 431  467          0xFFFFFFFFFFFFFFFFull,          /* high address */
 432      -        0x00000000FFFFFFFFull,          /* dma counter max */
      468 +        I40E_MAX_TX_BUFSZ - 1,          /* dma counter max */
 433  469          I40E_DMA_ALIGNMENT,             /* alignment */
 434  470          0x00000FFF,                     /* burst sizes */
 435  471          0x00000001,                     /* minimum transfer size */
 436  472          0x00000000FFFFFFFFull,          /* maximum transfer size */
 437  473          0xFFFFFFFFFFFFFFFFull,          /* maximum segment size  */
 438  474          I40E_TX_MAX_COOKIE,             /* scatter/gather list length */
 439  475          0x00000001,                     /* granularity */
 440  476          DDI_DMA_FLAGERR                 /* DMA flags */
 441  477  };
 442  478  
      479 +static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
      480 +        DMA_ATTR_V0,                    /* version number */
      481 +        0x0000000000000000ull,          /* low address */
      482 +        0xFFFFFFFFFFFFFFFFull,          /* high address */
      483 +        I40E_MAX_TX_BUFSZ - 1,          /* dma counter max */
      484 +        I40E_DMA_ALIGNMENT,             /* alignment */
      485 +        0x00000FFF,                     /* burst sizes */
      486 +        0x00000001,                     /* minimum transfer size */
      487 +        0x00000000FFFFFFFFull,          /* maximum transfer size */
      488 +        0xFFFFFFFFFFFFFFFFull,          /* maximum segment size  */
      489 +        I40E_TX_LSO_MAX_COOKIE,         /* scatter/gather list length */
      490 +        0x00000001,                     /* granularity */
      491 +        DDI_DMA_FLAGERR                 /* DMA flags */
      492 +};
      493 +
 443  494  /*
 444  495   * Next, we have the attributes for these structures. The descriptor rings are
 445  496   * all strictly little endian, while the data buffers are just arrays of bytes
 446  497   * representing frames. Because of this, we purposefully simplify the driver
 447  498   * programming life by programming the descriptor ring as little endian, while
 448  499   * for the buffer data we keep it as unstructured.
 449  500   *
 450  501   * Note, that to keep the Intel common code operating in a reasonable way, when
 451  502   * we allocate DMA memory for it, we do not use byte swapping and thus use the
 452  503   * standard i40e_buf_acc_attr.
↓ open down ↓ 208 lines elided ↑ open up ↑
 661  712          rxd->rxd_i40e = i40e;
 662  713  
 663  714          rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
 664  715          rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
 665  716  
 666  717          rxd->rxd_rcb_free = rxd->rxd_free_list_size;
 667  718  
 668  719          rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
 669  720              rxd->rxd_ring_size, KM_NOSLEEP);
 670  721          if (rxd->rxd_work_list == NULL) {
 671      -                i40e_error(i40e, "failed to allocate rx work list for a ring "
      722 +                i40e_error(i40e, "failed to allocate RX work list for a ring "
 672  723                      "of %d entries for ring %d", rxd->rxd_ring_size,
 673  724                      itrq->itrq_index);
 674  725                  goto cleanup;
 675  726          }
 676  727  
 677  728          rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
 678  729              rxd->rxd_free_list_size, KM_NOSLEEP);
 679  730          if (rxd->rxd_free_list == NULL) {
 680      -                i40e_error(i40e, "failed to allocate a %d entry rx free list "
      731 +                i40e_error(i40e, "failed to allocate a %d entry RX free list "
 681  732                      "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
 682  733                  goto cleanup;
 683  734          }
 684  735  
 685  736          rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
 686  737              (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
 687  738          if (rxd->rxd_rcb_area == NULL) {
 688  739                  i40e_error(i40e, "failed to allocate a %d entry rcb area for "
 689  740                      "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
 690  741                      itrq->itrq_index);
↓ open down ↓ 67 lines elided ↑ open up ↑
 758  809   */
 759  810  static boolean_t
 760  811  i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
 761  812  {
 762  813          int i, count;
 763  814          size_t dmasz;
 764  815          i40e_rx_control_block_t *rcb;
 765  816          i40e_t *i40e = rxd->rxd_i40e;
 766  817  
 767  818          /*
 768      -         * First allocate the rx descriptor ring.
      819 +         * First allocate the RX descriptor ring.
 769  820           */
 770  821          dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
 771  822          VERIFY(dmasz > 0);
 772  823          if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
 773  824              &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
 774  825              B_TRUE, dmasz) == B_FALSE) {
 775  826                  i40e_error(i40e, "failed to allocate DMA resources "
 776      -                    "for rx descriptor ring");
      827 +                    "for RX descriptor ring");
 777  828                  return (B_FALSE);
 778  829          }
 779  830          rxd->rxd_desc_ring =
 780  831              (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
 781  832          rxd->rxd_desc_next = 0;
 782  833  
 783  834          count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
 784  835          rcb = rxd->rxd_rcb_area;
 785  836  
 786  837          dmasz = i40e->i40e_rx_buf_size;
↓ open down ↓ 5 lines elided ↑ open up ↑
 792  843                  if (i < rxd->rxd_ring_size) {
 793  844                          rxd->rxd_work_list[i] = rcb;
 794  845                  } else {
 795  846                          rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
 796  847                  }
 797  848  
 798  849                  dmap = &rcb->rcb_dma;
 799  850                  if (i40e_alloc_dma_buffer(i40e, dmap,
 800  851                      &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
 801  852                      B_TRUE, B_FALSE, dmasz) == B_FALSE) {
 802      -                        i40e_error(i40e, "failed to allocate rx dma buffer");
      853 +                        i40e_error(i40e, "failed to allocate RX dma buffer");
 803  854                          return (B_FALSE);
 804  855                  }
 805  856  
 806  857                  /*
 807  858                   * Initialize the control block and offset the DMA address. See
 808  859                   * the note in the big theory statement that explains how this
 809  860                   * helps IP deal with alignment. Note, we don't worry about
 810  861                   * whether or not we successfully get an mblk_t from desballoc,
 811  862                   * it's a common case that we have to handle later on in the
 812  863                   * system.
↓ open down ↓ 21 lines elided ↑ open up ↑
 834  885          if (itrq->itrq_tcb_area != NULL) {
 835  886                  uint32_t i;
 836  887                  i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
 837  888  
 838  889                  for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
 839  890                          i40e_free_dma_buffer(&tcb->tcb_dma);
 840  891                          if (tcb->tcb_dma_handle != NULL) {
 841  892                                  ddi_dma_free_handle(&tcb->tcb_dma_handle);
 842  893                                  tcb->tcb_dma_handle = NULL;
 843  894                          }
      895 +                        if (tcb->tcb_lso_dma_handle != NULL) {
      896 +                                ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
      897 +                                tcb->tcb_lso_dma_handle = NULL;
      898 +                        }
 844  899                  }
 845  900  
 846  901                  fsz = sizeof (i40e_tx_control_block_t) *
 847  902                      itrq->itrq_tx_free_list_size;
 848  903                  kmem_free(itrq->itrq_tcb_area, fsz);
 849  904                  itrq->itrq_tcb_area = NULL;
 850  905          }
 851  906  
 852  907          if (itrq->itrq_tcb_free_list != NULL) {
 853  908                  fsz = sizeof (i40e_tx_control_block_t *) *
↓ open down ↓ 20 lines elided ↑ open up ↑
 874  929          int i, ret;
 875  930          size_t dmasz;
 876  931          i40e_tx_control_block_t *tcb;
 877  932          i40e_t *i40e = itrq->itrq_i40e;
 878  933  
 879  934          itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
 880  935          itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
 881  936              (i40e->i40e_tx_ring_size >> 1);
 882  937  
 883  938          /*
 884      -         * Allocate an additional tx descriptor for the writeback head.
      939 +         * Allocate an additional TX descriptor for the writeback head.
 885  940           */
 886  941          dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
 887  942          dmasz += sizeof (i40e_tx_desc_t);
 888  943  
 889  944          VERIFY(dmasz > 0);
 890  945          if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
 891  946              &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
 892  947              B_FALSE, B_TRUE, dmasz) == B_FALSE) {
 893      -                i40e_error(i40e, "failed to allocate DMA resources for tx "
      948 +                i40e_error(i40e, "failed to allocate DMA resources for TX "
 894  949                      "descriptor ring");
 895  950                  return (B_FALSE);
 896  951          }
 897  952          itrq->itrq_desc_ring =
 898  953              (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
 899  954          itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
 900  955              itrq->itrq_tx_ring_size);
 901  956          itrq->itrq_desc_head = 0;
 902  957          itrq->itrq_desc_tail = 0;
 903  958          itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
 904  959  
 905  960          itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
 906  961              sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
 907  962          if (itrq->itrq_tcb_work_list == NULL) {
 908      -                i40e_error(i40e, "failed to allocate a %d entry tx work list "
      963 +                i40e_error(i40e, "failed to allocate a %d entry TX work list "
 909  964                      "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
 910  965                  goto cleanup;
 911  966          }
 912  967  
 913  968          itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
 914  969              sizeof (i40e_tx_control_block_t *), KM_SLEEP);
 915  970          if (itrq->itrq_tcb_free_list == NULL) {
 916      -                i40e_error(i40e, "failed to allocate a %d entry tx free list "
      971 +                i40e_error(i40e, "failed to allocate a %d entry TX free list "
 917  972                      "for ring %d", itrq->itrq_tx_free_list_size,
 918  973                      itrq->itrq_index);
 919  974                  goto cleanup;
 920  975          }
 921  976  
 922  977          /*
 923      -         * We allocate enough tx control blocks to cover the free list.
      978 +         * We allocate enough TX control blocks to cover the free list.
 924  979           */
 925  980          itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
 926  981              itrq->itrq_tx_free_list_size, KM_NOSLEEP);
 927  982          if (itrq->itrq_tcb_area == NULL) {
 928  983                  i40e_error(i40e, "failed to allocate a %d entry tcb area for "
 929  984                      "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
 930  985                  goto cleanup;
 931  986          }
 932  987  
 933  988          /*
↓ open down ↓ 7 lines elided ↑ open up ↑
 941  996  
 942  997                  /*
 943  998                   * Allocate both a DMA buffer which we'll use for when we copy
 944  999                   * packets for transmission and allocate a DMA handle which
 945 1000                   * we'll use when we bind data.
 946 1001                   */
 947 1002                  ret = ddi_dma_alloc_handle(i40e->i40e_dip,
 948 1003                      &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
 949 1004                      &tcb->tcb_dma_handle);
 950 1005                  if (ret != DDI_SUCCESS) {
 951      -                        i40e_error(i40e, "failed to allocate DMA handle for tx "
     1006 +                        i40e_error(i40e, "failed to allocate DMA handle for TX "
 952 1007                              "data binding on ring %d: %d", itrq->itrq_index,
 953 1008                              ret);
 954 1009                          tcb->tcb_dma_handle = NULL;
 955 1010                          goto cleanup;
 956 1011                  }
 957 1012  
     1013 +                ret = ddi_dma_alloc_handle(i40e->i40e_dip,
     1014 +                    &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
     1015 +                    &tcb->tcb_lso_dma_handle);
     1016 +                if (ret != DDI_SUCCESS) {
     1017 +                        i40e_error(i40e, "failed to allocate DMA handle for TX "
     1018 +                            "LSO data binding on ring %d: %d", itrq->itrq_index,
     1019 +                            ret);
     1020 +                        tcb->tcb_lso_dma_handle = NULL;
     1021 +                        goto cleanup;
     1022 +                }
     1023 +
 958 1024                  if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
 959 1025                      &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
 960 1026                      B_TRUE, B_FALSE, dmasz) == B_FALSE) {
 961 1027                          i40e_error(i40e, "failed to allocate %ld bytes of "
 962      -                            "DMA for tx data binding on ring %d", dmasz,
     1028 +                            "DMA for TX data binding on ring %d", dmasz,
 963 1029                              itrq->itrq_index);
 964 1030                          goto cleanup;
 965 1031                  }
 966 1032  
 967 1033                  itrq->itrq_tcb_free_list[i] = tcb;
 968 1034          }
 969 1035  
 970 1036          itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
 971 1037  
 972 1038          return (B_TRUE);
↓ open down ↓ 9 lines elided ↑ open up ↑
 982 1048   */
 983 1049  void
 984 1050  i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
 985 1051  {
 986 1052          int i;
 987 1053  
 988 1054          for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
 989 1055                  i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
 990 1056  
 991 1057                  /*
 992      -                 * Clean up our rx data. We have to free DMA resources first and
     1058 +                 * In some cases i40e_alloc_rx_data() may have failed
     1059 +                 * and in that case there is no rxd to free.
     1060 +                 */
     1061 +                if (rxd == NULL)
     1062 +                        continue;
     1063 +
     1064 +                /*
     1065 +                 * Clean up our RX data. We have to free DMA resources first and
 993 1066                   * then if we have no more pending RCB's, then we'll go ahead
 994 1067                   * and clean things up. Note, we can't set the stopped flag on
 995      -                 * the rx data until after we've done the first pass of the
     1068 +                 * the RX data until after we've done the first pass of the
 996 1069                   * pending resources. Otherwise we might race with
 997 1070                   * i40e_rx_recycle on determining who should free the
 998 1071                   * i40e_rx_data_t above.
 999 1072                   */
1000 1073                  i40e_free_rx_dma(rxd, failed_init);
1001 1074  
1002 1075                  mutex_enter(&i40e->i40e_rx_pending_lock);
1003 1076                  rxd->rxd_shutdown = B_TRUE;
1004 1077                  if (rxd->rxd_rcb_pending == 0) {
1005 1078                          i40e_free_rx_data(rxd);
↓ open down ↓ 42 lines elided ↑ open up ↑
1048 1121   * capabilities, we copy the DMA attributes into the i40e_t and set them that
1049 1122   * way and use them for determining attributes.
1050 1123   */
1051 1124  void
1052 1125  i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
1053 1126  {
1054 1127          bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
1055 1128              sizeof (ddi_dma_attr_t));
1056 1129          bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
1057 1130              sizeof (ddi_dma_attr_t));
     1131 +        bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
     1132 +            sizeof (ddi_dma_attr_t));
1058 1133          bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
1059 1134              sizeof (ddi_device_acc_attr_t));
1060 1135          bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
1061 1136              sizeof (ddi_device_acc_attr_t));
1062 1137  
1063 1138          if (fma == B_TRUE) {
1064 1139                  i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
1065 1140                  i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
     1141 +                i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
     1142 +                    DDI_DMA_FLAGERR;
1066 1143          } else {
1067 1144                  i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
1068 1145                  i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
     1146 +                i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
     1147 +                    ~DDI_DMA_FLAGERR;
1069 1148          }
1070 1149  }
1071 1150  
1072 1151  static void
1073 1152  i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
1074 1153  {
1075 1154          mutex_enter(&rxd->rxd_free_lock);
1076 1155          ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
1077 1156          ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
1078 1157          rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
↓ open down ↓ 16 lines elided ↑ open up ↑
1095 1174          VERIFY(rcb != NULL);
1096 1175          rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
1097 1176          mutex_exit(&rxd->rxd_free_lock);
1098 1177  
1099 1178          return (rcb);
1100 1179  }
1101 1180  
1102 1181  /*
1103 1182   * This is the callback that we get from the OS when freemsg(9F) has been called
1104 1183   * on a loaned descriptor. In addition, if we take the last reference count
1105      - * here, then we have to tear down all of the rx data.
     1184 + * here, then we have to tear down all of the RX data.
1106 1185   */
1107 1186  void
1108 1187  i40e_rx_recycle(caddr_t arg)
1109 1188  {
1110 1189          uint32_t ref;
1111 1190          i40e_rx_control_block_t *rcb;
1112 1191          i40e_rx_data_t *rxd;
1113 1192          i40e_t *i40e;
1114 1193  
1115 1194          /* LINTED: E_BAD_PTR_CAST_ALIGN */
↓ open down ↓ 645 lines elided ↑ open up ↑
1761 1840          meoi->meoi_l4hlen = l4len;
1762 1841          meoi->meoi_flags |= MEOI_L4INFO_SET;
1763 1842          return (0);
1764 1843  }
1765 1844  
1766 1845  /*
1767 1846   * Attempt to put togther the information we'll need to feed into a descriptor
1768 1847   * to properly program the hardware for checksum offload as well as the
1769 1848   * generally required flags.
1770 1849   *
1771      - * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or
1772      - * into the descriptor based on the checksum flags for this mblk_t and the
     1850 + * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
     1851 + * 'or' into the descriptor based on the checksum flags for this mblk_t and the
1773 1852   * actual information we care about.
     1853 + *
     1854 + * If the mblk requires LSO then we'll also gather the information that will be
     1855 + * used to construct the Transmit Context Descriptor.
1774 1856   */
1775 1857  static int
1776 1858  i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
1777      -    i40e_tx_context_t *tctx)
     1859 +    mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx)
1778 1860  {
1779      -        int ret;
1780      -        uint32_t flags, start;
1781      -        mac_ether_offload_info_t meo;
     1861 +        uint32_t chkflags, start, mss, lsoflags;
1782 1862          i40e_txq_stat_t *txs = &itrq->itrq_txstat;
1783 1863  
1784 1864          bzero(tctx, sizeof (i40e_tx_context_t));
1785 1865  
1786 1866          if (i40e->i40e_tx_hcksum_enable != B_TRUE)
1787 1867                  return (0);
1788 1868  
1789      -        mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
1790      -        if (flags == 0)
     1869 +        mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
     1870 +        mac_lso_get(mp, &mss, &lsoflags);
     1871 +
     1872 +        if (chkflags == 0 && lsoflags == 0)
1791 1873                  return (0);
1792 1874  
1793      -        if ((ret = mac_ether_offload_info(mp, &meo)) != 0) {
1794      -                txs->itxs_hck_meoifail.value.ui64++;
1795      -                return (ret);
1796      -        }
1797      -
1798 1875          /*
1799 1876           * Have we been asked to checksum an IPv4 header. If so, verify that we
1800 1877           * have sufficient information and then set the proper fields in the
1801 1878           * command structure.
1802 1879           */
1803      -        if (flags & HCK_IPV4_HDRCKSUM) {
1804      -                if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
     1880 +        if (chkflags & HCK_IPV4_HDRCKSUM) {
     1881 +                if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1805 1882                          txs->itxs_hck_nol2info.value.ui64++;
1806 1883                          return (-1);
1807 1884                  }
1808      -                if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
     1885 +                if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1809 1886                          txs->itxs_hck_nol3info.value.ui64++;
1810 1887                          return (-1);
1811 1888                  }
1812      -                if (meo.meoi_l3proto != ETHERTYPE_IP) {
     1889 +                if (meo->meoi_l3proto != ETHERTYPE_IP) {
1813 1890                          txs->itxs_hck_badl3.value.ui64++;
1814 1891                          return (-1);
1815 1892                  }
1816      -                tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
1817      -                tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
     1893 +                tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
     1894 +                tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1818 1895                      I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1819      -                tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
     1896 +                tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1820 1897                      I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1821 1898          }
1822 1899  
1823 1900          /*
1824 1901           * We've been asked to provide an L4 header, first, set up the IP
1825 1902           * information in the descriptor if we haven't already before moving
1826 1903           * onto seeing if we have enough information for the L4 checksum
1827 1904           * offload.
1828 1905           */
1829      -        if (flags & HCK_PARTIALCKSUM) {
1830      -                if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) {
     1906 +        if (chkflags & HCK_PARTIALCKSUM) {
     1907 +                if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) {
1831 1908                          txs->itxs_hck_nol4info.value.ui64++;
1832 1909                          return (-1);
1833 1910                  }
1834 1911  
1835      -                if (!(flags & HCK_IPV4_HDRCKSUM)) {
1836      -                        if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
     1912 +                if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
     1913 +                        if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
1837 1914                                  txs->itxs_hck_nol2info.value.ui64++;
1838 1915                                  return (-1);
1839 1916                          }
1840      -                        if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
     1917 +                        if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
1841 1918                                  txs->itxs_hck_nol3info.value.ui64++;
1842 1919                                  return (-1);
1843 1920                          }
1844 1921  
1845      -                        if (meo.meoi_l3proto == ETHERTYPE_IP) {
1846      -                                tctx->itc_cmdflags |=
     1922 +                        if (meo->meoi_l3proto == ETHERTYPE_IP) {
     1923 +                                tctx->itc_data_cmdflags |=
1847 1924                                      I40E_TX_DESC_CMD_IIPT_IPV4;
1848      -                        } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) {
1849      -                                tctx->itc_cmdflags |=
     1925 +                        } else if (meo->meoi_l3proto == ETHERTYPE_IPV6) {
     1926 +                                tctx->itc_data_cmdflags |=
1850 1927                                      I40E_TX_DESC_CMD_IIPT_IPV6;
1851 1928                          } else {
1852 1929                                  txs->itxs_hck_badl3.value.ui64++;
1853 1930                                  return (-1);
1854 1931                          }
1855      -                        tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
     1932 +                        tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
1856 1933                              I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
1857      -                        tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
     1934 +                        tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
1858 1935                              I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
1859 1936                  }
1860 1937  
1861      -                switch (meo.meoi_l4proto) {
     1938 +                switch (meo->meoi_l4proto) {
1862 1939                  case IPPROTO_TCP:
1863      -                        tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
     1940 +                        tctx->itc_data_cmdflags |=
     1941 +                            I40E_TX_DESC_CMD_L4T_EOFT_TCP;
1864 1942                          break;
1865 1943                  case IPPROTO_UDP:
1866      -                        tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
     1944 +                        tctx->itc_data_cmdflags |=
     1945 +                            I40E_TX_DESC_CMD_L4T_EOFT_UDP;
1867 1946                          break;
1868 1947                  case IPPROTO_SCTP:
1869      -                        tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
     1948 +                        tctx->itc_data_cmdflags |=
     1949 +                            I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
1870 1950                          break;
1871 1951                  default:
1872 1952                          txs->itxs_hck_badl4.value.ui64++;
1873 1953                          return (-1);
1874 1954                  }
1875 1955  
1876      -                tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) <<
     1956 +                tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) <<
1877 1957                      I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
1878 1958          }
1879 1959  
     1960 +        if (lsoflags & HW_LSO) {
     1961 +                /*
     1962 +                 * LSO requires that checksum offloads are enabled.  If for
     1963 +                 * some reason they're not we bail out with an error.
     1964 +                 */
     1965 +                if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 ||
     1966 +                    (chkflags & HCK_PARTIALCKSUM) == 0) {
     1967 +                        txs->itxs_lso_nohck.value.ui64++;
     1968 +                        return (-1);
     1969 +                }
     1970 +
     1971 +                tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
     1972 +                tctx->itc_ctx_mss = mss;
     1973 +                tctx->itc_ctx_tsolen = msgsize(mp) -
     1974 +                    (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen);
     1975 +        }
     1976 +
1880 1977          return (0);
1881 1978  }
1882 1979  
1883 1980  static void
1884 1981  i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
1885 1982  {
1886 1983          ASSERT(tcb != NULL);
1887 1984  
1888 1985          mutex_enter(&itrq->itrq_tcb_lock);
1889 1986          ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
↓ open down ↓ 28 lines elided ↑ open up ↑
1918 2015   * or other activity that indicates that we need to take a look.
1919 2016   */
1920 2017  static void
1921 2018  i40e_tcb_reset(i40e_tx_control_block_t *tcb)
1922 2019  {
1923 2020          switch (tcb->tcb_type) {
1924 2021          case I40E_TX_COPY:
1925 2022                  tcb->tcb_dma.dmab_len = 0;
1926 2023                  break;
1927 2024          case I40E_TX_DMA:
1928      -                (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
     2025 +                if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
     2026 +                        (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
     2027 +                else if (tcb->tcb_bind_ncookies > 0)
     2028 +                        (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
     2029 +                if (tcb->tcb_bind_info != NULL) {
     2030 +                        kmem_free(tcb->tcb_bind_info,
     2031 +                            tcb->tcb_bind_ncookies *
     2032 +                            sizeof (struct i40e_dma_bind_info));
     2033 +                }
     2034 +                tcb->tcb_bind_info = NULL;
     2035 +                tcb->tcb_bind_ncookies = 0;
     2036 +                tcb->tcb_used_lso = B_FALSE;
1929 2037                  break;
     2038 +        case I40E_TX_DESC:
     2039 +                break;
1930 2040          case I40E_TX_NONE:
1931 2041                  /* Cast to pacify lint */
1932 2042                  panic("trying to free tcb %p with bad type none", (void *)tcb);
1933 2043          default:
1934 2044                  panic("unknown i40e tcb type: %d", tcb->tcb_type);
1935 2045          }
1936 2046  
1937 2047          tcb->tcb_type = I40E_TX_NONE;
1938      -        freemsg(tcb->tcb_mp);
1939      -        tcb->tcb_mp = NULL;
     2048 +        if (tcb->tcb_mp != NULL) {
     2049 +                freemsg(tcb->tcb_mp);
     2050 +                tcb->tcb_mp = NULL;
     2051 +        }
1940 2052          tcb->tcb_next = NULL;
1941 2053  }
1942 2054  
1943 2055  /*
1944 2056   * This is called as part of shutting down to clean up all outstanding
1945 2057   * descriptors. Similar to recycle, except we don't re-arm anything and instead
1946 2058   * just return control blocks to the free list.
1947 2059   */
1948 2060  void
1949 2061  i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
↓ open down ↓ 12 lines elided ↑ open up ↑
1962 2074              I40E_QTX_ENA(itrq->itrq_index));
1963 2075          VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
1964 2076              I40E_QTX_ENA_QENA_STAT_MASK));
1965 2077  #endif
1966 2078  
1967 2079          index = itrq->itrq_desc_head;
1968 2080          while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
1969 2081                  i40e_tx_control_block_t *tcb;
1970 2082  
1971 2083                  tcb = itrq->itrq_tcb_work_list[index];
1972      -                VERIFY(tcb != NULL);
1973      -                itrq->itrq_tcb_work_list[index] = NULL;
1974      -                i40e_tcb_reset(tcb);
1975      -                i40e_tcb_free(itrq, tcb);
     2084 +                if (tcb != NULL) {
     2085 +                        itrq->itrq_tcb_work_list[index] = NULL;
     2086 +                        i40e_tcb_reset(tcb);
     2087 +                        i40e_tcb_free(itrq, tcb);
     2088 +                }
1976 2089  
1977 2090                  bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
1978 2091                  index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
1979 2092                  itrq->itrq_desc_free++;
1980 2093          }
1981 2094  
1982 2095          ASSERT(index == itrq->itrq_desc_tail);
1983 2096          itrq->itrq_desc_head = index;
1984 2097  }
1985 2098  
↓ open down ↓ 2 lines elided ↑ open up ↑
1988 2101   * descriptors available for us to go and clean up and return to the hardware.
1989 2102   * We may also be blocked, and if so, we should make sure that we let it know
1990 2103   * we're good to go.
1991 2104   */
1992 2105  void
1993 2106  i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
1994 2107  {
1995 2108          uint32_t wbhead, toclean, count;
1996 2109          i40e_tx_control_block_t *tcbhead;
1997 2110          i40e_t *i40e = itrq->itrq_i40e;
     2111 +        uint_t desc_per_tcb, i;
1998 2112  
1999 2113          mutex_enter(&itrq->itrq_tx_lock);
2000 2114  
2001 2115          ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2002 2116          if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
2003 2117                  if (itrq->itrq_tx_blocked == B_TRUE) {
2004 2118                          itrq->itrq_tx_blocked = B_FALSE;
2005 2119                          mac_tx_ring_update(i40e->i40e_mac_hdl,
2006 2120                              itrq->itrq_mactxring);
2007 2121                          itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
↓ open down ↓ 27 lines elided ↑ open up ↑
2035 2149          while (toclean != wbhead) {
2036 2150                  i40e_tx_control_block_t *tcb;
2037 2151  
2038 2152                  tcb = itrq->itrq_tcb_work_list[toclean];
2039 2153                  itrq->itrq_tcb_work_list[toclean] = NULL;
2040 2154                  ASSERT(tcb != NULL);
2041 2155                  tcb->tcb_next = tcbhead;
2042 2156                  tcbhead = tcb;
2043 2157  
2044 2158                  /*
2045      -                 * We zero this out for sanity purposes.
     2159 +                 * In the DMA bind case, there may not necessarily be a 1:1
     2160 +                 * mapping between tcb's and descriptors.  If the tcb type
     2161 +                 * indicates a DMA binding then check the number of DMA
     2162 +                 * cookies to determine how many entries to clean in the
     2163 +                 * descriptor ring.
2046 2164                   */
2047      -                bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t));
2048      -                toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size);
2049      -                count++;
     2165 +                if (tcb->tcb_type == I40E_TX_DMA)
     2166 +                        desc_per_tcb = tcb->tcb_bind_ncookies;
     2167 +                else
     2168 +                        desc_per_tcb = 1;
     2169 +
     2170 +                for (i = 0; i < desc_per_tcb; i++) {
     2171 +                        /*
     2172 +                         * We zero this out for sanity purposes.
     2173 +                         */
     2174 +                        bzero(&itrq->itrq_desc_ring[toclean],
     2175 +                            sizeof (i40e_tx_desc_t));
     2176 +                        toclean = i40e_next_desc(toclean, 1,
     2177 +                            itrq->itrq_tx_ring_size);
     2178 +                        count++;
     2179 +                }
2050 2180          }
2051 2181  
2052 2182          itrq->itrq_desc_head = wbhead;
2053 2183          itrq->itrq_desc_free += count;
2054 2184          itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
2055 2185          ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
2056 2186  
2057 2187          if (itrq->itrq_tx_blocked == B_TRUE &&
2058 2188              itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
2059 2189                  itrq->itrq_tx_blocked = B_FALSE;
↓ open down ↓ 11 lines elided ↑ open up ↑
2071 2201                  i40e_tx_control_block_t *tcb = tcbhead;
2072 2202  
2073 2203                  tcbhead = tcb->tcb_next;
2074 2204                  i40e_tcb_reset(tcb);
2075 2205                  i40e_tcb_free(itrq, tcb);
2076 2206          }
2077 2207  
2078 2208          DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
2079 2209  }
2080 2210  
     2211 +static void
     2212 +i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp,
     2213 +    const size_t off, const size_t len)
     2214 +{
     2215 +        const void *soff = mp->b_rptr + off;
     2216 +        void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
     2217 +
     2218 +        ASSERT3U(len, >, 0);
     2219 +        ASSERT3P(soff, >=, mp->b_rptr);
     2220 +        ASSERT3P(soff, <=, mp->b_wptr);
     2221 +        ASSERT3U(len, <=, MBLKL(mp));
     2222 +        ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr);
     2223 +        ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len);
     2224 +        bcopy(soff, doff, len);
     2225 +        tcb->tcb_type = I40E_TX_COPY;
     2226 +        tcb->tcb_dma.dmab_len += len;
     2227 +        I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
     2228 +}
     2229 +
     2230 +static i40e_tx_control_block_t *
     2231 +i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
     2232 +    size_t off, boolean_t use_lso)
     2233 +{
     2234 +        ddi_dma_handle_t dma_handle;
     2235 +        ddi_dma_cookie_t dma_cookie;
     2236 +        uint_t i = 0, ncookies = 0, dmaflags;
     2237 +        i40e_tx_control_block_t *tcb;
     2238 +        i40e_txq_stat_t *txs = &itrq->itrq_txstat;
     2239 +
     2240 +        if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
     2241 +                txs->itxs_err_notcb.value.ui64++;
     2242 +                return (NULL);
     2243 +        }
     2244 +        tcb->tcb_type = I40E_TX_DMA;
     2245 +
     2246 +        if (use_lso == B_TRUE)
     2247 +                dma_handle = tcb->tcb_lso_dma_handle;
     2248 +        else
     2249 +                dma_handle = tcb->tcb_dma_handle;
     2250 +
     2251 +        dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING;
     2252 +        if (ddi_dma_addr_bind_handle(dma_handle, NULL,
     2253 +            (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags,
     2254 +            DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
     2255 +                txs->itxs_bind_fails.value.ui64++;
     2256 +                goto bffail;
     2257 +        }
     2258 +
     2259 +        tcb->tcb_bind_ncookies = ncookies;
     2260 +        tcb->tcb_used_lso = use_lso;
     2261 +
     2262 +        tcb->tcb_bind_info =
     2263 +            kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
     2264 +            KM_NOSLEEP);
     2265 +        if (tcb->tcb_bind_info == NULL)
     2266 +                goto bffail;
     2267 +
     2268 +        while (i < ncookies) {
     2269 +                if (i > 0)
     2270 +                        ddi_dma_nextcookie(dma_handle, &dma_cookie);
     2271 +
     2272 +                tcb->tcb_bind_info[i].dbi_paddr =
     2273 +                    (caddr_t)dma_cookie.dmac_laddress;
     2274 +                tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
     2275 +        }
     2276 +
     2277 +        return (tcb);
     2278 +
     2279 +bffail:
     2280 +        i40e_tcb_reset(tcb);
     2281 +        i40e_tcb_free(itrq, tcb);
     2282 +        return (NULL);
     2283 +}
     2284 +
     2285 +static void
     2286 +i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
     2287 +    caddr_t buff, size_t len, boolean_t last_desc)
     2288 +{
     2289 +        i40e_tx_desc_t *txdesc;
     2290 +        int cmd;
     2291 +
     2292 +        ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
     2293 +        itrq->itrq_desc_free--;
     2294 +        txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
     2295 +        itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
     2296 +            itrq->itrq_tx_ring_size);
     2297 +
     2298 +        cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
     2299 +
     2300 +        /*
     2301 +         * The last data descriptor needs the EOP bit set, so that the HW knows
     2302 +         * that we're ready to send.  Additionally, we set the RS (Report
     2303 +         * Status) bit, so that we are notified when the transmit engine has
     2304 +         * completed DMA'ing all of the data descriptors and data buffers
     2305 +         * associated with this frame.
     2306 +         */
     2307 +        if (last_desc == B_TRUE) {
     2308 +                cmd |= I40E_TX_DESC_CMD_EOP;
     2309 +                cmd |= I40E_TX_DESC_CMD_RS;
     2310 +        }
     2311 +
     2312 +        /*
     2313 +         * Per the X710 manual, section 8.4.2.1.1, the buffer size
     2314 +         * must be a value from 1 to 16K minus 1, inclusive.
     2315 +         */
     2316 +        ASSERT3U(len, >=, 1);
     2317 +        ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ - 1);
     2318 +
     2319 +        txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff);
     2320 +        txdesc->cmd_type_offset_bsz =
     2321 +            LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
     2322 +            ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
     2323 +            ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
     2324 +            ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
     2325 +}
     2326 +
2081 2327  /*
     2328 + * Place 'tcb' on the tail of the list represented by 'head'/'tail'.
     2329 + */
     2330 +static inline void
     2331 +tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail,
     2332 +    i40e_tx_control_block_t *tcb)
     2333 +{
     2334 +        if (*head == NULL) {
     2335 +                *head = tcb;
     2336 +                *tail = *head;
     2337 +        } else {
     2338 +                ASSERT3P(*tail, !=, NULL);
     2339 +                ASSERT3P((*tail)->tcb_next, ==, NULL);
     2340 +                (*tail)->tcb_next = tcb;
     2341 +                *tail = tcb;
     2342 +        }
     2343 +}
     2344 +
     2345 +/*
     2346 + * This function takes a single packet, possibly consisting of
     2347 + * multiple mblks, and creates a TCB chain to send to the controller.
     2348 + * This TCB chain may span up to a maximum of 8 descriptors. A copy
     2349 + * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or
     2350 + * more, depending on several factors. For each fragment (invidual
     2351 + * mblk making up the packet), we determine if its size dictates a
     2352 + * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a
     2353 + * count of descriptors used; when that count reaches the max we force
     2354 + * all remaining fragments into a single TCB buffer. We have a
     2355 + * guarantee that the TCB buffer is always larger than the MTU -- so
     2356 + * there is always enough room. Consecutive fragments below the DMA
     2357 + * threshold are copied into a single TCB. In the event of an error
     2358 + * this function returns NULL but leaves 'mp' alone.
     2359 + */
     2360 +static i40e_tx_control_block_t *
     2361 +i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc)
     2362 +{
     2363 +        const mblk_t *nmp = mp;
     2364 +        uint_t needed_desc = 0;
     2365 +        boolean_t force_copy = B_FALSE;
     2366 +        i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
     2367 +        i40e_t *i40e = itrq->itrq_i40e;
     2368 +        i40e_txq_stat_t *txs = &itrq->itrq_txstat;
     2369 +
     2370 +        /* TCB buffer is always larger than MTU. */
     2371 +        ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size);
     2372 +
     2373 +        while (nmp != NULL) {
     2374 +                const size_t nmp_len = MBLKL(nmp);
     2375 +
     2376 +                /* Ignore zero-length mblks. */
     2377 +                if (nmp_len == 0) {
     2378 +                        nmp = nmp->b_cont;
     2379 +                        continue;
     2380 +                }
     2381 +
     2382 +                if (nmp_len < i40e->i40e_tx_dma_min || force_copy) {
     2383 +                        /* Compress consecutive copies into one TCB. */
     2384 +                        if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) {
     2385 +                                i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
     2386 +                                nmp = nmp->b_cont;
     2387 +                                continue;
     2388 +                        }
     2389 +
     2390 +                        if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
     2391 +                                txs->itxs_err_notcb.value.ui64++;
     2392 +                                goto fail;
     2393 +                        }
     2394 +
     2395 +                        /*
     2396 +                         * TCB DMA buffer is guaranteed to be one
     2397 +                         * cookie by i40e_alloc_dma_buffer().
     2398 +                         */
     2399 +                        i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
     2400 +                        needed_desc++;
     2401 +                        tcb_list_append(&tcbhead, &tcbtail, tcb);
     2402 +                } else {
     2403 +                        uint_t total_desc;
     2404 +
     2405 +                        tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE);
     2406 +                        if (tcb == NULL) {
     2407 +                                i40e_error(i40e, "dma bind failed!");
     2408 +                                goto fail;
     2409 +                        }
     2410 +
     2411 +                        /*
     2412 +                         * If the new total exceeds the max or we've
     2413 +                         * reached the limit and there's data left,
     2414 +                         * then give up binding and copy the rest into
     2415 +                         * the pre-allocated TCB buffer.
     2416 +                         */
     2417 +                        total_desc = needed_desc + tcb->tcb_bind_ncookies;
     2418 +                        if ((total_desc > I40E_TX_MAX_COOKIE) ||
     2419 +                            (total_desc == I40E_TX_MAX_COOKIE &&
     2420 +                            nmp->b_cont != NULL)) {
     2421 +                                i40e_tcb_reset(tcb);
     2422 +                                i40e_tcb_free(itrq, tcb);
     2423 +
     2424 +                                if (tcbtail != NULL &&
     2425 +                                    tcbtail->tcb_type == I40E_TX_COPY) {
     2426 +                                        tcb = tcbtail;
     2427 +                                } else {
     2428 +                                        tcb = NULL;
     2429 +                                }
     2430 +
     2431 +                                force_copy = B_TRUE;
     2432 +                                txs->itxs_force_copy.value.ui64++;
     2433 +                                continue;
     2434 +                        }
     2435 +
     2436 +                        needed_desc += tcb->tcb_bind_ncookies;
     2437 +                        tcb_list_append(&tcbhead, &tcbtail, tcb);
     2438 +                }
     2439 +
     2440 +                nmp = nmp->b_cont;
     2441 +        }
     2442 +
     2443 +        ASSERT3P(nmp, ==, NULL);
     2444 +        ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE);
     2445 +        ASSERT3P(tcbhead, !=, NULL);
     2446 +        *ndesc += needed_desc;
     2447 +        return (tcbhead);
     2448 +
     2449 +fail:
     2450 +        tcb = tcbhead;
     2451 +        while (tcb != NULL) {
     2452 +                i40e_tx_control_block_t *next = tcb->tcb_next;
     2453 +
     2454 +                ASSERT(tcb->tcb_type == I40E_TX_DMA ||
     2455 +                    tcb->tcb_type == I40E_TX_COPY);
     2456 +
     2457 +                tcb->tcb_mp = NULL;
     2458 +                i40e_tcb_reset(tcb);
     2459 +                i40e_tcb_free(itrq, tcb);
     2460 +                tcb = next;
     2461 +        }
     2462 +
     2463 +        return (NULL);
     2464 +}
     2465 +
     2466 +/*
     2467 + * Section 8.4.1 of the 700-series programming guide states that a
     2468 + * segment may span up to 8 data descriptors; including both header
     2469 + * and payload data. However, empirical evidence shows that the
     2470 + * controller freezes the Tx queue when presented with a segment of 8
     2471 + * descriptors. Or, at least, when the first segment contains 8
     2472 + * descriptors. One explanation is that the controller counts the
     2473 + * context descriptor against the first segment, even though the
     2474 + * programming guide makes no mention of such a constraint. In any
     2475 + * case, we limit TSO segments to 7 descriptors to prevent Tx queue
     2476 + * freezes. We still allow non-TSO segments to utilize all 8
     2477 + * descriptors as they have not demonstrated the faulty behavior.
     2478 + */
     2479 +uint_t i40e_lso_num_descs = 7;
     2480 +
     2481 +#define I40E_TCB_LEFT(tcb)                              \
     2482 +        ((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len)
     2483 +
     2484 +/*
     2485 + * This function is similar in spirit to i40e_non_lso_chain(), but
     2486 + * much more complicated in reality. Like the previous function, it
     2487 + * takes a packet (an LSO packet) as input and returns a chain of
     2488 + * TCBs. The complication comes with the fact that we are no longer
     2489 + * trying to fit the entire packet into 8 descriptors, but rather we
     2490 + * must fit each MSS-size segment of the LSO packet into 8 descriptors.
     2491 + * Except it's really 7 descriptors, see i40e_lso_num_descs.
     2492 + *
     2493 + * Your first inclination might be to verify that a given segment
     2494 + * spans no more than 7 mblks; but it's actually much more subtle than
     2495 + * that. First, let's describe what the hardware expects, and then we
     2496 + * can expound on the software side of things.
     2497 + *
     2498 + * For an LSO packet the hardware expects the following:
     2499 + *
     2500 + *      o Each MSS-sized segment must span no more than 7 descriptors.
     2501 + *
     2502 + *      o The header size does not count towards the segment size.
     2503 + *
     2504 + *      o If header and payload share the first descriptor, then the
     2505 + *        controller will count the descriptor twice.
     2506 + *
     2507 + * The most important thing to keep in mind is that the hardware does
     2508 + * not view the segments in terms of mblks, like we do. The hardware
     2509 + * only sees descriptors. It will iterate each descriptor in turn,
     2510 + * keeping a tally of bytes seen and descriptors visited. If the byte
     2511 + * count hasn't reached MSS by the time the descriptor count reaches
     2512 + * 7, then the controller freezes the queue and we are stuck.
     2513 + * Furthermore, the hardware picks up its tally where it left off. So
     2514 + * if it reached MSS in the middle of a descriptor, it will start
     2515 + * tallying the next segment in the middle of that descriptor. The
     2516 + * hardware's view is entirely removed from the mblk chain or even the
     2517 + * descriptor layout. Consider these facts:
     2518 + *
     2519 + *      o The MSS will vary dpeneding on MTU and other factors.
     2520 + *
     2521 + *      o The dblk allocation will sit at various offsets within a
     2522 + *        memory page.
     2523 + *
     2524 + *      o The page size itself could vary in the future (i.e. not
     2525 + *        always 4K).
     2526 + *
     2527 + *      o Just because a dblk is virtually contiguous doesn't mean
     2528 + *        it's physically contiguous. The number of cookies
     2529 + *        (descriptors) required by a DMA bind of a single dblk is at
     2530 + *        the mercy of the page size and physical layout.
     2531 + *
     2532 + *      o The descriptors will most often NOT start/end on a MSS
     2533 + *        boundary. Thus the hardware will often start counting the
     2534 + *        MSS mid descriptor and finish mid descriptor.
     2535 + *
     2536 + * The upshot of all this is that the driver must learn to think like
     2537 + * the controller; and verify that none of the constraints are broken.
     2538 + * It does this by tallying up the segment just like the hardware
     2539 + * would. This is handled by the two variables 'segsz' and 'segdesc'.
     2540 + * After each attempt to bind a dblk, we check the constaints. If
     2541 + * violated, we undo the DMA and force a copy until MSS is met. We
     2542 + * have a guarantee that the TCB buffer is larger than MTU; thus
     2543 + * ensuring we can always meet the MSS with a single copy buffer. We
     2544 + * also copy consecutive non-DMA fragments into the same TCB buffer.
     2545 + */
     2546 +static i40e_tx_control_block_t *
     2547 +i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp,
     2548 +    const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx,
     2549 +    uint_t *ndesc)
     2550 +{
     2551 +        size_t mp_len = MBLKL(mp);
     2552 +        /*
     2553 +         * The cpoff (copy offset) variable tracks the offset inside
     2554 +         * the current mp. There are cases where the entire mp is not
     2555 +         * fully copied in one go: such as the header copy followed by
     2556 +         * a non-DMA mblk, or a TCB buffer that only has enough space
     2557 +         * to copy part of the current mp.
     2558 +         */
     2559 +        size_t cpoff = 0;
     2560 +        /*
     2561 +         * The segsz and segdesc variables track the controller's view
     2562 +         * of the segment. The needed_desc variable tracks the total
     2563 +         * number of data descriptors used by the driver.
     2564 +         */
     2565 +        size_t segsz = 0;
     2566 +        uint_t segdesc = 0;
     2567 +        uint_t needed_desc = 0;
     2568 +        const size_t hdrlen =
     2569 +            meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen;
     2570 +        const size_t mss = tctx->itc_ctx_mss;
     2571 +        boolean_t force_copy = B_FALSE;
     2572 +        i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
     2573 +        i40e_t *i40e = itrq->itrq_i40e;
     2574 +        i40e_txq_stat_t *txs = &itrq->itrq_txstat;
     2575 +
     2576 +        /*
     2577 +         * We always copy the header in order to avoid more
     2578 +         * complicated code dealing with various edge cases.
     2579 +         */
     2580 +        ASSERT3U(MBLKL(mp), >=, hdrlen);
     2581 +        if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
     2582 +                txs->itxs_err_notcb.value.ui64++;
     2583 +                goto fail;
     2584 +        }
     2585 +        needed_desc++;
     2586 +
     2587 +        tcb_list_append(&tcbhead, &tcbtail, tcb);
     2588 +        i40e_tx_copy_fragment(tcb, mp, 0, hdrlen);
     2589 +        cpoff += hdrlen;
     2590 +
     2591 +        /*
     2592 +         * A single descriptor containing both header and data is
     2593 +         * counted twice by the controller.
     2594 +         */
     2595 +        if ((mp_len > hdrlen && mp_len < i40e->i40e_tx_dma_min) ||
     2596 +            (mp->b_cont != NULL &&
     2597 +            MBLKL(mp->b_cont) < i40e->i40e_tx_dma_min)) {
     2598 +                segdesc = 2;
     2599 +        } else {
     2600 +                segdesc = 1;
     2601 +        }
     2602 +
     2603 +        /* If this fragment was pure header, then move to the next one. */
     2604 +        if (cpoff == mp_len) {
     2605 +                mp = mp->b_cont;
     2606 +                cpoff = 0;
     2607 +        }
     2608 +
     2609 +        while (mp != NULL) {
     2610 +                mp_len = MBLKL(mp);
     2611 +force_copy:
     2612 +                /* Ignore zero-length mblks. */
     2613 +                if (mp_len == 0) {
     2614 +                        mp = mp->b_cont;
     2615 +                        cpoff = 0;
     2616 +                        continue;
     2617 +                }
     2618 +
     2619 +                /*
     2620 +                 * We copy into the preallocated TCB buffer when the
     2621 +                 * current fragment is less than the DMA threshold OR
     2622 +                 * when the DMA bind can't meet the controller's
     2623 +                 * segment descriptor limit.
     2624 +                 */
     2625 +                if (mp_len < i40e->i40e_tx_dma_min || force_copy) {
     2626 +                        size_t tocopy;
     2627 +
     2628 +                        /*
     2629 +                         * Our objective here is to compress
     2630 +                         * consecutive copies into one TCB (until it
     2631 +                         * is full). If there is no current TCB, or if
     2632 +                         * it is a DMA TCB, then allocate a new one.
     2633 +                         */
     2634 +                        if (tcb == NULL ||
     2635 +                            (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) {
     2636 +                                if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
     2637 +                                        txs->itxs_err_notcb.value.ui64++;
     2638 +                                        goto fail;
     2639 +                                }
     2640 +
     2641 +                                /*
     2642 +                                 * The TCB DMA buffer is guaranteed to
     2643 +                                 * be one cookie by i40e_alloc_dma_buffer().
     2644 +                                 */
     2645 +                                needed_desc++;
     2646 +                                segdesc++;
     2647 +                                ASSERT3U(segdesc, <=, i40e_lso_num_descs);
     2648 +                                tcb_list_append(&tcbhead, &tcbtail, tcb);
     2649 +                        }
     2650 +
     2651 +                        tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff);
     2652 +                        i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy);
     2653 +                        cpoff += tocopy;
     2654 +                        segsz += tocopy;
     2655 +
     2656 +                        /* We have consumed the current mp. */
     2657 +                        if (cpoff == mp_len) {
     2658 +                                mp = mp->b_cont;
     2659 +                                cpoff = 0;
     2660 +                        }
     2661 +
     2662 +                        /* We have consumed the current TCB buffer. */
     2663 +                        if (I40E_TCB_LEFT(tcb) == 0) {
     2664 +                                tcb = NULL;
     2665 +                        }
     2666 +
     2667 +                        /*
     2668 +                         * We have met MSS with this copy; restart the
     2669 +                         * counters.
     2670 +                         */
     2671 +                        if (segsz >= mss) {
     2672 +                                segsz = segsz % mss;
     2673 +                                segdesc = segsz == 0 ? 0 : 1;
     2674 +                                force_copy = B_FALSE;
     2675 +                        }
     2676 +
     2677 +                        /*
     2678 +                         * We are at the controller's descriptor
     2679 +                         * limit; we must copy into the current TCB
     2680 +                         * until MSS is reached. The TCB buffer is
     2681 +                         * always bigger than the MTU so we know it is
     2682 +                         * big enough to meet the MSS.
     2683 +                         */
     2684 +                        if (segdesc == i40e_lso_num_descs) {
     2685 +                                force_copy = B_TRUE;
     2686 +                        }
     2687 +                } else {
     2688 +                        uint_t tsegdesc = segdesc;
     2689 +                        size_t tsegsz = segsz;
     2690 +
     2691 +                        ASSERT(force_copy == B_FALSE);
     2692 +                        ASSERT3U(tsegdesc, <, i40e_lso_num_descs);
     2693 +
     2694 +                        tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE);
     2695 +                        if (tcb == NULL) {
     2696 +                                i40e_error(i40e, "dma bind failed!");
     2697 +                                goto fail;
     2698 +                        }
     2699 +
     2700 +                        for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) {
     2701 +                                struct i40e_dma_bind_info dbi =
     2702 +                                    tcb->tcb_bind_info[i];
     2703 +
     2704 +                                tsegsz += dbi.dbi_len;
     2705 +                                tsegdesc++;
     2706 +                                ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
     2707 +
     2708 +                                /*
     2709 +                                 * We've met the MSS with this portion
     2710 +                                 * of the DMA.
     2711 +                                 */
     2712 +                                if (tsegsz >= mss) {
     2713 +                                        tsegdesc = 1;
     2714 +                                        tsegsz = tsegsz % mss;
     2715 +                                }
     2716 +
     2717 +                                /*
     2718 +                                 * We've reached max descriptors but
     2719 +                                 * have not met the MSS. Undo the bind
     2720 +                                 * and instead copy.
     2721 +                                 */
     2722 +                                if (tsegdesc == i40e_lso_num_descs) {
     2723 +                                        i40e_tcb_reset(tcb);
     2724 +                                        i40e_tcb_free(itrq, tcb);
     2725 +
     2726 +                                        if (tcbtail != NULL &&
     2727 +                                            I40E_TCB_LEFT(tcb) > 0 &&
     2728 +                                            tcbtail->tcb_type == I40E_TX_COPY) {
     2729 +                                                tcb = tcbtail;
     2730 +                                        } else {
     2731 +                                                tcb = NULL;
     2732 +                                        }
     2733 +
     2734 +                                        /*
     2735 +                                         * Remember, we are still on
     2736 +                                         * the same mp.
     2737 +                                         */
     2738 +                                        force_copy = B_TRUE;
     2739 +                                        txs->itxs_tso_force_copy.value.ui64++;
     2740 +                                        goto force_copy;
     2741 +                                }
     2742 +                        }
     2743 +
     2744 +                        ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
     2745 +                        ASSERT3U(tsegsz, <, mss);
     2746 +
     2747 +                        /*
     2748 +                         * We've made if through the loop without
     2749 +                         * breaking the segment descriptor contract
     2750 +                         * with the controller -- replace the segment
     2751 +                         * tracking values with the temporary ones.
     2752 +                         */
     2753 +                        segdesc = tsegdesc;
     2754 +                        segsz = tsegsz;
     2755 +                        needed_desc += tcb->tcb_bind_ncookies;
     2756 +                        cpoff = 0;
     2757 +                        tcb_list_append(&tcbhead, &tcbtail, tcb);
     2758 +                        mp = mp->b_cont;
     2759 +                }
     2760 +        }
     2761 +
     2762 +        ASSERT3P(mp, ==, NULL);
     2763 +        ASSERT3P(tcbhead, !=, NULL);
     2764 +        *ndesc += needed_desc;
     2765 +        return (tcbhead);
     2766 +
     2767 +fail:
     2768 +        tcb = tcbhead;
     2769 +        while (tcb != NULL) {
     2770 +                i40e_tx_control_block_t *next = tcb->tcb_next;
     2771 +
     2772 +                ASSERT(tcb->tcb_type == I40E_TX_DMA ||
     2773 +                    tcb->tcb_type == I40E_TX_COPY);
     2774 +
     2775 +                tcb->tcb_mp = NULL;
     2776 +                i40e_tcb_reset(tcb);
     2777 +                i40e_tcb_free(itrq, tcb);
     2778 +                tcb = next;
     2779 +        }
     2780 +
     2781 +        return (NULL);
     2782 +}
     2783 +
     2784 +/*
2082 2785   * We've been asked to send a message block on the wire. We'll only have a
2083 2786   * single chain. There will not be any b_next pointers; however, there may be
2084      - * multiple b_cont blocks.
     2787 + * multiple b_cont blocks. The number of b_cont blocks may exceed the
     2788 + * controller's Tx descriptor limit.
2085 2789   *
2086 2790   * We may do one of three things with any given mblk_t chain:
2087 2791   *
2088 2792   *   1) Drop it
2089 2793   *   2) Transmit it
2090 2794   *   3) Return it
2091 2795   *
2092 2796   * If we return it to MAC, then MAC will flow control on our behalf. In other
2093 2797   * words, it won't send us anything until we tell it that it's okay to send us
2094 2798   * something.
2095 2799   */
2096 2800  mblk_t *
2097 2801  i40e_ring_tx(void *arg, mblk_t *mp)
2098 2802  {
2099      -        const mblk_t *nmp;
2100      -        size_t mpsize;
2101      -        i40e_tx_control_block_t *tcb;
2102      -        i40e_tx_desc_t *txdesc;
     2803 +        size_t msglen;
     2804 +        i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL;
     2805 +        i40e_tx_context_desc_t *ctxdesc;
     2806 +        mac_ether_offload_info_t meo;
2103 2807          i40e_tx_context_t tctx;
2104      -        int cmd, type;
     2808 +        int type;
     2809 +        uint_t needed_desc = 0;
     2810 +        boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE;
2105 2811  
2106 2812          i40e_trqpair_t *itrq = arg;
2107 2813          i40e_t *i40e = itrq->itrq_i40e;
2108 2814          i40e_hw_t *hw = &i40e->i40e_hw_space;
2109 2815          i40e_txq_stat_t *txs = &itrq->itrq_txstat;
2110 2816  
2111 2817          ASSERT(mp->b_next == NULL);
2112 2818  
2113 2819          if (!(i40e->i40e_state & I40E_STARTED) ||
2114 2820              (i40e->i40e_state & I40E_OVERTEMP) ||
2115 2821              (i40e->i40e_state & I40E_SUSPENDED) ||
2116 2822              (i40e->i40e_state & I40E_ERROR) ||
2117 2823              (i40e->i40e_link_state != LINK_STATE_UP)) {
2118 2824                  freemsg(mp);
2119 2825                  return (NULL);
2120 2826          }
2121 2827  
     2828 +        if (mac_ether_offload_info(mp, &meo) != 0) {
     2829 +                freemsg(mp);
     2830 +                itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++;
     2831 +                return (NULL);
     2832 +        }
     2833 +
2122 2834          /*
2123 2835           * Figure out the relevant context about this frame that we might need
2124      -         * for enabling checksum, lso, etc. This also fills in information that
     2836 +         * for enabling checksum, LSO, etc. This also fills in information that
2125 2837           * we might set around the packet type, etc.
2126 2838           */
2127      -        if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) {
     2839 +        if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) {
2128 2840                  freemsg(mp);
2129 2841                  itrq->itrq_txstat.itxs_err_context.value.ui64++;
2130 2842                  return (NULL);
2131 2843          }
     2844 +        if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
     2845 +                use_lso = B_TRUE;
     2846 +                do_ctx_desc = B_TRUE;
     2847 +        }
2132 2848  
2133 2849          /*
2134 2850           * For the primordial driver we can punt on doing any recycling right
2135 2851           * now; however, longer term we need to probably do some more pro-active
2136      -         * recycling to cut back on stalls in the tx path.
     2852 +         * recycling to cut back on stalls in the TX path.
2137 2853           */
2138 2854  
2139      -        /*
2140      -         * Do a quick size check to make sure it fits into what we think it
2141      -         * should for this device. Note that longer term this will be false,
2142      -         * particularly when we have the world of TSO.
2143      -         */
2144      -        mpsize = 0;
2145      -        for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
2146      -                mpsize += MBLKL(nmp);
     2855 +        msglen = msgsize(mp);
     2856 +
     2857 +        if (do_ctx_desc) {
     2858 +                /*
     2859 +                 * If we're doing tunneling or LSO, then we'll need a TX
     2860 +                 * context descriptor in addition to one or more TX data
     2861 +                 * descriptors.  Since there's no data DMA block or handle
     2862 +                 * associated with the context descriptor, we create a special
     2863 +                 * control block that behaves effectively like a NOP.
     2864 +                 */
     2865 +                if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
     2866 +                        txs->itxs_err_notcb.value.ui64++;
     2867 +                        goto txfail;
     2868 +                }
     2869 +                tcb_ctx->tcb_type = I40E_TX_DESC;
     2870 +                needed_desc++;
2147 2871          }
2148 2872  
2149      -        /*
2150      -         * First we allocate our tx control block and prepare the packet for
2151      -         * transmit before we do a final check for descriptors. We do it this
2152      -         * way to minimize the time under the tx lock.
2153      -         */
2154      -        tcb = i40e_tcb_alloc(itrq);
2155      -        if (tcb == NULL) {
2156      -                txs->itxs_err_notcb.value.ui64++;
2157      -                goto txfail;
     2873 +        if (!use_lso) {
     2874 +                tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc);
     2875 +        } else {
     2876 +                tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc);
2158 2877          }
2159 2878  
2160      -        /*
2161      -         * For transmitting a block, we're currently going to use just a
2162      -         * single control block and bcopy all of the fragments into it. We
2163      -         * should be more intelligent about doing DMA binding or otherwise, but
2164      -         * for getting off the ground this will have to do.
2165      -         */
2166      -        ASSERT(tcb->tcb_dma.dmab_len == 0);
2167      -        ASSERT(tcb->tcb_dma.dmab_size >= mpsize);
2168      -        for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
2169      -                size_t clen = MBLKL(nmp);
2170      -                void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
     2879 +        if (tcbhead == NULL)
     2880 +                goto txfail;
2171 2881  
2172      -                bcopy(nmp->b_rptr, coff, clen);
2173      -                tcb->tcb_dma.dmab_len += clen;
2174      -        }
2175      -        ASSERT(tcb->tcb_dma.dmab_len == mpsize);
     2882 +        tcbhead->tcb_mp = mp;
2176 2883  
2177 2884          /*
2178      -         * While there's really no need to keep the mp here, but let's just do
2179      -         * it to help with our own debugging for now.
     2885 +         * The second condition ensures that 'itrq_desc_tail' never
     2886 +         * equals 'itrq_desc_head'. This enforces the rule found in
     2887 +         * the second bullet point of section 8.4.3.1.5 of the XL710
     2888 +         * PG, which declares the TAIL pointer in I40E_QTX_TAIL should
     2889 +         * never overlap with the head. This means that we only ever
     2890 +         * have 'itrq_tx_ring_size - 1' total available descriptors.
2180 2891           */
2181      -        tcb->tcb_mp = mp;
2182      -        tcb->tcb_type = I40E_TX_COPY;
2183      -        I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
2184      -
2185 2892          mutex_enter(&itrq->itrq_tx_lock);
2186      -        if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) {
     2893 +        if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
     2894 +            (itrq->itrq_desc_free - 1) < needed_desc) {
2187 2895                  txs->itxs_err_nodescs.value.ui64++;
2188 2896                  mutex_exit(&itrq->itrq_tx_lock);
2189 2897                  goto txfail;
2190 2898          }
2191 2899  
2192      -        /*
2193      -         * Build up the descriptor and send it out. Thankfully at the moment
2194      -         * we only need a single desc, because we're not doing anything fancy
2195      -         * yet.
2196      -         */
2197      -        ASSERT(itrq->itrq_desc_free > 0);
2198      -        itrq->itrq_desc_free--;
2199      -        txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
2200      -        itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
2201      -        itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
2202      -            itrq->itrq_tx_ring_size);
     2900 +        if (do_ctx_desc) {
     2901 +                /*
     2902 +                 * If we're enabling any offloads for this frame, then we'll
     2903 +                 * need to build up a transmit context descriptor, first.  The
     2904 +                 * context descriptor needs to be placed in the TX ring before
     2905 +                 * the data descriptor(s).  See section 8.4.2, table 8-16
     2906 +                 */
     2907 +                uint_t tail = itrq->itrq_desc_tail;
     2908 +                itrq->itrq_desc_free--;
     2909 +                ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
     2910 +                itrq->itrq_tcb_work_list[tail] = tcb_ctx;
     2911 +                itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
     2912 +                    itrq->itrq_tx_ring_size);
2203 2913  
2204      -        /*
2205      -         * Note, we always set EOP and RS which indicates that this is the last
2206      -         * data frame and that we should ask for it to be transmitted. We also
2207      -         * must always set ICRC, because that is an internal bit that must be
2208      -         * set to one for data descriptors. The remaining bits in the command
2209      -         * descriptor depend on checksumming and are determined based on the
2210      -         * information set up in i40e_tx_context().
2211      -         */
2212      -        type = I40E_TX_DESC_DTYPE_DATA;
2213      -        cmd = I40E_TX_DESC_CMD_EOP |
2214      -            I40E_TX_DESC_CMD_RS |
2215      -            I40E_TX_DESC_CMD_ICRC |
2216      -            tctx.itc_cmdflags;
2217      -        txdesc->buffer_addr =
2218      -            CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address);
2219      -        txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
2220      -            ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
2221      -            ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
2222      -            ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
     2914 +                /* QW0 */
     2915 +                type = I40E_TX_DESC_DTYPE_CONTEXT;
     2916 +                ctxdesc->tunneling_params = 0;
     2917 +                ctxdesc->l2tag2 = 0;
2223 2918  
     2919 +                /* QW1 */
     2920 +                ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
     2921 +                if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
     2922 +                        ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
     2923 +                            ((uint64_t)tctx.itc_ctx_cmdflags <<
     2924 +                            I40E_TXD_CTX_QW1_CMD_SHIFT) |
     2925 +                            ((uint64_t)tctx.itc_ctx_tsolen <<
     2926 +                            I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
     2927 +                            ((uint64_t)tctx.itc_ctx_mss <<
     2928 +                            I40E_TXD_CTX_QW1_MSS_SHIFT));
     2929 +                }
     2930 +        }
     2931 +
     2932 +        tcb = tcbhead;
     2933 +        while (tcb != NULL) {
     2934 +
     2935 +                itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
     2936 +                if (tcb->tcb_type == I40E_TX_COPY) {
     2937 +                        boolean_t last_desc = (tcb->tcb_next == NULL);
     2938 +
     2939 +                        i40e_tx_set_data_desc(itrq, &tctx,
     2940 +                            (caddr_t)tcb->tcb_dma.dmab_dma_address,
     2941 +                            tcb->tcb_dma.dmab_len, last_desc);
     2942 +                } else {
     2943 +                        boolean_t last_desc = B_FALSE;
     2944 +                        ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA);
     2945 +
     2946 +                        for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) {
     2947 +                                last_desc = (c == tcb->tcb_bind_ncookies - 1) &&
     2948 +                                    (tcb->tcb_next == NULL);
     2949 +
     2950 +                                i40e_tx_set_data_desc(itrq, &tctx,
     2951 +                                    tcb->tcb_bind_info[c].dbi_paddr,
     2952 +                                    tcb->tcb_bind_info[c].dbi_len,
     2953 +                                    last_desc);
     2954 +                        }
     2955 +                }
     2956 +
     2957 +                tcb = tcb->tcb_next;
     2958 +        }
     2959 +
2224 2960          /*
2225 2961           * Now, finally, sync the DMA data and alert hardware.
2226 2962           */
2227 2963          I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
2228 2964  
2229 2965          I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
2230 2966              itrq->itrq_desc_tail);
     2967 +
2231 2968          if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
2232 2969              DDI_FM_OK) {
2233 2970                  /*
2234 2971                   * Note, we can't really go through and clean this up very well,
2235 2972                   * because the memory has been given to the device, so just
2236 2973                   * indicate it's been transmitted.
2237 2974                   */
2238 2975                  ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
2239 2976                  atomic_or_32(&i40e->i40e_state, I40E_ERROR);
2240 2977          }
2241 2978  
2242      -        txs->itxs_bytes.value.ui64 += mpsize;
     2979 +        txs->itxs_bytes.value.ui64 += msglen;
2243 2980          txs->itxs_packets.value.ui64++;
2244      -        txs->itxs_descriptors.value.ui64++;
     2981 +        txs->itxs_descriptors.value.ui64 += needed_desc;
2245 2982  
2246 2983          mutex_exit(&itrq->itrq_tx_lock);
2247 2984  
2248 2985          return (NULL);
2249 2986  
2250 2987  txfail:
2251 2988          /*
2252 2989           * We ran out of resources. Return it to MAC and indicate that we'll
2253 2990           * need to signal MAC. If there are allocated tcb's, return them now.
2254 2991           * Make sure to reset their message block's, since we'll return them
2255 2992           * back to MAC.
2256 2993           */
2257      -        if (tcb != NULL) {
     2994 +        if (tcb_ctx != NULL) {
     2995 +                tcb_ctx->tcb_mp = NULL;
     2996 +                i40e_tcb_reset(tcb_ctx);
     2997 +                i40e_tcb_free(itrq, tcb_ctx);
     2998 +        }
     2999 +
     3000 +        tcb = tcbhead;
     3001 +        while (tcb != NULL) {
     3002 +                i40e_tx_control_block_t *next = tcb->tcb_next;
     3003 +
     3004 +                ASSERT(tcb->tcb_type == I40E_TX_DMA ||
     3005 +                    tcb->tcb_type == I40E_TX_COPY);
     3006 +
2258 3007                  tcb->tcb_mp = NULL;
2259 3008                  i40e_tcb_reset(tcb);
2260 3009                  i40e_tcb_free(itrq, tcb);
     3010 +                tcb = next;
2261 3011          }
2262 3012  
2263 3013          mutex_enter(&itrq->itrq_tx_lock);
2264 3014          itrq->itrq_tx_blocked = B_TRUE;
2265 3015          mutex_exit(&itrq->itrq_tx_lock);
2266 3016  
2267 3017          return (mp);
2268 3018  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX