1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  28  */
  29 
  30 #include "ixgbe_sw.h"
  31 
  32 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  33     uint32_t, boolean_t);
  34 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  35     uint32_t);
  36 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
  37     ixgbe_tx_context_t *, size_t);
  38 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
  39 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
  40 
  41 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
  42 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
  43     ixgbe_tx_context_t *);
  44 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
  45     ixgbe_tx_context_t *);
  46 
  47 #ifndef IXGBE_DEBUG
  48 #pragma inline(ixgbe_save_desc)
  49 #pragma inline(ixgbe_get_context)
  50 #pragma inline(ixgbe_check_context)
  51 #pragma inline(ixgbe_fill_context)
  52 #endif
  53 
  54 /*
  55  * ixgbe_ring_tx
  56  *
  57  * To transmit one mblk through one specified ring.
  58  *
  59  * One mblk can consist of several fragments, each fragment
  60  * will be processed with different methods based on the size.
  61  * For the fragments with size less than the bcopy threshold,
  62  * they will be processed by using bcopy; otherwise, they will
  63  * be processed by using DMA binding.
  64  *
  65  * To process the mblk, a tx control block is got from the
  66  * free list. One tx control block contains one tx buffer, which
  67  * is used to copy mblk fragments' data; and one tx DMA handle,
  68  * which is used to bind a mblk fragment with DMA resource.
  69  *
  70  * Several small mblk fragments can be copied into one tx control
  71  * block's buffer, and then the buffer will be transmitted with
  72  * one tx descriptor.
  73  *
  74  * A large fragment only binds with one tx control block's DMA
  75  * handle, and it can span several tx descriptors for transmitting.
  76  *
  77  * So to transmit a packet (mblk), several tx control blocks can
  78  * be used. After the processing, those tx control blocks will
  79  * be put to the work list.
  80  */
  81 mblk_t *
  82 ixgbe_ring_tx(void *arg, mblk_t *mp)
  83 {
  84         ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
  85         ixgbe_t *ixgbe = tx_ring->ixgbe;
  86         tx_type_t current_flag, next_flag;
  87         uint32_t current_len, next_len;
  88         uint32_t desc_total;
  89         size_t mbsize;
  90         int desc_num;
  91         boolean_t copy_done, eop;
  92         mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
  93         tx_control_block_t *tcb;
  94         ixgbe_tx_context_t tx_context, *ctx;
  95         link_list_t pending_list;
  96         uint32_t len, hdr_frag_len, hdr_len;
  97         uint32_t copy_thresh;
  98         mblk_t *hdr_new_mp = NULL;
  99         mblk_t *hdr_pre_mp = NULL;
 100         mblk_t *hdr_nmp = NULL;
 101 
 102         ASSERT(mp->b_next == NULL);
 103 
 104         if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
 105             (ixgbe->ixgbe_state & IXGBE_ERROR) ||
 106             (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
 107             !(ixgbe->ixgbe_state & IXGBE_STARTED)) {
 108                 return (mp);
 109         }
 110 
 111         copy_thresh = ixgbe->tx_copy_thresh;
 112 
 113         /* Get the mblk size */
 114         mbsize = 0;
 115         for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
 116                 mbsize += MBLKL(nmp);
 117         }
 118 
 119         if (ixgbe->tx_hcksum_enable) {
 120                 /*
 121                  * Retrieve checksum context information from the mblk
 122                  * that will be used to decide whether/how to fill the
 123                  * context descriptor.
 124                  */
 125                 ctx = &tx_context;
 126                 if (ixgbe_get_context(mp, ctx) < 0) {
 127                         freemsg(mp);
 128                         return (NULL);
 129                 }
 130 
 131                 /*
 132                  * If the mblk size exceeds the max size ixgbe could
 133                  * process, then discard this mblk, and return NULL.
 134                  */
 135                 if ((ctx->lso_flag &&
 136                     ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
 137                     (!ctx->lso_flag &&
 138                     (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
 139                         freemsg(mp);
 140                         IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
 141                         return (NULL);
 142                 }
 143         } else {
 144                 ctx = NULL;
 145         }
 146 
 147         /*
 148          * Check and recycle tx descriptors.
 149          * The recycle threshold here should be selected carefully
 150          */
 151         if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
 152                 tx_ring->tx_recycle(tx_ring);
 153         }
 154 
 155         /*
 156          * After the recycling, if the tbd_free is less than the
 157          * overload_threshold, assert overload, return mp;
 158          * and we need to re-schedule the tx again.
 159          */
 160         if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
 161                 tx_ring->reschedule = B_TRUE;
 162                 IXGBE_DEBUG_STAT(tx_ring->stat_overload);
 163                 return (mp);
 164         }
 165 
 166         /*
 167          * The pending_list is a linked list that is used to save
 168          * the tx control blocks that have packet data processed
 169          * but have not put the data to the tx descriptor ring.
 170          * It is used to reduce the lock contention of the tx_lock.
 171          */
 172         LINK_LIST_INIT(&pending_list);
 173         desc_num = 0;
 174         desc_total = 0;
 175 
 176         /*
 177          * The software should guarantee LSO packet header(MAC+IP+TCP)
 178          * to be within one descriptor. Here we reallocate and refill the
 179          * the header if it's physical memory non-contiguous.
 180          */
 181         if ((ctx != NULL) && ctx->lso_flag) {
 182                 /* find the last fragment of the header */
 183                 len = MBLKL(mp);
 184                 ASSERT(len > 0);
 185                 hdr_nmp = mp;
 186                 hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
 187                 while (len < hdr_len) {
 188                         hdr_pre_mp = hdr_nmp;
 189                         hdr_nmp = hdr_nmp->b_cont;
 190                         len += MBLKL(hdr_nmp);
 191                 }
 192                 /*
 193                  * If the header and the payload are in different mblks,
 194                  * we simply force the header to be copied into pre-allocated
 195                  * page-aligned buffer.
 196                  */
 197                 if (len == hdr_len)
 198                         goto adjust_threshold;
 199 
 200                 hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
 201                 /*
 202                  * There are two cases we need to reallocate a mblk for the
 203                  * last header fragment:
 204                  * 1. the header is in multiple mblks and the last fragment
 205                  * share the same mblk with the payload
 206                  * 2. the header is in a single mblk shared with the payload
 207                  * and the header is physical memory non-contiguous
 208                  */
 209                 if ((hdr_nmp != mp) ||
 210                     (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
 211                     < hdr_len)) {
 212                         IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
 213                         /*
 214                          * reallocate the mblk for the last header fragment,
 215                          * expect to bcopy into pre-allocated page-aligned
 216                          * buffer
 217                          */
 218                         hdr_new_mp = allocb(hdr_frag_len, NULL);
 219                         if (!hdr_new_mp)
 220                                 return (mp);
 221                         bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
 222                             hdr_frag_len);
 223                         /* link the new header fragment with the other parts */
 224                         hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
 225                         hdr_new_mp->b_cont = hdr_nmp;
 226                         if (hdr_pre_mp)
 227                                 hdr_pre_mp->b_cont = hdr_new_mp;
 228                         else
 229                                 mp = hdr_new_mp;
 230                         hdr_nmp->b_rptr += hdr_frag_len;
 231                 }
 232 adjust_threshold:
 233                 /*
 234                  * adjust the bcopy threshhold to guarantee
 235                  * the header to use bcopy way
 236                  */
 237                 if (copy_thresh < hdr_len)
 238                         copy_thresh = hdr_len;
 239         }
 240 
 241         current_mp = mp;
 242         current_len = MBLKL(current_mp);
 243         /*
 244          * Decide which method to use for the first fragment
 245          */
 246         current_flag = (current_len <= copy_thresh) ?
 247             USE_COPY : USE_DMA;
 248         /*
 249          * If the mblk includes several contiguous small fragments,
 250          * they may be copied into one buffer. This flag is used to
 251          * indicate whether there are pending fragments that need to
 252          * be copied to the current tx buffer.
 253          *
 254          * If this flag is B_TRUE, it indicates that a new tx control
 255          * block is needed to process the next fragment using either
 256          * copy or DMA binding.
 257          *
 258          * Otherwise, it indicates that the next fragment will be
 259          * copied to the current tx buffer that is maintained by the
 260          * current tx control block. No new tx control block is needed.
 261          */
 262         copy_done = B_TRUE;
 263         while (current_mp) {
 264                 next_mp = current_mp->b_cont;
 265                 eop = (next_mp == NULL); /* Last fragment of the packet? */
 266                 next_len = eop ? 0: MBLKL(next_mp);
 267 
 268                 /*
 269                  * When the current fragment is an empty fragment, if
 270                  * the next fragment will still be copied to the current
 271                  * tx buffer, we cannot skip this fragment here. Because
 272                  * the copy processing is pending for completion. We have
 273                  * to process this empty fragment in the tx_copy routine.
 274                  *
 275                  * If the copy processing is completed or a DMA binding
 276                  * processing is just completed, we can just skip this
 277                  * empty fragment.
 278                  */
 279                 if ((current_len == 0) && (copy_done)) {
 280                         current_mp = next_mp;
 281                         current_len = next_len;
 282                         current_flag = (current_len <= copy_thresh) ?
 283                             USE_COPY : USE_DMA;
 284                         continue;
 285                 }
 286 
 287                 if (copy_done) {
 288                         /*
 289                          * Get a new tx control block from the free list
 290                          */
 291                         tcb = ixgbe_get_free_list(tx_ring);
 292 
 293                         if (tcb == NULL) {
 294                                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 295                                 goto tx_failure;
 296                         }
 297 
 298                         /*
 299                          * Push the tx control block to the pending list
 300                          * to avoid using lock too early
 301                          */
 302                         LIST_PUSH_TAIL(&pending_list, &tcb->link);
 303                 }
 304 
 305                 if (current_flag == USE_COPY) {
 306                         /*
 307                          * Check whether to use bcopy or DMA binding to process
 308                          * the next fragment, and if using bcopy, whether we
 309                          * need to continue copying the next fragment into the
 310                          * current tx buffer.
 311                          */
 312                         ASSERT((tcb->tx_buf.len + current_len) <=
 313                             tcb->tx_buf.size);
 314 
 315                         if (eop) {
 316                                 /*
 317                                  * This is the last fragment of the packet, so
 318                                  * the copy processing will be completed with
 319                                  * this fragment.
 320                                  */
 321                                 next_flag = USE_NONE;
 322                                 copy_done = B_TRUE;
 323                         } else if ((tcb->tx_buf.len + current_len + next_len) >
 324                             tcb->tx_buf.size) {
 325                                 /*
 326                                  * If the next fragment is too large to be
 327                                  * copied to the current tx buffer, we need
 328                                  * to complete the current copy processing.
 329                                  */
 330                                 next_flag = (next_len > copy_thresh) ?
 331                                     USE_DMA: USE_COPY;
 332                                 copy_done = B_TRUE;
 333                         } else if (next_len > copy_thresh) {
 334                                 /*
 335                                  * The next fragment needs to be processed with
 336                                  * DMA binding. So the copy prcessing will be
 337                                  * completed with the current fragment.
 338                                  */
 339                                 next_flag = USE_DMA;
 340                                 copy_done = B_TRUE;
 341                         } else {
 342                                 /*
 343                                  * Continue to copy the next fragment to the
 344                                  * current tx buffer.
 345                                  */
 346                                 next_flag = USE_COPY;
 347                                 copy_done = B_FALSE;
 348                         }
 349 
 350                         desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
 351                             current_len, copy_done);
 352                 } else {
 353                         /*
 354                          * Check whether to use bcopy or DMA binding to process
 355                          * the next fragment.
 356                          */
 357                         next_flag = (next_len > copy_thresh) ?
 358                             USE_DMA: USE_COPY;
 359                         ASSERT(copy_done == B_TRUE);
 360 
 361                         desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
 362                             current_len);
 363                 }
 364 
 365                 if (desc_num > 0)
 366                         desc_total += desc_num;
 367                 else if (desc_num < 0)
 368                         goto tx_failure;
 369 
 370                 current_mp = next_mp;
 371                 current_len = next_len;
 372                 current_flag = next_flag;
 373         }
 374 
 375         /*
 376          * Attach the mblk to the last tx control block
 377          */
 378         ASSERT(tcb);
 379         ASSERT(tcb->mp == NULL);
 380         tcb->mp = mp;
 381 
 382         /*
 383          * 82598/82599 chipset has a limitation that no more than 32 tx
 384          * descriptors can be transmited out at one time.
 385          *
 386          * Here is a workaround for it: pull up the mblk then send it
 387          * out with bind way. By doing so, no more than MAX_COOKIE (18)
 388          * descriptors is needed.
 389          */
 390         if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
 391                 IXGBE_DEBUG_STAT(tx_ring->stat_break_tbd_limit);
 392 
 393                 /*
 394                  * Discard the mblk and free the used resources
 395                  */
 396                 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 397                 while (tcb) {
 398                         tcb->mp = NULL;
 399                         ixgbe_free_tcb(tcb);
 400                         tcb = (tx_control_block_t *)
 401                             LIST_GET_NEXT(&pending_list, &tcb->link);
 402                 }
 403 
 404                 /*
 405                  * Return the tx control blocks in the pending list to
 406                  * the free list.
 407                  */
 408                 ixgbe_put_free_list(tx_ring, &pending_list);
 409 
 410                 /*
 411                  * pull up the mblk and send it out with bind way
 412                  */
 413                 if ((pull_mp = msgpullup(mp, -1)) == NULL) {
 414                         tx_ring->reschedule = B_TRUE;
 415 
 416                         /*
 417                          * If new mblk has been allocted for the last header
 418                          * fragment of a LSO packet, we should restore the
 419                          * modified mp.
 420                          */
 421                         if (hdr_new_mp) {
 422                                 hdr_new_mp->b_cont = NULL;
 423                                 freeb(hdr_new_mp);
 424                                 hdr_nmp->b_rptr -= hdr_frag_len;
 425                                 if (hdr_pre_mp)
 426                                         hdr_pre_mp->b_cont = hdr_nmp;
 427                                 else
 428                                         mp = hdr_nmp;
 429                         }
 430                         return (mp);
 431                 }
 432 
 433                 LINK_LIST_INIT(&pending_list);
 434                 desc_total = 0;
 435 
 436                 /*
 437                  * if the packet is a LSO packet, we simply
 438                  * transmit the header in one descriptor using the copy way
 439                  */
 440                 if ((ctx != NULL) && ctx->lso_flag) {
 441                         hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
 442                             ctx->l4_hdr_len;
 443 
 444                         tcb = ixgbe_get_free_list(tx_ring);
 445                         if (tcb == NULL) {
 446                                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 447                                 goto tx_failure;
 448                         }
 449                         desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
 450                             hdr_len, B_TRUE);
 451                         LIST_PUSH_TAIL(&pending_list, &tcb->link);
 452                         desc_total  += desc_num;
 453 
 454                         pull_mp->b_rptr += hdr_len;
 455                 }
 456 
 457                 tcb = ixgbe_get_free_list(tx_ring);
 458                 if (tcb == NULL) {
 459                         IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 460                         goto tx_failure;
 461                 }
 462                 if ((ctx != NULL) && ctx->lso_flag) {
 463                         desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 464                             mbsize - hdr_len);
 465                 } else {
 466                         desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 467                             mbsize);
 468                 }
 469                 if (desc_num < 0) {
 470                         goto tx_failure;
 471                 }
 472                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
 473 
 474                 desc_total += desc_num;
 475                 tcb->mp = pull_mp;
 476         }
 477 
 478         /*
 479          * Before fill the tx descriptor ring with the data, we need to
 480          * ensure there are adequate free descriptors for transmit
 481          * (including one context descriptor).
 482          * Do not use up all the tx descriptors.
 483          * Otherwise tx recycle will fail and cause false hang.
 484          */
 485         if (tx_ring->tbd_free <= (desc_total + 1)) {
 486                 tx_ring->tx_recycle(tx_ring);
 487         }
 488 
 489         mutex_enter(&tx_ring->tx_lock);
 490         /*
 491          * If the number of free tx descriptors is not enough for transmit
 492          * then return mp.
 493          *
 494          * Note: we must put this check under the mutex protection to
 495          * ensure the correctness when multiple threads access it in
 496          * parallel.
 497          */
 498         if (tx_ring->tbd_free <= (desc_total + 1)) {
 499                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
 500                 mutex_exit(&tx_ring->tx_lock);
 501                 goto tx_failure;
 502         }
 503 
 504         desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
 505             mbsize);
 506 
 507         ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
 508 
 509         tx_ring->stat_obytes += mbsize;
 510         tx_ring->stat_opackets ++;
 511 
 512         mutex_exit(&tx_ring->tx_lock);
 513 
 514         /*
 515          * now that the transmission succeeds, need to free the original
 516          * mp if we used the pulling up mblk for transmission.
 517          */
 518         if (pull_mp) {
 519                 freemsg(mp);
 520         }
 521 
 522         return (NULL);
 523 
 524 tx_failure:
 525         /*
 526          * If transmission fails, need to free the pulling up mblk.
 527          */
 528         if (pull_mp) {
 529                 freemsg(pull_mp);
 530         }
 531 
 532         /*
 533          * If new mblk has been allocted for the last header
 534          * fragment of a LSO packet, we should restore the
 535          * modified mp.
 536          */
 537         if (hdr_new_mp) {
 538                 hdr_new_mp->b_cont = NULL;
 539                 freeb(hdr_new_mp);
 540                 hdr_nmp->b_rptr -= hdr_frag_len;
 541                 if (hdr_pre_mp)
 542                         hdr_pre_mp->b_cont = hdr_nmp;
 543                 else
 544                         mp = hdr_nmp;
 545         }
 546         /*
 547          * Discard the mblk and free the used resources
 548          */
 549         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 550         while (tcb) {
 551                 tcb->mp = NULL;
 552 
 553                 ixgbe_free_tcb(tcb);
 554 
 555                 tcb = (tx_control_block_t *)
 556                     LIST_GET_NEXT(&pending_list, &tcb->link);
 557         }
 558 
 559         /*
 560          * Return the tx control blocks in the pending list to the free list.
 561          */
 562         ixgbe_put_free_list(tx_ring, &pending_list);
 563 
 564         /* Transmit failed, do not drop the mblk, rechedule the transmit */
 565         tx_ring->reschedule = B_TRUE;
 566 
 567         return (mp);
 568 }
 569 
 570 /*
 571  * ixgbe_tx_copy
 572  *
 573  * Copy the mblk fragment to the pre-allocated tx buffer
 574  */
 575 static int
 576 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 577     uint32_t len, boolean_t copy_done)
 578 {
 579         dma_buffer_t *tx_buf;
 580         uint32_t desc_num;
 581         _NOTE(ARGUNUSED(tx_ring));
 582 
 583         tx_buf = &tcb->tx_buf;
 584 
 585         /*
 586          * Copy the packet data of the mblk fragment into the
 587          * pre-allocated tx buffer, which is maintained by the
 588          * tx control block.
 589          *
 590          * Several mblk fragments can be copied into one tx buffer.
 591          * The destination address of the current copied fragment in
 592          * the tx buffer is next to the end of the previous copied
 593          * fragment.
 594          */
 595         if (len > 0) {
 596                 bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
 597 
 598                 tx_buf->len += len;
 599                 tcb->frag_num++;
 600         }
 601 
 602         desc_num = 0;
 603 
 604         /*
 605          * If it is the last fragment copied to the current tx buffer,
 606          * in other words, if there's no remaining fragment or the remaining
 607          * fragment requires a new tx control block to process, we need to
 608          * complete the current copy processing by syncing up the current
 609          * DMA buffer and saving the descriptor data.
 610          */
 611         if (copy_done) {
 612                 /*
 613                  * Sync the DMA buffer of the packet data
 614                  */
 615                 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
 616 
 617                 tcb->tx_type = USE_COPY;
 618 
 619                 /*
 620                  * Save the address and length to the private data structure
 621                  * of the tx control block, which will be used to fill the
 622                  * tx descriptor ring after all the fragments are processed.
 623                  */
 624                 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
 625                 desc_num++;
 626         }
 627 
 628         return (desc_num);
 629 }
 630 
 631 /*
 632  * ixgbe_tx_bind
 633  *
 634  * Bind the mblk fragment with DMA
 635  */
 636 static int
 637 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 638     uint32_t len)
 639 {
 640         int status, i;
 641         ddi_dma_cookie_t dma_cookie;
 642         uint_t ncookies;
 643         int desc_num;
 644 
 645         /*
 646          * Use DMA binding to process the mblk fragment
 647          */
 648         status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
 649             (caddr_t)mp->b_rptr, len,
 650             DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
 651             0, &dma_cookie, &ncookies);
 652 
 653         if (status != DDI_DMA_MAPPED) {
 654                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
 655                 return (-1);
 656         }
 657 
 658         tcb->frag_num++;
 659         tcb->tx_type = USE_DMA;
 660         /*
 661          * Each fragment can span several cookies. One cookie will have
 662          * one tx descriptor to transmit.
 663          */
 664         desc_num = 0;
 665         for (i = ncookies; i > 0; i--) {
 666                 /*
 667                  * Save the address and length to the private data structure
 668                  * of the tx control block, which will be used to fill the
 669                  * tx descriptor ring after all the fragments are processed.
 670                  */
 671                 ixgbe_save_desc(tcb,
 672                     dma_cookie.dmac_laddress,
 673                     dma_cookie.dmac_size);
 674 
 675                 desc_num++;
 676 
 677                 if (i > 1)
 678                         ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
 679         }
 680 
 681         return (desc_num);
 682 }
 683 
 684 /*
 685  * ixgbe_get_context
 686  *
 687  * Get the context information from the mblk
 688  */
 689 static int
 690 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 691 {
 692         uint32_t start;
 693         uint32_t hckflags;
 694         uint32_t lsoflags;
 695         uint32_t mss;
 696         uint32_t len;
 697         uint32_t size;
 698         uint32_t offset;
 699         unsigned char *pos;
 700         ushort_t etype;
 701         uint32_t mac_hdr_len;
 702         uint32_t l4_proto;
 703         uint32_t l4_hdr_len;
 704 
 705         ASSERT(mp != NULL);
 706 
 707         mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
 708         bzero(ctx, sizeof (ixgbe_tx_context_t));
 709 
 710         if (hckflags == 0) {
 711                 return (0);
 712         }
 713 
 714         ctx->hcksum_flags = hckflags;
 715 
 716         mac_lso_get(mp, &mss, &lsoflags);
 717         ctx->mss = mss;
 718         ctx->lso_flag = (lsoflags == HW_LSO);
 719 
 720         /*
 721          * LSO relies on tx h/w checksum, so here will drop the package
 722          * if h/w checksum flag is not declared.
 723          */
 724         if (ctx->lso_flag) {
 725                 if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
 726                     (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
 727                         IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w "
 728                             "checksum flags are not specified when doing LSO");
 729                         return (-1);
 730                 }
 731         }
 732 
 733         etype = 0;
 734         mac_hdr_len = 0;
 735         l4_proto = 0;
 736 
 737         /*
 738          * Firstly get the position of the ether_type/ether_tpid.
 739          * Here we don't assume the ether (VLAN) header is fully included
 740          * in one mblk fragment, so we go thourgh the fragments to parse
 741          * the ether type.
 742          */
 743         size = len = MBLKL(mp);
 744         offset = offsetof(struct ether_header, ether_type);
 745         while (size <= offset) {
 746                 mp = mp->b_cont;
 747                 ASSERT(mp != NULL);
 748                 len = MBLKL(mp);
 749                 size += len;
 750         }
 751         pos = mp->b_rptr + offset + len - size;
 752 
 753         etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 754         if (etype == ETHERTYPE_VLAN) {
 755                 /*
 756                  * Get the position of the ether_type in VLAN header
 757                  */
 758                 offset = offsetof(struct ether_vlan_header, ether_type);
 759                 while (size <= offset) {
 760                         mp = mp->b_cont;
 761                         ASSERT(mp != NULL);
 762                         len = MBLKL(mp);
 763                         size += len;
 764                 }
 765                 pos = mp->b_rptr + offset + len - size;
 766 
 767                 etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 768                 mac_hdr_len = sizeof (struct ether_vlan_header);
 769         } else {
 770                 mac_hdr_len = sizeof (struct ether_header);
 771         }
 772 
 773         /*
 774          * Here we don't assume the IP(V6) header is fully included in
 775          * one mblk fragment.
 776          */
 777         switch (etype) {
 778         case ETHERTYPE_IP:
 779                 if (ctx->lso_flag) {
 780                         offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
 781                         while (size <= offset) {
 782                                 mp = mp->b_cont;
 783                                 ASSERT(mp != NULL);
 784                                 len = MBLKL(mp);
 785                                 size += len;
 786                         }
 787                         pos = mp->b_rptr + offset + len - size;
 788                         *((uint16_t *)(uintptr_t)(pos)) = 0;
 789 
 790                         offset = offsetof(ipha_t, ipha_hdr_checksum) +
 791                             mac_hdr_len;
 792                         while (size <= offset) {
 793                                 mp = mp->b_cont;
 794                                 ASSERT(mp != NULL);
 795                                 len = MBLKL(mp);
 796                                 size += len;
 797                         }
 798                         pos = mp->b_rptr + offset + len - size;
 799                         *((uint16_t *)(uintptr_t)(pos)) = 0;
 800 
 801                         /*
 802                          * To perform ixgbe LSO, here also need to fill
 803                          * the tcp checksum field of the packet with the
 804                          * following pseudo-header checksum:
 805                          * (ip_source_addr, ip_destination_addr, l4_proto)
 806                          * Currently the tcp/ip stack has done it.
 807                          */
 808                 }
 809 
 810                 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
 811                 while (size <= offset) {
 812                         mp = mp->b_cont;
 813                         ASSERT(mp != NULL);
 814                         len = MBLKL(mp);
 815                         size += len;
 816                 }
 817                 pos = mp->b_rptr + offset + len - size;
 818 
 819                 l4_proto = *(uint8_t *)pos;
 820                 break;
 821         case ETHERTYPE_IPV6:
 822                 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
 823                 while (size <= offset) {
 824                         mp = mp->b_cont;
 825                         ASSERT(mp != NULL);
 826                         len = MBLKL(mp);
 827                         size += len;
 828                 }
 829                 pos = mp->b_rptr + offset + len - size;
 830 
 831                 l4_proto = *(uint8_t *)pos;
 832                 break;
 833         default:
 834                 /* Unrecoverable error */
 835                 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
 836                 return (-2);
 837         }
 838 
 839         if (ctx->lso_flag) {
 840                 offset = mac_hdr_len + start;
 841                 while (size <= offset) {
 842                         mp = mp->b_cont;
 843                         ASSERT(mp != NULL);
 844                         len = MBLKL(mp);
 845                         size += len;
 846                 }
 847                 pos = mp->b_rptr + offset + len - size;
 848 
 849                 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
 850         } else {
 851                 /*
 852                  * l4 header length is only required for LSO
 853                  */
 854                 l4_hdr_len = 0;
 855         }
 856 
 857         ctx->mac_hdr_len = mac_hdr_len;
 858         ctx->ip_hdr_len = start;
 859         ctx->l4_proto = l4_proto;
 860         ctx->l4_hdr_len = l4_hdr_len;
 861 
 862         return (0);
 863 }
 864 
 865 /*
 866  * ixgbe_check_context
 867  *
 868  * Check if a new context descriptor is needed
 869  */
 870 static boolean_t
 871 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
 872 {
 873         ixgbe_tx_context_t *last;
 874 
 875         if (ctx == NULL)
 876                 return (B_FALSE);
 877 
 878         /*
 879          * Compare the context data retrieved from the mblk and the
 880          * stored data of the last context descriptor. The data need
 881          * to be checked are:
 882          *      hcksum_flags
 883          *      l4_proto
 884          *      mac_hdr_len
 885          *      ip_hdr_len
 886          *      lso_flag
 887          *      mss (only checked for LSO)
 888          *      l4_hr_len (only checked for LSO)
 889          * Either one of the above data is changed, a new context descriptor
 890          * will be needed.
 891          */
 892         last = &tx_ring->tx_context;
 893 
 894         if ((ctx->hcksum_flags != last->hcksum_flags) ||
 895             (ctx->l4_proto != last->l4_proto) ||
 896             (ctx->mac_hdr_len != last->mac_hdr_len) ||
 897             (ctx->ip_hdr_len != last->ip_hdr_len) ||
 898             (ctx->lso_flag != last->lso_flag) ||
 899             (ctx->lso_flag && ((ctx->mss != last->mss) ||
 900             (ctx->l4_hdr_len != last->l4_hdr_len)))) {
 901                 return (B_TRUE);
 902         }
 903 
 904         return (B_FALSE);
 905 }
 906 
 907 /*
 908  * ixgbe_fill_context
 909  *
 910  * Fill the context descriptor with hardware checksum informations
 911  */
 912 static void
 913 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
 914     ixgbe_tx_context_t *ctx)
 915 {
 916         /*
 917          * Fill the context descriptor with the checksum
 918          * context information we've got.
 919          */
 920         ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
 921         ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
 922             IXGBE_ADVTXD_MACLEN_SHIFT;
 923 
 924         ctx_tbd->type_tucmd_mlhl =
 925             IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
 926 
 927         if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
 928                 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
 929 
 930         if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
 931                 switch (ctx->l4_proto) {
 932                 case IPPROTO_TCP:
 933                         ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
 934                         break;
 935                 case IPPROTO_UDP:
 936                         /*
 937                          * We don't have to explicitly set:
 938                          *      ctx_tbd->type_tucmd_mlhl |=
 939                          *          IXGBE_ADVTXD_TUCMD_L4T_UDP;
 940                          * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
 941                          */
 942                         break;
 943                 default:
 944                         /* Unrecoverable error */
 945                         IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
 946                         break;
 947                 }
 948         }
 949 
 950         ctx_tbd->seqnum_seed = 0;
 951 
 952         if (ctx->lso_flag) {
 953                 ctx_tbd->mss_l4len_idx =
 954                     (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
 955                     (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
 956         } else {
 957                 ctx_tbd->mss_l4len_idx = 0;
 958         }
 959 }
 960 
 961 /*
 962  * ixgbe_tx_fill_ring
 963  *
 964  * Fill the tx descriptor ring with the data
 965  */
 966 static int
 967 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
 968     ixgbe_tx_context_t *ctx, size_t mbsize)
 969 {
 970         struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
 971         boolean_t load_context;
 972         uint32_t index, tcb_index, desc_num;
 973         union ixgbe_adv_tx_desc *tbd, *first_tbd;
 974         tx_control_block_t *tcb, *first_tcb;
 975         uint32_t hcksum_flags;
 976         int i;
 977 
 978         ASSERT(mutex_owned(&tx_ring->tx_lock));
 979 
 980         tbd = NULL;
 981         first_tbd = NULL;
 982         first_tcb = NULL;
 983         desc_num = 0;
 984         hcksum_flags = 0;
 985         load_context = B_FALSE;
 986 
 987         /*
 988          * Get the index of the first tx descriptor that will be filled,
 989          * and the index of the first work list item that will be attached
 990          * with the first used tx control block in the pending list.
 991          * Note: the two indexes are the same.
 992          */
 993         index = tx_ring->tbd_tail;
 994         tcb_index = tx_ring->tbd_tail;
 995 
 996         if (ctx != NULL) {
 997                 hcksum_flags = ctx->hcksum_flags;
 998 
 999                 /*
1000                  * Check if a new context descriptor is needed for this packet
1001                  */
1002                 load_context = ixgbe_check_context(tx_ring, ctx);
1003 
1004                 if (load_context) {
1005                         tbd = &tx_ring->tbd_ring[index];
1006 
1007                         /*
1008                          * Fill the context descriptor with the
1009                          * hardware checksum offload informations.
1010                          */
1011                         ixgbe_fill_context(
1012                             (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
1013 
1014                         index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1015                         desc_num++;
1016 
1017                         /*
1018                          * Store the checksum context data if
1019                          * a new context descriptor is added
1020                          */
1021                         tx_ring->tx_context = *ctx;
1022                 }
1023         }
1024 
1025         first_tbd = &tx_ring->tbd_ring[index];
1026 
1027         /*
1028          * Fill tx data descriptors with the data saved in the pending list.
1029          * The tx control blocks in the pending list are added to the work list
1030          * at the same time.
1031          *
1032          * The work list is strictly 1:1 corresponding to the descriptor ring.
1033          * One item of the work list corresponds to one tx descriptor. Because
1034          * one tx control block can span multiple tx descriptors, the tx
1035          * control block will be added to the first work list item that
1036          * corresponds to the first tx descriptor generated from that tx
1037          * control block.
1038          */
1039         tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1040         first_tcb = tcb;
1041         while (tcb != NULL) {
1042 
1043                 for (i = 0; i < tcb->desc_num; i++) {
1044                         tbd = &tx_ring->tbd_ring[index];
1045 
1046                         tbd->read.buffer_addr = tcb->desc[i].address;
1047                         tbd->read.cmd_type_len = tcb->desc[i].length;
1048 
1049                         tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
1050                             | IXGBE_ADVTXD_DTYP_DATA;
1051 
1052                         tbd->read.olinfo_status = 0;
1053 
1054                         index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1055                         desc_num++;
1056                 }
1057 
1058                 /*
1059                  * Add the tx control block to the work list
1060                  */
1061                 ASSERT(tx_ring->work_list[tcb_index] == NULL);
1062                 tx_ring->work_list[tcb_index] = tcb;
1063 
1064                 tcb_index = index;
1065                 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1066         }
1067 
1068         if (load_context) {
1069                 /*
1070                  * Count the context descriptor for
1071                  * the first tx control block.
1072                  */
1073                 first_tcb->desc_num++;
1074         }
1075         first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
1076 
1077         /*
1078          * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1079          * valid in the first descriptor of the packet.
1080          * Setting paylen in every first_tbd for all parts.
1081          * 82599 and X540 require the packet length in paylen field with or
1082          * without LSO and 82598 will ignore it in non-LSO mode.
1083          */
1084         ASSERT(first_tbd != NULL);
1085         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1086 
1087         switch (hw->mac.type) {
1088         case ixgbe_mac_82598EB:
1089                 if (ctx != NULL && ctx->lso_flag) {
1090                         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1091                         first_tbd->read.olinfo_status |=
1092                             (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1093                             - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1094                 }
1095                 break;
1096 
1097         case ixgbe_mac_82599EB:
1098         case ixgbe_mac_X540:
1099                 if (ctx != NULL && ctx->lso_flag) {
1100                         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1101                         first_tbd->read.olinfo_status |=
1102                             (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1103                             - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1104                 } else {
1105                         first_tbd->read.olinfo_status |=
1106                             (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1107                 }
1108                 break;
1109 
1110         default:
1111                 break;
1112         }
1113 
1114         /* Set hardware checksum bits */
1115         if (hcksum_flags != 0) {
1116                 if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1117                         first_tbd->read.olinfo_status |=
1118                             IXGBE_ADVTXD_POPTS_IXSM;
1119                 if (hcksum_flags & HCK_PARTIALCKSUM)
1120                         first_tbd->read.olinfo_status |=
1121                             IXGBE_ADVTXD_POPTS_TXSM;
1122         }
1123 
1124         /*
1125          * The last descriptor of packet needs End Of Packet (EOP),
1126          * and Report Status (RS) bits set
1127          */
1128         ASSERT(tbd != NULL);
1129         tbd->read.cmd_type_len |=
1130             IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1131 
1132         /*
1133          * Sync the DMA buffer of the tx descriptor ring
1134          */
1135         DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1136 
1137         /*
1138          * Update the number of the free tx descriptors.
1139          * The mutual exclusion between the transmission and the recycling
1140          * (for the tx descriptor ring and the work list) is implemented
1141          * with the atomic operation on the number of the free tx descriptors.
1142          *
1143          * Note: we should always decrement the counter tbd_free before
1144          * advancing the hardware TDT pointer to avoid the race condition -
1145          * before the counter tbd_free is decremented, the transmit of the
1146          * tx descriptors has done and the counter tbd_free is increased by
1147          * the tx recycling.
1148          */
1149         i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1150         ASSERT(i >= 0);
1151 
1152         tx_ring->tbd_tail = index;
1153 
1154         /*
1155          * Advance the hardware TDT pointer of the tx descriptor ring
1156          */
1157         IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1158 
1159         if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1160             DDI_FM_OK) {
1161                 ddi_fm_service_impact(tx_ring->ixgbe->dip,
1162                     DDI_SERVICE_DEGRADED);
1163                 atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1164         }
1165 
1166         return (desc_num);
1167 }
1168 
1169 /*
1170  * ixgbe_save_desc
1171  *
1172  * Save the address/length pair to the private array
1173  * of the tx control block. The address/length pairs
1174  * will be filled into the tx descriptor ring later.
1175  */
1176 static void
1177 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1178 {
1179         sw_desc_t *desc;
1180 
1181         desc = &tcb->desc[tcb->desc_num];
1182         desc->address = address;
1183         desc->length = length;
1184 
1185         tcb->desc_num++;
1186 }
1187 
1188 /*
1189  * ixgbe_tx_recycle_legacy
1190  *
1191  * Recycle the tx descriptors and tx control blocks.
1192  *
1193  * The work list is traversed to check if the corresponding
1194  * tx descriptors have been transmitted. If so, the resources
1195  * bound to the tx control blocks will be freed, and those
1196  * tx control blocks will be returned to the free list.
1197  */
1198 uint32_t
1199 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1200 {
1201         uint32_t index, last_index, prev_index;
1202         int desc_num;
1203         boolean_t desc_done;
1204         tx_control_block_t *tcb;
1205         link_list_t pending_list;
1206         ixgbe_t *ixgbe = tx_ring->ixgbe;
1207 
1208         mutex_enter(&tx_ring->recycle_lock);
1209 
1210         ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1211 
1212         if (tx_ring->tbd_free == tx_ring->ring_size) {
1213                 tx_ring->recycle_fail = 0;
1214                 tx_ring->stall_watchdog = 0;
1215                 if (tx_ring->reschedule) {
1216                         tx_ring->reschedule = B_FALSE;
1217                         mac_tx_ring_update(ixgbe->mac_hdl,
1218                             tx_ring->ring_handle);
1219                 }
1220                 mutex_exit(&tx_ring->recycle_lock);
1221                 return (0);
1222         }
1223 
1224         /*
1225          * Sync the DMA buffer of the tx descriptor ring
1226          */
1227         DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1228 
1229         if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1230                 mutex_exit(&tx_ring->recycle_lock);
1231                 ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1232                 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1233                 return (0);
1234         }
1235 
1236         LINK_LIST_INIT(&pending_list);
1237         desc_num = 0;
1238         index = tx_ring->tbd_head;   /* Index of next tbd/tcb to recycle */
1239 
1240         tcb = tx_ring->work_list[index];
1241         ASSERT(tcb != NULL);
1242 
1243         while (tcb != NULL) {
1244                 /*
1245                  * Get the last tx descriptor of this packet.
1246                  * If the last tx descriptor is done, then
1247                  * we can recycle all descriptors of a packet
1248                  * which usually includes several tx control blocks.
1249                  * For 82599, LSO descriptors can not be recycled
1250                  * unless the whole packet's transmission is done.
1251                  * That's why packet level recycling is used here.
1252                  * For 82598, there's not such limit.
1253                  */
1254                 last_index = tcb->last_index;
1255                 /*
1256                  * MAX_TX_RING_SIZE is used to judge whether
1257                  * the index is a valid value or not.
1258                  */
1259                 if (last_index == MAX_TX_RING_SIZE)
1260                         break;
1261 
1262                 /*
1263                  * Check if the Descriptor Done bit is set
1264                  */
1265                 desc_done = tx_ring->tbd_ring[last_index].wb.status &
1266                     IXGBE_TXD_STAT_DD;
1267                 if (desc_done) {
1268                         /*
1269                          * recycle all descriptors of the packet
1270                          */
1271                         while (tcb != NULL) {
1272                                 /*
1273                                  * Strip off the tx control block from
1274                                  * the work list, and add it to the
1275                                  * pending list.
1276                                  */
1277                                 tx_ring->work_list[index] = NULL;
1278                                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1279 
1280                                 /*
1281                                  * Count the total number of the tx
1282                                  * descriptors recycled
1283                                  */
1284                                 desc_num += tcb->desc_num;
1285 
1286                                 index = NEXT_INDEX(index, tcb->desc_num,
1287                                     tx_ring->ring_size);
1288 
1289                                 tcb = tx_ring->work_list[index];
1290 
1291                                 prev_index = PREV_INDEX(index, 1,
1292                                     tx_ring->ring_size);
1293                                 if (prev_index == last_index)
1294                                         break;
1295                         }
1296                 } else {
1297                         break;
1298                 }
1299         }
1300 
1301         /*
1302          * If no tx descriptors are recycled, no need to do more processing
1303          */
1304         if (desc_num == 0) {
1305                 tx_ring->recycle_fail++;
1306                 mutex_exit(&tx_ring->recycle_lock);
1307                 return (0);
1308         }
1309 
1310         tx_ring->recycle_fail = 0;
1311         tx_ring->stall_watchdog = 0;
1312 
1313         /*
1314          * Update the head index of the tx descriptor ring
1315          */
1316         tx_ring->tbd_head = index;
1317 
1318         /*
1319          * Update the number of the free tx descriptors with atomic operations
1320          */
1321         atomic_add_32(&tx_ring->tbd_free, desc_num);
1322 
1323         if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1324             (tx_ring->reschedule)) {
1325                 tx_ring->reschedule = B_FALSE;
1326                 mac_tx_ring_update(ixgbe->mac_hdl,
1327                     tx_ring->ring_handle);
1328         }
1329         mutex_exit(&tx_ring->recycle_lock);
1330 
1331         /*
1332          * Free the resources used by the tx control blocks
1333          * in the pending list
1334          */
1335         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1336         while (tcb != NULL) {
1337                 /*
1338                  * Release the resources occupied by the tx control block
1339                  */
1340                 ixgbe_free_tcb(tcb);
1341 
1342                 tcb = (tx_control_block_t *)
1343                     LIST_GET_NEXT(&pending_list, &tcb->link);
1344         }
1345 
1346         /*
1347          * Add the tx control blocks in the pending list to the free list.
1348          */
1349         ixgbe_put_free_list(tx_ring, &pending_list);
1350 
1351         return (desc_num);
1352 }
1353 
1354 /*
1355  * ixgbe_tx_recycle_head_wb
1356  *
1357  * Check the head write-back, and recycle all the transmitted
1358  * tx descriptors and tx control blocks.
1359  */
1360 uint32_t
1361 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1362 {
1363         uint32_t index;
1364         uint32_t head_wb;
1365         int desc_num;
1366         tx_control_block_t *tcb;
1367         link_list_t pending_list;
1368         ixgbe_t *ixgbe = tx_ring->ixgbe;
1369 
1370         mutex_enter(&tx_ring->recycle_lock);
1371 
1372         ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1373 
1374         if (tx_ring->tbd_free == tx_ring->ring_size) {
1375                 tx_ring->recycle_fail = 0;
1376                 tx_ring->stall_watchdog = 0;
1377                 if (tx_ring->reschedule) {
1378                         tx_ring->reschedule = B_FALSE;
1379                         mac_tx_ring_update(ixgbe->mac_hdl,
1380                             tx_ring->ring_handle);
1381                 }
1382                 mutex_exit(&tx_ring->recycle_lock);
1383                 return (0);
1384         }
1385 
1386         /*
1387          * Sync the DMA buffer of the tx descriptor ring
1388          *
1389          * Note: For head write-back mode, the tx descriptors will not
1390          * be written back, but the head write-back value is stored at
1391          * the last extra tbd at the end of the DMA area, we still need
1392          * to sync the head write-back value for kernel.
1393          *
1394          * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1395          */
1396         (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1397             sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1398             sizeof (uint32_t),
1399             DDI_DMA_SYNC_FORKERNEL);
1400 
1401         if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1402                 mutex_exit(&tx_ring->recycle_lock);
1403                 ddi_fm_service_impact(ixgbe->dip,
1404                     DDI_SERVICE_DEGRADED);
1405                 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1406                 return (0);
1407         }
1408 
1409         LINK_LIST_INIT(&pending_list);
1410         desc_num = 0;
1411         index = tx_ring->tbd_head;   /* Next index to clean */
1412 
1413         /*
1414          * Get the value of head write-back
1415          */
1416         head_wb = *tx_ring->tbd_head_wb;
1417         while (index != head_wb) {
1418                 tcb = tx_ring->work_list[index];
1419                 ASSERT(tcb != NULL);
1420 
1421                 if (OFFSET(index, head_wb, tx_ring->ring_size) <
1422                     tcb->desc_num) {
1423                         /*
1424                          * The current tx control block is not
1425                          * completely transmitted, stop recycling
1426                          */
1427                         break;
1428                 }
1429 
1430                 /*
1431                  * Strip off the tx control block from the work list,
1432                  * and add it to the pending list.
1433                  */
1434                 tx_ring->work_list[index] = NULL;
1435                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1436 
1437                 /*
1438                  * Advance the index of the tx descriptor ring
1439                  */
1440                 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1441 
1442                 /*
1443                  * Count the total number of the tx descriptors recycled
1444                  */
1445                 desc_num += tcb->desc_num;
1446         }
1447 
1448         /*
1449          * If no tx descriptors are recycled, no need to do more processing
1450          */
1451         if (desc_num == 0) {
1452                 tx_ring->recycle_fail++;
1453                 mutex_exit(&tx_ring->recycle_lock);
1454                 return (0);
1455         }
1456 
1457         tx_ring->recycle_fail = 0;
1458         tx_ring->stall_watchdog = 0;
1459 
1460         /*
1461          * Update the head index of the tx descriptor ring
1462          */
1463         tx_ring->tbd_head = index;
1464 
1465         /*
1466          * Update the number of the free tx descriptors with atomic operations
1467          */
1468         atomic_add_32(&tx_ring->tbd_free, desc_num);
1469 
1470         if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1471             (tx_ring->reschedule)) {
1472                 tx_ring->reschedule = B_FALSE;
1473                 mac_tx_ring_update(ixgbe->mac_hdl,
1474                     tx_ring->ring_handle);
1475         }
1476         mutex_exit(&tx_ring->recycle_lock);
1477 
1478         /*
1479          * Free the resources used by the tx control blocks
1480          * in the pending list
1481          */
1482         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1483         while (tcb) {
1484                 /*
1485                  * Release the resources occupied by the tx control block
1486                  */
1487                 ixgbe_free_tcb(tcb);
1488 
1489                 tcb = (tx_control_block_t *)
1490                     LIST_GET_NEXT(&pending_list, &tcb->link);
1491         }
1492 
1493         /*
1494          * Add the tx control blocks in the pending list to the free list.
1495          */
1496         ixgbe_put_free_list(tx_ring, &pending_list);
1497 
1498         return (desc_num);
1499 }
1500 
1501 /*
1502  * ixgbe_free_tcb - free up the tx control block
1503  *
1504  * Free the resources of the tx control block, including
1505  * unbind the previously bound DMA handle, and reset other
1506  * control fields.
1507  */
1508 void
1509 ixgbe_free_tcb(tx_control_block_t *tcb)
1510 {
1511         switch (tcb->tx_type) {
1512         case USE_COPY:
1513                 /*
1514                  * Reset the buffer length that is used for copy
1515                  */
1516                 tcb->tx_buf.len = 0;
1517                 break;
1518         case USE_DMA:
1519                 /*
1520                  * Release the DMA resource that is used for
1521                  * DMA binding.
1522                  */
1523                 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1524                 break;
1525         default:
1526                 break;
1527         }
1528 
1529         /*
1530          * Free the mblk
1531          */
1532         if (tcb->mp != NULL) {
1533                 freemsg(tcb->mp);
1534                 tcb->mp = NULL;
1535         }
1536 
1537         tcb->tx_type = USE_NONE;
1538         tcb->last_index = MAX_TX_RING_SIZE;
1539         tcb->frag_num = 0;
1540         tcb->desc_num = 0;
1541 }
1542 
1543 /*
1544  * ixgbe_get_free_list - Get a free tx control block from the free list
1545  *
1546  * The atomic operation on the number of the available tx control block
1547  * in the free list is used to keep this routine mutual exclusive with
1548  * the routine ixgbe_put_check_list.
1549  */
1550 static tx_control_block_t *
1551 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
1552 {
1553         tx_control_block_t *tcb;
1554 
1555         /*
1556          * Check and update the number of the free tx control block
1557          * in the free list.
1558          */
1559         if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1560                 return (NULL);
1561 
1562         mutex_enter(&tx_ring->tcb_head_lock);
1563 
1564         tcb = tx_ring->free_list[tx_ring->tcb_head];
1565         ASSERT(tcb != NULL);
1566         tx_ring->free_list[tx_ring->tcb_head] = NULL;
1567         tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1568             tx_ring->free_list_size);
1569 
1570         mutex_exit(&tx_ring->tcb_head_lock);
1571 
1572         return (tcb);
1573 }
1574 
1575 /*
1576  * ixgbe_put_free_list
1577  *
1578  * Put a list of used tx control blocks back to the free list
1579  *
1580  * A mutex is used here to ensure the serialization. The mutual exclusion
1581  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1582  * the atomic operation on the counter tcb_free.
1583  */
1584 void
1585 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1586 {
1587         uint32_t index;
1588         int tcb_num;
1589         tx_control_block_t *tcb;
1590 
1591         mutex_enter(&tx_ring->tcb_tail_lock);
1592 
1593         index = tx_ring->tcb_tail;
1594 
1595         tcb_num = 0;
1596         tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1597         while (tcb != NULL) {
1598                 ASSERT(tx_ring->free_list[index] == NULL);
1599                 tx_ring->free_list[index] = tcb;
1600 
1601                 tcb_num++;
1602 
1603                 index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1604 
1605                 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1606         }
1607 
1608         tx_ring->tcb_tail = index;
1609 
1610         /*
1611          * Update the number of the free tx control block
1612          * in the free list. This operation must be placed
1613          * under the protection of the lock.
1614          */
1615         atomic_add_32(&tx_ring->tcb_free, tcb_num);
1616 
1617         mutex_exit(&tx_ring->tcb_tail_lock);
1618 }