1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  28  */
  29 
  30 #include "ixgbe_sw.h"
  31 
  32 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  33     uint32_t, boolean_t);
  34 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  35     uint32_t);
  36 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
  37     ixgbe_tx_context_t *, size_t);
  38 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
  39 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
  40 
  41 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
  42 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
  43     ixgbe_tx_context_t *);
  44 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
  45     ixgbe_tx_context_t *);
  46 
  47 #ifndef IXGBE_DEBUG
  48 #pragma inline(ixgbe_save_desc)
  49 #pragma inline(ixgbe_get_context)
  50 #pragma inline(ixgbe_check_context)
  51 #pragma inline(ixgbe_fill_context)
  52 #endif
  53 
  54 /*
  55  * ixgbe_ring_tx
  56  *
  57  * To transmit one mblk through one specified ring.
  58  *
  59  * One mblk can consist of several fragments, each fragment
  60  * will be processed with different methods based on the size.
  61  * For the fragments with size less than the bcopy threshold,
  62  * they will be processed by using bcopy; otherwise, they will
  63  * be processed by using DMA binding.
  64  *
  65  * To process the mblk, a tx control block is got from the
  66  * free list. One tx control block contains one tx buffer, which
  67  * is used to copy mblk fragments' data; and one tx DMA handle,
  68  * which is used to bind a mblk fragment with DMA resource.
  69  *
  70  * Several small mblk fragments can be copied into one tx control
  71  * block's buffer, and then the buffer will be transmitted with
  72  * one tx descriptor.
  73  *
  74  * A large fragment only binds with one tx control block's DMA
  75  * handle, and it can span several tx descriptors for transmitting.
  76  *
  77  * So to transmit a packet (mblk), several tx control blocks can
  78  * be used. After the processing, those tx control blocks will
  79  * be put to the work list.
  80  */
  81 mblk_t *
  82 ixgbe_ring_tx(void *arg, mblk_t *mp)
  83 {
  84         ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
  85         ixgbe_t *ixgbe = tx_ring->ixgbe;
  86         tx_type_t current_flag, next_flag;
  87         uint32_t current_len, next_len;
  88         uint32_t desc_total;
  89         size_t mbsize;
  90         int desc_num;
  91         boolean_t copy_done, eop;
  92         mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
  93         tx_control_block_t *tcb;
  94         ixgbe_tx_context_t tx_context, *ctx;
  95         link_list_t pending_list;
  96         uint32_t len, hdr_frag_len, hdr_len;
  97         uint32_t copy_thresh;
  98         mblk_t *hdr_new_mp = NULL;
  99         mblk_t *hdr_pre_mp = NULL;
 100         mblk_t *hdr_nmp = NULL;
 101 
 102         ASSERT(mp->b_next == NULL);
 103 
 104         if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
 105             (ixgbe->ixgbe_state & IXGBE_ERROR) ||
 106             (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
 107             !(ixgbe->ixgbe_state & IXGBE_STARTED)) {
 108                 return (mp);
 109         }
 110 
 111         copy_thresh = ixgbe->tx_copy_thresh;
 112 
 113         /* Get the mblk size */
 114         mbsize = 0;
 115         for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
 116                 mbsize += MBLKL(nmp);
 117         }
 118 
 119         if (ixgbe->tx_hcksum_enable) {
 120                 /*
 121                  * Retrieve checksum context information from the mblk
 122                  * that will be used to decide whether/how to fill the
 123                  * context descriptor.
 124                  */
 125                 ctx = &tx_context;
 126                 if (ixgbe_get_context(mp, ctx) < 0) {
 127                         freemsg(mp);
 128                         return (NULL);
 129                 }
 130 
 131                 /*
 132                  * If the mblk size exceeds the max size ixgbe could
 133                  * process, then discard this mblk, and return NULL.
 134                  */
 135                 if ((ctx->lso_flag &&
 136                     ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
 137                     (!ctx->lso_flag &&
 138                     (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
 139                         freemsg(mp);
 140                         IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
 141                         return (NULL);
 142                 }
 143         } else {
 144                 ctx = NULL;
 145         }
 146 
 147         /*
 148          * Check and recycle tx descriptors.
 149          * The recycle threshold here should be selected carefully
 150          */
 151         if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
 152                 tx_ring->tx_recycle(tx_ring);
 153         }
 154 
 155         /*
 156          * After the recycling, if the tbd_free is less than the
 157          * overload_threshold, assert overload, return mp;
 158          * and we need to re-schedule the tx again.
 159          */
 160         if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
 161                 tx_ring->reschedule = B_TRUE;
 162                 IXGBE_DEBUG_STAT(tx_ring->stat_overload);
 163                 return (mp);
 164         }
 165 
 166         /*
 167          * The pending_list is a linked list that is used to save
 168          * the tx control blocks that have packet data processed
 169          * but have not put the data to the tx descriptor ring.
 170          * It is used to reduce the lock contention of the tx_lock.
 171          */
 172         LINK_LIST_INIT(&pending_list);
 173         desc_num = 0;
 174         desc_total = 0;
 175 
 176         /*
 177          * The software should guarantee LSO packet header(MAC+IP+TCP)
 178          * to be within one descriptor. Here we reallocate and refill the
 179          * the header if it's physical memory non-contiguous.
 180          */
 181         if ((ctx != NULL) && ctx->lso_flag) {
 182                 /* find the last fragment of the header */
 183                 len = MBLKL(mp);
 184                 ASSERT(len > 0);
 185                 hdr_nmp = mp;
 186                 hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
 187                 while (len < hdr_len) {
 188                         hdr_pre_mp = hdr_nmp;
 189                         hdr_nmp = hdr_nmp->b_cont;
 190                         len += MBLKL(hdr_nmp);
 191                 }
 192                 /*
 193                  * If the header and the payload are in different mblks,
 194                  * we simply force the header to be copied into pre-allocated
 195                  * page-aligned buffer.
 196                  */
 197                 if (len == hdr_len)
 198                         goto adjust_threshold;
 199 
 200                 hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
 201                 /*
 202                  * There are two cases we need to reallocate a mblk for the
 203                  * last header fragment:
 204                  * 1. the header is in multiple mblks and the last fragment
 205                  * share the same mblk with the payload
 206                  * 2. the header is in a single mblk shared with the payload
 207                  * and the header is physical memory non-contiguous
 208                  */
 209                 if ((hdr_nmp != mp) ||
 210                     (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
 211                     < hdr_len)) {
 212                         IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
 213                         /*
 214                          * reallocate the mblk for the last header fragment,
 215                          * expect to bcopy into pre-allocated page-aligned
 216                          * buffer
 217                          */
 218                         hdr_new_mp = allocb(hdr_frag_len, NULL);
 219                         if (!hdr_new_mp)
 220                                 return (mp);
 221                         bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
 222                             hdr_frag_len);
 223                         /* link the new header fragment with the other parts */
 224                         hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
 225                         hdr_new_mp->b_cont = hdr_nmp;
 226                         if (hdr_pre_mp)
 227                                 hdr_pre_mp->b_cont = hdr_new_mp;
 228                         else
 229                                 mp = hdr_new_mp;
 230                         hdr_nmp->b_rptr += hdr_frag_len;
 231                 }
 232 adjust_threshold:
 233                 /*
 234                  * adjust the bcopy threshhold to guarantee
 235                  * the header to use bcopy way
 236                  */
 237                 if (copy_thresh < hdr_len)
 238                         copy_thresh = hdr_len;
 239         }
 240 
 241         current_mp = mp;
 242         current_len = MBLKL(current_mp);
 243         /*
 244          * Decide which method to use for the first fragment
 245          */
 246         current_flag = (current_len <= copy_thresh) ?
 247             USE_COPY : USE_DMA;
 248         /*
 249          * If the mblk includes several contiguous small fragments,
 250          * they may be copied into one buffer. This flag is used to
 251          * indicate whether there are pending fragments that need to
 252          * be copied to the current tx buffer.
 253          *
 254          * If this flag is B_TRUE, it indicates that a new tx control
 255          * block is needed to process the next fragment using either
 256          * copy or DMA binding.
 257          *
 258          * Otherwise, it indicates that the next fragment will be
 259          * copied to the current tx buffer that is maintained by the
 260          * current tx control block. No new tx control block is needed.
 261          */
 262         copy_done = B_TRUE;
 263         while (current_mp) {
 264                 next_mp = current_mp->b_cont;
 265                 eop = (next_mp == NULL); /* Last fragment of the packet? */
 266                 next_len = eop ? 0: MBLKL(next_mp);
 267 
 268                 /*
 269                  * When the current fragment is an empty fragment, if
 270                  * the next fragment will still be copied to the current
 271                  * tx buffer, we cannot skip this fragment here. Because
 272                  * the copy processing is pending for completion. We have
 273                  * to process this empty fragment in the tx_copy routine.
 274                  *
 275                  * If the copy processing is completed or a DMA binding
 276                  * processing is just completed, we can just skip this
 277                  * empty fragment.
 278                  */
 279                 if ((current_len == 0) && (copy_done)) {
 280                         current_mp = next_mp;
 281                         current_len = next_len;
 282                         current_flag = (current_len <= copy_thresh) ?
 283                             USE_COPY : USE_DMA;
 284                         continue;
 285                 }
 286 
 287                 if (copy_done) {
 288                         /*
 289                          * Get a new tx control block from the free list
 290                          */
 291                         tcb = ixgbe_get_free_list(tx_ring);
 292 
 293                         if (tcb == NULL) {
 294                                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 295                                 goto tx_failure;
 296                         }
 297 
 298                         /*
 299                          * Push the tx control block to the pending list
 300                          * to avoid using lock too early
 301                          */
 302                         LIST_PUSH_TAIL(&pending_list, &tcb->link);
 303                 }
 304 
 305                 if (current_flag == USE_COPY) {
 306                         /*
 307                          * Check whether to use bcopy or DMA binding to process
 308                          * the next fragment, and if using bcopy, whether we
 309                          * need to continue copying the next fragment into the
 310                          * current tx buffer.
 311                          */
 312                         ASSERT((tcb->tx_buf.len + current_len) <=
 313                             tcb->tx_buf.size);
 314 
 315                         if (eop) {
 316                                 /*
 317                                  * This is the last fragment of the packet, so
 318                                  * the copy processing will be completed with
 319                                  * this fragment.
 320                                  */
 321                                 next_flag = USE_NONE;
 322                                 copy_done = B_TRUE;
 323                         } else if ((tcb->tx_buf.len + current_len + next_len) >
 324                             tcb->tx_buf.size) {
 325                                 /*
 326                                  * If the next fragment is too large to be
 327                                  * copied to the current tx buffer, we need
 328                                  * to complete the current copy processing.
 329                                  */
 330                                 next_flag = (next_len > copy_thresh) ?
 331                                     USE_DMA: USE_COPY;
 332                                 copy_done = B_TRUE;
 333                         } else if (next_len > copy_thresh) {
 334                                 /*
 335                                  * The next fragment needs to be processed with
 336                                  * DMA binding. So the copy prcessing will be
 337                                  * completed with the current fragment.
 338                                  */
 339                                 next_flag = USE_DMA;
 340                                 copy_done = B_TRUE;
 341                         } else {
 342                                 /*
 343                                  * Continue to copy the next fragment to the
 344                                  * current tx buffer.
 345                                  */
 346                                 next_flag = USE_COPY;
 347                                 copy_done = B_FALSE;
 348                         }
 349 
 350                         desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
 351                             current_len, copy_done);
 352                 } else {
 353                         /*
 354                          * Check whether to use bcopy or DMA binding to process
 355                          * the next fragment.
 356                          */
 357                         next_flag = (next_len > copy_thresh) ?
 358                             USE_DMA: USE_COPY;
 359                         ASSERT(copy_done == B_TRUE);
 360 
 361                         desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
 362                             current_len);
 363                 }
 364 
 365                 if (desc_num > 0)
 366                         desc_total += desc_num;
 367                 else if (desc_num < 0)
 368                         goto tx_failure;
 369 
 370                 current_mp = next_mp;
 371                 current_len = next_len;
 372                 current_flag = next_flag;
 373         }
 374 
 375         /*
 376          * Attach the mblk to the last tx control block
 377          */
 378         ASSERT(tcb);
 379         ASSERT(tcb->mp == NULL);
 380         tcb->mp = mp;
 381 
 382         /*
 383          * 82598/82599 chipset has a limitation that no more than 32 tx
 384          * descriptors can be transmited out at one time.
 385          *
 386          * Here is a workaround for it: pull up the mblk then send it
 387          * out with bind way. By doing so, no more than MAX_COOKIE (18)
 388          * descriptors is needed.
 389          */
 390         if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
 391                 IXGBE_DEBUG_STAT(tx_ring->stat_break_tbd_limit);
 392 
 393                 /*
 394                  * Discard the mblk and free the used resources
 395                  */
 396                 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 397                 while (tcb) {
 398                         tcb->mp = NULL;
 399                         ixgbe_free_tcb(tcb);
 400                         tcb = (tx_control_block_t *)
 401                             LIST_GET_NEXT(&pending_list, &tcb->link);
 402                 }
 403 
 404                 /*
 405                  * Return the tx control blocks in the pending list to
 406                  * the free list.
 407                  */
 408                 ixgbe_put_free_list(tx_ring, &pending_list);
 409 
 410                 /*
 411                  * pull up the mblk and send it out with bind way
 412                  */
 413                 if ((pull_mp = msgpullup(mp, -1)) == NULL) {
 414                         tx_ring->reschedule = B_TRUE;
 415 
 416                         /*
 417                          * If new mblk has been allocted for the last header
 418                          * fragment of a LSO packet, we should restore the
 419                          * modified mp.
 420                          */
 421                         if (hdr_new_mp) {
 422                                 hdr_new_mp->b_cont = NULL;
 423                                 freeb(hdr_new_mp);
 424                                 hdr_nmp->b_rptr -= hdr_frag_len;
 425                                 if (hdr_pre_mp)
 426                                         hdr_pre_mp->b_cont = hdr_nmp;
 427                                 else
 428                                         mp = hdr_nmp;
 429                         }
 430                         return (mp);
 431                 }
 432 
 433                 LINK_LIST_INIT(&pending_list);
 434                 desc_total = 0;
 435 
 436                 /*
 437                  * if the packet is a LSO packet, we simply
 438                  * transmit the header in one descriptor using the copy way
 439                  */
 440                 if ((ctx != NULL) && ctx->lso_flag) {
 441                         hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
 442                             ctx->l4_hdr_len;
 443 
 444                         tcb = ixgbe_get_free_list(tx_ring);
 445                         if (tcb == NULL) {
 446                                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 447                                 goto tx_failure;
 448                         }
 449                         desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
 450                             hdr_len, B_TRUE);
 451                         LIST_PUSH_TAIL(&pending_list, &tcb->link);
 452                         desc_total  += desc_num;
 453 
 454                         pull_mp->b_rptr += hdr_len;
 455                 }
 456 
 457                 tcb = ixgbe_get_free_list(tx_ring);
 458                 if (tcb == NULL) {
 459                         IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 460                         goto tx_failure;
 461                 }
 462                 if ((ctx != NULL) && ctx->lso_flag) {
 463                         desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 464                             mbsize - hdr_len);
 465                 } else {
 466                         desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 467                             mbsize);
 468                 }
 469                 if (desc_num < 0) {
 470                         goto tx_failure;
 471                 }
 472                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
 473 
 474                 desc_total += desc_num;
 475                 tcb->mp = pull_mp;
 476         }
 477 
 478         /*
 479          * Before fill the tx descriptor ring with the data, we need to
 480          * ensure there are adequate free descriptors for transmit
 481          * (including one context descriptor).
 482          * Do not use up all the tx descriptors.
 483          * Otherwise tx recycle will fail and cause false hang.
 484          */
 485         if (tx_ring->tbd_free <= (desc_total + 1)) {
 486                 tx_ring->tx_recycle(tx_ring);
 487         }
 488 
 489         mutex_enter(&tx_ring->tx_lock);
 490         /*
 491          * If the number of free tx descriptors is not enough for transmit
 492          * then return mp.
 493          *
 494          * Note: we must put this check under the mutex protection to
 495          * ensure the correctness when multiple threads access it in
 496          * parallel.
 497          */
 498         if (tx_ring->tbd_free <= (desc_total + 1)) {
 499                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
 500                 mutex_exit(&tx_ring->tx_lock);
 501                 goto tx_failure;
 502         }
 503 
 504         desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
 505             mbsize);
 506 
 507         ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
 508 
 509         tx_ring->stat_obytes += mbsize;
 510         tx_ring->stat_opackets ++;
 511 
 512         mutex_exit(&tx_ring->tx_lock);
 513 
 514         /*
 515          * now that the transmission succeeds, need to free the original
 516          * mp if we used the pulling up mblk for transmission.
 517          */
 518         if (pull_mp) {
 519                 freemsg(mp);
 520         }
 521 
 522         return (NULL);
 523 
 524 tx_failure:
 525         /*
 526          * If transmission fails, need to free the pulling up mblk.
 527          */
 528         if (pull_mp) {
 529                 freemsg(pull_mp);
 530         }
 531 
 532         /*
 533          * If new mblk has been allocted for the last header
 534          * fragment of a LSO packet, we should restore the
 535          * modified mp.
 536          */
 537         if (hdr_new_mp) {
 538                 hdr_new_mp->b_cont = NULL;
 539                 freeb(hdr_new_mp);
 540                 hdr_nmp->b_rptr -= hdr_frag_len;
 541                 if (hdr_pre_mp)
 542                         hdr_pre_mp->b_cont = hdr_nmp;
 543                 else
 544                         mp = hdr_nmp;
 545         }
 546         /*
 547          * Discard the mblk and free the used resources
 548          */
 549         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 550         while (tcb) {
 551                 tcb->mp = NULL;
 552 
 553                 ixgbe_free_tcb(tcb);
 554 
 555                 tcb = (tx_control_block_t *)
 556                     LIST_GET_NEXT(&pending_list, &tcb->link);
 557         }
 558 
 559         /*
 560          * Return the tx control blocks in the pending list to the free list.
 561          */
 562         ixgbe_put_free_list(tx_ring, &pending_list);
 563 
 564         /* Transmit failed, do not drop the mblk, rechedule the transmit */
 565         tx_ring->reschedule = B_TRUE;
 566 
 567         return (mp);
 568 }
 569 
 570 /*
 571  * ixgbe_tx_copy
 572  *
 573  * Copy the mblk fragment to the pre-allocated tx buffer
 574  */
 575 static int
 576 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 577     uint32_t len, boolean_t copy_done)
 578 {
 579         dma_buffer_t *tx_buf;
 580         uint32_t desc_num;
 581         _NOTE(ARGUNUSED(tx_ring));
 582 
 583         tx_buf = &tcb->tx_buf;
 584 
 585         /*
 586          * Copy the packet data of the mblk fragment into the
 587          * pre-allocated tx buffer, which is maintained by the
 588          * tx control block.
 589          *
 590          * Several mblk fragments can be copied into one tx buffer.
 591          * The destination address of the current copied fragment in
 592          * the tx buffer is next to the end of the previous copied
 593          * fragment.
 594          */
 595         if (len > 0) {
 596                 bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
 597 
 598                 tx_buf->len += len;
 599                 tcb->frag_num++;
 600         }
 601 
 602         desc_num = 0;
 603 
 604         /*
 605          * If it is the last fragment copied to the current tx buffer,
 606          * in other words, if there's no remaining fragment or the remaining
 607          * fragment requires a new tx control block to process, we need to
 608          * complete the current copy processing by syncing up the current
 609          * DMA buffer and saving the descriptor data.
 610          */
 611         if (copy_done) {
 612                 /*
 613                  * Sync the DMA buffer of the packet data
 614                  */
 615                 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
 616 
 617                 tcb->tx_type = USE_COPY;
 618 
 619                 /*
 620                  * Save the address and length to the private data structure
 621                  * of the tx control block, which will be used to fill the
 622                  * tx descriptor ring after all the fragments are processed.
 623                  */
 624                 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
 625                 desc_num++;
 626         }
 627 
 628         return (desc_num);
 629 }
 630 
 631 /*
 632  * ixgbe_tx_bind
 633  *
 634  * Bind the mblk fragment with DMA
 635  */
 636 static int
 637 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 638     uint32_t len)
 639 {
 640         int status, i;
 641         ddi_dma_cookie_t dma_cookie;
 642         uint_t ncookies;
 643         int desc_num;
 644 
 645         /*
 646          * Use DMA binding to process the mblk fragment
 647          */
 648         status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
 649             (caddr_t)mp->b_rptr, len,
 650             DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
 651             0, &dma_cookie, &ncookies);
 652 
 653         if (status != DDI_DMA_MAPPED) {
 654                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
 655                 return (-1);
 656         }
 657 
 658         tcb->frag_num++;
 659         tcb->tx_type = USE_DMA;
 660         /*
 661          * Each fragment can span several cookies. One cookie will have
 662          * one tx descriptor to transmit.
 663          */
 664         desc_num = 0;
 665         for (i = ncookies; i > 0; i--) {
 666                 /*
 667                  * Save the address and length to the private data structure
 668                  * of the tx control block, which will be used to fill the
 669                  * tx descriptor ring after all the fragments are processed.
 670                  */
 671                 ixgbe_save_desc(tcb,
 672                     dma_cookie.dmac_laddress,
 673                     dma_cookie.dmac_size);
 674 
 675                 desc_num++;
 676 
 677                 if (i > 1)
 678                         ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
 679         }
 680 
 681         return (desc_num);
 682 }
 683 
 684 /*
 685  * ixgbe_get_context
 686  *
 687  * Get the context information from the mblk
 688  */
 689 static int
 690 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 691 {
 692         uint32_t start;
 693         uint32_t hckflags;
 694         uint32_t lsoflags;
 695         uint32_t mss;
 696         uint32_t len;
 697         uint32_t size;
 698         uint32_t offset;
 699         unsigned char *pos;
 700         ushort_t etype;
 701         uint32_t mac_hdr_len;
 702         uint32_t l4_proto;
 703         uint32_t l4_hdr_len;
 704 
 705         ASSERT(mp != NULL);
 706 
 707         mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
 708         bzero(ctx, sizeof (ixgbe_tx_context_t));
 709 
 710         if (hckflags == 0) {
 711                 return (0);
 712         }
 713 
 714         ctx->hcksum_flags = hckflags;
 715 
 716         mac_lso_get(mp, &mss, &lsoflags);
 717         ctx->mss = mss;
 718         ctx->lso_flag = (lsoflags == HW_LSO);
 719 
 720         /*
 721          * LSO relies on tx h/w checksum, so here will drop the package
 722          * if h/w checksum flag is not declared.
 723          */
 724         if (ctx->lso_flag) {
 725                 if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
 726                     (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
 727                         IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w "
 728                             "checksum flags are not specified when doing LSO");
 729                         return (-1);
 730                 }
 731         }
 732 
 733         etype = 0;
 734         mac_hdr_len = 0;
 735         l4_proto = 0;
 736 
 737         /*
 738          * Firstly get the position of the ether_type/ether_tpid.
 739          * Here we don't assume the ether (VLAN) header is fully included
 740          * in one mblk fragment, so we go thourgh the fragments to parse
 741          * the ether type.
 742          */
 743         size = len = MBLKL(mp);
 744         offset = offsetof(struct ether_header, ether_type);
 745         while (size <= offset) {
 746                 mp = mp->b_cont;
 747                 ASSERT(mp != NULL);
 748                 len = MBLKL(mp);
 749                 size += len;
 750         }
 751         pos = mp->b_rptr + offset + len - size;
 752 
 753         etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 754         if (etype == ETHERTYPE_VLAN) {
 755                 /*
 756                  * Get the position of the ether_type in VLAN header
 757                  */
 758                 offset = offsetof(struct ether_vlan_header, ether_type);
 759                 while (size <= offset) {
 760                         mp = mp->b_cont;
 761                         ASSERT(mp != NULL);
 762                         len = MBLKL(mp);
 763                         size += len;
 764                 }
 765                 pos = mp->b_rptr + offset + len - size;
 766 
 767                 etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 768                 mac_hdr_len = sizeof (struct ether_vlan_header);
 769         } else {
 770                 mac_hdr_len = sizeof (struct ether_header);
 771         }
 772 
 773         /*
 774          * Here we don't assume the IP(V6) header is fully included in
 775          * one mblk fragment.
 776          */
 777         switch (etype) {
 778         case ETHERTYPE_IP:
 779                 if (ctx->lso_flag) {
 780                         offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
 781                         while (size <= offset) {
 782                                 mp = mp->b_cont;
 783                                 ASSERT(mp != NULL);
 784                                 len = MBLKL(mp);
 785                                 size += len;
 786                         }
 787                         pos = mp->b_rptr + offset + len - size;
 788                         *((uint16_t *)(uintptr_t)(pos)) = 0;
 789 
 790                         offset = offsetof(ipha_t, ipha_hdr_checksum) +
 791                             mac_hdr_len;
 792                         while (size <= offset) {
 793                                 mp = mp->b_cont;
 794                                 ASSERT(mp != NULL);
 795                                 len = MBLKL(mp);
 796                                 size += len;
 797                         }
 798                         pos = mp->b_rptr + offset + len - size;
 799                         *((uint16_t *)(uintptr_t)(pos)) = 0;
 800 
 801                         /*
 802                          * To perform ixgbe LSO, here also need to fill
 803                          * the tcp checksum field of the packet with the
 804                          * following pseudo-header checksum:
 805                          * (ip_source_addr, ip_destination_addr, l4_proto)
 806                          * Currently the tcp/ip stack has done it.
 807                          */
 808                 }
 809 
 810                 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
 811                 while (size <= offset) {
 812                         mp = mp->b_cont;
 813                         ASSERT(mp != NULL);
 814                         len = MBLKL(mp);
 815                         size += len;
 816                 }
 817                 pos = mp->b_rptr + offset + len - size;
 818 
 819                 l4_proto = *(uint8_t *)pos;
 820                 break;
 821         case ETHERTYPE_IPV6:
 822                 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
 823                 while (size <= offset) {
 824                         mp = mp->b_cont;
 825                         ASSERT(mp != NULL);
 826                         len = MBLKL(mp);
 827                         size += len;
 828                 }
 829                 pos = mp->b_rptr + offset + len - size;
 830 
 831                 l4_proto = *(uint8_t *)pos;
 832                 break;
 833         default:
 834                 /* Unrecoverable error */
 835                 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
 836                 return (-2);
 837         }
 838 
 839         if (ctx->lso_flag) {
 840                 offset = mac_hdr_len + start;
 841                 while (size <= offset) {
 842                         mp = mp->b_cont;
 843                         ASSERT(mp != NULL);
 844                         len = MBLKL(mp);
 845                         size += len;
 846                 }
 847                 pos = mp->b_rptr + offset + len - size;
 848 
 849                 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
 850         } else {
 851                 /*
 852                  * l4 header length is only required for LSO
 853                  */
 854                 l4_hdr_len = 0;
 855         }
 856 
 857         ctx->mac_hdr_len = mac_hdr_len;
 858         ctx->ip_hdr_len = start;
 859         ctx->l4_proto = l4_proto;
 860         ctx->l4_hdr_len = l4_hdr_len;
 861 
 862         return (0);
 863 }
 864 
 865 /*
 866  * ixgbe_check_context
 867  *
 868  * Check if a new context descriptor is needed
 869  */
 870 static boolean_t
 871 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
 872 {
 873         ixgbe_tx_context_t *last;
 874 
 875         if (ctx == NULL)
 876                 return (B_FALSE);
 877 
 878         /*
 879          * Compare the context data retrieved from the mblk and the
 880          * stored data of the last context descriptor. The data need
 881          * to be checked are:
 882          *      hcksum_flags
 883          *      l4_proto
 884          *      mac_hdr_len
 885          *      ip_hdr_len
 886          *      lso_flag
 887          *      mss (only checked for LSO)
 888          *      l4_hr_len (only checked for LSO)
 889          * Either one of the above data is changed, a new context descriptor
 890          * will be needed.
 891          */
 892         last = &tx_ring->tx_context;
 893 
 894         if ((ctx->hcksum_flags != last->hcksum_flags) ||
 895             (ctx->l4_proto != last->l4_proto) ||
 896             (ctx->mac_hdr_len != last->mac_hdr_len) ||
 897             (ctx->ip_hdr_len != last->ip_hdr_len) ||
 898             (ctx->lso_flag != last->lso_flag) ||
 899             (ctx->lso_flag && ((ctx->mss != last->mss) ||
 900             (ctx->l4_hdr_len != last->l4_hdr_len)))) {
 901                 return (B_TRUE);
 902         }
 903 
 904         return (B_FALSE);
 905 }
 906 
 907 /*
 908  * ixgbe_fill_context
 909  *
 910  * Fill the context descriptor with hardware checksum informations
 911  */
 912 static void
 913 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
 914     ixgbe_tx_context_t *ctx)
 915 {
 916         /*
 917          * Fill the context descriptor with the checksum
 918          * context information we've got.
 919          */
 920         ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
 921         ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
 922             IXGBE_ADVTXD_MACLEN_SHIFT;
 923 
 924         ctx_tbd->type_tucmd_mlhl =
 925             IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
 926 
 927         if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
 928                 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
 929 
 930         if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
 931                 switch (ctx->l4_proto) {
 932                 case IPPROTO_TCP:
 933                         ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
 934                         break;
 935                 case IPPROTO_UDP:
 936                         /*
 937                          * We don't have to explicitly set:
 938                          *      ctx_tbd->type_tucmd_mlhl |=
 939                          *          IXGBE_ADVTXD_TUCMD_L4T_UDP;
 940                          * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
 941                          */
 942                         break;
 943                 default:
 944                         /* Unrecoverable error */
 945                         IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
 946                         break;
 947                 }
 948         }
 949 
 950         ctx_tbd->seqnum_seed = 0;
 951 
 952         if (ctx->lso_flag) {
 953                 ctx_tbd->mss_l4len_idx =
 954                     (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
 955                     (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
 956         } else {
 957                 ctx_tbd->mss_l4len_idx = 0;
 958         }
 959 }
 960 
 961 /*
 962  * ixgbe_tx_fill_ring
 963  *
 964  * Fill the tx descriptor ring with the data
 965  */
 966 static int
 967 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
 968     ixgbe_tx_context_t *ctx, size_t mbsize)
 969 {
 970         struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
 971         boolean_t load_context;
 972         uint32_t index, tcb_index, desc_num;
 973         union ixgbe_adv_tx_desc *tbd, *first_tbd;
 974         tx_control_block_t *tcb, *first_tcb;
 975         uint32_t hcksum_flags;
 976         int i;
 977 
 978         ASSERT(mutex_owned(&tx_ring->tx_lock));
 979 
 980         tbd = NULL;
 981         first_tbd = NULL;
 982         first_tcb = NULL;
 983         desc_num = 0;
 984         hcksum_flags = 0;
 985         load_context = B_FALSE;
 986 
 987         /*
 988          * Get the index of the first tx descriptor that will be filled,
 989          * and the index of the first work list item that will be attached
 990          * with the first used tx control block in the pending list.
 991          * Note: the two indexes are the same.
 992          */
 993         index = tx_ring->tbd_tail;
 994         tcb_index = tx_ring->tbd_tail;
 995 
 996         if (ctx != NULL) {
 997                 hcksum_flags = ctx->hcksum_flags;
 998 
 999                 /*
1000                  * Check if a new context descriptor is needed for this packet
1001                  */
1002                 load_context = ixgbe_check_context(tx_ring, ctx);
1003 
1004                 if (load_context) {
1005                         tbd = &tx_ring->tbd_ring[index];
1006 
1007                         /*
1008                          * Fill the context descriptor with the
1009                          * hardware checksum offload informations.
1010                          */
1011                         ixgbe_fill_context(
1012                             (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
1013 
1014                         index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1015                         desc_num++;
1016 
1017                         /*
1018                          * Store the checksum context data if
1019                          * a new context descriptor is added
1020                          */
1021                         tx_ring->tx_context = *ctx;
1022                 }
1023         }
1024 
1025         first_tbd = &tx_ring->tbd_ring[index];
1026 
1027         /*
1028          * Fill tx data descriptors with the data saved in the pending list.
1029          * The tx control blocks in the pending list are added to the work list
1030          * at the same time.
1031          *
1032          * The work list is strictly 1:1 corresponding to the descriptor ring.
1033          * One item of the work list corresponds to one tx descriptor. Because
1034          * one tx control block can span multiple tx descriptors, the tx
1035          * control block will be added to the first work list item that
1036          * corresponds to the first tx descriptor generated from that tx
1037          * control block.
1038          */
1039         tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1040         first_tcb = tcb;
1041         while (tcb != NULL) {
1042 
1043                 for (i = 0; i < tcb->desc_num; i++) {
1044                         tbd = &tx_ring->tbd_ring[index];
1045 
1046                         tbd->read.buffer_addr = tcb->desc[i].address;
1047                         tbd->read.cmd_type_len = tcb->desc[i].length;
1048 
1049                         tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
1050                             | IXGBE_ADVTXD_DTYP_DATA;
1051 
1052                         tbd->read.olinfo_status = 0;
1053 
1054                         index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1055                         desc_num++;
1056                 }
1057 
1058                 /*
1059                  * Add the tx control block to the work list
1060                  */
1061                 ASSERT(tx_ring->work_list[tcb_index] == NULL);
1062                 tx_ring->work_list[tcb_index] = tcb;
1063 
1064                 tcb_index = index;
1065                 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1066         }
1067 
1068         if (load_context) {
1069                 /*
1070                  * Count the context descriptor for
1071                  * the first tx control block.
1072                  */
1073                 first_tcb->desc_num++;
1074         }
1075         first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
1076 
1077         /*
1078          * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1079          * valid in the first descriptor of the packet.
1080          * Setting paylen in every first_tbd for all parts.
1081          * 82599 requires the packet length in paylen field with or without
1082          * LSO and 82598 will ignore it in non-LSO mode.
1083          */
1084         ASSERT(first_tbd != NULL);
1085         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1086 
1087         switch (hw->mac.type) {
1088         case ixgbe_mac_82598EB:
1089                 if (ctx != NULL && ctx->lso_flag) {
1090                         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1091                         first_tbd->read.olinfo_status |=
1092                             (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1093                             - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1094                 }
1095                 break;
1096 
1097         case ixgbe_mac_82599EB:
1098                 if (ctx != NULL && ctx->lso_flag) {
1099                         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1100                         first_tbd->read.olinfo_status |=
1101                             (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1102                             - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1103                 } else {
1104                         first_tbd->read.olinfo_status |=
1105                             (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1106                 }
1107                 break;
1108 
1109         default:
1110                 break;
1111         }
1112 
1113         /* Set hardware checksum bits */
1114         if (hcksum_flags != 0) {
1115                 if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1116                         first_tbd->read.olinfo_status |=
1117                             IXGBE_ADVTXD_POPTS_IXSM;
1118                 if (hcksum_flags & HCK_PARTIALCKSUM)
1119                         first_tbd->read.olinfo_status |=
1120                             IXGBE_ADVTXD_POPTS_TXSM;
1121         }
1122 
1123         /*
1124          * The last descriptor of packet needs End Of Packet (EOP),
1125          * and Report Status (RS) bits set
1126          */
1127         ASSERT(tbd != NULL);
1128         tbd->read.cmd_type_len |=
1129             IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1130 
1131         /*
1132          * Sync the DMA buffer of the tx descriptor ring
1133          */
1134         DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1135 
1136         /*
1137          * Update the number of the free tx descriptors.
1138          * The mutual exclusion between the transmission and the recycling
1139          * (for the tx descriptor ring and the work list) is implemented
1140          * with the atomic operation on the number of the free tx descriptors.
1141          *
1142          * Note: we should always decrement the counter tbd_free before
1143          * advancing the hardware TDT pointer to avoid the race condition -
1144          * before the counter tbd_free is decremented, the transmit of the
1145          * tx descriptors has done and the counter tbd_free is increased by
1146          * the tx recycling.
1147          */
1148         i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1149         ASSERT(i >= 0);
1150 
1151         tx_ring->tbd_tail = index;
1152 
1153         /*
1154          * Advance the hardware TDT pointer of the tx descriptor ring
1155          */
1156         IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1157 
1158         if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1159             DDI_FM_OK) {
1160                 ddi_fm_service_impact(tx_ring->ixgbe->dip,
1161                     DDI_SERVICE_DEGRADED);
1162                 atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1163         }
1164 
1165         return (desc_num);
1166 }
1167 
1168 /*
1169  * ixgbe_save_desc
1170  *
1171  * Save the address/length pair to the private array
1172  * of the tx control block. The address/length pairs
1173  * will be filled into the tx descriptor ring later.
1174  */
1175 static void
1176 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1177 {
1178         sw_desc_t *desc;
1179 
1180         desc = &tcb->desc[tcb->desc_num];
1181         desc->address = address;
1182         desc->length = length;
1183 
1184         tcb->desc_num++;
1185 }
1186 
1187 /*
1188  * ixgbe_tx_recycle_legacy
1189  *
1190  * Recycle the tx descriptors and tx control blocks.
1191  *
1192  * The work list is traversed to check if the corresponding
1193  * tx descriptors have been transmitted. If so, the resources
1194  * bound to the tx control blocks will be freed, and those
1195  * tx control blocks will be returned to the free list.
1196  */
1197 uint32_t
1198 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1199 {
1200         uint32_t index, last_index, prev_index;
1201         int desc_num;
1202         boolean_t desc_done;
1203         tx_control_block_t *tcb;
1204         link_list_t pending_list;
1205         ixgbe_t *ixgbe = tx_ring->ixgbe;
1206 
1207         mutex_enter(&tx_ring->recycle_lock);
1208 
1209         ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1210 
1211         if (tx_ring->tbd_free == tx_ring->ring_size) {
1212                 tx_ring->recycle_fail = 0;
1213                 tx_ring->stall_watchdog = 0;
1214                 if (tx_ring->reschedule) {
1215                         tx_ring->reschedule = B_FALSE;
1216                         mac_tx_ring_update(ixgbe->mac_hdl,
1217                             tx_ring->ring_handle);
1218                 }
1219                 mutex_exit(&tx_ring->recycle_lock);
1220                 return (0);
1221         }
1222 
1223         /*
1224          * Sync the DMA buffer of the tx descriptor ring
1225          */
1226         DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1227 
1228         if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1229                 mutex_exit(&tx_ring->recycle_lock);
1230                 ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1231                 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1232                 return (0);
1233         }
1234 
1235         LINK_LIST_INIT(&pending_list);
1236         desc_num = 0;
1237         index = tx_ring->tbd_head;   /* Index of next tbd/tcb to recycle */
1238 
1239         tcb = tx_ring->work_list[index];
1240         ASSERT(tcb != NULL);
1241 
1242         while (tcb != NULL) {
1243                 /*
1244                  * Get the last tx descriptor of this packet.
1245                  * If the last tx descriptor is done, then
1246                  * we can recycle all descriptors of a packet
1247                  * which usually includes several tx control blocks.
1248                  * For 82599, LSO descriptors can not be recycled
1249                  * unless the whole packet's transmission is done.
1250                  * That's why packet level recycling is used here.
1251                  * For 82598, there's not such limit.
1252                  */
1253                 last_index = tcb->last_index;
1254                 /*
1255                  * MAX_TX_RING_SIZE is used to judge whether
1256                  * the index is a valid value or not.
1257                  */
1258                 if (last_index == MAX_TX_RING_SIZE)
1259                         break;
1260 
1261                 /*
1262                  * Check if the Descriptor Done bit is set
1263                  */
1264                 desc_done = tx_ring->tbd_ring[last_index].wb.status &
1265                     IXGBE_TXD_STAT_DD;
1266                 if (desc_done) {
1267                         /*
1268                          * recycle all descriptors of the packet
1269                          */
1270                         while (tcb != NULL) {
1271                                 /*
1272                                  * Strip off the tx control block from
1273                                  * the work list, and add it to the
1274                                  * pending list.
1275                                  */
1276                                 tx_ring->work_list[index] = NULL;
1277                                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1278 
1279                                 /*
1280                                  * Count the total number of the tx
1281                                  * descriptors recycled
1282                                  */
1283                                 desc_num += tcb->desc_num;
1284 
1285                                 index = NEXT_INDEX(index, tcb->desc_num,
1286                                     tx_ring->ring_size);
1287 
1288                                 tcb = tx_ring->work_list[index];
1289 
1290                                 prev_index = PREV_INDEX(index, 1,
1291                                     tx_ring->ring_size);
1292                                 if (prev_index == last_index)
1293                                         break;
1294                         }
1295                 } else {
1296                         break;
1297                 }
1298         }
1299 
1300         /*
1301          * If no tx descriptors are recycled, no need to do more processing
1302          */
1303         if (desc_num == 0) {
1304                 tx_ring->recycle_fail++;
1305                 mutex_exit(&tx_ring->recycle_lock);
1306                 return (0);
1307         }
1308 
1309         tx_ring->recycle_fail = 0;
1310         tx_ring->stall_watchdog = 0;
1311 
1312         /*
1313          * Update the head index of the tx descriptor ring
1314          */
1315         tx_ring->tbd_head = index;
1316 
1317         /*
1318          * Update the number of the free tx descriptors with atomic operations
1319          */
1320         atomic_add_32(&tx_ring->tbd_free, desc_num);
1321 
1322         if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1323             (tx_ring->reschedule)) {
1324                 tx_ring->reschedule = B_FALSE;
1325                 mac_tx_ring_update(ixgbe->mac_hdl,
1326                     tx_ring->ring_handle);
1327         }
1328         mutex_exit(&tx_ring->recycle_lock);
1329 
1330         /*
1331          * Free the resources used by the tx control blocks
1332          * in the pending list
1333          */
1334         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1335         while (tcb != NULL) {
1336                 /*
1337                  * Release the resources occupied by the tx control block
1338                  */
1339                 ixgbe_free_tcb(tcb);
1340 
1341                 tcb = (tx_control_block_t *)
1342                     LIST_GET_NEXT(&pending_list, &tcb->link);
1343         }
1344 
1345         /*
1346          * Add the tx control blocks in the pending list to the free list.
1347          */
1348         ixgbe_put_free_list(tx_ring, &pending_list);
1349 
1350         return (desc_num);
1351 }
1352 
1353 /*
1354  * ixgbe_tx_recycle_head_wb
1355  *
1356  * Check the head write-back, and recycle all the transmitted
1357  * tx descriptors and tx control blocks.
1358  */
1359 uint32_t
1360 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1361 {
1362         uint32_t index;
1363         uint32_t head_wb;
1364         int desc_num;
1365         tx_control_block_t *tcb;
1366         link_list_t pending_list;
1367         ixgbe_t *ixgbe = tx_ring->ixgbe;
1368 
1369         mutex_enter(&tx_ring->recycle_lock);
1370 
1371         ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1372 
1373         if (tx_ring->tbd_free == tx_ring->ring_size) {
1374                 tx_ring->recycle_fail = 0;
1375                 tx_ring->stall_watchdog = 0;
1376                 if (tx_ring->reschedule) {
1377                         tx_ring->reschedule = B_FALSE;
1378                         mac_tx_ring_update(ixgbe->mac_hdl,
1379                             tx_ring->ring_handle);
1380                 }
1381                 mutex_exit(&tx_ring->recycle_lock);
1382                 return (0);
1383         }
1384 
1385         /*
1386          * Sync the DMA buffer of the tx descriptor ring
1387          *
1388          * Note: For head write-back mode, the tx descriptors will not
1389          * be written back, but the head write-back value is stored at
1390          * the last extra tbd at the end of the DMA area, we still need
1391          * to sync the head write-back value for kernel.
1392          *
1393          * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1394          */
1395         (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1396             sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1397             sizeof (uint32_t),
1398             DDI_DMA_SYNC_FORKERNEL);
1399 
1400         if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1401                 mutex_exit(&tx_ring->recycle_lock);
1402                 ddi_fm_service_impact(ixgbe->dip,
1403                     DDI_SERVICE_DEGRADED);
1404                 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1405                 return (0);
1406         }
1407 
1408         LINK_LIST_INIT(&pending_list);
1409         desc_num = 0;
1410         index = tx_ring->tbd_head;   /* Next index to clean */
1411 
1412         /*
1413          * Get the value of head write-back
1414          */
1415         head_wb = *tx_ring->tbd_head_wb;
1416         while (index != head_wb) {
1417                 tcb = tx_ring->work_list[index];
1418                 ASSERT(tcb != NULL);
1419 
1420                 if (OFFSET(index, head_wb, tx_ring->ring_size) <
1421                     tcb->desc_num) {
1422                         /*
1423                          * The current tx control block is not
1424                          * completely transmitted, stop recycling
1425                          */
1426                         break;
1427                 }
1428 
1429                 /*
1430                  * Strip off the tx control block from the work list,
1431                  * and add it to the pending list.
1432                  */
1433                 tx_ring->work_list[index] = NULL;
1434                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1435 
1436                 /*
1437                  * Advance the index of the tx descriptor ring
1438                  */
1439                 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1440 
1441                 /*
1442                  * Count the total number of the tx descriptors recycled
1443                  */
1444                 desc_num += tcb->desc_num;
1445         }
1446 
1447         /*
1448          * If no tx descriptors are recycled, no need to do more processing
1449          */
1450         if (desc_num == 0) {
1451                 tx_ring->recycle_fail++;
1452                 mutex_exit(&tx_ring->recycle_lock);
1453                 return (0);
1454         }
1455 
1456         tx_ring->recycle_fail = 0;
1457         tx_ring->stall_watchdog = 0;
1458 
1459         /*
1460          * Update the head index of the tx descriptor ring
1461          */
1462         tx_ring->tbd_head = index;
1463 
1464         /*
1465          * Update the number of the free tx descriptors with atomic operations
1466          */
1467         atomic_add_32(&tx_ring->tbd_free, desc_num);
1468 
1469         if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1470             (tx_ring->reschedule)) {
1471                 tx_ring->reschedule = B_FALSE;
1472                 mac_tx_ring_update(ixgbe->mac_hdl,
1473                     tx_ring->ring_handle);
1474         }
1475         mutex_exit(&tx_ring->recycle_lock);
1476 
1477         /*
1478          * Free the resources used by the tx control blocks
1479          * in the pending list
1480          */
1481         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1482         while (tcb) {
1483                 /*
1484                  * Release the resources occupied by the tx control block
1485                  */
1486                 ixgbe_free_tcb(tcb);
1487 
1488                 tcb = (tx_control_block_t *)
1489                     LIST_GET_NEXT(&pending_list, &tcb->link);
1490         }
1491 
1492         /*
1493          * Add the tx control blocks in the pending list to the free list.
1494          */
1495         ixgbe_put_free_list(tx_ring, &pending_list);
1496 
1497         return (desc_num);
1498 }
1499 
1500 /*
1501  * ixgbe_free_tcb - free up the tx control block
1502  *
1503  * Free the resources of the tx control block, including
1504  * unbind the previously bound DMA handle, and reset other
1505  * control fields.
1506  */
1507 void
1508 ixgbe_free_tcb(tx_control_block_t *tcb)
1509 {
1510         switch (tcb->tx_type) {
1511         case USE_COPY:
1512                 /*
1513                  * Reset the buffer length that is used for copy
1514                  */
1515                 tcb->tx_buf.len = 0;
1516                 break;
1517         case USE_DMA:
1518                 /*
1519                  * Release the DMA resource that is used for
1520                  * DMA binding.
1521                  */
1522                 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1523                 break;
1524         default:
1525                 break;
1526         }
1527 
1528         /*
1529          * Free the mblk
1530          */
1531         if (tcb->mp != NULL) {
1532                 freemsg(tcb->mp);
1533                 tcb->mp = NULL;
1534         }
1535 
1536         tcb->tx_type = USE_NONE;
1537         tcb->last_index = MAX_TX_RING_SIZE;
1538         tcb->frag_num = 0;
1539         tcb->desc_num = 0;
1540 }
1541 
1542 /*
1543  * ixgbe_get_free_list - Get a free tx control block from the free list
1544  *
1545  * The atomic operation on the number of the available tx control block
1546  * in the free list is used to keep this routine mutual exclusive with
1547  * the routine ixgbe_put_check_list.
1548  */
1549 static tx_control_block_t *
1550 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
1551 {
1552         tx_control_block_t *tcb;
1553 
1554         /*
1555          * Check and update the number of the free tx control block
1556          * in the free list.
1557          */
1558         if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1559                 return (NULL);
1560 
1561         mutex_enter(&tx_ring->tcb_head_lock);
1562 
1563         tcb = tx_ring->free_list[tx_ring->tcb_head];
1564         ASSERT(tcb != NULL);
1565         tx_ring->free_list[tx_ring->tcb_head] = NULL;
1566         tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1567             tx_ring->free_list_size);
1568 
1569         mutex_exit(&tx_ring->tcb_head_lock);
1570 
1571         return (tcb);
1572 }
1573 
1574 /*
1575  * ixgbe_put_free_list
1576  *
1577  * Put a list of used tx control blocks back to the free list
1578  *
1579  * A mutex is used here to ensure the serialization. The mutual exclusion
1580  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1581  * the atomic operation on the counter tcb_free.
1582  */
1583 void
1584 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1585 {
1586         uint32_t index;
1587         int tcb_num;
1588         tx_control_block_t *tcb;
1589 
1590         mutex_enter(&tx_ring->tcb_tail_lock);
1591 
1592         index = tx_ring->tcb_tail;
1593 
1594         tcb_num = 0;
1595         tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1596         while (tcb != NULL) {
1597                 ASSERT(tx_ring->free_list[index] == NULL);
1598                 tx_ring->free_list[index] = tcb;
1599 
1600                 tcb_num++;
1601 
1602                 index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1603 
1604                 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1605         }
1606 
1607         tx_ring->tcb_tail = index;
1608 
1609         /*
1610          * Update the number of the free tx control block
1611          * in the free list. This operation must be placed
1612          * under the protection of the lock.
1613          */
1614         atomic_add_32(&tx_ring->tcb_free, tcb_num);
1615 
1616         mutex_exit(&tx_ring->tcb_tail_lock);
1617 }