1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  29  */
  30 
  31 #include "ixgbe_sw.h"
  32 
  33 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  34     uint32_t, boolean_t);
  35 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  36     uint32_t);
  37 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
  38     ixgbe_tx_context_t *, size_t);
  39 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
  40 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
  41 
  42 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
  43 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
  44     ixgbe_tx_context_t *);
  45 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
  46     ixgbe_tx_context_t *);
  47 
  48 #ifndef IXGBE_DEBUG
  49 #pragma inline(ixgbe_save_desc)
  50 #pragma inline(ixgbe_get_context)
  51 #pragma inline(ixgbe_check_context)
  52 #pragma inline(ixgbe_fill_context)
  53 #endif
  54 
  55 /*
  56  * ixgbe_ring_tx
  57  *
  58  * To transmit one mblk through one specified ring.
  59  *
  60  * One mblk can consist of several fragments, each fragment
  61  * will be processed with different methods based on the size.
  62  * For the fragments with size less than the bcopy threshold,
  63  * they will be processed by using bcopy; otherwise, they will
  64  * be processed by using DMA binding.
  65  *
  66  * To process the mblk, a tx control block is got from the
  67  * free list. One tx control block contains one tx buffer, which
  68  * is used to copy mblk fragments' data; and one tx DMA handle,
  69  * which is used to bind a mblk fragment with DMA resource.
  70  *
  71  * Several small mblk fragments can be copied into one tx control
  72  * block's buffer, and then the buffer will be transmitted with
  73  * one tx descriptor.
  74  *
  75  * A large fragment only binds with one tx control block's DMA
  76  * handle, and it can span several tx descriptors for transmitting.
  77  *
  78  * So to transmit a packet (mblk), several tx control blocks can
  79  * be used. After the processing, those tx control blocks will
  80  * be put to the work list.
  81  */
  82 mblk_t *
  83 ixgbe_ring_tx(void *arg, mblk_t *mp)
  84 {
  85         ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
  86         ixgbe_t *ixgbe = tx_ring->ixgbe;
  87         tx_type_t current_flag, next_flag;
  88         uint32_t current_len, next_len;
  89         uint32_t desc_total;
  90         size_t mbsize;
  91         int desc_num;
  92         boolean_t copy_done, eop;
  93         mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
  94         tx_control_block_t *tcb;
  95         ixgbe_tx_context_t tx_context, *ctx;
  96         link_list_t pending_list;
  97         uint32_t len, hdr_frag_len, hdr_len;
  98         uint32_t copy_thresh;
  99         mblk_t *hdr_new_mp = NULL;
 100         mblk_t *hdr_pre_mp = NULL;
 101         mblk_t *hdr_nmp = NULL;
 102 
 103         ASSERT(mp->b_next == NULL);
 104 
 105         if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
 106             (ixgbe->ixgbe_state & IXGBE_ERROR) ||
 107             (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
 108             !(ixgbe->ixgbe_state & IXGBE_STARTED)) {
 109                 return (mp);
 110         }
 111 
 112         copy_thresh = ixgbe->tx_copy_thresh;
 113 
 114         /* Get the mblk size */
 115         mbsize = 0;
 116         for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
 117                 mbsize += MBLKL(nmp);
 118         }
 119 
 120         if (ixgbe->tx_hcksum_enable) {
 121                 /*
 122                  * Retrieve checksum context information from the mblk
 123                  * that will be used to decide whether/how to fill the
 124                  * context descriptor.
 125                  */
 126                 ctx = &tx_context;
 127                 if (ixgbe_get_context(mp, ctx) < 0) {
 128                         freemsg(mp);
 129                         return (NULL);
 130                 }
 131 
 132                 /*
 133                  * If the mblk size exceeds the max size ixgbe could
 134                  * process, then discard this mblk, and return NULL.
 135                  */
 136                 if ((ctx->lso_flag &&
 137                     ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
 138                     (!ctx->lso_flag &&
 139                     (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
 140                         freemsg(mp);
 141                         IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
 142                         return (NULL);
 143                 }
 144         } else {
 145                 ctx = NULL;
 146         }
 147 
 148         /*
 149          * Check and recycle tx descriptors.
 150          * The recycle threshold here should be selected carefully
 151          */
 152         if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
 153                 tx_ring->tx_recycle(tx_ring);
 154         }
 155 
 156         /*
 157          * After the recycling, if the tbd_free is less than the
 158          * overload_threshold, assert overload, return mp;
 159          * and we need to re-schedule the tx again.
 160          */
 161         if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
 162                 tx_ring->reschedule = B_TRUE;
 163                 IXGBE_DEBUG_STAT(tx_ring->stat_overload);
 164                 return (mp);
 165         }
 166 
 167         /*
 168          * The pending_list is a linked list that is used to save
 169          * the tx control blocks that have packet data processed
 170          * but have not put the data to the tx descriptor ring.
 171          * It is used to reduce the lock contention of the tx_lock.
 172          */
 173         LINK_LIST_INIT(&pending_list);
 174         desc_num = 0;
 175         desc_total = 0;
 176 
 177         /*
 178          * The software should guarantee LSO packet header(MAC+IP+TCP)
 179          * to be within one descriptor. Here we reallocate and refill the
 180          * the header if it's physical memory non-contiguous.
 181          */
 182         if ((ctx != NULL) && ctx->lso_flag) {
 183                 /* find the last fragment of the header */
 184                 len = MBLKL(mp);
 185                 ASSERT(len > 0);
 186                 hdr_nmp = mp;
 187                 hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
 188                 while (len < hdr_len) {
 189                         hdr_pre_mp = hdr_nmp;
 190                         hdr_nmp = hdr_nmp->b_cont;
 191                         len += MBLKL(hdr_nmp);
 192                 }
 193                 /*
 194                  * If the header and the payload are in different mblks,
 195                  * we simply force the header to be copied into pre-allocated
 196                  * page-aligned buffer.
 197                  */
 198                 if (len == hdr_len)
 199                         goto adjust_threshold;
 200 
 201                 hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
 202                 /*
 203                  * There are two cases we need to reallocate a mblk for the
 204                  * last header fragment:
 205                  * 1. the header is in multiple mblks and the last fragment
 206                  * share the same mblk with the payload
 207                  * 2. the header is in a single mblk shared with the payload
 208                  * and the header is physical memory non-contiguous
 209                  */
 210                 if ((hdr_nmp != mp) ||
 211                     (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
 212                     < hdr_len)) {
 213                         IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
 214                         /*
 215                          * reallocate the mblk for the last header fragment,
 216                          * expect to bcopy into pre-allocated page-aligned
 217                          * buffer
 218                          */
 219                         hdr_new_mp = allocb(hdr_frag_len, NULL);
 220                         if (!hdr_new_mp)
 221                                 return (mp);
 222                         bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
 223                             hdr_frag_len);
 224                         /* link the new header fragment with the other parts */
 225                         hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
 226                         hdr_new_mp->b_cont = hdr_nmp;
 227                         if (hdr_pre_mp)
 228                                 hdr_pre_mp->b_cont = hdr_new_mp;
 229                         else
 230                                 mp = hdr_new_mp;
 231                         hdr_nmp->b_rptr += hdr_frag_len;
 232                 }
 233 adjust_threshold:
 234                 /*
 235                  * adjust the bcopy threshhold to guarantee
 236                  * the header to use bcopy way
 237                  */
 238                 if (copy_thresh < hdr_len)
 239                         copy_thresh = hdr_len;
 240         }
 241 
 242         current_mp = mp;
 243         current_len = MBLKL(current_mp);
 244         /*
 245          * Decide which method to use for the first fragment
 246          */
 247         current_flag = (current_len <= copy_thresh) ?
 248             USE_COPY : USE_DMA;
 249         /*
 250          * If the mblk includes several contiguous small fragments,
 251          * they may be copied into one buffer. This flag is used to
 252          * indicate whether there are pending fragments that need to
 253          * be copied to the current tx buffer.
 254          *
 255          * If this flag is B_TRUE, it indicates that a new tx control
 256          * block is needed to process the next fragment using either
 257          * copy or DMA binding.
 258          *
 259          * Otherwise, it indicates that the next fragment will be
 260          * copied to the current tx buffer that is maintained by the
 261          * current tx control block. No new tx control block is needed.
 262          */
 263         copy_done = B_TRUE;
 264         while (current_mp) {
 265                 next_mp = current_mp->b_cont;
 266                 eop = (next_mp == NULL); /* Last fragment of the packet? */
 267                 next_len = eop ? 0: MBLKL(next_mp);
 268 
 269                 /*
 270                  * When the current fragment is an empty fragment, if
 271                  * the next fragment will still be copied to the current
 272                  * tx buffer, we cannot skip this fragment here. Because
 273                  * the copy processing is pending for completion. We have
 274                  * to process this empty fragment in the tx_copy routine.
 275                  *
 276                  * If the copy processing is completed or a DMA binding
 277                  * processing is just completed, we can just skip this
 278                  * empty fragment.
 279                  */
 280                 if ((current_len == 0) && (copy_done)) {
 281                         current_mp = next_mp;
 282                         current_len = next_len;
 283                         current_flag = (current_len <= copy_thresh) ?
 284                             USE_COPY : USE_DMA;
 285                         continue;
 286                 }
 287 
 288                 if (copy_done) {
 289                         /*
 290                          * Get a new tx control block from the free list
 291                          */
 292                         tcb = ixgbe_get_free_list(tx_ring);
 293 
 294                         if (tcb == NULL) {
 295                                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 296                                 goto tx_failure;
 297                         }
 298 
 299                         /*
 300                          * Push the tx control block to the pending list
 301                          * to avoid using lock too early
 302                          */
 303                         LIST_PUSH_TAIL(&pending_list, &tcb->link);
 304                 }
 305 
 306                 if (current_flag == USE_COPY) {
 307                         /*
 308                          * Check whether to use bcopy or DMA binding to process
 309                          * the next fragment, and if using bcopy, whether we
 310                          * need to continue copying the next fragment into the
 311                          * current tx buffer.
 312                          */
 313                         ASSERT((tcb->tx_buf.len + current_len) <=
 314                             tcb->tx_buf.size);
 315 
 316                         if (eop) {
 317                                 /*
 318                                  * This is the last fragment of the packet, so
 319                                  * the copy processing will be completed with
 320                                  * this fragment.
 321                                  */
 322                                 next_flag = USE_NONE;
 323                                 copy_done = B_TRUE;
 324                         } else if ((tcb->tx_buf.len + current_len + next_len) >
 325                             tcb->tx_buf.size) {
 326                                 /*
 327                                  * If the next fragment is too large to be
 328                                  * copied to the current tx buffer, we need
 329                                  * to complete the current copy processing.
 330                                  */
 331                                 next_flag = (next_len > copy_thresh) ?
 332                                     USE_DMA: USE_COPY;
 333                                 copy_done = B_TRUE;
 334                         } else if (next_len > copy_thresh) {
 335                                 /*
 336                                  * The next fragment needs to be processed with
 337                                  * DMA binding. So the copy prcessing will be
 338                                  * completed with the current fragment.
 339                                  */
 340                                 next_flag = USE_DMA;
 341                                 copy_done = B_TRUE;
 342                         } else {
 343                                 /*
 344                                  * Continue to copy the next fragment to the
 345                                  * current tx buffer.
 346                                  */
 347                                 next_flag = USE_COPY;
 348                                 copy_done = B_FALSE;
 349                         }
 350 
 351                         desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
 352                             current_len, copy_done);
 353                 } else {
 354                         /*
 355                          * Check whether to use bcopy or DMA binding to process
 356                          * the next fragment.
 357                          */
 358                         next_flag = (next_len > copy_thresh) ?
 359                             USE_DMA: USE_COPY;
 360                         ASSERT(copy_done == B_TRUE);
 361 
 362                         desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
 363                             current_len);
 364                 }
 365 
 366                 if (desc_num > 0)
 367                         desc_total += desc_num;
 368                 else if (desc_num < 0)
 369                         goto tx_failure;
 370 
 371                 current_mp = next_mp;
 372                 current_len = next_len;
 373                 current_flag = next_flag;
 374         }
 375 
 376         /*
 377          * Attach the mblk to the last tx control block
 378          */
 379         ASSERT(tcb);
 380         ASSERT(tcb->mp == NULL);
 381         tcb->mp = mp;
 382 
 383         /*
 384          * 82598/82599 chipset has a limitation that no more than 32 tx
 385          * descriptors can be transmited out at one time.
 386          *
 387          * Here is a workaround for it: pull up the mblk then send it
 388          * out with bind way. By doing so, no more than MAX_COOKIE (18)
 389          * descriptors is needed.
 390          */
 391         if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
 392                 IXGBE_DEBUG_STAT(tx_ring->stat_break_tbd_limit);
 393 
 394                 /*
 395                  * Discard the mblk and free the used resources
 396                  */
 397                 tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 398                 while (tcb) {
 399                         tcb->mp = NULL;
 400                         ixgbe_free_tcb(tcb);
 401                         tcb = (tx_control_block_t *)
 402                             LIST_GET_NEXT(&pending_list, &tcb->link);
 403                 }
 404 
 405                 /*
 406                  * Return the tx control blocks in the pending list to
 407                  * the free list.
 408                  */
 409                 ixgbe_put_free_list(tx_ring, &pending_list);
 410 
 411                 /*
 412                  * pull up the mblk and send it out with bind way
 413                  */
 414                 if ((pull_mp = msgpullup(mp, -1)) == NULL) {
 415                         tx_ring->reschedule = B_TRUE;
 416 
 417                         /*
 418                          * If new mblk has been allocted for the last header
 419                          * fragment of a LSO packet, we should restore the
 420                          * modified mp.
 421                          */
 422                         if (hdr_new_mp) {
 423                                 hdr_new_mp->b_cont = NULL;
 424                                 freeb(hdr_new_mp);
 425                                 hdr_nmp->b_rptr -= hdr_frag_len;
 426                                 if (hdr_pre_mp)
 427                                         hdr_pre_mp->b_cont = hdr_nmp;
 428                                 else
 429                                         mp = hdr_nmp;
 430                         }
 431                         return (mp);
 432                 }
 433 
 434                 LINK_LIST_INIT(&pending_list);
 435                 desc_total = 0;
 436 
 437                 /*
 438                  * if the packet is a LSO packet, we simply
 439                  * transmit the header in one descriptor using the copy way
 440                  */
 441                 if ((ctx != NULL) && ctx->lso_flag) {
 442                         hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
 443                             ctx->l4_hdr_len;
 444 
 445                         tcb = ixgbe_get_free_list(tx_ring);
 446                         if (tcb == NULL) {
 447                                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 448                                 goto tx_failure;
 449                         }
 450                         desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
 451                             hdr_len, B_TRUE);
 452                         LIST_PUSH_TAIL(&pending_list, &tcb->link);
 453                         desc_total  += desc_num;
 454 
 455                         pull_mp->b_rptr += hdr_len;
 456                 }
 457 
 458                 tcb = ixgbe_get_free_list(tx_ring);
 459                 if (tcb == NULL) {
 460                         IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 461                         goto tx_failure;
 462                 }
 463                 if ((ctx != NULL) && ctx->lso_flag) {
 464                         desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 465                             mbsize - hdr_len);
 466                 } else {
 467                         desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 468                             mbsize);
 469                 }
 470                 if (desc_num < 0) {
 471                         goto tx_failure;
 472                 }
 473                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
 474 
 475                 desc_total += desc_num;
 476                 tcb->mp = pull_mp;
 477         }
 478 
 479         /*
 480          * Before fill the tx descriptor ring with the data, we need to
 481          * ensure there are adequate free descriptors for transmit
 482          * (including one context descriptor).
 483          * Do not use up all the tx descriptors.
 484          * Otherwise tx recycle will fail and cause false hang.
 485          */
 486         if (tx_ring->tbd_free <= (desc_total + 1)) {
 487                 tx_ring->tx_recycle(tx_ring);
 488         }
 489 
 490         mutex_enter(&tx_ring->tx_lock);
 491         /*
 492          * If the number of free tx descriptors is not enough for transmit
 493          * then return mp.
 494          *
 495          * Note: we must put this check under the mutex protection to
 496          * ensure the correctness when multiple threads access it in
 497          * parallel.
 498          */
 499         if (tx_ring->tbd_free <= (desc_total + 1)) {
 500                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
 501                 mutex_exit(&tx_ring->tx_lock);
 502                 goto tx_failure;
 503         }
 504 
 505         desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
 506             mbsize);
 507 
 508         ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
 509 
 510         tx_ring->stat_obytes += mbsize;
 511         tx_ring->stat_opackets ++;
 512 
 513         mutex_exit(&tx_ring->tx_lock);
 514 
 515         /*
 516          * now that the transmission succeeds, need to free the original
 517          * mp if we used the pulling up mblk for transmission.
 518          */
 519         if (pull_mp) {
 520                 freemsg(mp);
 521         }
 522 
 523         return (NULL);
 524 
 525 tx_failure:
 526         /*
 527          * If transmission fails, need to free the pulling up mblk.
 528          */
 529         if (pull_mp) {
 530                 freemsg(pull_mp);
 531         }
 532 
 533         /*
 534          * If new mblk has been allocted for the last header
 535          * fragment of a LSO packet, we should restore the
 536          * modified mp.
 537          */
 538         if (hdr_new_mp) {
 539                 hdr_new_mp->b_cont = NULL;
 540                 freeb(hdr_new_mp);
 541                 hdr_nmp->b_rptr -= hdr_frag_len;
 542                 if (hdr_pre_mp)
 543                         hdr_pre_mp->b_cont = hdr_nmp;
 544                 else
 545                         mp = hdr_nmp;
 546         }
 547         /*
 548          * Discard the mblk and free the used resources
 549          */
 550         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 551         while (tcb) {
 552                 tcb->mp = NULL;
 553 
 554                 ixgbe_free_tcb(tcb);
 555 
 556                 tcb = (tx_control_block_t *)
 557                     LIST_GET_NEXT(&pending_list, &tcb->link);
 558         }
 559 
 560         /*
 561          * Return the tx control blocks in the pending list to the free list.
 562          */
 563         ixgbe_put_free_list(tx_ring, &pending_list);
 564 
 565         /* Transmit failed, do not drop the mblk, rechedule the transmit */
 566         tx_ring->reschedule = B_TRUE;
 567 
 568         return (mp);
 569 }
 570 
 571 /*
 572  * ixgbe_tx_copy
 573  *
 574  * Copy the mblk fragment to the pre-allocated tx buffer
 575  */
 576 static int
 577 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 578     uint32_t len, boolean_t copy_done)
 579 {
 580         dma_buffer_t *tx_buf;
 581         uint32_t desc_num;
 582         _NOTE(ARGUNUSED(tx_ring));
 583 
 584         tx_buf = &tcb->tx_buf;
 585 
 586         /*
 587          * Copy the packet data of the mblk fragment into the
 588          * pre-allocated tx buffer, which is maintained by the
 589          * tx control block.
 590          *
 591          * Several mblk fragments can be copied into one tx buffer.
 592          * The destination address of the current copied fragment in
 593          * the tx buffer is next to the end of the previous copied
 594          * fragment.
 595          */
 596         if (len > 0) {
 597                 bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
 598 
 599                 tx_buf->len += len;
 600                 tcb->frag_num++;
 601         }
 602 
 603         desc_num = 0;
 604 
 605         /*
 606          * If it is the last fragment copied to the current tx buffer,
 607          * in other words, if there's no remaining fragment or the remaining
 608          * fragment requires a new tx control block to process, we need to
 609          * complete the current copy processing by syncing up the current
 610          * DMA buffer and saving the descriptor data.
 611          */
 612         if (copy_done) {
 613                 /*
 614                  * Sync the DMA buffer of the packet data
 615                  */
 616                 DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
 617 
 618                 tcb->tx_type = USE_COPY;
 619 
 620                 /*
 621                  * Save the address and length to the private data structure
 622                  * of the tx control block, which will be used to fill the
 623                  * tx descriptor ring after all the fragments are processed.
 624                  */
 625                 ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
 626                 desc_num++;
 627         }
 628 
 629         return (desc_num);
 630 }
 631 
 632 /*
 633  * ixgbe_tx_bind
 634  *
 635  * Bind the mblk fragment with DMA
 636  */
 637 static int
 638 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 639     uint32_t len)
 640 {
 641         int status, i;
 642         ddi_dma_cookie_t dma_cookie;
 643         uint_t ncookies;
 644         int desc_num;
 645 
 646         /*
 647          * Use DMA binding to process the mblk fragment
 648          */
 649         status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
 650             (caddr_t)mp->b_rptr, len,
 651             DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
 652             0, &dma_cookie, &ncookies);
 653 
 654         if (status != DDI_DMA_MAPPED) {
 655                 IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
 656                 return (-1);
 657         }
 658 
 659         tcb->frag_num++;
 660         tcb->tx_type = USE_DMA;
 661         /*
 662          * Each fragment can span several cookies. One cookie will have
 663          * one tx descriptor to transmit.
 664          */
 665         desc_num = 0;
 666         for (i = ncookies; i > 0; i--) {
 667                 /*
 668                  * Save the address and length to the private data structure
 669                  * of the tx control block, which will be used to fill the
 670                  * tx descriptor ring after all the fragments are processed.
 671                  */
 672                 ixgbe_save_desc(tcb,
 673                     dma_cookie.dmac_laddress,
 674                     dma_cookie.dmac_size);
 675 
 676                 desc_num++;
 677 
 678                 if (i > 1)
 679                         ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
 680         }
 681 
 682         return (desc_num);
 683 }
 684 
 685 /*
 686  * ixgbe_get_context
 687  *
 688  * Get the context information from the mblk
 689  */
 690 static int
 691 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 692 {
 693         uint32_t start;
 694         uint32_t hckflags;
 695         uint32_t lsoflags;
 696         uint32_t mss;
 697         uint32_t len;
 698         uint32_t size;
 699         uint32_t offset;
 700         unsigned char *pos;
 701         ushort_t etype;
 702         uint32_t mac_hdr_len;
 703         uint32_t l4_proto;
 704         uint32_t l4_hdr_len;
 705 
 706         ASSERT(mp != NULL);
 707 
 708         mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
 709         bzero(ctx, sizeof (ixgbe_tx_context_t));
 710 
 711         if (hckflags == 0) {
 712                 return (0);
 713         }
 714 
 715         ctx->hcksum_flags = hckflags;
 716 
 717         mac_lso_get(mp, &mss, &lsoflags);
 718         ctx->mss = mss;
 719         ctx->lso_flag = (lsoflags == HW_LSO);
 720 
 721         /*
 722          * LSO relies on tx h/w checksum, so here will drop the package
 723          * if h/w checksum flag is not declared.
 724          */
 725         if (ctx->lso_flag) {
 726                 if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
 727                     (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
 728                         IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w "
 729                             "checksum flags are not specified when doing LSO");
 730                         return (-1);
 731                 }
 732         }
 733 
 734         etype = 0;
 735         mac_hdr_len = 0;
 736         l4_proto = 0;
 737 
 738         /*
 739          * Firstly get the position of the ether_type/ether_tpid.
 740          * Here we don't assume the ether (VLAN) header is fully included
 741          * in one mblk fragment, so we go thourgh the fragments to parse
 742          * the ether type.
 743          */
 744         size = len = MBLKL(mp);
 745         offset = offsetof(struct ether_header, ether_type);
 746         while (size <= offset) {
 747                 mp = mp->b_cont;
 748                 ASSERT(mp != NULL);
 749                 len = MBLKL(mp);
 750                 size += len;
 751         }
 752         pos = mp->b_rptr + offset + len - size;
 753 
 754         etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 755         if (etype == ETHERTYPE_VLAN) {
 756                 /*
 757                  * Get the position of the ether_type in VLAN header
 758                  */
 759                 offset = offsetof(struct ether_vlan_header, ether_type);
 760                 while (size <= offset) {
 761                         mp = mp->b_cont;
 762                         ASSERT(mp != NULL);
 763                         len = MBLKL(mp);
 764                         size += len;
 765                 }
 766                 pos = mp->b_rptr + offset + len - size;
 767 
 768                 etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 769                 mac_hdr_len = sizeof (struct ether_vlan_header);
 770         } else {
 771                 mac_hdr_len = sizeof (struct ether_header);
 772         }
 773 
 774         /*
 775          * Here we don't assume the IP(V6) header is fully included in
 776          * one mblk fragment.
 777          */
 778         switch (etype) {
 779         case ETHERTYPE_IP:
 780                 if (ctx->lso_flag) {
 781                         offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
 782                         while (size <= offset) {
 783                                 mp = mp->b_cont;
 784                                 ASSERT(mp != NULL);
 785                                 len = MBLKL(mp);
 786                                 size += len;
 787                         }
 788                         pos = mp->b_rptr + offset + len - size;
 789                         *((uint16_t *)(uintptr_t)(pos)) = 0;
 790 
 791                         offset = offsetof(ipha_t, ipha_hdr_checksum) +
 792                             mac_hdr_len;
 793                         while (size <= offset) {
 794                                 mp = mp->b_cont;
 795                                 ASSERT(mp != NULL);
 796                                 len = MBLKL(mp);
 797                                 size += len;
 798                         }
 799                         pos = mp->b_rptr + offset + len - size;
 800                         *((uint16_t *)(uintptr_t)(pos)) = 0;
 801 
 802                         /*
 803                          * To perform ixgbe LSO, here also need to fill
 804                          * the tcp checksum field of the packet with the
 805                          * following pseudo-header checksum:
 806                          * (ip_source_addr, ip_destination_addr, l4_proto)
 807                          * Currently the tcp/ip stack has done it.
 808                          */
 809                 }
 810 
 811                 offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
 812                 while (size <= offset) {
 813                         mp = mp->b_cont;
 814                         ASSERT(mp != NULL);
 815                         len = MBLKL(mp);
 816                         size += len;
 817                 }
 818                 pos = mp->b_rptr + offset + len - size;
 819 
 820                 l4_proto = *(uint8_t *)pos;
 821                 break;
 822         case ETHERTYPE_IPV6:
 823                 offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
 824                 while (size <= offset) {
 825                         mp = mp->b_cont;
 826                         ASSERT(mp != NULL);
 827                         len = MBLKL(mp);
 828                         size += len;
 829                 }
 830                 pos = mp->b_rptr + offset + len - size;
 831 
 832                 l4_proto = *(uint8_t *)pos;
 833                 break;
 834         default:
 835                 /* Unrecoverable error */
 836                 IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
 837                 return (-2);
 838         }
 839 
 840         if (ctx->lso_flag) {
 841                 offset = mac_hdr_len + start;
 842                 while (size <= offset) {
 843                         mp = mp->b_cont;
 844                         ASSERT(mp != NULL);
 845                         len = MBLKL(mp);
 846                         size += len;
 847                 }
 848                 pos = mp->b_rptr + offset + len - size;
 849 
 850                 l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
 851         } else {
 852                 /*
 853                  * l4 header length is only required for LSO
 854                  */
 855                 l4_hdr_len = 0;
 856         }
 857 
 858         ctx->mac_hdr_len = mac_hdr_len;
 859         ctx->ip_hdr_len = start;
 860         ctx->l4_proto = l4_proto;
 861         ctx->l4_hdr_len = l4_hdr_len;
 862 
 863         return (0);
 864 }
 865 
 866 /*
 867  * ixgbe_check_context
 868  *
 869  * Check if a new context descriptor is needed
 870  */
 871 static boolean_t
 872 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
 873 {
 874         ixgbe_tx_context_t *last;
 875 
 876         if (ctx == NULL)
 877                 return (B_FALSE);
 878 
 879         /*
 880          * Compare the context data retrieved from the mblk and the
 881          * stored data of the last context descriptor. The data need
 882          * to be checked are:
 883          *      hcksum_flags
 884          *      l4_proto
 885          *      mac_hdr_len
 886          *      ip_hdr_len
 887          *      lso_flag
 888          *      mss (only checked for LSO)
 889          *      l4_hr_len (only checked for LSO)
 890          * Either one of the above data is changed, a new context descriptor
 891          * will be needed.
 892          */
 893         last = &tx_ring->tx_context;
 894 
 895         if ((ctx->hcksum_flags != last->hcksum_flags) ||
 896             (ctx->l4_proto != last->l4_proto) ||
 897             (ctx->mac_hdr_len != last->mac_hdr_len) ||
 898             (ctx->ip_hdr_len != last->ip_hdr_len) ||
 899             (ctx->lso_flag != last->lso_flag) ||
 900             (ctx->lso_flag && ((ctx->mss != last->mss) ||
 901             (ctx->l4_hdr_len != last->l4_hdr_len)))) {
 902                 return (B_TRUE);
 903         }
 904 
 905         return (B_FALSE);
 906 }
 907 
 908 /*
 909  * ixgbe_fill_context
 910  *
 911  * Fill the context descriptor with hardware checksum informations
 912  */
 913 static void
 914 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
 915     ixgbe_tx_context_t *ctx)
 916 {
 917         /*
 918          * Fill the context descriptor with the checksum
 919          * context information we've got.
 920          */
 921         ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
 922         ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
 923             IXGBE_ADVTXD_MACLEN_SHIFT;
 924 
 925         ctx_tbd->type_tucmd_mlhl =
 926             IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
 927 
 928         if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
 929                 ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
 930 
 931         if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
 932                 switch (ctx->l4_proto) {
 933                 case IPPROTO_TCP:
 934                         ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
 935                         break;
 936                 case IPPROTO_UDP:
 937                         /*
 938                          * We don't have to explicitly set:
 939                          *      ctx_tbd->type_tucmd_mlhl |=
 940                          *          IXGBE_ADVTXD_TUCMD_L4T_UDP;
 941                          * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
 942                          */
 943                         break;
 944                 default:
 945                         /* Unrecoverable error */
 946                         IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
 947                         break;
 948                 }
 949         }
 950 
 951         ctx_tbd->seqnum_seed = 0;
 952 
 953         if (ctx->lso_flag) {
 954                 ctx_tbd->mss_l4len_idx =
 955                     (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
 956                     (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
 957         } else {
 958                 ctx_tbd->mss_l4len_idx = 0;
 959         }
 960 }
 961 
 962 /*
 963  * ixgbe_tx_fill_ring
 964  *
 965  * Fill the tx descriptor ring with the data
 966  */
 967 static int
 968 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
 969     ixgbe_tx_context_t *ctx, size_t mbsize)
 970 {
 971         struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
 972         boolean_t load_context;
 973         uint32_t index, tcb_index, desc_num;
 974         union ixgbe_adv_tx_desc *tbd, *first_tbd;
 975         tx_control_block_t *tcb, *first_tcb;
 976         uint32_t hcksum_flags;
 977         int i;
 978 
 979         ASSERT(mutex_owned(&tx_ring->tx_lock));
 980 
 981         tbd = NULL;
 982         first_tbd = NULL;
 983         first_tcb = NULL;
 984         desc_num = 0;
 985         hcksum_flags = 0;
 986         load_context = B_FALSE;
 987 
 988         /*
 989          * Get the index of the first tx descriptor that will be filled,
 990          * and the index of the first work list item that will be attached
 991          * with the first used tx control block in the pending list.
 992          * Note: the two indexes are the same.
 993          */
 994         index = tx_ring->tbd_tail;
 995         tcb_index = tx_ring->tbd_tail;
 996 
 997         if (ctx != NULL) {
 998                 hcksum_flags = ctx->hcksum_flags;
 999 
1000                 /*
1001                  * Check if a new context descriptor is needed for this packet
1002                  */
1003                 load_context = ixgbe_check_context(tx_ring, ctx);
1004 
1005                 if (load_context) {
1006                         tbd = &tx_ring->tbd_ring[index];
1007 
1008                         /*
1009                          * Fill the context descriptor with the
1010                          * hardware checksum offload informations.
1011                          */
1012                         ixgbe_fill_context(
1013                             (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
1014 
1015                         index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1016                         desc_num++;
1017 
1018                         /*
1019                          * Store the checksum context data if
1020                          * a new context descriptor is added
1021                          */
1022                         tx_ring->tx_context = *ctx;
1023                 }
1024         }
1025 
1026         first_tbd = &tx_ring->tbd_ring[index];
1027 
1028         /*
1029          * Fill tx data descriptors with the data saved in the pending list.
1030          * The tx control blocks in the pending list are added to the work list
1031          * at the same time.
1032          *
1033          * The work list is strictly 1:1 corresponding to the descriptor ring.
1034          * One item of the work list corresponds to one tx descriptor. Because
1035          * one tx control block can span multiple tx descriptors, the tx
1036          * control block will be added to the first work list item that
1037          * corresponds to the first tx descriptor generated from that tx
1038          * control block.
1039          */
1040         tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1041         first_tcb = tcb;
1042         while (tcb != NULL) {
1043 
1044                 for (i = 0; i < tcb->desc_num; i++) {
1045                         tbd = &tx_ring->tbd_ring[index];
1046 
1047                         tbd->read.buffer_addr = tcb->desc[i].address;
1048                         tbd->read.cmd_type_len = tcb->desc[i].length;
1049 
1050                         tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
1051                             | IXGBE_ADVTXD_DTYP_DATA;
1052 
1053                         tbd->read.olinfo_status = 0;
1054 
1055                         index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1056                         desc_num++;
1057                 }
1058 
1059                 /*
1060                  * Add the tx control block to the work list
1061                  */
1062                 ASSERT(tx_ring->work_list[tcb_index] == NULL);
1063                 tx_ring->work_list[tcb_index] = tcb;
1064 
1065                 tcb_index = index;
1066                 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1067         }
1068 
1069         if (load_context) {
1070                 /*
1071                  * Count the context descriptor for
1072                  * the first tx control block.
1073                  */
1074                 first_tcb->desc_num++;
1075         }
1076         first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
1077 
1078         /*
1079          * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1080          * valid in the first descriptor of the packet.
1081          * Setting paylen in every first_tbd for all parts.
1082          * 82599 and X540 require the packet length in paylen field with or
1083          * without LSO and 82598 will ignore it in non-LSO mode.
1084          */
1085         ASSERT(first_tbd != NULL);
1086         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1087 
1088         switch (hw->mac.type) {
1089         case ixgbe_mac_82598EB:
1090                 if (ctx != NULL && ctx->lso_flag) {
1091                         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1092                         first_tbd->read.olinfo_status |=
1093                             (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1094                             - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1095                 }
1096                 break;
1097 
1098         case ixgbe_mac_82599EB:
1099         case ixgbe_mac_X540:
1100                 if (ctx != NULL && ctx->lso_flag) {
1101                         first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1102                         first_tbd->read.olinfo_status |=
1103                             (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1104                             - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1105                 } else {
1106                         first_tbd->read.olinfo_status |=
1107                             (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1108                 }
1109                 break;
1110 
1111         default:
1112                 break;
1113         }
1114 
1115         /* Set hardware checksum bits */
1116         if (hcksum_flags != 0) {
1117                 if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1118                         first_tbd->read.olinfo_status |=
1119                             IXGBE_ADVTXD_POPTS_IXSM;
1120                 if (hcksum_flags & HCK_PARTIALCKSUM)
1121                         first_tbd->read.olinfo_status |=
1122                             IXGBE_ADVTXD_POPTS_TXSM;
1123         }
1124 
1125         /*
1126          * The last descriptor of packet needs End Of Packet (EOP),
1127          * and Report Status (RS) bits set
1128          */
1129         ASSERT(tbd != NULL);
1130         tbd->read.cmd_type_len |=
1131             IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1132 
1133         /*
1134          * Sync the DMA buffer of the tx descriptor ring
1135          */
1136         DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1137 
1138         /*
1139          * Update the number of the free tx descriptors.
1140          * The mutual exclusion between the transmission and the recycling
1141          * (for the tx descriptor ring and the work list) is implemented
1142          * with the atomic operation on the number of the free tx descriptors.
1143          *
1144          * Note: we should always decrement the counter tbd_free before
1145          * advancing the hardware TDT pointer to avoid the race condition -
1146          * before the counter tbd_free is decremented, the transmit of the
1147          * tx descriptors has done and the counter tbd_free is increased by
1148          * the tx recycling.
1149          */
1150         i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1151         ASSERT(i >= 0);
1152 
1153         tx_ring->tbd_tail = index;
1154 
1155         /*
1156          * Advance the hardware TDT pointer of the tx descriptor ring
1157          */
1158         IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1159 
1160         if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1161             DDI_FM_OK) {
1162                 ddi_fm_service_impact(tx_ring->ixgbe->dip,
1163                     DDI_SERVICE_DEGRADED);
1164                 atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1165         }
1166 
1167         return (desc_num);
1168 }
1169 
1170 /*
1171  * ixgbe_save_desc
1172  *
1173  * Save the address/length pair to the private array
1174  * of the tx control block. The address/length pairs
1175  * will be filled into the tx descriptor ring later.
1176  */
1177 static void
1178 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1179 {
1180         sw_desc_t *desc;
1181 
1182         desc = &tcb->desc[tcb->desc_num];
1183         desc->address = address;
1184         desc->length = length;
1185 
1186         tcb->desc_num++;
1187 }
1188 
1189 /*
1190  * ixgbe_tx_recycle_legacy
1191  *
1192  * Recycle the tx descriptors and tx control blocks.
1193  *
1194  * The work list is traversed to check if the corresponding
1195  * tx descriptors have been transmitted. If so, the resources
1196  * bound to the tx control blocks will be freed, and those
1197  * tx control blocks will be returned to the free list.
1198  */
1199 uint32_t
1200 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1201 {
1202         uint32_t index, last_index, prev_index;
1203         int desc_num;
1204         boolean_t desc_done;
1205         tx_control_block_t *tcb;
1206         link_list_t pending_list;
1207         ixgbe_t *ixgbe = tx_ring->ixgbe;
1208 
1209         mutex_enter(&tx_ring->recycle_lock);
1210 
1211         ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1212 
1213         if (tx_ring->tbd_free == tx_ring->ring_size) {
1214                 tx_ring->recycle_fail = 0;
1215                 tx_ring->stall_watchdog = 0;
1216                 if (tx_ring->reschedule) {
1217                         tx_ring->reschedule = B_FALSE;
1218                         mac_tx_ring_update(ixgbe->mac_hdl,
1219                             tx_ring->ring_handle);
1220                 }
1221                 mutex_exit(&tx_ring->recycle_lock);
1222                 return (0);
1223         }
1224 
1225         /*
1226          * Sync the DMA buffer of the tx descriptor ring
1227          */
1228         DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1229 
1230         if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1231                 mutex_exit(&tx_ring->recycle_lock);
1232                 ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1233                 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1234                 return (0);
1235         }
1236 
1237         LINK_LIST_INIT(&pending_list);
1238         desc_num = 0;
1239         index = tx_ring->tbd_head;   /* Index of next tbd/tcb to recycle */
1240 
1241         tcb = tx_ring->work_list[index];
1242         ASSERT(tcb != NULL);
1243 
1244         while (tcb != NULL) {
1245                 /*
1246                  * Get the last tx descriptor of this packet.
1247                  * If the last tx descriptor is done, then
1248                  * we can recycle all descriptors of a packet
1249                  * which usually includes several tx control blocks.
1250                  * For 82599, LSO descriptors can not be recycled
1251                  * unless the whole packet's transmission is done.
1252                  * That's why packet level recycling is used here.
1253                  * For 82598, there's not such limit.
1254                  */
1255                 last_index = tcb->last_index;
1256                 /*
1257                  * MAX_TX_RING_SIZE is used to judge whether
1258                  * the index is a valid value or not.
1259                  */
1260                 if (last_index == MAX_TX_RING_SIZE)
1261                         break;
1262 
1263                 /*
1264                  * Check if the Descriptor Done bit is set
1265                  */
1266                 desc_done = tx_ring->tbd_ring[last_index].wb.status &
1267                     IXGBE_TXD_STAT_DD;
1268                 if (desc_done) {
1269                         /*
1270                          * recycle all descriptors of the packet
1271                          */
1272                         while (tcb != NULL) {
1273                                 /*
1274                                  * Strip off the tx control block from
1275                                  * the work list, and add it to the
1276                                  * pending list.
1277                                  */
1278                                 tx_ring->work_list[index] = NULL;
1279                                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1280 
1281                                 /*
1282                                  * Count the total number of the tx
1283                                  * descriptors recycled
1284                                  */
1285                                 desc_num += tcb->desc_num;
1286 
1287                                 index = NEXT_INDEX(index, tcb->desc_num,
1288                                     tx_ring->ring_size);
1289 
1290                                 tcb = tx_ring->work_list[index];
1291 
1292                                 prev_index = PREV_INDEX(index, 1,
1293                                     tx_ring->ring_size);
1294                                 if (prev_index == last_index)
1295                                         break;
1296                         }
1297                 } else {
1298                         break;
1299                 }
1300         }
1301 
1302         /*
1303          * If no tx descriptors are recycled, no need to do more processing
1304          */
1305         if (desc_num == 0) {
1306                 tx_ring->recycle_fail++;
1307                 mutex_exit(&tx_ring->recycle_lock);
1308                 return (0);
1309         }
1310 
1311         tx_ring->recycle_fail = 0;
1312         tx_ring->stall_watchdog = 0;
1313 
1314         /*
1315          * Update the head index of the tx descriptor ring
1316          */
1317         tx_ring->tbd_head = index;
1318 
1319         /*
1320          * Update the number of the free tx descriptors with atomic operations
1321          */
1322         atomic_add_32(&tx_ring->tbd_free, desc_num);
1323 
1324         if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1325             (tx_ring->reschedule)) {
1326                 tx_ring->reschedule = B_FALSE;
1327                 mac_tx_ring_update(ixgbe->mac_hdl,
1328                     tx_ring->ring_handle);
1329         }
1330         mutex_exit(&tx_ring->recycle_lock);
1331 
1332         /*
1333          * Free the resources used by the tx control blocks
1334          * in the pending list
1335          */
1336         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1337         while (tcb != NULL) {
1338                 /*
1339                  * Release the resources occupied by the tx control block
1340                  */
1341                 ixgbe_free_tcb(tcb);
1342 
1343                 tcb = (tx_control_block_t *)
1344                     LIST_GET_NEXT(&pending_list, &tcb->link);
1345         }
1346 
1347         /*
1348          * Add the tx control blocks in the pending list to the free list.
1349          */
1350         ixgbe_put_free_list(tx_ring, &pending_list);
1351 
1352         return (desc_num);
1353 }
1354 
1355 /*
1356  * ixgbe_tx_recycle_head_wb
1357  *
1358  * Check the head write-back, and recycle all the transmitted
1359  * tx descriptors and tx control blocks.
1360  */
1361 uint32_t
1362 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1363 {
1364         uint32_t index;
1365         uint32_t head_wb;
1366         int desc_num;
1367         tx_control_block_t *tcb;
1368         link_list_t pending_list;
1369         ixgbe_t *ixgbe = tx_ring->ixgbe;
1370 
1371         mutex_enter(&tx_ring->recycle_lock);
1372 
1373         ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1374 
1375         if (tx_ring->tbd_free == tx_ring->ring_size) {
1376                 tx_ring->recycle_fail = 0;
1377                 tx_ring->stall_watchdog = 0;
1378                 if (tx_ring->reschedule) {
1379                         tx_ring->reschedule = B_FALSE;
1380                         mac_tx_ring_update(ixgbe->mac_hdl,
1381                             tx_ring->ring_handle);
1382                 }
1383                 mutex_exit(&tx_ring->recycle_lock);
1384                 return (0);
1385         }
1386 
1387         /*
1388          * Sync the DMA buffer of the tx descriptor ring
1389          *
1390          * Note: For head write-back mode, the tx descriptors will not
1391          * be written back, but the head write-back value is stored at
1392          * the last extra tbd at the end of the DMA area, we still need
1393          * to sync the head write-back value for kernel.
1394          *
1395          * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1396          */
1397         (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1398             sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1399             sizeof (uint32_t),
1400             DDI_DMA_SYNC_FORKERNEL);
1401 
1402         if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1403                 mutex_exit(&tx_ring->recycle_lock);
1404                 ddi_fm_service_impact(ixgbe->dip,
1405                     DDI_SERVICE_DEGRADED);
1406                 atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1407                 return (0);
1408         }
1409 
1410         LINK_LIST_INIT(&pending_list);
1411         desc_num = 0;
1412         index = tx_ring->tbd_head;   /* Next index to clean */
1413 
1414         /*
1415          * Get the value of head write-back
1416          */
1417         head_wb = *tx_ring->tbd_head_wb;
1418         while (index != head_wb) {
1419                 tcb = tx_ring->work_list[index];
1420                 ASSERT(tcb != NULL);
1421 
1422                 if (OFFSET(index, head_wb, tx_ring->ring_size) <
1423                     tcb->desc_num) {
1424                         /*
1425                          * The current tx control block is not
1426                          * completely transmitted, stop recycling
1427                          */
1428                         break;
1429                 }
1430 
1431                 /*
1432                  * Strip off the tx control block from the work list,
1433                  * and add it to the pending list.
1434                  */
1435                 tx_ring->work_list[index] = NULL;
1436                 LIST_PUSH_TAIL(&pending_list, &tcb->link);
1437 
1438                 /*
1439                  * Advance the index of the tx descriptor ring
1440                  */
1441                 index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1442 
1443                 /*
1444                  * Count the total number of the tx descriptors recycled
1445                  */
1446                 desc_num += tcb->desc_num;
1447         }
1448 
1449         /*
1450          * If no tx descriptors are recycled, no need to do more processing
1451          */
1452         if (desc_num == 0) {
1453                 tx_ring->recycle_fail++;
1454                 mutex_exit(&tx_ring->recycle_lock);
1455                 return (0);
1456         }
1457 
1458         tx_ring->recycle_fail = 0;
1459         tx_ring->stall_watchdog = 0;
1460 
1461         /*
1462          * Update the head index of the tx descriptor ring
1463          */
1464         tx_ring->tbd_head = index;
1465 
1466         /*
1467          * Update the number of the free tx descriptors with atomic operations
1468          */
1469         atomic_add_32(&tx_ring->tbd_free, desc_num);
1470 
1471         if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1472             (tx_ring->reschedule)) {
1473                 tx_ring->reschedule = B_FALSE;
1474                 mac_tx_ring_update(ixgbe->mac_hdl,
1475                     tx_ring->ring_handle);
1476         }
1477         mutex_exit(&tx_ring->recycle_lock);
1478 
1479         /*
1480          * Free the resources used by the tx control blocks
1481          * in the pending list
1482          */
1483         tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1484         while (tcb) {
1485                 /*
1486                  * Release the resources occupied by the tx control block
1487                  */
1488                 ixgbe_free_tcb(tcb);
1489 
1490                 tcb = (tx_control_block_t *)
1491                     LIST_GET_NEXT(&pending_list, &tcb->link);
1492         }
1493 
1494         /*
1495          * Add the tx control blocks in the pending list to the free list.
1496          */
1497         ixgbe_put_free_list(tx_ring, &pending_list);
1498 
1499         return (desc_num);
1500 }
1501 
1502 /*
1503  * ixgbe_free_tcb - free up the tx control block
1504  *
1505  * Free the resources of the tx control block, including
1506  * unbind the previously bound DMA handle, and reset other
1507  * control fields.
1508  */
1509 void
1510 ixgbe_free_tcb(tx_control_block_t *tcb)
1511 {
1512         switch (tcb->tx_type) {
1513         case USE_COPY:
1514                 /*
1515                  * Reset the buffer length that is used for copy
1516                  */
1517                 tcb->tx_buf.len = 0;
1518                 break;
1519         case USE_DMA:
1520                 /*
1521                  * Release the DMA resource that is used for
1522                  * DMA binding.
1523                  */
1524                 (void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1525                 break;
1526         default:
1527                 break;
1528         }
1529 
1530         /*
1531          * Free the mblk
1532          */
1533         if (tcb->mp != NULL) {
1534                 freemsg(tcb->mp);
1535                 tcb->mp = NULL;
1536         }
1537 
1538         tcb->tx_type = USE_NONE;
1539         tcb->last_index = MAX_TX_RING_SIZE;
1540         tcb->frag_num = 0;
1541         tcb->desc_num = 0;
1542 }
1543 
1544 /*
1545  * ixgbe_get_free_list - Get a free tx control block from the free list
1546  *
1547  * The atomic operation on the number of the available tx control block
1548  * in the free list is used to keep this routine mutual exclusive with
1549  * the routine ixgbe_put_check_list.
1550  */
1551 static tx_control_block_t *
1552 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
1553 {
1554         tx_control_block_t *tcb;
1555 
1556         /*
1557          * Check and update the number of the free tx control block
1558          * in the free list.
1559          */
1560         if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1561                 return (NULL);
1562 
1563         mutex_enter(&tx_ring->tcb_head_lock);
1564 
1565         tcb = tx_ring->free_list[tx_ring->tcb_head];
1566         ASSERT(tcb != NULL);
1567         tx_ring->free_list[tx_ring->tcb_head] = NULL;
1568         tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1569             tx_ring->free_list_size);
1570 
1571         mutex_exit(&tx_ring->tcb_head_lock);
1572 
1573         return (tcb);
1574 }
1575 
1576 /*
1577  * ixgbe_put_free_list
1578  *
1579  * Put a list of used tx control blocks back to the free list
1580  *
1581  * A mutex is used here to ensure the serialization. The mutual exclusion
1582  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1583  * the atomic operation on the counter tcb_free.
1584  */
1585 void
1586 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1587 {
1588         uint32_t index;
1589         int tcb_num;
1590         tx_control_block_t *tcb;
1591 
1592         mutex_enter(&tx_ring->tcb_tail_lock);
1593 
1594         index = tx_ring->tcb_tail;
1595 
1596         tcb_num = 0;
1597         tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1598         while (tcb != NULL) {
1599                 ASSERT(tx_ring->free_list[index] == NULL);
1600                 tx_ring->free_list[index] = tcb;
1601 
1602                 tcb_num++;
1603 
1604                 index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1605 
1606                 tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1607         }
1608 
1609         tx_ring->tcb_tail = index;
1610 
1611         /*
1612          * Update the number of the free tx control block
1613          * in the free list. This operation must be placed
1614          * under the protection of the lock.
1615          */
1616         atomic_add_32(&tx_ring->tcb_free, tcb_num);
1617 
1618         mutex_exit(&tx_ring->tcb_tail_lock);
1619 }