Print this page
    
XXXX Intel X540 support
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
          +++ new/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  28   28   */
  29   29  
  30   30  #include "ixgbe_sw.h"
  31   31  
  32   32  static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  33   33      uint32_t, boolean_t);
  34   34  static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  35   35      uint32_t);
  36   36  static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
  37   37      ixgbe_tx_context_t *, size_t);
  38   38  static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
  39   39  static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
  40   40  
  41   41  static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
  42   42  static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
  43   43      ixgbe_tx_context_t *);
  44   44  static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
  45   45      ixgbe_tx_context_t *);
  46   46  
  47   47  #ifndef IXGBE_DEBUG
  48   48  #pragma inline(ixgbe_save_desc)
  49   49  #pragma inline(ixgbe_get_context)
  50   50  #pragma inline(ixgbe_check_context)
  51   51  #pragma inline(ixgbe_fill_context)
  52   52  #endif
  53   53  
  54   54  /*
  55   55   * ixgbe_ring_tx
  56   56   *
  57   57   * To transmit one mblk through one specified ring.
  58   58   *
  59   59   * One mblk can consist of several fragments, each fragment
  60   60   * will be processed with different methods based on the size.
  61   61   * For the fragments with size less than the bcopy threshold,
  62   62   * they will be processed by using bcopy; otherwise, they will
  63   63   * be processed by using DMA binding.
  64   64   *
  65   65   * To process the mblk, a tx control block is got from the
  66   66   * free list. One tx control block contains one tx buffer, which
  67   67   * is used to copy mblk fragments' data; and one tx DMA handle,
  68   68   * which is used to bind a mblk fragment with DMA resource.
  69   69   *
  70   70   * Several small mblk fragments can be copied into one tx control
  71   71   * block's buffer, and then the buffer will be transmitted with
  72   72   * one tx descriptor.
  73   73   *
  74   74   * A large fragment only binds with one tx control block's DMA
  75   75   * handle, and it can span several tx descriptors for transmitting.
  76   76   *
  77   77   * So to transmit a packet (mblk), several tx control blocks can
  78   78   * be used. After the processing, those tx control blocks will
  79   79   * be put to the work list.
  80   80   */
  81   81  mblk_t *
  82   82  ixgbe_ring_tx(void *arg, mblk_t *mp)
  83   83  {
  84   84          ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
  85   85          ixgbe_t *ixgbe = tx_ring->ixgbe;
  86   86          tx_type_t current_flag, next_flag;
  87   87          uint32_t current_len, next_len;
  88   88          uint32_t desc_total;
  89   89          size_t mbsize;
  90   90          int desc_num;
  91   91          boolean_t copy_done, eop;
  92   92          mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
  93   93          tx_control_block_t *tcb;
  94   94          ixgbe_tx_context_t tx_context, *ctx;
  95   95          link_list_t pending_list;
  96   96          uint32_t len, hdr_frag_len, hdr_len;
  97   97          uint32_t copy_thresh;
  98   98          mblk_t *hdr_new_mp = NULL;
  99   99          mblk_t *hdr_pre_mp = NULL;
 100  100          mblk_t *hdr_nmp = NULL;
 101  101  
 102  102          ASSERT(mp->b_next == NULL);
 103  103  
 104  104          if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
 105  105              (ixgbe->ixgbe_state & IXGBE_ERROR) ||
 106  106              (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
 107  107              !(ixgbe->ixgbe_state & IXGBE_STARTED)) {
 108  108                  return (mp);
 109  109          }
 110  110  
 111  111          copy_thresh = ixgbe->tx_copy_thresh;
 112  112  
 113  113          /* Get the mblk size */
 114  114          mbsize = 0;
 115  115          for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
 116  116                  mbsize += MBLKL(nmp);
 117  117          }
 118  118  
 119  119          if (ixgbe->tx_hcksum_enable) {
 120  120                  /*
 121  121                   * Retrieve checksum context information from the mblk
 122  122                   * that will be used to decide whether/how to fill the
 123  123                   * context descriptor.
 124  124                   */
 125  125                  ctx = &tx_context;
 126  126                  if (ixgbe_get_context(mp, ctx) < 0) {
 127  127                          freemsg(mp);
 128  128                          return (NULL);
 129  129                  }
 130  130  
 131  131                  /*
 132  132                   * If the mblk size exceeds the max size ixgbe could
 133  133                   * process, then discard this mblk, and return NULL.
 134  134                   */
 135  135                  if ((ctx->lso_flag &&
 136  136                      ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
 137  137                      (!ctx->lso_flag &&
 138  138                      (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
 139  139                          freemsg(mp);
 140  140                          IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
 141  141                          return (NULL);
 142  142                  }
 143  143          } else {
 144  144                  ctx = NULL;
 145  145          }
 146  146  
 147  147          /*
 148  148           * Check and recycle tx descriptors.
 149  149           * The recycle threshold here should be selected carefully
 150  150           */
 151  151          if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
 152  152                  tx_ring->tx_recycle(tx_ring);
 153  153          }
 154  154  
 155  155          /*
 156  156           * After the recycling, if the tbd_free is less than the
 157  157           * overload_threshold, assert overload, return mp;
 158  158           * and we need to re-schedule the tx again.
 159  159           */
 160  160          if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
 161  161                  tx_ring->reschedule = B_TRUE;
 162  162                  IXGBE_DEBUG_STAT(tx_ring->stat_overload);
 163  163                  return (mp);
 164  164          }
 165  165  
 166  166          /*
 167  167           * The pending_list is a linked list that is used to save
 168  168           * the tx control blocks that have packet data processed
 169  169           * but have not put the data to the tx descriptor ring.
 170  170           * It is used to reduce the lock contention of the tx_lock.
 171  171           */
 172  172          LINK_LIST_INIT(&pending_list);
 173  173          desc_num = 0;
 174  174          desc_total = 0;
 175  175  
 176  176          /*
 177  177           * The software should guarantee LSO packet header(MAC+IP+TCP)
 178  178           * to be within one descriptor. Here we reallocate and refill the
 179  179           * the header if it's physical memory non-contiguous.
 180  180           */
 181  181          if ((ctx != NULL) && ctx->lso_flag) {
 182  182                  /* find the last fragment of the header */
 183  183                  len = MBLKL(mp);
 184  184                  ASSERT(len > 0);
 185  185                  hdr_nmp = mp;
 186  186                  hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
 187  187                  while (len < hdr_len) {
 188  188                          hdr_pre_mp = hdr_nmp;
 189  189                          hdr_nmp = hdr_nmp->b_cont;
 190  190                          len += MBLKL(hdr_nmp);
 191  191                  }
 192  192                  /*
 193  193                   * If the header and the payload are in different mblks,
 194  194                   * we simply force the header to be copied into pre-allocated
 195  195                   * page-aligned buffer.
 196  196                   */
 197  197                  if (len == hdr_len)
 198  198                          goto adjust_threshold;
 199  199  
 200  200                  hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
 201  201                  /*
 202  202                   * There are two cases we need to reallocate a mblk for the
 203  203                   * last header fragment:
 204  204                   * 1. the header is in multiple mblks and the last fragment
 205  205                   * share the same mblk with the payload
 206  206                   * 2. the header is in a single mblk shared with the payload
 207  207                   * and the header is physical memory non-contiguous
 208  208                   */
 209  209                  if ((hdr_nmp != mp) ||
 210  210                      (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
 211  211                      < hdr_len)) {
 212  212                          IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
 213  213                          /*
 214  214                           * reallocate the mblk for the last header fragment,
 215  215                           * expect to bcopy into pre-allocated page-aligned
 216  216                           * buffer
 217  217                           */
 218  218                          hdr_new_mp = allocb(hdr_frag_len, NULL);
 219  219                          if (!hdr_new_mp)
 220  220                                  return (mp);
 221  221                          bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
 222  222                              hdr_frag_len);
 223  223                          /* link the new header fragment with the other parts */
 224  224                          hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
 225  225                          hdr_new_mp->b_cont = hdr_nmp;
 226  226                          if (hdr_pre_mp)
 227  227                                  hdr_pre_mp->b_cont = hdr_new_mp;
 228  228                          else
 229  229                                  mp = hdr_new_mp;
 230  230                          hdr_nmp->b_rptr += hdr_frag_len;
 231  231                  }
 232  232  adjust_threshold:
 233  233                  /*
 234  234                   * adjust the bcopy threshhold to guarantee
 235  235                   * the header to use bcopy way
 236  236                   */
 237  237                  if (copy_thresh < hdr_len)
 238  238                          copy_thresh = hdr_len;
 239  239          }
 240  240  
 241  241          current_mp = mp;
 242  242          current_len = MBLKL(current_mp);
 243  243          /*
 244  244           * Decide which method to use for the first fragment
 245  245           */
 246  246          current_flag = (current_len <= copy_thresh) ?
 247  247              USE_COPY : USE_DMA;
 248  248          /*
 249  249           * If the mblk includes several contiguous small fragments,
 250  250           * they may be copied into one buffer. This flag is used to
 251  251           * indicate whether there are pending fragments that need to
 252  252           * be copied to the current tx buffer.
 253  253           *
 254  254           * If this flag is B_TRUE, it indicates that a new tx control
 255  255           * block is needed to process the next fragment using either
 256  256           * copy or DMA binding.
 257  257           *
 258  258           * Otherwise, it indicates that the next fragment will be
 259  259           * copied to the current tx buffer that is maintained by the
 260  260           * current tx control block. No new tx control block is needed.
 261  261           */
 262  262          copy_done = B_TRUE;
 263  263          while (current_mp) {
 264  264                  next_mp = current_mp->b_cont;
 265  265                  eop = (next_mp == NULL); /* Last fragment of the packet? */
 266  266                  next_len = eop ? 0: MBLKL(next_mp);
 267  267  
 268  268                  /*
 269  269                   * When the current fragment is an empty fragment, if
 270  270                   * the next fragment will still be copied to the current
 271  271                   * tx buffer, we cannot skip this fragment here. Because
 272  272                   * the copy processing is pending for completion. We have
 273  273                   * to process this empty fragment in the tx_copy routine.
 274  274                   *
 275  275                   * If the copy processing is completed or a DMA binding
 276  276                   * processing is just completed, we can just skip this
 277  277                   * empty fragment.
 278  278                   */
 279  279                  if ((current_len == 0) && (copy_done)) {
 280  280                          current_mp = next_mp;
 281  281                          current_len = next_len;
 282  282                          current_flag = (current_len <= copy_thresh) ?
 283  283                              USE_COPY : USE_DMA;
 284  284                          continue;
 285  285                  }
 286  286  
 287  287                  if (copy_done) {
 288  288                          /*
 289  289                           * Get a new tx control block from the free list
 290  290                           */
 291  291                          tcb = ixgbe_get_free_list(tx_ring);
 292  292  
 293  293                          if (tcb == NULL) {
 294  294                                  IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 295  295                                  goto tx_failure;
 296  296                          }
 297  297  
 298  298                          /*
 299  299                           * Push the tx control block to the pending list
 300  300                           * to avoid using lock too early
 301  301                           */
 302  302                          LIST_PUSH_TAIL(&pending_list, &tcb->link);
 303  303                  }
 304  304  
 305  305                  if (current_flag == USE_COPY) {
 306  306                          /*
 307  307                           * Check whether to use bcopy or DMA binding to process
 308  308                           * the next fragment, and if using bcopy, whether we
 309  309                           * need to continue copying the next fragment into the
 310  310                           * current tx buffer.
 311  311                           */
 312  312                          ASSERT((tcb->tx_buf.len + current_len) <=
 313  313                              tcb->tx_buf.size);
 314  314  
 315  315                          if (eop) {
 316  316                                  /*
 317  317                                   * This is the last fragment of the packet, so
 318  318                                   * the copy processing will be completed with
 319  319                                   * this fragment.
 320  320                                   */
 321  321                                  next_flag = USE_NONE;
 322  322                                  copy_done = B_TRUE;
 323  323                          } else if ((tcb->tx_buf.len + current_len + next_len) >
 324  324                              tcb->tx_buf.size) {
 325  325                                  /*
 326  326                                   * If the next fragment is too large to be
 327  327                                   * copied to the current tx buffer, we need
 328  328                                   * to complete the current copy processing.
 329  329                                   */
 330  330                                  next_flag = (next_len > copy_thresh) ?
 331  331                                      USE_DMA: USE_COPY;
 332  332                                  copy_done = B_TRUE;
 333  333                          } else if (next_len > copy_thresh) {
 334  334                                  /*
 335  335                                   * The next fragment needs to be processed with
 336  336                                   * DMA binding. So the copy prcessing will be
 337  337                                   * completed with the current fragment.
 338  338                                   */
 339  339                                  next_flag = USE_DMA;
 340  340                                  copy_done = B_TRUE;
 341  341                          } else {
 342  342                                  /*
 343  343                                   * Continue to copy the next fragment to the
 344  344                                   * current tx buffer.
 345  345                                   */
 346  346                                  next_flag = USE_COPY;
 347  347                                  copy_done = B_FALSE;
 348  348                          }
 349  349  
 350  350                          desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
 351  351                              current_len, copy_done);
 352  352                  } else {
 353  353                          /*
 354  354                           * Check whether to use bcopy or DMA binding to process
 355  355                           * the next fragment.
 356  356                           */
 357  357                          next_flag = (next_len > copy_thresh) ?
 358  358                              USE_DMA: USE_COPY;
 359  359                          ASSERT(copy_done == B_TRUE);
 360  360  
 361  361                          desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
 362  362                              current_len);
 363  363                  }
 364  364  
 365  365                  if (desc_num > 0)
 366  366                          desc_total += desc_num;
 367  367                  else if (desc_num < 0)
 368  368                          goto tx_failure;
 369  369  
 370  370                  current_mp = next_mp;
 371  371                  current_len = next_len;
 372  372                  current_flag = next_flag;
 373  373          }
 374  374  
 375  375          /*
 376  376           * Attach the mblk to the last tx control block
 377  377           */
 378  378          ASSERT(tcb);
 379  379          ASSERT(tcb->mp == NULL);
 380  380          tcb->mp = mp;
 381  381  
 382  382          /*
 383  383           * 82598/82599 chipset has a limitation that no more than 32 tx
 384  384           * descriptors can be transmited out at one time.
 385  385           *
 386  386           * Here is a workaround for it: pull up the mblk then send it
 387  387           * out with bind way. By doing so, no more than MAX_COOKIE (18)
 388  388           * descriptors is needed.
 389  389           */
 390  390          if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
 391  391                  IXGBE_DEBUG_STAT(tx_ring->stat_break_tbd_limit);
 392  392  
 393  393                  /*
 394  394                   * Discard the mblk and free the used resources
 395  395                   */
 396  396                  tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 397  397                  while (tcb) {
 398  398                          tcb->mp = NULL;
 399  399                          ixgbe_free_tcb(tcb);
 400  400                          tcb = (tx_control_block_t *)
 401  401                              LIST_GET_NEXT(&pending_list, &tcb->link);
 402  402                  }
 403  403  
 404  404                  /*
 405  405                   * Return the tx control blocks in the pending list to
 406  406                   * the free list.
 407  407                   */
 408  408                  ixgbe_put_free_list(tx_ring, &pending_list);
 409  409  
 410  410                  /*
 411  411                   * pull up the mblk and send it out with bind way
 412  412                   */
 413  413                  if ((pull_mp = msgpullup(mp, -1)) == NULL) {
 414  414                          tx_ring->reschedule = B_TRUE;
 415  415  
 416  416                          /*
 417  417                           * If new mblk has been allocted for the last header
 418  418                           * fragment of a LSO packet, we should restore the
 419  419                           * modified mp.
 420  420                           */
 421  421                          if (hdr_new_mp) {
 422  422                                  hdr_new_mp->b_cont = NULL;
 423  423                                  freeb(hdr_new_mp);
 424  424                                  hdr_nmp->b_rptr -= hdr_frag_len;
 425  425                                  if (hdr_pre_mp)
 426  426                                          hdr_pre_mp->b_cont = hdr_nmp;
 427  427                                  else
 428  428                                          mp = hdr_nmp;
 429  429                          }
 430  430                          return (mp);
 431  431                  }
 432  432  
 433  433                  LINK_LIST_INIT(&pending_list);
 434  434                  desc_total = 0;
 435  435  
 436  436                  /*
 437  437                   * if the packet is a LSO packet, we simply
 438  438                   * transmit the header in one descriptor using the copy way
 439  439                   */
 440  440                  if ((ctx != NULL) && ctx->lso_flag) {
 441  441                          hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
 442  442                              ctx->l4_hdr_len;
 443  443  
 444  444                          tcb = ixgbe_get_free_list(tx_ring);
 445  445                          if (tcb == NULL) {
 446  446                                  IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 447  447                                  goto tx_failure;
 448  448                          }
 449  449                          desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
 450  450                              hdr_len, B_TRUE);
 451  451                          LIST_PUSH_TAIL(&pending_list, &tcb->link);
 452  452                          desc_total  += desc_num;
 453  453  
 454  454                          pull_mp->b_rptr += hdr_len;
 455  455                  }
 456  456  
 457  457                  tcb = ixgbe_get_free_list(tx_ring);
 458  458                  if (tcb == NULL) {
 459  459                          IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 460  460                          goto tx_failure;
 461  461                  }
 462  462                  if ((ctx != NULL) && ctx->lso_flag) {
 463  463                          desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 464  464                              mbsize - hdr_len);
 465  465                  } else {
 466  466                          desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 467  467                              mbsize);
 468  468                  }
 469  469                  if (desc_num < 0) {
 470  470                          goto tx_failure;
 471  471                  }
 472  472                  LIST_PUSH_TAIL(&pending_list, &tcb->link);
 473  473  
 474  474                  desc_total += desc_num;
 475  475                  tcb->mp = pull_mp;
 476  476          }
 477  477  
 478  478          /*
 479  479           * Before fill the tx descriptor ring with the data, we need to
 480  480           * ensure there are adequate free descriptors for transmit
 481  481           * (including one context descriptor).
 482  482           * Do not use up all the tx descriptors.
 483  483           * Otherwise tx recycle will fail and cause false hang.
 484  484           */
 485  485          if (tx_ring->tbd_free <= (desc_total + 1)) {
 486  486                  tx_ring->tx_recycle(tx_ring);
 487  487          }
 488  488  
 489  489          mutex_enter(&tx_ring->tx_lock);
 490  490          /*
 491  491           * If the number of free tx descriptors is not enough for transmit
 492  492           * then return mp.
 493  493           *
 494  494           * Note: we must put this check under the mutex protection to
 495  495           * ensure the correctness when multiple threads access it in
 496  496           * parallel.
 497  497           */
 498  498          if (tx_ring->tbd_free <= (desc_total + 1)) {
 499  499                  IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
 500  500                  mutex_exit(&tx_ring->tx_lock);
 501  501                  goto tx_failure;
 502  502          }
 503  503  
 504  504          desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
 505  505              mbsize);
 506  506  
 507  507          ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
 508  508  
 509  509          tx_ring->stat_obytes += mbsize;
 510  510          tx_ring->stat_opackets ++;
 511  511  
 512  512          mutex_exit(&tx_ring->tx_lock);
 513  513  
 514  514          /*
 515  515           * now that the transmission succeeds, need to free the original
 516  516           * mp if we used the pulling up mblk for transmission.
 517  517           */
 518  518          if (pull_mp) {
 519  519                  freemsg(mp);
 520  520          }
 521  521  
 522  522          return (NULL);
 523  523  
 524  524  tx_failure:
 525  525          /*
 526  526           * If transmission fails, need to free the pulling up mblk.
 527  527           */
 528  528          if (pull_mp) {
 529  529                  freemsg(pull_mp);
 530  530          }
 531  531  
 532  532          /*
 533  533           * If new mblk has been allocted for the last header
 534  534           * fragment of a LSO packet, we should restore the
 535  535           * modified mp.
 536  536           */
 537  537          if (hdr_new_mp) {
 538  538                  hdr_new_mp->b_cont = NULL;
 539  539                  freeb(hdr_new_mp);
 540  540                  hdr_nmp->b_rptr -= hdr_frag_len;
 541  541                  if (hdr_pre_mp)
 542  542                          hdr_pre_mp->b_cont = hdr_nmp;
 543  543                  else
 544  544                          mp = hdr_nmp;
 545  545          }
 546  546          /*
 547  547           * Discard the mblk and free the used resources
 548  548           */
 549  549          tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 550  550          while (tcb) {
 551  551                  tcb->mp = NULL;
 552  552  
 553  553                  ixgbe_free_tcb(tcb);
 554  554  
 555  555                  tcb = (tx_control_block_t *)
 556  556                      LIST_GET_NEXT(&pending_list, &tcb->link);
 557  557          }
 558  558  
 559  559          /*
 560  560           * Return the tx control blocks in the pending list to the free list.
 561  561           */
 562  562          ixgbe_put_free_list(tx_ring, &pending_list);
 563  563  
 564  564          /* Transmit failed, do not drop the mblk, rechedule the transmit */
 565  565          tx_ring->reschedule = B_TRUE;
 566  566  
 567  567          return (mp);
 568  568  }
 569  569  
 570  570  /*
 571  571   * ixgbe_tx_copy
 572  572   *
 573  573   * Copy the mblk fragment to the pre-allocated tx buffer
 574  574   */
 575  575  static int
 576  576  ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 577  577      uint32_t len, boolean_t copy_done)
 578  578  {
 579  579          dma_buffer_t *tx_buf;
 580  580          uint32_t desc_num;
 581  581          _NOTE(ARGUNUSED(tx_ring));
 582  582  
 583  583          tx_buf = &tcb->tx_buf;
 584  584  
 585  585          /*
 586  586           * Copy the packet data of the mblk fragment into the
 587  587           * pre-allocated tx buffer, which is maintained by the
 588  588           * tx control block.
 589  589           *
 590  590           * Several mblk fragments can be copied into one tx buffer.
 591  591           * The destination address of the current copied fragment in
 592  592           * the tx buffer is next to the end of the previous copied
 593  593           * fragment.
 594  594           */
 595  595          if (len > 0) {
 596  596                  bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
 597  597  
 598  598                  tx_buf->len += len;
 599  599                  tcb->frag_num++;
 600  600          }
 601  601  
 602  602          desc_num = 0;
 603  603  
 604  604          /*
 605  605           * If it is the last fragment copied to the current tx buffer,
 606  606           * in other words, if there's no remaining fragment or the remaining
 607  607           * fragment requires a new tx control block to process, we need to
 608  608           * complete the current copy processing by syncing up the current
 609  609           * DMA buffer and saving the descriptor data.
 610  610           */
 611  611          if (copy_done) {
 612  612                  /*
 613  613                   * Sync the DMA buffer of the packet data
 614  614                   */
 615  615                  DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
 616  616  
 617  617                  tcb->tx_type = USE_COPY;
 618  618  
 619  619                  /*
 620  620                   * Save the address and length to the private data structure
 621  621                   * of the tx control block, which will be used to fill the
 622  622                   * tx descriptor ring after all the fragments are processed.
 623  623                   */
 624  624                  ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
 625  625                  desc_num++;
 626  626          }
 627  627  
 628  628          return (desc_num);
 629  629  }
 630  630  
 631  631  /*
 632  632   * ixgbe_tx_bind
 633  633   *
 634  634   * Bind the mblk fragment with DMA
 635  635   */
 636  636  static int
 637  637  ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 638  638      uint32_t len)
 639  639  {
 640  640          int status, i;
 641  641          ddi_dma_cookie_t dma_cookie;
 642  642          uint_t ncookies;
 643  643          int desc_num;
 644  644  
 645  645          /*
 646  646           * Use DMA binding to process the mblk fragment
 647  647           */
 648  648          status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
 649  649              (caddr_t)mp->b_rptr, len,
 650  650              DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
 651  651              0, &dma_cookie, &ncookies);
 652  652  
 653  653          if (status != DDI_DMA_MAPPED) {
 654  654                  IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
 655  655                  return (-1);
 656  656          }
 657  657  
 658  658          tcb->frag_num++;
 659  659          tcb->tx_type = USE_DMA;
 660  660          /*
 661  661           * Each fragment can span several cookies. One cookie will have
 662  662           * one tx descriptor to transmit.
 663  663           */
 664  664          desc_num = 0;
 665  665          for (i = ncookies; i > 0; i--) {
 666  666                  /*
 667  667                   * Save the address and length to the private data structure
 668  668                   * of the tx control block, which will be used to fill the
 669  669                   * tx descriptor ring after all the fragments are processed.
 670  670                   */
 671  671                  ixgbe_save_desc(tcb,
 672  672                      dma_cookie.dmac_laddress,
 673  673                      dma_cookie.dmac_size);
 674  674  
 675  675                  desc_num++;
 676  676  
 677  677                  if (i > 1)
 678  678                          ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
 679  679          }
 680  680  
 681  681          return (desc_num);
 682  682  }
 683  683  
 684  684  /*
 685  685   * ixgbe_get_context
 686  686   *
 687  687   * Get the context information from the mblk
 688  688   */
 689  689  static int
 690  690  ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 691  691  {
 692  692          uint32_t start;
 693  693          uint32_t hckflags;
 694  694          uint32_t lsoflags;
 695  695          uint32_t mss;
 696  696          uint32_t len;
 697  697          uint32_t size;
 698  698          uint32_t offset;
 699  699          unsigned char *pos;
 700  700          ushort_t etype;
 701  701          uint32_t mac_hdr_len;
 702  702          uint32_t l4_proto;
 703  703          uint32_t l4_hdr_len;
 704  704  
 705  705          ASSERT(mp != NULL);
 706  706  
 707  707          mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
 708  708          bzero(ctx, sizeof (ixgbe_tx_context_t));
 709  709  
 710  710          if (hckflags == 0) {
 711  711                  return (0);
 712  712          }
 713  713  
 714  714          ctx->hcksum_flags = hckflags;
 715  715  
 716  716          mac_lso_get(mp, &mss, &lsoflags);
 717  717          ctx->mss = mss;
 718  718          ctx->lso_flag = (lsoflags == HW_LSO);
 719  719  
 720  720          /*
 721  721           * LSO relies on tx h/w checksum, so here will drop the package
 722  722           * if h/w checksum flag is not declared.
 723  723           */
 724  724          if (ctx->lso_flag) {
 725  725                  if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
 726  726                      (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
 727  727                          IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w "
 728  728                              "checksum flags are not specified when doing LSO");
 729  729                          return (-1);
 730  730                  }
 731  731          }
 732  732  
 733  733          etype = 0;
 734  734          mac_hdr_len = 0;
 735  735          l4_proto = 0;
 736  736  
 737  737          /*
 738  738           * Firstly get the position of the ether_type/ether_tpid.
 739  739           * Here we don't assume the ether (VLAN) header is fully included
 740  740           * in one mblk fragment, so we go thourgh the fragments to parse
 741  741           * the ether type.
 742  742           */
 743  743          size = len = MBLKL(mp);
 744  744          offset = offsetof(struct ether_header, ether_type);
 745  745          while (size <= offset) {
 746  746                  mp = mp->b_cont;
 747  747                  ASSERT(mp != NULL);
 748  748                  len = MBLKL(mp);
 749  749                  size += len;
 750  750          }
 751  751          pos = mp->b_rptr + offset + len - size;
 752  752  
 753  753          etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 754  754          if (etype == ETHERTYPE_VLAN) {
 755  755                  /*
 756  756                   * Get the position of the ether_type in VLAN header
 757  757                   */
 758  758                  offset = offsetof(struct ether_vlan_header, ether_type);
 759  759                  while (size <= offset) {
 760  760                          mp = mp->b_cont;
 761  761                          ASSERT(mp != NULL);
 762  762                          len = MBLKL(mp);
 763  763                          size += len;
 764  764                  }
 765  765                  pos = mp->b_rptr + offset + len - size;
 766  766  
 767  767                  etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 768  768                  mac_hdr_len = sizeof (struct ether_vlan_header);
 769  769          } else {
 770  770                  mac_hdr_len = sizeof (struct ether_header);
 771  771          }
 772  772  
 773  773          /*
 774  774           * Here we don't assume the IP(V6) header is fully included in
 775  775           * one mblk fragment.
 776  776           */
 777  777          switch (etype) {
 778  778          case ETHERTYPE_IP:
 779  779                  if (ctx->lso_flag) {
 780  780                          offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
 781  781                          while (size <= offset) {
 782  782                                  mp = mp->b_cont;
 783  783                                  ASSERT(mp != NULL);
 784  784                                  len = MBLKL(mp);
 785  785                                  size += len;
 786  786                          }
 787  787                          pos = mp->b_rptr + offset + len - size;
 788  788                          *((uint16_t *)(uintptr_t)(pos)) = 0;
 789  789  
 790  790                          offset = offsetof(ipha_t, ipha_hdr_checksum) +
 791  791                              mac_hdr_len;
 792  792                          while (size <= offset) {
 793  793                                  mp = mp->b_cont;
 794  794                                  ASSERT(mp != NULL);
 795  795                                  len = MBLKL(mp);
 796  796                                  size += len;
 797  797                          }
 798  798                          pos = mp->b_rptr + offset + len - size;
 799  799                          *((uint16_t *)(uintptr_t)(pos)) = 0;
 800  800  
 801  801                          /*
 802  802                           * To perform ixgbe LSO, here also need to fill
 803  803                           * the tcp checksum field of the packet with the
 804  804                           * following pseudo-header checksum:
 805  805                           * (ip_source_addr, ip_destination_addr, l4_proto)
 806  806                           * Currently the tcp/ip stack has done it.
 807  807                           */
 808  808                  }
 809  809  
 810  810                  offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
 811  811                  while (size <= offset) {
 812  812                          mp = mp->b_cont;
 813  813                          ASSERT(mp != NULL);
 814  814                          len = MBLKL(mp);
 815  815                          size += len;
 816  816                  }
 817  817                  pos = mp->b_rptr + offset + len - size;
 818  818  
 819  819                  l4_proto = *(uint8_t *)pos;
 820  820                  break;
 821  821          case ETHERTYPE_IPV6:
 822  822                  offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
 823  823                  while (size <= offset) {
 824  824                          mp = mp->b_cont;
 825  825                          ASSERT(mp != NULL);
 826  826                          len = MBLKL(mp);
 827  827                          size += len;
 828  828                  }
 829  829                  pos = mp->b_rptr + offset + len - size;
 830  830  
 831  831                  l4_proto = *(uint8_t *)pos;
 832  832                  break;
 833  833          default:
 834  834                  /* Unrecoverable error */
 835  835                  IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
 836  836                  return (-2);
 837  837          }
 838  838  
 839  839          if (ctx->lso_flag) {
 840  840                  offset = mac_hdr_len + start;
 841  841                  while (size <= offset) {
 842  842                          mp = mp->b_cont;
 843  843                          ASSERT(mp != NULL);
 844  844                          len = MBLKL(mp);
 845  845                          size += len;
 846  846                  }
 847  847                  pos = mp->b_rptr + offset + len - size;
 848  848  
 849  849                  l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
 850  850          } else {
 851  851                  /*
 852  852                   * l4 header length is only required for LSO
 853  853                   */
 854  854                  l4_hdr_len = 0;
 855  855          }
 856  856  
 857  857          ctx->mac_hdr_len = mac_hdr_len;
 858  858          ctx->ip_hdr_len = start;
 859  859          ctx->l4_proto = l4_proto;
 860  860          ctx->l4_hdr_len = l4_hdr_len;
 861  861  
 862  862          return (0);
 863  863  }
 864  864  
 865  865  /*
 866  866   * ixgbe_check_context
 867  867   *
 868  868   * Check if a new context descriptor is needed
 869  869   */
 870  870  static boolean_t
 871  871  ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
 872  872  {
 873  873          ixgbe_tx_context_t *last;
 874  874  
 875  875          if (ctx == NULL)
 876  876                  return (B_FALSE);
 877  877  
 878  878          /*
 879  879           * Compare the context data retrieved from the mblk and the
 880  880           * stored data of the last context descriptor. The data need
 881  881           * to be checked are:
 882  882           *      hcksum_flags
 883  883           *      l4_proto
 884  884           *      mac_hdr_len
 885  885           *      ip_hdr_len
 886  886           *      lso_flag
 887  887           *      mss (only checked for LSO)
 888  888           *      l4_hr_len (only checked for LSO)
 889  889           * Either one of the above data is changed, a new context descriptor
 890  890           * will be needed.
 891  891           */
 892  892          last = &tx_ring->tx_context;
 893  893  
 894  894          if ((ctx->hcksum_flags != last->hcksum_flags) ||
 895  895              (ctx->l4_proto != last->l4_proto) ||
 896  896              (ctx->mac_hdr_len != last->mac_hdr_len) ||
 897  897              (ctx->ip_hdr_len != last->ip_hdr_len) ||
 898  898              (ctx->lso_flag != last->lso_flag) ||
 899  899              (ctx->lso_flag && ((ctx->mss != last->mss) ||
 900  900              (ctx->l4_hdr_len != last->l4_hdr_len)))) {
 901  901                  return (B_TRUE);
 902  902          }
 903  903  
 904  904          return (B_FALSE);
 905  905  }
 906  906  
 907  907  /*
 908  908   * ixgbe_fill_context
 909  909   *
 910  910   * Fill the context descriptor with hardware checksum informations
 911  911   */
 912  912  static void
 913  913  ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
 914  914      ixgbe_tx_context_t *ctx)
 915  915  {
 916  916          /*
 917  917           * Fill the context descriptor with the checksum
 918  918           * context information we've got.
 919  919           */
 920  920          ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
 921  921          ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
 922  922              IXGBE_ADVTXD_MACLEN_SHIFT;
 923  923  
 924  924          ctx_tbd->type_tucmd_mlhl =
 925  925              IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
 926  926  
 927  927          if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
 928  928                  ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
 929  929  
 930  930          if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
 931  931                  switch (ctx->l4_proto) {
 932  932                  case IPPROTO_TCP:
 933  933                          ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
 934  934                          break;
 935  935                  case IPPROTO_UDP:
 936  936                          /*
 937  937                           * We don't have to explicitly set:
 938  938                           *      ctx_tbd->type_tucmd_mlhl |=
 939  939                           *          IXGBE_ADVTXD_TUCMD_L4T_UDP;
 940  940                           * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
 941  941                           */
 942  942                          break;
 943  943                  default:
 944  944                          /* Unrecoverable error */
 945  945                          IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
 946  946                          break;
 947  947                  }
 948  948          }
 949  949  
 950  950          ctx_tbd->seqnum_seed = 0;
 951  951  
 952  952          if (ctx->lso_flag) {
 953  953                  ctx_tbd->mss_l4len_idx =
 954  954                      (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
 955  955                      (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
 956  956          } else {
 957  957                  ctx_tbd->mss_l4len_idx = 0;
 958  958          }
 959  959  }
 960  960  
 961  961  /*
 962  962   * ixgbe_tx_fill_ring
 963  963   *
 964  964   * Fill the tx descriptor ring with the data
 965  965   */
 966  966  static int
 967  967  ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
 968  968      ixgbe_tx_context_t *ctx, size_t mbsize)
 969  969  {
 970  970          struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
 971  971          boolean_t load_context;
 972  972          uint32_t index, tcb_index, desc_num;
 973  973          union ixgbe_adv_tx_desc *tbd, *first_tbd;
 974  974          tx_control_block_t *tcb, *first_tcb;
 975  975          uint32_t hcksum_flags;
 976  976          int i;
 977  977  
 978  978          ASSERT(mutex_owned(&tx_ring->tx_lock));
 979  979  
 980  980          tbd = NULL;
 981  981          first_tbd = NULL;
 982  982          first_tcb = NULL;
 983  983          desc_num = 0;
 984  984          hcksum_flags = 0;
 985  985          load_context = B_FALSE;
 986  986  
 987  987          /*
 988  988           * Get the index of the first tx descriptor that will be filled,
 989  989           * and the index of the first work list item that will be attached
 990  990           * with the first used tx control block in the pending list.
 991  991           * Note: the two indexes are the same.
 992  992           */
 993  993          index = tx_ring->tbd_tail;
 994  994          tcb_index = tx_ring->tbd_tail;
 995  995  
 996  996          if (ctx != NULL) {
 997  997                  hcksum_flags = ctx->hcksum_flags;
 998  998  
 999  999                  /*
1000 1000                   * Check if a new context descriptor is needed for this packet
1001 1001                   */
1002 1002                  load_context = ixgbe_check_context(tx_ring, ctx);
1003 1003  
1004 1004                  if (load_context) {
1005 1005                          tbd = &tx_ring->tbd_ring[index];
1006 1006  
1007 1007                          /*
1008 1008                           * Fill the context descriptor with the
1009 1009                           * hardware checksum offload informations.
1010 1010                           */
1011 1011                          ixgbe_fill_context(
1012 1012                              (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
1013 1013  
1014 1014                          index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1015 1015                          desc_num++;
1016 1016  
1017 1017                          /*
1018 1018                           * Store the checksum context data if
1019 1019                           * a new context descriptor is added
1020 1020                           */
1021 1021                          tx_ring->tx_context = *ctx;
1022 1022                  }
1023 1023          }
1024 1024  
1025 1025          first_tbd = &tx_ring->tbd_ring[index];
1026 1026  
1027 1027          /*
1028 1028           * Fill tx data descriptors with the data saved in the pending list.
1029 1029           * The tx control blocks in the pending list are added to the work list
1030 1030           * at the same time.
1031 1031           *
1032 1032           * The work list is strictly 1:1 corresponding to the descriptor ring.
1033 1033           * One item of the work list corresponds to one tx descriptor. Because
1034 1034           * one tx control block can span multiple tx descriptors, the tx
1035 1035           * control block will be added to the first work list item that
1036 1036           * corresponds to the first tx descriptor generated from that tx
1037 1037           * control block.
1038 1038           */
1039 1039          tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1040 1040          first_tcb = tcb;
1041 1041          while (tcb != NULL) {
1042 1042  
1043 1043                  for (i = 0; i < tcb->desc_num; i++) {
1044 1044                          tbd = &tx_ring->tbd_ring[index];
1045 1045  
1046 1046                          tbd->read.buffer_addr = tcb->desc[i].address;
1047 1047                          tbd->read.cmd_type_len = tcb->desc[i].length;
1048 1048  
1049 1049                          tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
1050 1050                              | IXGBE_ADVTXD_DTYP_DATA;
1051 1051  
1052 1052                          tbd->read.olinfo_status = 0;
1053 1053  
1054 1054                          index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1055 1055                          desc_num++;
1056 1056                  }
1057 1057  
1058 1058                  /*
1059 1059                   * Add the tx control block to the work list
1060 1060                   */
1061 1061                  ASSERT(tx_ring->work_list[tcb_index] == NULL);
1062 1062                  tx_ring->work_list[tcb_index] = tcb;
1063 1063  
1064 1064                  tcb_index = index;
1065 1065                  tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1066 1066          }
1067 1067  
1068 1068          if (load_context) {
1069 1069                  /*
1070 1070                   * Count the context descriptor for
  
    | 
      ↓ open down ↓ | 
    1070 lines elided | 
    
      ↑ open up ↑ | 
  
1071 1071                   * the first tx control block.
1072 1072                   */
1073 1073                  first_tcb->desc_num++;
1074 1074          }
1075 1075          first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
1076 1076  
1077 1077          /*
1078 1078           * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1079 1079           * valid in the first descriptor of the packet.
1080 1080           * Setting paylen in every first_tbd for all parts.
1081      -         * 82599 requires the packet length in paylen field with or without
1082      -         * LSO and 82598 will ignore it in non-LSO mode.
     1081 +         * 82599 and X540 require the packet length in paylen field with or
     1082 +         * without LSO and 82598 will ignore it in non-LSO mode.
1083 1083           */
1084 1084          ASSERT(first_tbd != NULL);
1085 1085          first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1086 1086  
1087 1087          switch (hw->mac.type) {
1088 1088          case ixgbe_mac_82598EB:
1089 1089                  if (ctx != NULL && ctx->lso_flag) {
1090 1090                          first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1091 1091                          first_tbd->read.olinfo_status |=
1092 1092                              (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1093 1093                              - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1094 1094                  }
1095 1095                  break;
1096 1096  
1097 1097          case ixgbe_mac_82599EB:
     1098 +        case ixgbe_mac_X540:
1098 1099                  if (ctx != NULL && ctx->lso_flag) {
1099 1100                          first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1100 1101                          first_tbd->read.olinfo_status |=
1101 1102                              (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1102 1103                              - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1103 1104                  } else {
1104 1105                          first_tbd->read.olinfo_status |=
1105 1106                              (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1106 1107                  }
1107 1108                  break;
1108 1109  
1109 1110          default:
1110 1111                  break;
1111 1112          }
1112 1113  
1113 1114          /* Set hardware checksum bits */
1114 1115          if (hcksum_flags != 0) {
1115 1116                  if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1116 1117                          first_tbd->read.olinfo_status |=
1117 1118                              IXGBE_ADVTXD_POPTS_IXSM;
1118 1119                  if (hcksum_flags & HCK_PARTIALCKSUM)
1119 1120                          first_tbd->read.olinfo_status |=
1120 1121                              IXGBE_ADVTXD_POPTS_TXSM;
1121 1122          }
1122 1123  
1123 1124          /*
1124 1125           * The last descriptor of packet needs End Of Packet (EOP),
1125 1126           * and Report Status (RS) bits set
1126 1127           */
1127 1128          ASSERT(tbd != NULL);
1128 1129          tbd->read.cmd_type_len |=
1129 1130              IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1130 1131  
1131 1132          /*
1132 1133           * Sync the DMA buffer of the tx descriptor ring
1133 1134           */
1134 1135          DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1135 1136  
1136 1137          /*
1137 1138           * Update the number of the free tx descriptors.
1138 1139           * The mutual exclusion between the transmission and the recycling
1139 1140           * (for the tx descriptor ring and the work list) is implemented
1140 1141           * with the atomic operation on the number of the free tx descriptors.
1141 1142           *
1142 1143           * Note: we should always decrement the counter tbd_free before
1143 1144           * advancing the hardware TDT pointer to avoid the race condition -
1144 1145           * before the counter tbd_free is decremented, the transmit of the
1145 1146           * tx descriptors has done and the counter tbd_free is increased by
1146 1147           * the tx recycling.
1147 1148           */
1148 1149          i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1149 1150          ASSERT(i >= 0);
1150 1151  
1151 1152          tx_ring->tbd_tail = index;
1152 1153  
1153 1154          /*
1154 1155           * Advance the hardware TDT pointer of the tx descriptor ring
1155 1156           */
1156 1157          IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1157 1158  
1158 1159          if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1159 1160              DDI_FM_OK) {
1160 1161                  ddi_fm_service_impact(tx_ring->ixgbe->dip,
1161 1162                      DDI_SERVICE_DEGRADED);
1162 1163                  atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1163 1164          }
1164 1165  
1165 1166          return (desc_num);
1166 1167  }
1167 1168  
1168 1169  /*
1169 1170   * ixgbe_save_desc
1170 1171   *
1171 1172   * Save the address/length pair to the private array
1172 1173   * of the tx control block. The address/length pairs
1173 1174   * will be filled into the tx descriptor ring later.
1174 1175   */
1175 1176  static void
1176 1177  ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1177 1178  {
1178 1179          sw_desc_t *desc;
1179 1180  
1180 1181          desc = &tcb->desc[tcb->desc_num];
1181 1182          desc->address = address;
1182 1183          desc->length = length;
1183 1184  
1184 1185          tcb->desc_num++;
1185 1186  }
1186 1187  
1187 1188  /*
1188 1189   * ixgbe_tx_recycle_legacy
1189 1190   *
1190 1191   * Recycle the tx descriptors and tx control blocks.
1191 1192   *
1192 1193   * The work list is traversed to check if the corresponding
1193 1194   * tx descriptors have been transmitted. If so, the resources
1194 1195   * bound to the tx control blocks will be freed, and those
1195 1196   * tx control blocks will be returned to the free list.
1196 1197   */
1197 1198  uint32_t
1198 1199  ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1199 1200  {
1200 1201          uint32_t index, last_index, prev_index;
1201 1202          int desc_num;
1202 1203          boolean_t desc_done;
1203 1204          tx_control_block_t *tcb;
1204 1205          link_list_t pending_list;
1205 1206          ixgbe_t *ixgbe = tx_ring->ixgbe;
1206 1207  
1207 1208          mutex_enter(&tx_ring->recycle_lock);
1208 1209  
1209 1210          ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1210 1211  
1211 1212          if (tx_ring->tbd_free == tx_ring->ring_size) {
1212 1213                  tx_ring->recycle_fail = 0;
1213 1214                  tx_ring->stall_watchdog = 0;
1214 1215                  if (tx_ring->reschedule) {
1215 1216                          tx_ring->reschedule = B_FALSE;
1216 1217                          mac_tx_ring_update(ixgbe->mac_hdl,
1217 1218                              tx_ring->ring_handle);
1218 1219                  }
1219 1220                  mutex_exit(&tx_ring->recycle_lock);
1220 1221                  return (0);
1221 1222          }
1222 1223  
1223 1224          /*
1224 1225           * Sync the DMA buffer of the tx descriptor ring
1225 1226           */
1226 1227          DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1227 1228  
1228 1229          if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1229 1230                  mutex_exit(&tx_ring->recycle_lock);
1230 1231                  ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1231 1232                  atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1232 1233                  return (0);
1233 1234          }
1234 1235  
1235 1236          LINK_LIST_INIT(&pending_list);
1236 1237          desc_num = 0;
1237 1238          index = tx_ring->tbd_head;      /* Index of next tbd/tcb to recycle */
1238 1239  
1239 1240          tcb = tx_ring->work_list[index];
1240 1241          ASSERT(tcb != NULL);
1241 1242  
1242 1243          while (tcb != NULL) {
1243 1244                  /*
1244 1245                   * Get the last tx descriptor of this packet.
1245 1246                   * If the last tx descriptor is done, then
1246 1247                   * we can recycle all descriptors of a packet
1247 1248                   * which usually includes several tx control blocks.
1248 1249                   * For 82599, LSO descriptors can not be recycled
1249 1250                   * unless the whole packet's transmission is done.
1250 1251                   * That's why packet level recycling is used here.
1251 1252                   * For 82598, there's not such limit.
1252 1253                   */
1253 1254                  last_index = tcb->last_index;
1254 1255                  /*
1255 1256                   * MAX_TX_RING_SIZE is used to judge whether
1256 1257                   * the index is a valid value or not.
1257 1258                   */
1258 1259                  if (last_index == MAX_TX_RING_SIZE)
1259 1260                          break;
1260 1261  
1261 1262                  /*
1262 1263                   * Check if the Descriptor Done bit is set
1263 1264                   */
1264 1265                  desc_done = tx_ring->tbd_ring[last_index].wb.status &
1265 1266                      IXGBE_TXD_STAT_DD;
1266 1267                  if (desc_done) {
1267 1268                          /*
1268 1269                           * recycle all descriptors of the packet
1269 1270                           */
1270 1271                          while (tcb != NULL) {
1271 1272                                  /*
1272 1273                                   * Strip off the tx control block from
1273 1274                                   * the work list, and add it to the
1274 1275                                   * pending list.
1275 1276                                   */
1276 1277                                  tx_ring->work_list[index] = NULL;
1277 1278                                  LIST_PUSH_TAIL(&pending_list, &tcb->link);
1278 1279  
1279 1280                                  /*
1280 1281                                   * Count the total number of the tx
1281 1282                                   * descriptors recycled
1282 1283                                   */
1283 1284                                  desc_num += tcb->desc_num;
1284 1285  
1285 1286                                  index = NEXT_INDEX(index, tcb->desc_num,
1286 1287                                      tx_ring->ring_size);
1287 1288  
1288 1289                                  tcb = tx_ring->work_list[index];
1289 1290  
1290 1291                                  prev_index = PREV_INDEX(index, 1,
1291 1292                                      tx_ring->ring_size);
1292 1293                                  if (prev_index == last_index)
1293 1294                                          break;
1294 1295                          }
1295 1296                  } else {
1296 1297                          break;
1297 1298                  }
1298 1299          }
1299 1300  
1300 1301          /*
1301 1302           * If no tx descriptors are recycled, no need to do more processing
1302 1303           */
1303 1304          if (desc_num == 0) {
1304 1305                  tx_ring->recycle_fail++;
1305 1306                  mutex_exit(&tx_ring->recycle_lock);
1306 1307                  return (0);
1307 1308          }
1308 1309  
1309 1310          tx_ring->recycle_fail = 0;
1310 1311          tx_ring->stall_watchdog = 0;
1311 1312  
1312 1313          /*
1313 1314           * Update the head index of the tx descriptor ring
1314 1315           */
1315 1316          tx_ring->tbd_head = index;
1316 1317  
1317 1318          /*
1318 1319           * Update the number of the free tx descriptors with atomic operations
1319 1320           */
1320 1321          atomic_add_32(&tx_ring->tbd_free, desc_num);
1321 1322  
1322 1323          if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1323 1324              (tx_ring->reschedule)) {
1324 1325                  tx_ring->reschedule = B_FALSE;
1325 1326                  mac_tx_ring_update(ixgbe->mac_hdl,
1326 1327                      tx_ring->ring_handle);
1327 1328          }
1328 1329          mutex_exit(&tx_ring->recycle_lock);
1329 1330  
1330 1331          /*
1331 1332           * Free the resources used by the tx control blocks
1332 1333           * in the pending list
1333 1334           */
1334 1335          tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1335 1336          while (tcb != NULL) {
1336 1337                  /*
1337 1338                   * Release the resources occupied by the tx control block
1338 1339                   */
1339 1340                  ixgbe_free_tcb(tcb);
1340 1341  
1341 1342                  tcb = (tx_control_block_t *)
1342 1343                      LIST_GET_NEXT(&pending_list, &tcb->link);
1343 1344          }
1344 1345  
1345 1346          /*
1346 1347           * Add the tx control blocks in the pending list to the free list.
1347 1348           */
1348 1349          ixgbe_put_free_list(tx_ring, &pending_list);
1349 1350  
1350 1351          return (desc_num);
1351 1352  }
1352 1353  
1353 1354  /*
1354 1355   * ixgbe_tx_recycle_head_wb
1355 1356   *
1356 1357   * Check the head write-back, and recycle all the transmitted
1357 1358   * tx descriptors and tx control blocks.
1358 1359   */
1359 1360  uint32_t
1360 1361  ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1361 1362  {
1362 1363          uint32_t index;
1363 1364          uint32_t head_wb;
1364 1365          int desc_num;
1365 1366          tx_control_block_t *tcb;
1366 1367          link_list_t pending_list;
1367 1368          ixgbe_t *ixgbe = tx_ring->ixgbe;
1368 1369  
1369 1370          mutex_enter(&tx_ring->recycle_lock);
1370 1371  
1371 1372          ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1372 1373  
1373 1374          if (tx_ring->tbd_free == tx_ring->ring_size) {
1374 1375                  tx_ring->recycle_fail = 0;
1375 1376                  tx_ring->stall_watchdog = 0;
1376 1377                  if (tx_ring->reschedule) {
1377 1378                          tx_ring->reschedule = B_FALSE;
1378 1379                          mac_tx_ring_update(ixgbe->mac_hdl,
1379 1380                              tx_ring->ring_handle);
1380 1381                  }
1381 1382                  mutex_exit(&tx_ring->recycle_lock);
1382 1383                  return (0);
1383 1384          }
1384 1385  
1385 1386          /*
1386 1387           * Sync the DMA buffer of the tx descriptor ring
1387 1388           *
1388 1389           * Note: For head write-back mode, the tx descriptors will not
1389 1390           * be written back, but the head write-back value is stored at
1390 1391           * the last extra tbd at the end of the DMA area, we still need
1391 1392           * to sync the head write-back value for kernel.
1392 1393           *
1393 1394           * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1394 1395           */
1395 1396          (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1396 1397              sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1397 1398              sizeof (uint32_t),
1398 1399              DDI_DMA_SYNC_FORKERNEL);
1399 1400  
1400 1401          if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1401 1402                  mutex_exit(&tx_ring->recycle_lock);
1402 1403                  ddi_fm_service_impact(ixgbe->dip,
1403 1404                      DDI_SERVICE_DEGRADED);
1404 1405                  atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1405 1406                  return (0);
1406 1407          }
1407 1408  
1408 1409          LINK_LIST_INIT(&pending_list);
1409 1410          desc_num = 0;
1410 1411          index = tx_ring->tbd_head;      /* Next index to clean */
1411 1412  
1412 1413          /*
1413 1414           * Get the value of head write-back
1414 1415           */
1415 1416          head_wb = *tx_ring->tbd_head_wb;
1416 1417          while (index != head_wb) {
1417 1418                  tcb = tx_ring->work_list[index];
1418 1419                  ASSERT(tcb != NULL);
1419 1420  
1420 1421                  if (OFFSET(index, head_wb, tx_ring->ring_size) <
1421 1422                      tcb->desc_num) {
1422 1423                          /*
1423 1424                           * The current tx control block is not
1424 1425                           * completely transmitted, stop recycling
1425 1426                           */
1426 1427                          break;
1427 1428                  }
1428 1429  
1429 1430                  /*
1430 1431                   * Strip off the tx control block from the work list,
1431 1432                   * and add it to the pending list.
1432 1433                   */
1433 1434                  tx_ring->work_list[index] = NULL;
1434 1435                  LIST_PUSH_TAIL(&pending_list, &tcb->link);
1435 1436  
1436 1437                  /*
1437 1438                   * Advance the index of the tx descriptor ring
1438 1439                   */
1439 1440                  index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1440 1441  
1441 1442                  /*
1442 1443                   * Count the total number of the tx descriptors recycled
1443 1444                   */
1444 1445                  desc_num += tcb->desc_num;
1445 1446          }
1446 1447  
1447 1448          /*
1448 1449           * If no tx descriptors are recycled, no need to do more processing
1449 1450           */
1450 1451          if (desc_num == 0) {
1451 1452                  tx_ring->recycle_fail++;
1452 1453                  mutex_exit(&tx_ring->recycle_lock);
1453 1454                  return (0);
1454 1455          }
1455 1456  
1456 1457          tx_ring->recycle_fail = 0;
1457 1458          tx_ring->stall_watchdog = 0;
1458 1459  
1459 1460          /*
1460 1461           * Update the head index of the tx descriptor ring
1461 1462           */
1462 1463          tx_ring->tbd_head = index;
1463 1464  
1464 1465          /*
1465 1466           * Update the number of the free tx descriptors with atomic operations
1466 1467           */
1467 1468          atomic_add_32(&tx_ring->tbd_free, desc_num);
1468 1469  
1469 1470          if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1470 1471              (tx_ring->reschedule)) {
1471 1472                  tx_ring->reschedule = B_FALSE;
1472 1473                  mac_tx_ring_update(ixgbe->mac_hdl,
1473 1474                      tx_ring->ring_handle);
1474 1475          }
1475 1476          mutex_exit(&tx_ring->recycle_lock);
1476 1477  
1477 1478          /*
1478 1479           * Free the resources used by the tx control blocks
1479 1480           * in the pending list
1480 1481           */
1481 1482          tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1482 1483          while (tcb) {
1483 1484                  /*
1484 1485                   * Release the resources occupied by the tx control block
1485 1486                   */
1486 1487                  ixgbe_free_tcb(tcb);
1487 1488  
1488 1489                  tcb = (tx_control_block_t *)
1489 1490                      LIST_GET_NEXT(&pending_list, &tcb->link);
1490 1491          }
1491 1492  
1492 1493          /*
1493 1494           * Add the tx control blocks in the pending list to the free list.
1494 1495           */
1495 1496          ixgbe_put_free_list(tx_ring, &pending_list);
1496 1497  
1497 1498          return (desc_num);
1498 1499  }
1499 1500  
1500 1501  /*
1501 1502   * ixgbe_free_tcb - free up the tx control block
1502 1503   *
1503 1504   * Free the resources of the tx control block, including
1504 1505   * unbind the previously bound DMA handle, and reset other
1505 1506   * control fields.
1506 1507   */
1507 1508  void
1508 1509  ixgbe_free_tcb(tx_control_block_t *tcb)
1509 1510  {
1510 1511          switch (tcb->tx_type) {
1511 1512          case USE_COPY:
1512 1513                  /*
1513 1514                   * Reset the buffer length that is used for copy
1514 1515                   */
1515 1516                  tcb->tx_buf.len = 0;
1516 1517                  break;
1517 1518          case USE_DMA:
1518 1519                  /*
1519 1520                   * Release the DMA resource that is used for
1520 1521                   * DMA binding.
1521 1522                   */
1522 1523                  (void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1523 1524                  break;
1524 1525          default:
1525 1526                  break;
1526 1527          }
1527 1528  
1528 1529          /*
1529 1530           * Free the mblk
1530 1531           */
1531 1532          if (tcb->mp != NULL) {
1532 1533                  freemsg(tcb->mp);
1533 1534                  tcb->mp = NULL;
1534 1535          }
1535 1536  
1536 1537          tcb->tx_type = USE_NONE;
1537 1538          tcb->last_index = MAX_TX_RING_SIZE;
1538 1539          tcb->frag_num = 0;
1539 1540          tcb->desc_num = 0;
1540 1541  }
1541 1542  
1542 1543  /*
1543 1544   * ixgbe_get_free_list - Get a free tx control block from the free list
1544 1545   *
1545 1546   * The atomic operation on the number of the available tx control block
1546 1547   * in the free list is used to keep this routine mutual exclusive with
1547 1548   * the routine ixgbe_put_check_list.
1548 1549   */
1549 1550  static tx_control_block_t *
1550 1551  ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
1551 1552  {
1552 1553          tx_control_block_t *tcb;
1553 1554  
1554 1555          /*
1555 1556           * Check and update the number of the free tx control block
1556 1557           * in the free list.
1557 1558           */
1558 1559          if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1559 1560                  return (NULL);
1560 1561  
1561 1562          mutex_enter(&tx_ring->tcb_head_lock);
1562 1563  
1563 1564          tcb = tx_ring->free_list[tx_ring->tcb_head];
1564 1565          ASSERT(tcb != NULL);
1565 1566          tx_ring->free_list[tx_ring->tcb_head] = NULL;
1566 1567          tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1567 1568              tx_ring->free_list_size);
1568 1569  
1569 1570          mutex_exit(&tx_ring->tcb_head_lock);
1570 1571  
1571 1572          return (tcb);
1572 1573  }
1573 1574  
1574 1575  /*
1575 1576   * ixgbe_put_free_list
1576 1577   *
1577 1578   * Put a list of used tx control blocks back to the free list
1578 1579   *
1579 1580   * A mutex is used here to ensure the serialization. The mutual exclusion
1580 1581   * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1581 1582   * the atomic operation on the counter tcb_free.
1582 1583   */
1583 1584  void
1584 1585  ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1585 1586  {
1586 1587          uint32_t index;
1587 1588          int tcb_num;
1588 1589          tx_control_block_t *tcb;
1589 1590  
1590 1591          mutex_enter(&tx_ring->tcb_tail_lock);
1591 1592  
1592 1593          index = tx_ring->tcb_tail;
1593 1594  
1594 1595          tcb_num = 0;
1595 1596          tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1596 1597          while (tcb != NULL) {
1597 1598                  ASSERT(tx_ring->free_list[index] == NULL);
1598 1599                  tx_ring->free_list[index] = tcb;
1599 1600  
1600 1601                  tcb_num++;
1601 1602  
1602 1603                  index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1603 1604  
1604 1605                  tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1605 1606          }
1606 1607  
1607 1608          tx_ring->tcb_tail = index;
1608 1609  
1609 1610          /*
1610 1611           * Update the number of the free tx control block
1611 1612           * in the free list. This operation must be placed
1612 1613           * under the protection of the lock.
1613 1614           */
1614 1615          atomic_add_32(&tx_ring->tcb_free, tcb_num);
1615 1616  
1616 1617          mutex_exit(&tx_ring->tcb_tail_lock);
1617 1618  }
  
    | 
      ↓ open down ↓ | 
    510 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX