Print this page
    
6064 ixgbe needs X550 support
Reviewed by: Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
          +++ new/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  
    | 
      ↓ open down ↓ | 
    18 lines elided | 
    
      ↑ open up ↑ | 
  
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  28   28   * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
       29 + * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
  29   30   */
  30   31  
  31   32  #include "ixgbe_sw.h"
  32   33  
  33   34  static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  34   35      uint32_t, boolean_t);
  35   36  static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
  36   37      uint32_t);
  37   38  static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
  38   39      ixgbe_tx_context_t *, size_t);
  39   40  static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
  40   41  static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
  41   42  
  42   43  static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
  43   44  static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
  44   45      ixgbe_tx_context_t *);
  45   46  static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
  46   47      ixgbe_tx_context_t *);
  47   48  
  48   49  #ifndef IXGBE_DEBUG
  49   50  #pragma inline(ixgbe_save_desc)
  50   51  #pragma inline(ixgbe_get_context)
  51   52  #pragma inline(ixgbe_check_context)
  52   53  #pragma inline(ixgbe_fill_context)
  53   54  #endif
  54   55  
  55   56  /*
  56   57   * ixgbe_ring_tx
  57   58   *
  58   59   * To transmit one mblk through one specified ring.
  59   60   *
  60   61   * One mblk can consist of several fragments, each fragment
  61   62   * will be processed with different methods based on the size.
  62   63   * For the fragments with size less than the bcopy threshold,
  63   64   * they will be processed by using bcopy; otherwise, they will
  64   65   * be processed by using DMA binding.
  65   66   *
  66   67   * To process the mblk, a tx control block is got from the
  67   68   * free list. One tx control block contains one tx buffer, which
  68   69   * is used to copy mblk fragments' data; and one tx DMA handle,
  69   70   * which is used to bind a mblk fragment with DMA resource.
  70   71   *
  71   72   * Several small mblk fragments can be copied into one tx control
  72   73   * block's buffer, and then the buffer will be transmitted with
  73   74   * one tx descriptor.
  74   75   *
  75   76   * A large fragment only binds with one tx control block's DMA
  76   77   * handle, and it can span several tx descriptors for transmitting.
  77   78   *
  78   79   * So to transmit a packet (mblk), several tx control blocks can
  79   80   * be used. After the processing, those tx control blocks will
  80   81   * be put to the work list.
  81   82   */
  82   83  mblk_t *
  83   84  ixgbe_ring_tx(void *arg, mblk_t *mp)
  84   85  {
  85   86          ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
  86   87          ixgbe_t *ixgbe = tx_ring->ixgbe;
  87   88          tx_type_t current_flag, next_flag;
  88   89          uint32_t current_len, next_len;
  89   90          uint32_t desc_total;
  90   91          size_t mbsize;
  91   92          int desc_num;
  92   93          boolean_t copy_done, eop;
  93   94          mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
  94   95          tx_control_block_t *tcb;
  95   96          ixgbe_tx_context_t tx_context, *ctx;
  96   97          link_list_t pending_list;
  97   98          uint32_t len, hdr_frag_len, hdr_len;
  98   99          uint32_t copy_thresh;
  99  100          mblk_t *hdr_new_mp = NULL;
 100  101          mblk_t *hdr_pre_mp = NULL;
 101  102          mblk_t *hdr_nmp = NULL;
 102  103  
 103  104          ASSERT(mp->b_next == NULL);
 104  105  
 105  106          if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
 106  107              (ixgbe->ixgbe_state & IXGBE_ERROR) ||
 107  108              (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
 108  109              !(ixgbe->ixgbe_state & IXGBE_STARTED) ||
 109  110              ixgbe->link_state != LINK_STATE_UP) {
 110  111                  freemsg(mp);
 111  112                  return (NULL);
 112  113          }
 113  114  
 114  115          copy_thresh = ixgbe->tx_copy_thresh;
 115  116  
 116  117          /* Get the mblk size */
 117  118          mbsize = 0;
 118  119          for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
 119  120                  mbsize += MBLKL(nmp);
 120  121          }
 121  122  
 122  123          if (ixgbe->tx_hcksum_enable) {
 123  124                  /*
 124  125                   * Retrieve checksum context information from the mblk
 125  126                   * that will be used to decide whether/how to fill the
 126  127                   * context descriptor.
 127  128                   */
 128  129                  ctx = &tx_context;
 129  130                  if (ixgbe_get_context(mp, ctx) < 0) {
 130  131                          freemsg(mp);
 131  132                          return (NULL);
 132  133                  }
 133  134  
 134  135                  /*
 135  136                   * If the mblk size exceeds the max size ixgbe could
 136  137                   * process, then discard this mblk, and return NULL.
 137  138                   */
 138  139                  if ((ctx->lso_flag &&
 139  140                      ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
 140  141                      (!ctx->lso_flag &&
 141  142                      (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
 142  143                          freemsg(mp);
 143  144                          IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
 144  145                          return (NULL);
 145  146                  }
 146  147          } else {
 147  148                  ctx = NULL;
 148  149          }
 149  150  
 150  151          /*
 151  152           * Check and recycle tx descriptors.
 152  153           * The recycle threshold here should be selected carefully
 153  154           */
 154  155          if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
 155  156                  tx_ring->tx_recycle(tx_ring);
 156  157          }
 157  158  
 158  159          /*
 159  160           * After the recycling, if the tbd_free is less than the
 160  161           * overload_threshold, assert overload, return mp;
 161  162           * and we need to re-schedule the tx again.
 162  163           */
 163  164          if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
 164  165                  tx_ring->reschedule = B_TRUE;
 165  166                  IXGBE_DEBUG_STAT(tx_ring->stat_overload);
 166  167                  return (mp);
 167  168          }
 168  169  
 169  170          /*
 170  171           * The pending_list is a linked list that is used to save
 171  172           * the tx control blocks that have packet data processed
 172  173           * but have not put the data to the tx descriptor ring.
 173  174           * It is used to reduce the lock contention of the tx_lock.
 174  175           */
 175  176          LINK_LIST_INIT(&pending_list);
 176  177          desc_num = 0;
 177  178          desc_total = 0;
 178  179  
 179  180          /*
 180  181           * The software should guarantee LSO packet header(MAC+IP+TCP)
 181  182           * to be within one descriptor. Here we reallocate and refill the
 182  183           * the header if it's physical memory non-contiguous.
 183  184           */
 184  185          if ((ctx != NULL) && ctx->lso_flag) {
 185  186                  /* find the last fragment of the header */
 186  187                  len = MBLKL(mp);
 187  188                  ASSERT(len > 0);
 188  189                  hdr_nmp = mp;
 189  190                  hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
 190  191                  while (len < hdr_len) {
 191  192                          hdr_pre_mp = hdr_nmp;
 192  193                          hdr_nmp = hdr_nmp->b_cont;
 193  194                          len += MBLKL(hdr_nmp);
 194  195                  }
 195  196                  /*
 196  197                   * If the header and the payload are in different mblks,
 197  198                   * we simply force the header to be copied into pre-allocated
 198  199                   * page-aligned buffer.
 199  200                   */
 200  201                  if (len == hdr_len)
 201  202                          goto adjust_threshold;
 202  203  
 203  204                  hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
 204  205                  /*
 205  206                   * There are two cases we need to reallocate a mblk for the
 206  207                   * last header fragment:
 207  208                   * 1. the header is in multiple mblks and the last fragment
 208  209                   * share the same mblk with the payload
 209  210                   * 2. the header is in a single mblk shared with the payload
 210  211                   * and the header is physical memory non-contiguous
 211  212                   */
 212  213                  if ((hdr_nmp != mp) ||
 213  214                      (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
 214  215                      < hdr_len)) {
 215  216                          IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
 216  217                          /*
 217  218                           * reallocate the mblk for the last header fragment,
 218  219                           * expect to bcopy into pre-allocated page-aligned
 219  220                           * buffer
 220  221                           */
 221  222                          hdr_new_mp = allocb(hdr_frag_len, NULL);
 222  223                          if (!hdr_new_mp)
 223  224                                  return (mp);
 224  225                          bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
 225  226                              hdr_frag_len);
 226  227                          /* link the new header fragment with the other parts */
 227  228                          hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
 228  229                          hdr_new_mp->b_cont = hdr_nmp;
 229  230                          if (hdr_pre_mp)
 230  231                                  hdr_pre_mp->b_cont = hdr_new_mp;
 231  232                          else
 232  233                                  mp = hdr_new_mp;
 233  234                          hdr_nmp->b_rptr += hdr_frag_len;
 234  235                  }
 235  236  adjust_threshold:
 236  237                  /*
 237  238                   * adjust the bcopy threshhold to guarantee
 238  239                   * the header to use bcopy way
 239  240                   */
 240  241                  if (copy_thresh < hdr_len)
 241  242                          copy_thresh = hdr_len;
 242  243          }
 243  244  
 244  245          current_mp = mp;
 245  246          current_len = MBLKL(current_mp);
 246  247          /*
 247  248           * Decide which method to use for the first fragment
 248  249           */
 249  250          current_flag = (current_len <= copy_thresh) ?
 250  251              USE_COPY : USE_DMA;
 251  252          /*
 252  253           * If the mblk includes several contiguous small fragments,
 253  254           * they may be copied into one buffer. This flag is used to
 254  255           * indicate whether there are pending fragments that need to
 255  256           * be copied to the current tx buffer.
 256  257           *
 257  258           * If this flag is B_TRUE, it indicates that a new tx control
 258  259           * block is needed to process the next fragment using either
 259  260           * copy or DMA binding.
 260  261           *
 261  262           * Otherwise, it indicates that the next fragment will be
 262  263           * copied to the current tx buffer that is maintained by the
 263  264           * current tx control block. No new tx control block is needed.
 264  265           */
 265  266          copy_done = B_TRUE;
 266  267          while (current_mp) {
 267  268                  next_mp = current_mp->b_cont;
 268  269                  eop = (next_mp == NULL); /* Last fragment of the packet? */
 269  270                  next_len = eop ? 0: MBLKL(next_mp);
 270  271  
 271  272                  /*
 272  273                   * When the current fragment is an empty fragment, if
 273  274                   * the next fragment will still be copied to the current
 274  275                   * tx buffer, we cannot skip this fragment here. Because
 275  276                   * the copy processing is pending for completion. We have
 276  277                   * to process this empty fragment in the tx_copy routine.
 277  278                   *
 278  279                   * If the copy processing is completed or a DMA binding
 279  280                   * processing is just completed, we can just skip this
 280  281                   * empty fragment.
 281  282                   */
 282  283                  if ((current_len == 0) && (copy_done)) {
 283  284                          current_mp = next_mp;
 284  285                          current_len = next_len;
 285  286                          current_flag = (current_len <= copy_thresh) ?
 286  287                              USE_COPY : USE_DMA;
 287  288                          continue;
 288  289                  }
 289  290  
 290  291                  if (copy_done) {
 291  292                          /*
 292  293                           * Get a new tx control block from the free list
 293  294                           */
 294  295                          tcb = ixgbe_get_free_list(tx_ring);
 295  296  
 296  297                          if (tcb == NULL) {
 297  298                                  IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 298  299                                  goto tx_failure;
 299  300                          }
 300  301  
 301  302                          /*
 302  303                           * Push the tx control block to the pending list
 303  304                           * to avoid using lock too early
 304  305                           */
 305  306                          LIST_PUSH_TAIL(&pending_list, &tcb->link);
 306  307                  }
 307  308  
 308  309                  if (current_flag == USE_COPY) {
 309  310                          /*
 310  311                           * Check whether to use bcopy or DMA binding to process
 311  312                           * the next fragment, and if using bcopy, whether we
 312  313                           * need to continue copying the next fragment into the
 313  314                           * current tx buffer.
 314  315                           */
 315  316                          ASSERT((tcb->tx_buf.len + current_len) <=
 316  317                              tcb->tx_buf.size);
 317  318  
 318  319                          if (eop) {
 319  320                                  /*
 320  321                                   * This is the last fragment of the packet, so
 321  322                                   * the copy processing will be completed with
 322  323                                   * this fragment.
 323  324                                   */
 324  325                                  next_flag = USE_NONE;
 325  326                                  copy_done = B_TRUE;
 326  327                          } else if ((tcb->tx_buf.len + current_len + next_len) >
 327  328                              tcb->tx_buf.size) {
 328  329                                  /*
 329  330                                   * If the next fragment is too large to be
 330  331                                   * copied to the current tx buffer, we need
 331  332                                   * to complete the current copy processing.
 332  333                                   */
 333  334                                  next_flag = (next_len > copy_thresh) ?
 334  335                                      USE_DMA: USE_COPY;
 335  336                                  copy_done = B_TRUE;
 336  337                          } else if (next_len > copy_thresh) {
 337  338                                  /*
 338  339                                   * The next fragment needs to be processed with
 339  340                                   * DMA binding. So the copy prcessing will be
 340  341                                   * completed with the current fragment.
 341  342                                   */
 342  343                                  next_flag = USE_DMA;
 343  344                                  copy_done = B_TRUE;
 344  345                          } else {
 345  346                                  /*
 346  347                                   * Continue to copy the next fragment to the
 347  348                                   * current tx buffer.
 348  349                                   */
 349  350                                  next_flag = USE_COPY;
 350  351                                  copy_done = B_FALSE;
 351  352                          }
 352  353  
 353  354                          desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
 354  355                              current_len, copy_done);
 355  356                  } else {
 356  357                          /*
 357  358                           * Check whether to use bcopy or DMA binding to process
 358  359                           * the next fragment.
 359  360                           */
 360  361                          next_flag = (next_len > copy_thresh) ?
 361  362                              USE_DMA: USE_COPY;
 362  363                          ASSERT(copy_done == B_TRUE);
 363  364  
 364  365                          desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
 365  366                              current_len);
 366  367                  }
 367  368  
 368  369                  if (desc_num > 0)
 369  370                          desc_total += desc_num;
 370  371                  else if (desc_num < 0)
 371  372                          goto tx_failure;
 372  373  
 373  374                  current_mp = next_mp;
 374  375                  current_len = next_len;
 375  376                  current_flag = next_flag;
 376  377          }
 377  378  
 378  379          /*
 379  380           * Attach the mblk to the last tx control block
 380  381           */
 381  382          ASSERT(tcb);
 382  383          ASSERT(tcb->mp == NULL);
 383  384          tcb->mp = mp;
 384  385  
 385  386          /*
 386  387           * 82598/82599 chipset has a limitation that no more than 32 tx
 387  388           * descriptors can be transmited out at one time.
 388  389           *
 389  390           * Here is a workaround for it: pull up the mblk then send it
 390  391           * out with bind way. By doing so, no more than MAX_COOKIE (18)
 391  392           * descriptors is needed.
 392  393           */
 393  394          if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
 394  395                  IXGBE_DEBUG_STAT(tx_ring->stat_break_tbd_limit);
 395  396  
 396  397                  /*
 397  398                   * Discard the mblk and free the used resources
 398  399                   */
 399  400                  tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 400  401                  while (tcb) {
 401  402                          tcb->mp = NULL;
 402  403                          ixgbe_free_tcb(tcb);
 403  404                          tcb = (tx_control_block_t *)
 404  405                              LIST_GET_NEXT(&pending_list, &tcb->link);
 405  406                  }
 406  407  
 407  408                  /*
 408  409                   * Return the tx control blocks in the pending list to
 409  410                   * the free list.
 410  411                   */
 411  412                  ixgbe_put_free_list(tx_ring, &pending_list);
 412  413  
 413  414                  /*
 414  415                   * pull up the mblk and send it out with bind way
 415  416                   */
 416  417                  if ((pull_mp = msgpullup(mp, -1)) == NULL) {
 417  418                          tx_ring->reschedule = B_TRUE;
 418  419  
 419  420                          /*
 420  421                           * If new mblk has been allocted for the last header
 421  422                           * fragment of a LSO packet, we should restore the
 422  423                           * modified mp.
 423  424                           */
 424  425                          if (hdr_new_mp) {
 425  426                                  hdr_new_mp->b_cont = NULL;
 426  427                                  freeb(hdr_new_mp);
 427  428                                  hdr_nmp->b_rptr -= hdr_frag_len;
 428  429                                  if (hdr_pre_mp)
 429  430                                          hdr_pre_mp->b_cont = hdr_nmp;
 430  431                                  else
 431  432                                          mp = hdr_nmp;
 432  433                          }
 433  434                          return (mp);
 434  435                  }
 435  436  
 436  437                  LINK_LIST_INIT(&pending_list);
 437  438                  desc_total = 0;
 438  439  
 439  440                  /*
 440  441                   * if the packet is a LSO packet, we simply
 441  442                   * transmit the header in one descriptor using the copy way
 442  443                   */
 443  444                  if ((ctx != NULL) && ctx->lso_flag) {
 444  445                          hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
 445  446                              ctx->l4_hdr_len;
 446  447  
 447  448                          tcb = ixgbe_get_free_list(tx_ring);
 448  449                          if (tcb == NULL) {
 449  450                                  IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 450  451                                  goto tx_failure;
 451  452                          }
 452  453                          desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
 453  454                              hdr_len, B_TRUE);
 454  455                          LIST_PUSH_TAIL(&pending_list, &tcb->link);
 455  456                          desc_total  += desc_num;
 456  457  
 457  458                          pull_mp->b_rptr += hdr_len;
 458  459                  }
 459  460  
 460  461                  tcb = ixgbe_get_free_list(tx_ring);
 461  462                  if (tcb == NULL) {
 462  463                          IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
 463  464                          goto tx_failure;
 464  465                  }
 465  466                  if ((ctx != NULL) && ctx->lso_flag) {
 466  467                          desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 467  468                              mbsize - hdr_len);
 468  469                  } else {
 469  470                          desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
 470  471                              mbsize);
 471  472                  }
 472  473                  if (desc_num < 0) {
 473  474                          goto tx_failure;
 474  475                  }
 475  476                  LIST_PUSH_TAIL(&pending_list, &tcb->link);
 476  477  
 477  478                  desc_total += desc_num;
 478  479                  tcb->mp = pull_mp;
 479  480          }
 480  481  
 481  482          /*
 482  483           * Before fill the tx descriptor ring with the data, we need to
 483  484           * ensure there are adequate free descriptors for transmit
 484  485           * (including one context descriptor).
 485  486           * Do not use up all the tx descriptors.
 486  487           * Otherwise tx recycle will fail and cause false hang.
 487  488           */
 488  489          if (tx_ring->tbd_free <= (desc_total + 1)) {
 489  490                  tx_ring->tx_recycle(tx_ring);
 490  491          }
 491  492  
 492  493          mutex_enter(&tx_ring->tx_lock);
 493  494          /*
 494  495           * If the number of free tx descriptors is not enough for transmit
 495  496           * then return mp.
 496  497           *
 497  498           * Note: we must put this check under the mutex protection to
 498  499           * ensure the correctness when multiple threads access it in
 499  500           * parallel.
 500  501           */
 501  502          if (tx_ring->tbd_free <= (desc_total + 1)) {
 502  503                  IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
 503  504                  mutex_exit(&tx_ring->tx_lock);
 504  505                  goto tx_failure;
 505  506          }
 506  507  
 507  508          desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
 508  509              mbsize);
 509  510  
 510  511          ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
 511  512  
 512  513          tx_ring->stat_obytes += mbsize;
 513  514          tx_ring->stat_opackets ++;
 514  515  
 515  516          mutex_exit(&tx_ring->tx_lock);
 516  517  
 517  518          /*
 518  519           * now that the transmission succeeds, need to free the original
 519  520           * mp if we used the pulling up mblk for transmission.
 520  521           */
 521  522          if (pull_mp) {
 522  523                  freemsg(mp);
 523  524          }
 524  525  
 525  526          return (NULL);
 526  527  
 527  528  tx_failure:
 528  529          /*
 529  530           * If transmission fails, need to free the pulling up mblk.
 530  531           */
 531  532          if (pull_mp) {
 532  533                  freemsg(pull_mp);
 533  534          }
 534  535  
 535  536          /*
 536  537           * If new mblk has been allocted for the last header
 537  538           * fragment of a LSO packet, we should restore the
 538  539           * modified mp.
 539  540           */
 540  541          if (hdr_new_mp) {
 541  542                  hdr_new_mp->b_cont = NULL;
 542  543                  freeb(hdr_new_mp);
 543  544                  hdr_nmp->b_rptr -= hdr_frag_len;
 544  545                  if (hdr_pre_mp)
 545  546                          hdr_pre_mp->b_cont = hdr_nmp;
 546  547                  else
 547  548                          mp = hdr_nmp;
 548  549          }
 549  550          /*
 550  551           * Discard the mblk and free the used resources
 551  552           */
 552  553          tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
 553  554          while (tcb) {
 554  555                  tcb->mp = NULL;
 555  556  
 556  557                  ixgbe_free_tcb(tcb);
 557  558  
 558  559                  tcb = (tx_control_block_t *)
 559  560                      LIST_GET_NEXT(&pending_list, &tcb->link);
 560  561          }
 561  562  
 562  563          /*
 563  564           * Return the tx control blocks in the pending list to the free list.
 564  565           */
 565  566          ixgbe_put_free_list(tx_ring, &pending_list);
 566  567  
 567  568          /* Transmit failed, do not drop the mblk, rechedule the transmit */
 568  569          tx_ring->reschedule = B_TRUE;
 569  570  
 570  571          return (mp);
 571  572  }
 572  573  
 573  574  /*
 574  575   * ixgbe_tx_copy
 575  576   *
 576  577   * Copy the mblk fragment to the pre-allocated tx buffer
 577  578   */
 578  579  static int
 579  580  ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 580  581      uint32_t len, boolean_t copy_done)
 581  582  {
 582  583          dma_buffer_t *tx_buf;
 583  584          uint32_t desc_num;
 584  585          _NOTE(ARGUNUSED(tx_ring));
 585  586  
 586  587          tx_buf = &tcb->tx_buf;
 587  588  
 588  589          /*
 589  590           * Copy the packet data of the mblk fragment into the
 590  591           * pre-allocated tx buffer, which is maintained by the
 591  592           * tx control block.
 592  593           *
 593  594           * Several mblk fragments can be copied into one tx buffer.
 594  595           * The destination address of the current copied fragment in
 595  596           * the tx buffer is next to the end of the previous copied
 596  597           * fragment.
 597  598           */
 598  599          if (len > 0) {
 599  600                  bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
 600  601  
 601  602                  tx_buf->len += len;
 602  603                  tcb->frag_num++;
 603  604          }
 604  605  
 605  606          desc_num = 0;
 606  607  
 607  608          /*
 608  609           * If it is the last fragment copied to the current tx buffer,
 609  610           * in other words, if there's no remaining fragment or the remaining
 610  611           * fragment requires a new tx control block to process, we need to
 611  612           * complete the current copy processing by syncing up the current
 612  613           * DMA buffer and saving the descriptor data.
 613  614           */
 614  615          if (copy_done) {
 615  616                  /*
 616  617                   * Sync the DMA buffer of the packet data
 617  618                   */
 618  619                  DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
 619  620  
 620  621                  tcb->tx_type = USE_COPY;
 621  622  
 622  623                  /*
 623  624                   * Save the address and length to the private data structure
 624  625                   * of the tx control block, which will be used to fill the
 625  626                   * tx descriptor ring after all the fragments are processed.
 626  627                   */
 627  628                  ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
 628  629                  desc_num++;
 629  630          }
 630  631  
 631  632          return (desc_num);
 632  633  }
 633  634  
 634  635  /*
 635  636   * ixgbe_tx_bind
 636  637   *
 637  638   * Bind the mblk fragment with DMA
 638  639   */
 639  640  static int
 640  641  ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
 641  642      uint32_t len)
 642  643  {
 643  644          int status, i;
 644  645          ddi_dma_cookie_t dma_cookie;
 645  646          uint_t ncookies;
 646  647          int desc_num;
 647  648  
 648  649          /*
 649  650           * Use DMA binding to process the mblk fragment
 650  651           */
 651  652          status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
 652  653              (caddr_t)mp->b_rptr, len,
 653  654              DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
 654  655              0, &dma_cookie, &ncookies);
 655  656  
 656  657          if (status != DDI_DMA_MAPPED) {
 657  658                  IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
 658  659                  return (-1);
 659  660          }
 660  661  
 661  662          tcb->frag_num++;
 662  663          tcb->tx_type = USE_DMA;
 663  664          /*
 664  665           * Each fragment can span several cookies. One cookie will have
 665  666           * one tx descriptor to transmit.
 666  667           */
 667  668          desc_num = 0;
 668  669          for (i = ncookies; i > 0; i--) {
 669  670                  /*
 670  671                   * Save the address and length to the private data structure
 671  672                   * of the tx control block, which will be used to fill the
 672  673                   * tx descriptor ring after all the fragments are processed.
 673  674                   */
 674  675                  ixgbe_save_desc(tcb,
 675  676                      dma_cookie.dmac_laddress,
 676  677                      dma_cookie.dmac_size);
 677  678  
 678  679                  desc_num++;
 679  680  
 680  681                  if (i > 1)
 681  682                          ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
 682  683          }
 683  684  
 684  685          return (desc_num);
 685  686  }
 686  687  
 687  688  /*
 688  689   * ixgbe_get_context
 689  690   *
 690  691   * Get the context information from the mblk
 691  692   */
 692  693  static int
 693  694  ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 694  695  {
 695  696          uint32_t start;
 696  697          uint32_t hckflags;
 697  698          uint32_t lsoflags;
 698  699          uint32_t mss;
 699  700          uint32_t len;
 700  701          uint32_t size;
 701  702          uint32_t offset;
 702  703          unsigned char *pos;
 703  704          ushort_t etype;
 704  705          uint32_t mac_hdr_len;
 705  706          uint32_t l4_proto;
 706  707          uint32_t l4_hdr_len;
 707  708  
 708  709          ASSERT(mp != NULL);
 709  710  
 710  711          mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
 711  712          bzero(ctx, sizeof (ixgbe_tx_context_t));
 712  713  
 713  714          if (hckflags == 0) {
 714  715                  return (0);
 715  716          }
 716  717  
 717  718          ctx->hcksum_flags = hckflags;
 718  719  
 719  720          mac_lso_get(mp, &mss, &lsoflags);
 720  721          ctx->mss = mss;
 721  722          ctx->lso_flag = (lsoflags == HW_LSO);
 722  723  
 723  724          /*
 724  725           * LSO relies on tx h/w checksum, so here will drop the package
 725  726           * if h/w checksum flag is not declared.
 726  727           */
 727  728          if (ctx->lso_flag) {
 728  729                  if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
 729  730                      (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
 730  731                          IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w "
 731  732                              "checksum flags are not specified when doing LSO");
 732  733                          return (-1);
 733  734                  }
 734  735          }
 735  736  
 736  737          etype = 0;
 737  738          mac_hdr_len = 0;
 738  739          l4_proto = 0;
 739  740  
 740  741          /*
 741  742           * Firstly get the position of the ether_type/ether_tpid.
 742  743           * Here we don't assume the ether (VLAN) header is fully included
 743  744           * in one mblk fragment, so we go thourgh the fragments to parse
 744  745           * the ether type.
 745  746           */
 746  747          size = len = MBLKL(mp);
 747  748          offset = offsetof(struct ether_header, ether_type);
 748  749          while (size <= offset) {
 749  750                  mp = mp->b_cont;
 750  751                  ASSERT(mp != NULL);
 751  752                  len = MBLKL(mp);
 752  753                  size += len;
 753  754          }
 754  755          pos = mp->b_rptr + offset + len - size;
 755  756  
 756  757          etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 757  758          if (etype == ETHERTYPE_VLAN) {
 758  759                  /*
 759  760                   * Get the position of the ether_type in VLAN header
 760  761                   */
 761  762                  offset = offsetof(struct ether_vlan_header, ether_type);
 762  763                  while (size <= offset) {
 763  764                          mp = mp->b_cont;
 764  765                          ASSERT(mp != NULL);
 765  766                          len = MBLKL(mp);
 766  767                          size += len;
 767  768                  }
 768  769                  pos = mp->b_rptr + offset + len - size;
 769  770  
 770  771                  etype = ntohs(*(ushort_t *)(uintptr_t)pos);
 771  772                  mac_hdr_len = sizeof (struct ether_vlan_header);
 772  773          } else {
 773  774                  mac_hdr_len = sizeof (struct ether_header);
 774  775          }
 775  776  
 776  777          /*
 777  778           * Here we don't assume the IP(V6) header is fully included in
 778  779           * one mblk fragment.
 779  780           */
 780  781          switch (etype) {
 781  782          case ETHERTYPE_IP:
 782  783                  if (ctx->lso_flag) {
 783  784                          offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
 784  785                          while (size <= offset) {
 785  786                                  mp = mp->b_cont;
 786  787                                  ASSERT(mp != NULL);
 787  788                                  len = MBLKL(mp);
 788  789                                  size += len;
 789  790                          }
 790  791                          pos = mp->b_rptr + offset + len - size;
 791  792                          *((uint16_t *)(uintptr_t)(pos)) = 0;
 792  793  
 793  794                          offset = offsetof(ipha_t, ipha_hdr_checksum) +
 794  795                              mac_hdr_len;
 795  796                          while (size <= offset) {
 796  797                                  mp = mp->b_cont;
 797  798                                  ASSERT(mp != NULL);
 798  799                                  len = MBLKL(mp);
 799  800                                  size += len;
 800  801                          }
 801  802                          pos = mp->b_rptr + offset + len - size;
 802  803                          *((uint16_t *)(uintptr_t)(pos)) = 0;
 803  804  
 804  805                          /*
 805  806                           * To perform ixgbe LSO, here also need to fill
 806  807                           * the tcp checksum field of the packet with the
 807  808                           * following pseudo-header checksum:
 808  809                           * (ip_source_addr, ip_destination_addr, l4_proto)
 809  810                           * Currently the tcp/ip stack has done it.
 810  811                           */
 811  812                  }
 812  813  
 813  814                  offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
 814  815                  while (size <= offset) {
 815  816                          mp = mp->b_cont;
 816  817                          ASSERT(mp != NULL);
 817  818                          len = MBLKL(mp);
 818  819                          size += len;
 819  820                  }
 820  821                  pos = mp->b_rptr + offset + len - size;
 821  822  
 822  823                  l4_proto = *(uint8_t *)pos;
 823  824                  break;
 824  825          case ETHERTYPE_IPV6:
 825  826                  offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
 826  827                  while (size <= offset) {
 827  828                          mp = mp->b_cont;
 828  829                          ASSERT(mp != NULL);
 829  830                          len = MBLKL(mp);
 830  831                          size += len;
 831  832                  }
 832  833                  pos = mp->b_rptr + offset + len - size;
 833  834  
 834  835                  l4_proto = *(uint8_t *)pos;
 835  836                  break;
 836  837          default:
 837  838                  /* Unrecoverable error */
 838  839                  IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
 839  840                  return (-2);
 840  841          }
 841  842  
 842  843          if (ctx->lso_flag) {
 843  844                  offset = mac_hdr_len + start;
 844  845                  while (size <= offset) {
 845  846                          mp = mp->b_cont;
 846  847                          ASSERT(mp != NULL);
 847  848                          len = MBLKL(mp);
 848  849                          size += len;
 849  850                  }
 850  851                  pos = mp->b_rptr + offset + len - size;
 851  852  
 852  853                  l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
 853  854          } else {
 854  855                  /*
 855  856                   * l4 header length is only required for LSO
 856  857                   */
 857  858                  l4_hdr_len = 0;
 858  859          }
 859  860  
 860  861          ctx->mac_hdr_len = mac_hdr_len;
 861  862          ctx->ip_hdr_len = start;
 862  863          ctx->l4_proto = l4_proto;
 863  864          ctx->l4_hdr_len = l4_hdr_len;
 864  865  
 865  866          return (0);
 866  867  }
 867  868  
 868  869  /*
 869  870   * ixgbe_check_context
 870  871   *
 871  872   * Check if a new context descriptor is needed
 872  873   */
 873  874  static boolean_t
 874  875  ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
 875  876  {
 876  877          ixgbe_tx_context_t *last;
 877  878  
 878  879          if (ctx == NULL)
 879  880                  return (B_FALSE);
 880  881  
 881  882          /*
 882  883           * Compare the context data retrieved from the mblk and the
 883  884           * stored data of the last context descriptor. The data need
 884  885           * to be checked are:
 885  886           *      hcksum_flags
 886  887           *      l4_proto
 887  888           *      mac_hdr_len
 888  889           *      ip_hdr_len
 889  890           *      lso_flag
 890  891           *      mss (only checked for LSO)
 891  892           *      l4_hr_len (only checked for LSO)
 892  893           * Either one of the above data is changed, a new context descriptor
 893  894           * will be needed.
 894  895           */
 895  896          last = &tx_ring->tx_context;
 896  897  
 897  898          if ((ctx->hcksum_flags != last->hcksum_flags) ||
 898  899              (ctx->l4_proto != last->l4_proto) ||
 899  900              (ctx->mac_hdr_len != last->mac_hdr_len) ||
 900  901              (ctx->ip_hdr_len != last->ip_hdr_len) ||
 901  902              (ctx->lso_flag != last->lso_flag) ||
 902  903              (ctx->lso_flag && ((ctx->mss != last->mss) ||
 903  904              (ctx->l4_hdr_len != last->l4_hdr_len)))) {
 904  905                  return (B_TRUE);
 905  906          }
 906  907  
 907  908          return (B_FALSE);
 908  909  }
 909  910  
 910  911  /*
 911  912   * ixgbe_fill_context
 912  913   *
 913  914   * Fill the context descriptor with hardware checksum informations
 914  915   */
 915  916  static void
 916  917  ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
 917  918      ixgbe_tx_context_t *ctx)
 918  919  {
 919  920          /*
 920  921           * Fill the context descriptor with the checksum
 921  922           * context information we've got.
 922  923           */
 923  924          ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
 924  925          ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
 925  926              IXGBE_ADVTXD_MACLEN_SHIFT;
 926  927  
 927  928          ctx_tbd->type_tucmd_mlhl =
 928  929              IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
 929  930  
 930  931          if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
 931  932                  ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
 932  933  
 933  934          if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
 934  935                  switch (ctx->l4_proto) {
 935  936                  case IPPROTO_TCP:
 936  937                          ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
 937  938                          break;
 938  939                  case IPPROTO_UDP:
 939  940                          /*
 940  941                           * We don't have to explicitly set:
 941  942                           *      ctx_tbd->type_tucmd_mlhl |=
 942  943                           *          IXGBE_ADVTXD_TUCMD_L4T_UDP;
 943  944                           * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
 944  945                           */
 945  946                          break;
 946  947                  default:
 947  948                          /* Unrecoverable error */
 948  949                          IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
 949  950                          break;
 950  951                  }
 951  952          }
 952  953  
 953  954          ctx_tbd->seqnum_seed = 0;
 954  955  
 955  956          if (ctx->lso_flag) {
 956  957                  ctx_tbd->mss_l4len_idx =
 957  958                      (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
 958  959                      (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
 959  960          } else {
 960  961                  ctx_tbd->mss_l4len_idx = 0;
 961  962          }
 962  963  }
 963  964  
 964  965  /*
 965  966   * ixgbe_tx_fill_ring
 966  967   *
 967  968   * Fill the tx descriptor ring with the data
 968  969   */
 969  970  static int
 970  971  ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
 971  972      ixgbe_tx_context_t *ctx, size_t mbsize)
 972  973  {
 973  974          struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
 974  975          boolean_t load_context;
 975  976          uint32_t index, tcb_index, desc_num;
 976  977          union ixgbe_adv_tx_desc *tbd, *first_tbd;
 977  978          tx_control_block_t *tcb, *first_tcb;
 978  979          uint32_t hcksum_flags;
 979  980          int i;
 980  981  
 981  982          ASSERT(mutex_owned(&tx_ring->tx_lock));
 982  983  
 983  984          tbd = NULL;
 984  985          first_tbd = NULL;
 985  986          first_tcb = NULL;
 986  987          desc_num = 0;
 987  988          hcksum_flags = 0;
 988  989          load_context = B_FALSE;
 989  990  
 990  991          /*
 991  992           * Get the index of the first tx descriptor that will be filled,
 992  993           * and the index of the first work list item that will be attached
 993  994           * with the first used tx control block in the pending list.
 994  995           * Note: the two indexes are the same.
 995  996           */
 996  997          index = tx_ring->tbd_tail;
 997  998          tcb_index = tx_ring->tbd_tail;
 998  999  
 999 1000          if (ctx != NULL) {
1000 1001                  hcksum_flags = ctx->hcksum_flags;
1001 1002  
1002 1003                  /*
1003 1004                   * Check if a new context descriptor is needed for this packet
1004 1005                   */
1005 1006                  load_context = ixgbe_check_context(tx_ring, ctx);
1006 1007  
1007 1008                  if (load_context) {
1008 1009                          tbd = &tx_ring->tbd_ring[index];
1009 1010  
1010 1011                          /*
1011 1012                           * Fill the context descriptor with the
1012 1013                           * hardware checksum offload informations.
1013 1014                           */
1014 1015                          ixgbe_fill_context(
1015 1016                              (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
1016 1017  
1017 1018                          index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1018 1019                          desc_num++;
1019 1020  
1020 1021                          /*
1021 1022                           * Store the checksum context data if
1022 1023                           * a new context descriptor is added
1023 1024                           */
1024 1025                          tx_ring->tx_context = *ctx;
1025 1026                  }
1026 1027          }
1027 1028  
1028 1029          first_tbd = &tx_ring->tbd_ring[index];
1029 1030  
1030 1031          /*
1031 1032           * Fill tx data descriptors with the data saved in the pending list.
1032 1033           * The tx control blocks in the pending list are added to the work list
1033 1034           * at the same time.
1034 1035           *
1035 1036           * The work list is strictly 1:1 corresponding to the descriptor ring.
1036 1037           * One item of the work list corresponds to one tx descriptor. Because
1037 1038           * one tx control block can span multiple tx descriptors, the tx
1038 1039           * control block will be added to the first work list item that
1039 1040           * corresponds to the first tx descriptor generated from that tx
1040 1041           * control block.
1041 1042           */
1042 1043          tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1043 1044          first_tcb = tcb;
1044 1045          while (tcb != NULL) {
1045 1046  
1046 1047                  for (i = 0; i < tcb->desc_num; i++) {
1047 1048                          tbd = &tx_ring->tbd_ring[index];
1048 1049  
1049 1050                          tbd->read.buffer_addr = tcb->desc[i].address;
1050 1051                          tbd->read.cmd_type_len = tcb->desc[i].length;
1051 1052  
1052 1053                          tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
1053 1054                              | IXGBE_ADVTXD_DTYP_DATA;
1054 1055  
1055 1056                          tbd->read.olinfo_status = 0;
1056 1057  
1057 1058                          index = NEXT_INDEX(index, 1, tx_ring->ring_size);
1058 1059                          desc_num++;
1059 1060                  }
1060 1061  
1061 1062                  /*
1062 1063                   * Add the tx control block to the work list
1063 1064                   */
1064 1065                  ASSERT(tx_ring->work_list[tcb_index] == NULL);
1065 1066                  tx_ring->work_list[tcb_index] = tcb;
1066 1067  
1067 1068                  tcb_index = index;
1068 1069                  tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1069 1070          }
1070 1071  
1071 1072          if (load_context) {
1072 1073                  /*
1073 1074                   * Count the context descriptor for
  
    | 
      ↓ open down ↓ | 
    1035 lines elided | 
    
      ↑ open up ↑ | 
  
1074 1075                   * the first tx control block.
1075 1076                   */
1076 1077                  first_tcb->desc_num++;
1077 1078          }
1078 1079          first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
1079 1080  
1080 1081          /*
1081 1082           * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1082 1083           * valid in the first descriptor of the packet.
1083 1084           * Setting paylen in every first_tbd for all parts.
1084      -         * 82599 and X540 require the packet length in paylen field with or
1085      -         * without LSO and 82598 will ignore it in non-LSO mode.
     1085 +         * 82599, X540 and X550 require the packet length in paylen field
     1086 +         * with or without LSO and 82598 will ignore it in non-LSO mode.
1086 1087           */
1087 1088          ASSERT(first_tbd != NULL);
1088 1089          first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1089 1090  
1090 1091          switch (hw->mac.type) {
1091 1092          case ixgbe_mac_82598EB:
1092 1093                  if (ctx != NULL && ctx->lso_flag) {
1093 1094                          first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1094 1095                          first_tbd->read.olinfo_status |=
1095 1096                              (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1096 1097                              - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1097 1098                  }
1098 1099                  break;
1099 1100  
1100 1101          case ixgbe_mac_82599EB:
1101 1102          case ixgbe_mac_X540:
     1103 +        case ixgbe_mac_X550:
     1104 +        case ixgbe_mac_X550EM_x:
1102 1105                  if (ctx != NULL && ctx->lso_flag) {
1103 1106                          first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1104 1107                          first_tbd->read.olinfo_status |=
1105 1108                              (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1106 1109                              - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1107 1110                  } else {
1108 1111                          first_tbd->read.olinfo_status |=
1109 1112                              (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1110 1113                  }
1111 1114                  break;
1112 1115  
1113 1116          default:
1114 1117                  break;
1115 1118          }
1116 1119  
1117 1120          /* Set hardware checksum bits */
1118 1121          if (hcksum_flags != 0) {
1119 1122                  if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1120 1123                          first_tbd->read.olinfo_status |=
1121 1124                              IXGBE_ADVTXD_POPTS_IXSM;
1122 1125                  if (hcksum_flags & HCK_PARTIALCKSUM)
1123 1126                          first_tbd->read.olinfo_status |=
1124 1127                              IXGBE_ADVTXD_POPTS_TXSM;
1125 1128          }
1126 1129  
1127 1130          /*
1128 1131           * The last descriptor of packet needs End Of Packet (EOP),
1129 1132           * and Report Status (RS) bits set
1130 1133           */
1131 1134          ASSERT(tbd != NULL);
1132 1135          tbd->read.cmd_type_len |=
1133 1136              IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1134 1137  
1135 1138          /*
1136 1139           * Sync the DMA buffer of the tx descriptor ring
1137 1140           */
1138 1141          DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1139 1142  
1140 1143          /*
1141 1144           * Update the number of the free tx descriptors.
1142 1145           * The mutual exclusion between the transmission and the recycling
1143 1146           * (for the tx descriptor ring and the work list) is implemented
1144 1147           * with the atomic operation on the number of the free tx descriptors.
1145 1148           *
1146 1149           * Note: we should always decrement the counter tbd_free before
1147 1150           * advancing the hardware TDT pointer to avoid the race condition -
1148 1151           * before the counter tbd_free is decremented, the transmit of the
1149 1152           * tx descriptors has done and the counter tbd_free is increased by
1150 1153           * the tx recycling.
1151 1154           */
1152 1155          i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1153 1156          ASSERT(i >= 0);
1154 1157  
1155 1158          tx_ring->tbd_tail = index;
1156 1159  
1157 1160          /*
1158 1161           * Advance the hardware TDT pointer of the tx descriptor ring
1159 1162           */
1160 1163          IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1161 1164  
1162 1165          if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1163 1166              DDI_FM_OK) {
1164 1167                  ddi_fm_service_impact(tx_ring->ixgbe->dip,
1165 1168                      DDI_SERVICE_DEGRADED);
1166 1169                  atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1167 1170          }
1168 1171  
1169 1172          return (desc_num);
1170 1173  }
1171 1174  
1172 1175  /*
1173 1176   * ixgbe_save_desc
1174 1177   *
1175 1178   * Save the address/length pair to the private array
1176 1179   * of the tx control block. The address/length pairs
1177 1180   * will be filled into the tx descriptor ring later.
1178 1181   */
1179 1182  static void
1180 1183  ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1181 1184  {
1182 1185          sw_desc_t *desc;
1183 1186  
1184 1187          desc = &tcb->desc[tcb->desc_num];
1185 1188          desc->address = address;
1186 1189          desc->length = length;
1187 1190  
1188 1191          tcb->desc_num++;
1189 1192  }
1190 1193  
1191 1194  /*
1192 1195   * ixgbe_tx_recycle_legacy
1193 1196   *
1194 1197   * Recycle the tx descriptors and tx control blocks.
1195 1198   *
1196 1199   * The work list is traversed to check if the corresponding
1197 1200   * tx descriptors have been transmitted. If so, the resources
1198 1201   * bound to the tx control blocks will be freed, and those
1199 1202   * tx control blocks will be returned to the free list.
1200 1203   */
1201 1204  uint32_t
1202 1205  ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1203 1206  {
1204 1207          uint32_t index, last_index, prev_index;
1205 1208          int desc_num;
1206 1209          boolean_t desc_done;
1207 1210          tx_control_block_t *tcb;
1208 1211          link_list_t pending_list;
1209 1212          ixgbe_t *ixgbe = tx_ring->ixgbe;
1210 1213  
1211 1214          mutex_enter(&tx_ring->recycle_lock);
1212 1215  
1213 1216          ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1214 1217  
1215 1218          if (tx_ring->tbd_free == tx_ring->ring_size) {
1216 1219                  tx_ring->recycle_fail = 0;
1217 1220                  tx_ring->stall_watchdog = 0;
1218 1221                  if (tx_ring->reschedule) {
1219 1222                          tx_ring->reschedule = B_FALSE;
1220 1223                          mac_tx_ring_update(ixgbe->mac_hdl,
1221 1224                              tx_ring->ring_handle);
1222 1225                  }
1223 1226                  mutex_exit(&tx_ring->recycle_lock);
1224 1227                  return (0);
1225 1228          }
1226 1229  
1227 1230          /*
1228 1231           * Sync the DMA buffer of the tx descriptor ring
1229 1232           */
1230 1233          DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1231 1234  
1232 1235          if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1233 1236                  mutex_exit(&tx_ring->recycle_lock);
1234 1237                  ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1235 1238                  atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1236 1239                  return (0);
1237 1240          }
1238 1241  
1239 1242          LINK_LIST_INIT(&pending_list);
1240 1243          desc_num = 0;
1241 1244          index = tx_ring->tbd_head;      /* Index of next tbd/tcb to recycle */
1242 1245  
1243 1246          tcb = tx_ring->work_list[index];
1244 1247          ASSERT(tcb != NULL);
1245 1248  
1246 1249          while (tcb != NULL) {
1247 1250                  /*
1248 1251                   * Get the last tx descriptor of this packet.
1249 1252                   * If the last tx descriptor is done, then
1250 1253                   * we can recycle all descriptors of a packet
1251 1254                   * which usually includes several tx control blocks.
1252 1255                   * For 82599, LSO descriptors can not be recycled
1253 1256                   * unless the whole packet's transmission is done.
1254 1257                   * That's why packet level recycling is used here.
1255 1258                   * For 82598, there's not such limit.
1256 1259                   */
1257 1260                  last_index = tcb->last_index;
1258 1261                  /*
1259 1262                   * MAX_TX_RING_SIZE is used to judge whether
1260 1263                   * the index is a valid value or not.
1261 1264                   */
1262 1265                  if (last_index == MAX_TX_RING_SIZE)
1263 1266                          break;
1264 1267  
1265 1268                  /*
1266 1269                   * Check if the Descriptor Done bit is set
1267 1270                   */
1268 1271                  desc_done = tx_ring->tbd_ring[last_index].wb.status &
1269 1272                      IXGBE_TXD_STAT_DD;
1270 1273                  if (desc_done) {
1271 1274                          /*
1272 1275                           * recycle all descriptors of the packet
1273 1276                           */
1274 1277                          while (tcb != NULL) {
1275 1278                                  /*
1276 1279                                   * Strip off the tx control block from
1277 1280                                   * the work list, and add it to the
1278 1281                                   * pending list.
1279 1282                                   */
1280 1283                                  tx_ring->work_list[index] = NULL;
1281 1284                                  LIST_PUSH_TAIL(&pending_list, &tcb->link);
1282 1285  
1283 1286                                  /*
1284 1287                                   * Count the total number of the tx
1285 1288                                   * descriptors recycled
1286 1289                                   */
1287 1290                                  desc_num += tcb->desc_num;
1288 1291  
1289 1292                                  index = NEXT_INDEX(index, tcb->desc_num,
1290 1293                                      tx_ring->ring_size);
1291 1294  
1292 1295                                  tcb = tx_ring->work_list[index];
1293 1296  
1294 1297                                  prev_index = PREV_INDEX(index, 1,
1295 1298                                      tx_ring->ring_size);
1296 1299                                  if (prev_index == last_index)
1297 1300                                          break;
1298 1301                          }
1299 1302                  } else {
1300 1303                          break;
1301 1304                  }
1302 1305          }
1303 1306  
1304 1307          /*
1305 1308           * If no tx descriptors are recycled, no need to do more processing
1306 1309           */
1307 1310          if (desc_num == 0) {
1308 1311                  tx_ring->recycle_fail++;
1309 1312                  mutex_exit(&tx_ring->recycle_lock);
1310 1313                  return (0);
1311 1314          }
1312 1315  
1313 1316          tx_ring->recycle_fail = 0;
1314 1317          tx_ring->stall_watchdog = 0;
1315 1318  
1316 1319          /*
1317 1320           * Update the head index of the tx descriptor ring
1318 1321           */
1319 1322          tx_ring->tbd_head = index;
1320 1323  
1321 1324          /*
1322 1325           * Update the number of the free tx descriptors with atomic operations
1323 1326           */
1324 1327          atomic_add_32(&tx_ring->tbd_free, desc_num);
1325 1328  
1326 1329          if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1327 1330              (tx_ring->reschedule)) {
1328 1331                  tx_ring->reschedule = B_FALSE;
1329 1332                  mac_tx_ring_update(ixgbe->mac_hdl,
1330 1333                      tx_ring->ring_handle);
1331 1334          }
1332 1335          mutex_exit(&tx_ring->recycle_lock);
1333 1336  
1334 1337          /*
1335 1338           * Free the resources used by the tx control blocks
1336 1339           * in the pending list
1337 1340           */
1338 1341          tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1339 1342          while (tcb != NULL) {
1340 1343                  /*
1341 1344                   * Release the resources occupied by the tx control block
1342 1345                   */
1343 1346                  ixgbe_free_tcb(tcb);
1344 1347  
1345 1348                  tcb = (tx_control_block_t *)
1346 1349                      LIST_GET_NEXT(&pending_list, &tcb->link);
1347 1350          }
1348 1351  
1349 1352          /*
1350 1353           * Add the tx control blocks in the pending list to the free list.
1351 1354           */
1352 1355          ixgbe_put_free_list(tx_ring, &pending_list);
1353 1356  
1354 1357          return (desc_num);
1355 1358  }
1356 1359  
1357 1360  /*
1358 1361   * ixgbe_tx_recycle_head_wb
1359 1362   *
1360 1363   * Check the head write-back, and recycle all the transmitted
1361 1364   * tx descriptors and tx control blocks.
1362 1365   */
1363 1366  uint32_t
1364 1367  ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1365 1368  {
1366 1369          uint32_t index;
1367 1370          uint32_t head_wb;
1368 1371          int desc_num;
1369 1372          tx_control_block_t *tcb;
1370 1373          link_list_t pending_list;
1371 1374          ixgbe_t *ixgbe = tx_ring->ixgbe;
1372 1375  
1373 1376          mutex_enter(&tx_ring->recycle_lock);
1374 1377  
1375 1378          ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1376 1379  
1377 1380          if (tx_ring->tbd_free == tx_ring->ring_size) {
1378 1381                  tx_ring->recycle_fail = 0;
1379 1382                  tx_ring->stall_watchdog = 0;
1380 1383                  if (tx_ring->reschedule) {
1381 1384                          tx_ring->reschedule = B_FALSE;
1382 1385                          mac_tx_ring_update(ixgbe->mac_hdl,
1383 1386                              tx_ring->ring_handle);
1384 1387                  }
1385 1388                  mutex_exit(&tx_ring->recycle_lock);
1386 1389                  return (0);
1387 1390          }
1388 1391  
1389 1392          /*
1390 1393           * Sync the DMA buffer of the tx descriptor ring
1391 1394           *
1392 1395           * Note: For head write-back mode, the tx descriptors will not
1393 1396           * be written back, but the head write-back value is stored at
1394 1397           * the last extra tbd at the end of the DMA area, we still need
1395 1398           * to sync the head write-back value for kernel.
1396 1399           *
1397 1400           * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1398 1401           */
1399 1402          (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1400 1403              sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1401 1404              sizeof (uint32_t),
1402 1405              DDI_DMA_SYNC_FORKERNEL);
1403 1406  
1404 1407          if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1405 1408                  mutex_exit(&tx_ring->recycle_lock);
1406 1409                  ddi_fm_service_impact(ixgbe->dip,
1407 1410                      DDI_SERVICE_DEGRADED);
1408 1411                  atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1409 1412                  return (0);
1410 1413          }
1411 1414  
1412 1415          LINK_LIST_INIT(&pending_list);
1413 1416          desc_num = 0;
1414 1417          index = tx_ring->tbd_head;      /* Next index to clean */
1415 1418  
1416 1419          /*
1417 1420           * Get the value of head write-back
1418 1421           */
1419 1422          head_wb = *tx_ring->tbd_head_wb;
1420 1423          while (index != head_wb) {
1421 1424                  tcb = tx_ring->work_list[index];
1422 1425                  ASSERT(tcb != NULL);
1423 1426  
1424 1427                  if (OFFSET(index, head_wb, tx_ring->ring_size) <
1425 1428                      tcb->desc_num) {
1426 1429                          /*
1427 1430                           * The current tx control block is not
1428 1431                           * completely transmitted, stop recycling
1429 1432                           */
1430 1433                          break;
1431 1434                  }
1432 1435  
1433 1436                  /*
1434 1437                   * Strip off the tx control block from the work list,
1435 1438                   * and add it to the pending list.
1436 1439                   */
1437 1440                  tx_ring->work_list[index] = NULL;
1438 1441                  LIST_PUSH_TAIL(&pending_list, &tcb->link);
1439 1442  
1440 1443                  /*
1441 1444                   * Advance the index of the tx descriptor ring
1442 1445                   */
1443 1446                  index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1444 1447  
1445 1448                  /*
1446 1449                   * Count the total number of the tx descriptors recycled
1447 1450                   */
1448 1451                  desc_num += tcb->desc_num;
1449 1452          }
1450 1453  
1451 1454          /*
1452 1455           * If no tx descriptors are recycled, no need to do more processing
1453 1456           */
1454 1457          if (desc_num == 0) {
1455 1458                  tx_ring->recycle_fail++;
1456 1459                  mutex_exit(&tx_ring->recycle_lock);
1457 1460                  return (0);
1458 1461          }
1459 1462  
1460 1463          tx_ring->recycle_fail = 0;
1461 1464          tx_ring->stall_watchdog = 0;
1462 1465  
1463 1466          /*
1464 1467           * Update the head index of the tx descriptor ring
1465 1468           */
1466 1469          tx_ring->tbd_head = index;
1467 1470  
1468 1471          /*
1469 1472           * Update the number of the free tx descriptors with atomic operations
1470 1473           */
1471 1474          atomic_add_32(&tx_ring->tbd_free, desc_num);
1472 1475  
1473 1476          if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1474 1477              (tx_ring->reschedule)) {
1475 1478                  tx_ring->reschedule = B_FALSE;
1476 1479                  mac_tx_ring_update(ixgbe->mac_hdl,
1477 1480                      tx_ring->ring_handle);
1478 1481          }
1479 1482          mutex_exit(&tx_ring->recycle_lock);
1480 1483  
1481 1484          /*
1482 1485           * Free the resources used by the tx control blocks
1483 1486           * in the pending list
1484 1487           */
1485 1488          tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
1486 1489          while (tcb) {
1487 1490                  /*
1488 1491                   * Release the resources occupied by the tx control block
1489 1492                   */
1490 1493                  ixgbe_free_tcb(tcb);
1491 1494  
1492 1495                  tcb = (tx_control_block_t *)
1493 1496                      LIST_GET_NEXT(&pending_list, &tcb->link);
1494 1497          }
1495 1498  
1496 1499          /*
1497 1500           * Add the tx control blocks in the pending list to the free list.
1498 1501           */
1499 1502          ixgbe_put_free_list(tx_ring, &pending_list);
1500 1503  
1501 1504          return (desc_num);
1502 1505  }
1503 1506  
1504 1507  /*
1505 1508   * ixgbe_free_tcb - free up the tx control block
1506 1509   *
1507 1510   * Free the resources of the tx control block, including
1508 1511   * unbind the previously bound DMA handle, and reset other
1509 1512   * control fields.
1510 1513   */
1511 1514  void
1512 1515  ixgbe_free_tcb(tx_control_block_t *tcb)
1513 1516  {
1514 1517          switch (tcb->tx_type) {
1515 1518          case USE_COPY:
1516 1519                  /*
1517 1520                   * Reset the buffer length that is used for copy
1518 1521                   */
1519 1522                  tcb->tx_buf.len = 0;
1520 1523                  break;
1521 1524          case USE_DMA:
1522 1525                  /*
1523 1526                   * Release the DMA resource that is used for
1524 1527                   * DMA binding.
1525 1528                   */
1526 1529                  (void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1527 1530                  break;
1528 1531          default:
1529 1532                  break;
1530 1533          }
1531 1534  
1532 1535          /*
1533 1536           * Free the mblk
1534 1537           */
1535 1538          if (tcb->mp != NULL) {
1536 1539                  freemsg(tcb->mp);
1537 1540                  tcb->mp = NULL;
1538 1541          }
1539 1542  
1540 1543          tcb->tx_type = USE_NONE;
1541 1544          tcb->last_index = MAX_TX_RING_SIZE;
1542 1545          tcb->frag_num = 0;
1543 1546          tcb->desc_num = 0;
1544 1547  }
1545 1548  
1546 1549  /*
1547 1550   * ixgbe_get_free_list - Get a free tx control block from the free list
1548 1551   *
1549 1552   * The atomic operation on the number of the available tx control block
1550 1553   * in the free list is used to keep this routine mutual exclusive with
1551 1554   * the routine ixgbe_put_check_list.
1552 1555   */
1553 1556  static tx_control_block_t *
1554 1557  ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
1555 1558  {
1556 1559          tx_control_block_t *tcb;
1557 1560  
1558 1561          /*
1559 1562           * Check and update the number of the free tx control block
1560 1563           * in the free list.
1561 1564           */
1562 1565          if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
1563 1566                  return (NULL);
1564 1567  
1565 1568          mutex_enter(&tx_ring->tcb_head_lock);
1566 1569  
1567 1570          tcb = tx_ring->free_list[tx_ring->tcb_head];
1568 1571          ASSERT(tcb != NULL);
1569 1572          tx_ring->free_list[tx_ring->tcb_head] = NULL;
1570 1573          tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1571 1574              tx_ring->free_list_size);
1572 1575  
1573 1576          mutex_exit(&tx_ring->tcb_head_lock);
1574 1577  
1575 1578          return (tcb);
1576 1579  }
1577 1580  
1578 1581  /*
1579 1582   * ixgbe_put_free_list
1580 1583   *
1581 1584   * Put a list of used tx control blocks back to the free list
1582 1585   *
1583 1586   * A mutex is used here to ensure the serialization. The mutual exclusion
1584 1587   * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1585 1588   * the atomic operation on the counter tcb_free.
1586 1589   */
1587 1590  void
1588 1591  ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1589 1592  {
1590 1593          uint32_t index;
1591 1594          int tcb_num;
1592 1595          tx_control_block_t *tcb;
1593 1596  
1594 1597          mutex_enter(&tx_ring->tcb_tail_lock);
1595 1598  
1596 1599          index = tx_ring->tcb_tail;
1597 1600  
1598 1601          tcb_num = 0;
1599 1602          tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1600 1603          while (tcb != NULL) {
1601 1604                  ASSERT(tx_ring->free_list[index] == NULL);
1602 1605                  tx_ring->free_list[index] = tcb;
1603 1606  
1604 1607                  tcb_num++;
1605 1608  
1606 1609                  index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1607 1610  
1608 1611                  tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1609 1612          }
1610 1613  
1611 1614          tx_ring->tcb_tail = index;
1612 1615  
1613 1616          /*
1614 1617           * Update the number of the free tx control block
1615 1618           * in the free list. This operation must be placed
1616 1619           * under the protection of the lock.
1617 1620           */
1618 1621          atomic_add_32(&tx_ring->tcb_free, tcb_num);
1619 1622  
1620 1623          mutex_exit(&tx_ring->tcb_tail_lock);
1621 1624  }
  
    | 
      ↓ open down ↓ | 
    510 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX