918 Wdiff usr/src/uts/common/io/mac/mac_sched.c

Print this page

918 Need better IP fanout (esp. with VLANs present)

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/io/mac/mac_sched.c
          +++ new/usr/src/uts/common/io/mac/mac_sched.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24      - */
  25      -/*
  26   24   * Copyright 2011 Joyent, Inc.  All rights reserved.
       25 + * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  27   26   */
  28   27  
  29   28  #include <sys/types.h>
  30   29  #include <sys/callb.h>
  31   30  #include <sys/sdt.h>
  32   31  #include <sys/strsubr.h>
  33   32  #include <sys/strsun.h>
  34   33  #include <sys/vlan.h>
  35   34  #include <sys/stack.h>
  36   35  #include <sys/archsystm.h>

  37   36  #include <inet/ipsec_impl.h>
  38   37  #include <inet/ip_impl.h>
  39   38  #include <inet/sadb.h>
  40   39  #include <inet/ipsecesp.h>
  41   40  #include <inet/ipsecah.h>
  42   41  #include <inet/ip6.h>
  43   42  
  44   43  #include <sys/mac_impl.h>
  45   44  #include <sys/mac_client_impl.h>
  46   45  #include <sys/mac_client_priv.h>
  47   46  #include <sys/mac_soft_ring.h>
  48   47  #include <sys/mac_flow_impl.h>
  49   48  
  50   49  static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
  51   50      uintptr_t, uint16_t, mblk_t **);
  52   51  static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
  53   52      uintptr_t, uint16_t, mblk_t **);
  54   53  static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
  55   54      uintptr_t, uint16_t, mblk_t **);
  56   55  static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
  57   56      uintptr_t, uint16_t, mblk_t **);
  58   57  static mac_tx_cookie_t mac_tx_aggr_mode(mac_soft_ring_set_t *, mblk_t *,
  59   58      uintptr_t, uint16_t, mblk_t **);
  60   59  
  61   60  typedef struct mac_tx_mode_s {
  62   61          mac_tx_srs_mode_t       mac_tx_mode;
  63   62          mac_tx_func_t           mac_tx_func;
  64   63  } mac_tx_mode_t;
  65   64  
  66   65  /*
  67   66   * There are seven modes of operation on the Tx side. These modes get set
  68   67   * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
  69   68   * none of the other modes are user configurable. They get selected by
  70   69   * the system depending upon whether the link (or flow) has multiple Tx
  71   70   * rings or a bandwidth configured, or if the link is an aggr, etc.
  72   71   *
  73   72   * When the Tx SRS is operating in aggr mode (st_mode) or if there are
  74   73   * multiple Tx rings owned by Tx SRS, then each Tx ring (pseudo or
  75   74   * otherwise) will have a soft ring associated with it. These soft rings
  76   75   * are stored in srs_tx_soft_rings[] array.
  77   76   *
  78   77   * Additionally in the case of aggr, there is the st_soft_rings[] array
  79   78   * in the mac_srs_tx_t structure. This array is used to store the same
  80   79   * set of soft rings that are present in srs_tx_soft_rings[] array but
  81   80   * in a different manner. The soft ring associated with the pseudo Tx
  82   81   * ring is saved at mr_index (of the pseudo ring) in st_soft_rings[]
  83   82   * array. This helps in quickly getting the soft ring associated with the
  84   83   * Tx ring when aggr_find_tx_ring() returns the pseudo Tx ring that is to
  85   84   * be used for transmit.
  86   85   */
  87   86  mac_tx_mode_t mac_tx_mode_list[] = {
  88   87          {SRS_TX_DEFAULT,        mac_tx_single_ring_mode},
  89   88          {SRS_TX_SERIALIZE,      mac_tx_serializer_mode},
  90   89          {SRS_TX_FANOUT,         mac_tx_fanout_mode},
  91   90          {SRS_TX_BW,             mac_tx_bw_mode},
  92   91          {SRS_TX_BW_FANOUT,      mac_tx_bw_mode},
  93   92          {SRS_TX_AGGR,           mac_tx_aggr_mode},
  94   93          {SRS_TX_BW_AGGR,        mac_tx_bw_mode}
  95   94  };
  96   95  
  97   96  /*
  98   97   * Soft Ring Set (SRS) - The Run time code that deals with
  99   98   * dynamic polling from the hardware, bandwidth enforcement,
 100   99   * fanout etc.
 101  100   *
 102  101   * We try to use H/W classification on NIC and assign traffic for
 103  102   * a MAC address to a particular Rx ring or ring group. There is a
 104  103   * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
 105  104   * switches the underlying Rx ring between interrupt and
 106  105   * polling mode and enforces any specified B/W control.
 107  106   *
 108  107   * There is always a SRS created and tied to each H/W and S/W rule.
 109  108   * Whenever we create a H/W rule, we always add the the same rule to
 110  109   * S/W classifier and tie a SRS to it.
 111  110   *
 112  111   * In case a B/W control is specified, it is broken into bytes
 113  112   * per ticks and as soon as the quota for a tick is exhausted,
 114  113   * the underlying Rx ring is forced into poll mode for remainder of
 115  114   * the tick. The SRS poll thread only polls for bytes that are
 116  115   * allowed to come in the SRS. We typically let 4x the configured
 117  116   * B/W worth of packets to come in the SRS (to prevent unnecessary
 118  117   * drops due to bursts) but only process the specified amount.
 119  118   *
 120  119   * A MAC client (e.g. a VNIC or aggr) can have 1 or more
 121  120   * Rx rings (and corresponding SRSs) assigned to it. The SRS
 122  121   * in turn can have softrings to do protocol level fanout or
 123  122   * softrings to do S/W based fanout or both. In case the NIC
 124  123   * has no Rx rings, we do S/W classification to respective SRS.
 125  124   * The S/W classification rule is always setup and ready. This
 126  125   * allows the MAC layer to reassign Rx rings whenever needed
 127  126   * but packets still continue to flow via the default path and
 128  127   * getting S/W classified to correct SRS.
 129  128   *
 130  129   * The SRS's are used on both Tx and Rx side. They use the same
 131  130   * data structure but the processing routines have slightly different
 132  131   * semantics due to the fact that Rx side needs to do dynamic
 133  132   * polling etc.
 134  133   *
 135  134   * Dynamic Polling Notes
 136  135   * =====================
 137  136   *
 138  137   * Each Soft ring set is capable of switching its Rx ring between
 139  138   * interrupt and poll mode and actively 'polls' for packets in
 140  139   * poll mode. If the SRS is implementing a B/W limit, it makes
 141  140   * sure that only Max allowed packets are pulled in poll mode
 142  141   * and goes to poll mode as soon as B/W limit is exceeded. As
 143  142   * such, there are no overheads to implement B/W limits.
 144  143   *
 145  144   * In poll mode, its better to keep the pipeline going where the
 146  145   * SRS worker thread keeps processing packets and poll thread
 147  146   * keeps bringing more packets (specially if they get to run
 148  147   * on different CPUs). This also prevents the overheads associated
 149  148   * by excessive signalling (on NUMA machines, this can be
 150  149   * pretty devastating). The exception is latency optimized case
 151  150   * where worker thread does no work and interrupt and poll thread
 152  151   * are allowed to do their own drain.
 153  152   *
 154  153   * We use the following policy to control Dynamic Polling:
 155  154   * 1) We switch to poll mode anytime the processing
 156  155   *    thread causes a backlog to build up in SRS and
 157  156   *    its associated Soft Rings (sr_poll_pkt_cnt > 0).
 158  157   * 2) As long as the backlog stays under the low water
 159  158   *    mark (sr_lowat), we poll the H/W for more packets.
 160  159   * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
 161  160   *    water mark, we stay in poll mode but don't poll
 162  161   *    the H/W for more packets.
 163  162   * 4) Anytime in polling mode, if we poll the H/W for
 164  163   *    packets and find nothing plus we have an existing
 165  164   *    backlog (sr_poll_pkt_cnt > 0), we stay in polling
 166  165   *    mode but don't poll the H/W for packets anymore
 167  166   *    (let the polling thread go to sleep).
 168  167   * 5) Once the backlog is relived (packets are processed)
 169  168   *    we reenable polling (by signalling the poll thread)
 170  169   *    only when the backlog dips below sr_poll_thres.
 171  170   * 6) sr_hiwat is used exclusively when we are not
 172  171   *    polling capable and is used to decide when to
 173  172   *    drop packets so the SRS queue length doesn't grow
 174  173   *    infinitely.
 175  174   *
 176  175   * NOTE: Also see the block level comment on top of mac_soft_ring.c
 177  176   */
 178  177  
 179  178  /*
 180  179   * mac_latency_optimize
 181  180   *
 182  181   * Controls whether the poll thread can process the packets inline
 183  182   * or let the SRS worker thread do the processing. This applies if
 184  183   * the SRS was not being processed. For latency sensitive traffic,
 185  184   * this needs to be true to allow inline processing. For throughput
 186  185   * under load, this should be false.
 187  186   *
 188  187   * This (and other similar) tunable should be rolled into a link
 189  188   * or flow specific workload hint that can be set using dladm
 190  189   * linkprop (instead of multiple such tunables).
 191  190   */
 192  191  boolean_t mac_latency_optimize = B_TRUE;
 193  192  
 194  193  /*
 195  194   * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
 196  195   *
 197  196   * queue a mp or chain in soft ring set and increment the
 198  197   * local count (srs_count) for the SRS and the shared counter
 199  198   * (srs_poll_pkt_cnt - shared between SRS and its soft rings
 200  199   * to track the total unprocessed packets for polling to work
 201  200   * correctly).
 202  201   *
 203  202   * The size (total bytes queued) counters are incremented only
 204  203   * if we are doing B/W control.
 205  204   */
 206  205  #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {         \
 207  206          ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 208  207          if ((mac_srs)->srs_last != NULL)                                \
 209  208                  (mac_srs)->srs_last->b_next = (head);                   \
 210  209          else                                                            \
 211  210                  (mac_srs)->srs_first = (head);                          \
 212  211          (mac_srs)->srs_last = (tail);                                   \
 213  212          (mac_srs)->srs_count += count;                                  \
 214  213  }
 215  214  
 216  215  #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {      \
 217  216          mac_srs_rx_t    *srs_rx = &(mac_srs)->srs_rx;                   \
 218  217                                                                          \
 219  218          MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);          \
 220  219          srs_rx->sr_poll_pkt_cnt += count;                               \
 221  220          ASSERT(srs_rx->sr_poll_pkt_cnt > 0);                            \
 222  221          if ((mac_srs)->srs_type & SRST_BW_CONTROL) {                    \
 223  222                  (mac_srs)->srs_size += (sz);                            \
 224  223                  mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock);           \
 225  224                  (mac_srs)->srs_bw->mac_bw_sz += (sz);                   \
 226  225                  mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock);            \
 227  226          }                                                               \
 228  227  }
 229  228  
 230  229  #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {      \
 231  230          mac_srs->srs_state |= SRS_ENQUEUED;                             \
 232  231          MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);          \
 233  232          if ((mac_srs)->srs_type & SRST_BW_CONTROL) {                    \
 234  233                  (mac_srs)->srs_size += (sz);                            \
 235  234                  (mac_srs)->srs_bw->mac_bw_sz += (sz);                   \
 236  235          }                                                               \
 237  236  }
 238  237  
 239  238  /*
 240  239   * Turn polling on routines
 241  240   */
 242  241  #define MAC_SRS_POLLING_ON(mac_srs) {                                   \
 243  242          ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 244  243          if (((mac_srs)->srs_state &                                     \
 245  244              (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) {    \
 246  245                  (mac_srs)->srs_state |= SRS_POLLING;                    \
 247  246                  (void) mac_hwring_disable_intr((mac_ring_handle_t)      \
 248  247                      (mac_srs)->srs_ring);                               \
 249  248                  (mac_srs)->srs_rx.sr_poll_on++;                         \
 250  249          }                                                               \
 251  250  }
 252  251  
 253  252  #define MAC_SRS_WORKER_POLLING_ON(mac_srs) {                            \
 254  253          ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 255  254          if (((mac_srs)->srs_state &                                     \
 256  255              (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) ==              \
 257  256              (SRS_POLLING_CAPAB|SRS_WORKER)) {                           \
 258  257                  (mac_srs)->srs_state |= SRS_POLLING;                    \
 259  258                  (void) mac_hwring_disable_intr((mac_ring_handle_t)      \
 260  259                      (mac_srs)->srs_ring);                               \
 261  260                  (mac_srs)->srs_rx.sr_worker_poll_on++;                  \
 262  261          }                                                               \
 263  262  }
 264  263  
 265  264  /*
 266  265   * MAC_SRS_POLL_RING
 267  266   *
 268  267   * Signal the SRS poll thread to poll the underlying H/W ring
 269  268   * provided it wasn't already polling (SRS_GET_PKTS was set).
 270  269   *
 271  270   * Poll thread gets to run only from mac_rx_srs_drain() and only
 272  271   * if the drain was being done by the worker thread.
 273  272   */
 274  273  #define MAC_SRS_POLL_RING(mac_srs) {                                    \
 275  274          mac_srs_rx_t    *srs_rx = &(mac_srs)->srs_rx;                   \
 276  275                                                                          \
 277  276          ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 278  277          srs_rx->sr_poll_thr_sig++;                                      \
 279  278          if (((mac_srs)->srs_state &                                     \
 280  279              (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) ==             \
 281  280                  (SRS_WORKER|SRS_POLLING_CAPAB)) {                       \
 282  281                  (mac_srs)->srs_state |= SRS_GET_PKTS;                   \
 283  282                  cv_signal(&(mac_srs)->srs_cv);                          \
 284  283          } else {                                                        \
 285  284                  srs_rx->sr_poll_thr_busy++;                             \
 286  285          }                                                               \
 287  286  }
 288  287  
 289  288  /*
 290  289   * MAC_SRS_CHECK_BW_CONTROL
 291  290   *
 292  291   * Check to see if next tick has started so we can reset the
 293  292   * SRS_BW_ENFORCED flag and allow more packets to come in the
 294  293   * system.
 295  294   */
 296  295  #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) {                             \
 297  296          ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 298  297          ASSERT(((mac_srs)->srs_type & SRST_TX) ||                       \
 299  298              MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock));               \
 300  299          clock_t now = ddi_get_lbolt();                                  \
 301  300          if ((mac_srs)->srs_bw->mac_bw_curr_time != now) {               \
 302  301                  (mac_srs)->srs_bw->mac_bw_curr_time = now;              \
 303  302                  (mac_srs)->srs_bw->mac_bw_used = 0;                     \
 304  303                  if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED)  \
 305  304                          (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
 306  305          }                                                               \
 307  306  }
 308  307  
 309  308  /*
 310  309   * MAC_SRS_WORKER_WAKEUP
 311  310   *
 312  311   * Wake up the SRS worker thread to process the queue as long as
 313  312   * no one else is processing the queue. If we are optimizing for
 314  313   * latency, we wake up the worker thread immediately or else we
 315  314   * wait mac_srs_worker_wakeup_ticks before worker thread gets
 316  315   * woken up.
 317  316   */
 318  317  int mac_srs_worker_wakeup_ticks = 0;
 319  318  #define MAC_SRS_WORKER_WAKEUP(mac_srs) {                                \
 320  319          ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 321  320          if (!((mac_srs)->srs_state & SRS_PROC) &&                       \
 322  321                  (mac_srs)->srs_tid == NULL) {                           \
 323  322                  if (((mac_srs)->srs_state & SRS_LATENCY_OPT) ||         \
 324  323                          (mac_srs_worker_wakeup_ticks == 0))             \
 325  324                          cv_signal(&(mac_srs)->srs_async);               \
 326  325                  else                                                    \
 327  326                          (mac_srs)->srs_tid =                            \
 328  327                                  timeout(mac_srs_fire, (mac_srs),        \
 329  328                                          mac_srs_worker_wakeup_ticks);   \
 330  329          }                                                               \
 331  330  }
 332  331  
 333  332  #define TX_BANDWIDTH_MODE(mac_srs)                              \
 334  333          ((mac_srs)->srs_tx.st_mode == SRS_TX_BW ||              \
 335  334              (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT ||    \
 336  335              (mac_srs)->srs_tx.st_mode == SRS_TX_BW_AGGR)
 337  336  
 338  337  #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) {                      \
 339  338          if (tx_mode == SRS_TX_BW_FANOUT)                                \
 340  339                  (void) mac_tx_fanout_mode(mac_srs, head, hint, 0, NULL);\
 341  340          else                                                            \
 342  341                  (void) mac_tx_aggr_mode(mac_srs, head, hint, 0, NULL);  \
 343  342  }
 344  343  
 345  344  /*
 346  345   * MAC_TX_SRS_BLOCK
 347  346   *
 348  347   * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
 349  348   * will be set only if srs_tx_woken_up is FALSE. If
 350  349   * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
 351  350   * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
 352  351   * attempt to transmit again and not setting SRS_TX_BLOCKED does
 353  352   * that.
 354  353   */
 355  354  #define MAC_TX_SRS_BLOCK(srs, mp)       {                       \
 356  355          ASSERT(MUTEX_HELD(&(srs)->srs_lock));                   \
 357  356          if ((srs)->srs_tx.st_woken_up) {                        \
 358  357                  (srs)->srs_tx.st_woken_up = B_FALSE;            \
 359  358          } else {                                                \
 360  359                  ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED));   \
 361  360                  (srs)->srs_state |= SRS_TX_BLOCKED;             \
 362  361                  (srs)->srs_tx.st_stat.mts_blockcnt++;           \
 363  362          }                                                       \
 364  363  }
 365  364  
 366  365  /*
 367  366   * MAC_TX_SRS_TEST_HIWAT
 368  367   *
 369  368   * Called before queueing a packet onto Tx SRS to test and set
 370  369   * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
 371  370   */
 372  371  #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) {         \
 373  372          boolean_t enqueue = 1;                                          \
 374  373                                                                          \
 375  374          if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) {                \
 376  375                  /*                                                      \
 377  376                   * flow-controlled. Store srs in cookie so that it      \
 378  377                   * can be returned as mac_tx_cookie_t to client         \
 379  378                   */                                                     \
 380  379                  (srs)->srs_state |= SRS_TX_HIWAT;                       \
 381  380                  cookie = (mac_tx_cookie_t)srs;                          \
 382  381                  (srs)->srs_tx.st_hiwat_cnt++;                           \
 383  382                  if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) {    \
 384  383                          /* increment freed stats */                     \
 385  384                          (srs)->srs_tx.st_stat.mts_sdrops += cnt;        \
 386  385                          /*                                              \
 387  386                           * b_prev may be set to the fanout hint         \
 388  387                           * hence can't use freemsg directly             \
 389  388                           */                                             \
 390  389                          mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);    \
 391  390                          DTRACE_PROBE1(tx_queued_hiwat,                  \
 392  391                              mac_soft_ring_set_t *, srs);                \
 393  392                          enqueue = 0;                                    \
 394  393                  }                                                       \
 395  394          }                                                               \
 396  395          if (enqueue)                                                    \
 397  396                  MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz);       \
 398  397  }
 399  398  
 400  399  /* Some utility macros */
 401  400  #define MAC_SRS_BW_LOCK(srs)                                            \
 402  401          if (!(srs->srs_type & SRST_TX))                                 \
 403  402                  mutex_enter(&srs->srs_bw->mac_bw_lock);
 404  403  
 405  404  #define MAC_SRS_BW_UNLOCK(srs)                                          \
 406  405          if (!(srs->srs_type & SRST_TX))                                 \
 407  406                  mutex_exit(&srs->srs_bw->mac_bw_lock);
 408  407  
 409  408  #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) {              \
 410  409          mac_pkt_drop(NULL, NULL, mp, B_FALSE);                  \
 411  410          /* increment freed stats */                             \
 412  411          mac_srs->srs_tx.st_stat.mts_sdrops++;                   \
 413  412          cookie = (mac_tx_cookie_t)srs;                          \
 414  413  }
 415  414  
 416  415  #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) {          \
 417  416          mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT;                     \
 418  417          cookie = (mac_tx_cookie_t)srs;                                  \
 419  418          *ret_mp = mp_chain;                                             \
 420  419  }
 421  420  
 422  421  /*
 423  422   * MAC_RX_SRS_TOODEEP
 424  423   *
 425  424   * Macro called as part of receive-side processing to determine if handling
 426  425   * can occur in situ (in the interrupt thread) or if it should be left to a
 427  426   * worker thread.  Note that the constant used to make this determination is
 428  427   * not entirely made-up, and is a result of some emprical validation. That
 429  428   * said, the constant is left as a static variable to allow it to be
 430  429   * dynamically tuned in the field if and as needed.
 431  430   */
 432  431  static uintptr_t mac_rx_srs_stack_needed = 10240;
 433  432  static uint_t mac_rx_srs_stack_toodeep;
 434  433  
 435  434  #ifndef STACK_GROWTH_DOWN
 436  435  #error Downward stack growth assumed.
 437  436  #endif
 438  437  
 439  438  #define MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
 440  439          (uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \
 441  440          ++mac_rx_srs_stack_toodeep)
 442  441  
 443  442  
 444  443  /*
 445  444   * Drop the rx packet and advance to the next one in the chain.
 446  445   */
 447  446  static void
 448  447  mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
 449  448  {
 450  449          mac_srs_rx_t    *srs_rx = &srs->srs_rx;
 451  450  
 452  451          ASSERT(mp->b_next == NULL);
 453  452          mutex_enter(&srs->srs_lock);
 454  453          MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
 455  454          MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
 456  455          mutex_exit(&srs->srs_lock);
 457  456  
 458  457          srs_rx->sr_stat.mrs_sdrops++;
 459  458          freemsg(mp);
 460  459  }
 461  460  
 462  461  /* DATAPATH RUNTIME ROUTINES */
 463  462  
 464  463  /*
 465  464   * mac_srs_fire
 466  465   *
 467  466   * Timer callback routine for waking up the SRS worker thread.
 468  467   */
 469  468  static void
 470  469  mac_srs_fire(void *arg)
 471  470  {
 472  471          mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
 473  472  
 474  473          mutex_enter(&mac_srs->srs_lock);
 475  474          if (mac_srs->srs_tid == 0) {
 476  475                  mutex_exit(&mac_srs->srs_lock);
 477  476                  return;
 478  477          }
 479  478  
 480  479          mac_srs->srs_tid = 0;
 481  480          if (!(mac_srs->srs_state & SRS_PROC))
 482  481                  cv_signal(&mac_srs->srs_async);
 483  482  
 484  483          mutex_exit(&mac_srs->srs_lock);
 485  484  }
 486  485  
 487  486  /*
 488  487   * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
 489  488   * and it is used on the TX path.
 490  489   */
 491  490  #define HASH_HINT(hint) \
 492  491          ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
 493  492  
 494  493  
 495  494  /*
 496  495   * hash based on the src address and the port information.
 497  496   */
 498  497  #define HASH_ADDR(src, ports)                                   \
 499  498          (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^     \
 500  499          ((ports) >> 8) ^ (ports))
 501  500  
 502  501  #define COMPUTE_INDEX(key, sz)  (key % sz)
 503  502  
 504  503  #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) {       \
 505  504          if ((tail) != NULL) {                                           \
 506  505                  ASSERT((tail)->b_next == NULL);                         \
 507  506                  (tail)->b_next = (mp);                                  \
 508  507          } else {                                                        \
 509  508                  ASSERT((head) == NULL);                                 \
 510  509                  (head) = (mp);                                          \
 511  510          }                                                               \
 512  511          (tail) = (mp);                                                  \
 513  512          (cnt)++;                                                        \
 514  513          if ((bw_ctl))                                                   \
 515  514                  (sz) += (sz0);                                          \
 516  515  }
 517  516  
 518  517  #define MAC_FANOUT_DEFAULT      0
 519  518  #define MAC_FANOUT_RND_ROBIN    1
 520  519  int mac_fanout_type = MAC_FANOUT_DEFAULT;
 521  520  
 522  521  #define MAX_SR_TYPES    3

↓ open down ↓

486 lines elided

↑ open up ↑

 523  522  /* fanout types for port based hashing */
 524  523  enum pkt_type {
 525  524          V4_TCP = 0,
 526  525          V4_UDP,
 527  526          OTH,
 528  527          UNDEF
 529  528  };
 530  529  
 531  530  /*
 532  531   * In general we do port based hashing to spread traffic over different
 533      - * softrings. The below tunable allows to override that behavior. Setting it
 534      - * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
 535      - * is also the applicable to ipv6 packets carrying multiple optional headers
 536      - * and other uncommon packet types.
      532 + * softrings. The below tunables allow to override that behavior. Setting one
      533 + * (depending on IPv6 or IPv4) to B_TRUE allows a fanout based on src
      534 + * IPv6 or IPv4 address. This behavior is also applicable to IPv6 packets
      535 + * carrying multiple optional headers and other uncommon packet types.
 537  536   */
 538  537  boolean_t mac_src_ipv6_fanout = B_FALSE;
      538 +boolean_t mac_src_ipv4_fanout = B_FALSE;
 539  539  
 540  540  /*
 541  541   * Pair of local and remote ports in the transport header
 542  542   */
 543  543  #define PORTS_SIZE 4
 544  544  
 545  545  /*
 546  546   * mac_rx_srs_proto_fanout
 547  547   *
 548  548   * This routine delivers packets destined to an SRS into one of the

 549  549   * protocol soft rings.
 550  550   *
 551  551   * Given a chain of packets we need to split it up into multiple sub chains
 552  552   * destined into TCP, UDP or OTH soft ring. Instead of entering
 553  553   * the soft ring one packet at a time, we want to enter it in the form of a
 554  554   * chain otherwise we get this start/stop behaviour where the worker thread
 555  555   * goes to sleep and then next packets comes in forcing it to wake up etc.
 556  556   */
 557  557  static void
 558  558  mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 559  559  {
 560  560          struct ether_header             *ehp;
 561  561          struct ether_vlan_header        *evhp;
 562  562          uint32_t                        sap;
 563  563          ipha_t                          *ipha;
 564  564          uint8_t                         *dstaddr;
 565  565          size_t                          hdrsize;
 566  566          mblk_t                          *mp;
 567  567          mblk_t                          *headmp[MAX_SR_TYPES];
 568  568          mblk_t                          *tailmp[MAX_SR_TYPES];
 569  569          int                             cnt[MAX_SR_TYPES];
 570  570          size_t                          sz[MAX_SR_TYPES];
 571  571          size_t                          sz1;
 572  572          boolean_t                       bw_ctl;
 573  573          boolean_t                       hw_classified;
 574  574          boolean_t                       dls_bypass;
 575  575          boolean_t                       is_ether;
 576  576          boolean_t                       is_unicast;
 577  577          enum pkt_type                   type;
 578  578          mac_client_impl_t               *mcip = mac_srs->srs_mcip;
 579  579  
 580  580          is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
 581  581          bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
 582  582  
 583  583          /*
 584  584           * If we don't have a Rx ring, S/W classification would have done
 585  585           * its job and its a packet meant for us. If we were polling on
 586  586           * the default ring (i.e. there was a ring assigned to this SRS),
 587  587           * then we need to make sure that the mac address really belongs
 588  588           * to us.
 589  589           */
 590  590          hw_classified = mac_srs->srs_ring != NULL &&
 591  591              mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
 592  592  
 593  593          /*
 594  594           * Special clients (eg. VLAN, non ether, etc) need DLS
 595  595           * processing in the Rx path. SRST_DLS_BYPASS will be clear for
 596  596           * such SRSs. Another way of disabling bypass is to set the
 597  597           * MCIS_RX_BYPASS_DISABLE flag.
 598  598           */
 599  599          dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
 600  600              ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
 601  601  
 602  602          bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
 603  603          bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
 604  604          bzero(cnt, MAX_SR_TYPES * sizeof (int));
 605  605          bzero(sz, MAX_SR_TYPES * sizeof (size_t));
 606  606  
 607  607          /*
 608  608           * We got a chain from SRS that we need to send to the soft rings.
 609  609           * Since squeues for TCP & IPv4 sap poll their soft rings (for
 610  610           * performance reasons), we need to separate out v4_tcp, v4_udp
 611  611           * and the rest goes in other.
 612  612           */
 613  613          while (head != NULL) {
 614  614                  mp = head;
 615  615                  head = head->b_next;
 616  616                  mp->b_next = NULL;
 617  617  
 618  618                  type = OTH;
 619  619                  sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
 620  620  
 621  621                  if (is_ether) {
 622  622                          /*
 623  623                           * At this point we can be sure the packet at least
 624  624                           * has an ether header.
 625  625                           */
 626  626                          if (sz1 < sizeof (struct ether_header)) {
 627  627                                  mac_rx_drop_pkt(mac_srs, mp);
 628  628                                  continue;
 629  629                          }
 630  630                          ehp = (struct ether_header *)mp->b_rptr;
 631  631  
 632  632                          /*
 633  633                           * Determine if this is a VLAN or non-VLAN packet.
 634  634                           */
 635  635                          if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
 636  636                                  evhp = (struct ether_vlan_header *)mp->b_rptr;
 637  637                                  sap = ntohs(evhp->ether_type);
 638  638                                  hdrsize = sizeof (struct ether_vlan_header);
 639  639                                  /*
 640  640                                   * Check if the VID of the packet, if any,
 641  641                                   * belongs to this client.
 642  642                                   */
 643  643                                  if (!mac_client_check_flow_vid(mcip,
 644  644                                      VLAN_ID(ntohs(evhp->ether_tci)))) {
 645  645                                          mac_rx_drop_pkt(mac_srs, mp);
 646  646                                          continue;
 647  647                                  }
 648  648                          } else {
 649  649                                  hdrsize = sizeof (struct ether_header);
 650  650                          }
 651  651                          is_unicast =
 652  652                              ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
 653  653                          dstaddr = (uint8_t *)&ehp->ether_dhost;
 654  654                  } else {
 655  655                          mac_header_info_t               mhi;
 656  656  
 657  657                          if (mac_header_info((mac_handle_t)mcip->mci_mip,
 658  658                              mp, &mhi) != 0) {
 659  659                                  mac_rx_drop_pkt(mac_srs, mp);
 660  660                                  continue;
 661  661                          }
 662  662                          hdrsize = mhi.mhi_hdrsize;
 663  663                          sap = mhi.mhi_bindsap;
 664  664                          is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
 665  665                          dstaddr = (uint8_t *)mhi.mhi_daddr;
 666  666                  }
 667  667  
 668  668                  if (!dls_bypass) {
 669  669                          FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
 670  670                              cnt[type], bw_ctl, sz[type], sz1, mp);
 671  671                          continue;
 672  672                  }
 673  673  
 674  674                  if (sap == ETHERTYPE_IP) {
 675  675                          /*
 676  676                           * If we are H/W classified, but we have promisc
 677  677                           * on, then we need to check for the unicast address.
 678  678                           */
 679  679                          if (hw_classified && mcip->mci_promisc_list != NULL) {
 680  680                                  mac_address_t           *map;
 681  681  
 682  682                                  rw_enter(&mcip->mci_rw_lock, RW_READER);
 683  683                                  map = mcip->mci_unicast;
 684  684                                  if (bcmp(dstaddr, map->ma_addr,
 685  685                                      map->ma_len) == 0)
 686  686                                          type = UNDEF;
 687  687                                  rw_exit(&mcip->mci_rw_lock);
 688  688                          } else if (is_unicast) {
 689  689                                  type = UNDEF;
 690  690                          }
 691  691                  }
 692  692  
 693  693                  /*
 694  694                   * This needs to become a contract with the driver for
 695  695                   * the fast path.
 696  696                   *
 697  697                   * In the normal case the packet will have at least the L2
 698  698                   * header and the IP + Transport header in the same mblk.
 699  699                   * This is usually the case when the NIC driver sends up
 700  700                   * the packet. This is also true when the stack generates
 701  701                   * a packet that is looped back and when the stack uses the
 702  702                   * fastpath mechanism. The normal case is optimized for
 703  703                   * performance and may bypass DLS. All other cases go through
 704  704                   * the 'OTH' type path without DLS bypass.
 705  705                   */
 706  706  
 707  707                  ipha = (ipha_t *)(mp->b_rptr + hdrsize);
 708  708                  if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
 709  709                          type = OTH;
 710  710  
 711  711                  if (type == OTH) {
 712  712                          FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
 713  713                              cnt[type], bw_ctl, sz[type], sz1, mp);
 714  714                          continue;
 715  715                  }
 716  716  
 717  717                  ASSERT(type == UNDEF);
 718  718                  /*
 719  719                   * We look for at least 4 bytes past the IP header to get
 720  720                   * the port information. If we get an IP fragment, we don't
 721  721                   * have the port information, and we use just the protocol
 722  722                   * information.
 723  723                   */
 724  724                  switch (ipha->ipha_protocol) {
 725  725                  case IPPROTO_TCP:
 726  726                          type = V4_TCP;
 727  727                          mp->b_rptr += hdrsize;
 728  728                          break;
 729  729                  case IPPROTO_UDP:
 730  730                          type = V4_UDP;
 731  731                          mp->b_rptr += hdrsize;
 732  732                          break;
 733  733                  default:
 734  734                          type = OTH;
 735  735                          break;
 736  736                  }
 737  737  
 738  738                  FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
 739  739                      bw_ctl, sz[type], sz1, mp);
 740  740          }
 741  741  
 742  742          for (type = V4_TCP; type < UNDEF; type++) {
 743  743                  if (headmp[type] != NULL) {
 744  744                          mac_soft_ring_t                 *softring;
 745  745  
 746  746                          ASSERT(tailmp[type]->b_next == NULL);
 747  747                          switch (type) {
 748  748                          case V4_TCP:
 749  749                                  softring = mac_srs->srs_tcp_soft_rings[0];
 750  750                                  break;
 751  751                          case V4_UDP:
 752  752                                  softring = mac_srs->srs_udp_soft_rings[0];

↓ open down ↓

204 lines elided

↑ open up ↑

 753  753                                  break;
 754  754                          case OTH:
 755  755                                  softring = mac_srs->srs_oth_soft_rings[0];
 756  756                          }
 757  757                          mac_rx_soft_ring_process(mcip, softring,
 758  758                              headmp[type], tailmp[type], cnt[type], sz[type]);
 759  759                  }
 760  760          }
 761  761  }
 762  762  
 763      -int     fanout_unalligned = 0;
      763 +int     fanout_unaligned = 0;
 764  764  
 765  765  /*
 766  766   * mac_rx_srs_long_fanout
 767  767   *
 768      - * The fanout routine for IPv6
      768 + * The fanout routine for VLANs, and for anything else that isn't performing
      769 + * explicit dls bypass.  Returns -1 on an error (drop the packet due to a
      770 + * malformed packet), 0 on success, with values written in *indx and *type.
 769  771   */
 770  772  static int
 771  773  mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
 772  774      uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
 773  775  {
 774  776          ip6_t           *ip6h;
      777 +        ipha_t          *ipha;
 775  778          uint8_t         *whereptr;
 776  779          uint_t          hash;
 777  780          uint16_t        remlen;
 778  781          uint8_t         nexthdr;
 779  782          uint16_t        hdr_len;
      783 +        uint32_t        src_val;
      784 +        boolean_t       modifiable = B_TRUE;
      785 +        boolean_t       v6;
 780  786  
      787 +        ASSERT(MBLKL(mp) >= hdrsize);
      788 +
 781  789          if (sap == ETHERTYPE_IPV6) {
 782      -                boolean_t       modifiable = B_TRUE;
      790 +                v6 = B_TRUE;
      791 +                hdr_len = IPV6_HDR_LEN;
      792 +        } else if (sap == ETHERTYPE_IP) {
      793 +                v6 = B_FALSE;
      794 +                hdr_len = IP_SIMPLE_HDR_LENGTH;
      795 +        } else {
      796 +                *indx = 0;
      797 +                *type = OTH;
      798 +                return (0);
      799 +        }
 783  800  
 784      -                ASSERT(MBLKL(mp) >= hdrsize);
      801 +        ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
      802 +        ipha = (ipha_t *)ip6h;
 785  803  
 786      -                ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
 787      -                if ((unsigned char *)ip6h == mp->b_wptr) {
 788      -                        /*
 789      -                         * The first mblk_t only includes the mac header.
 790      -                         * Note that it is safe to change the mp pointer here,
 791      -                         * as the subsequent operation does not assume mp
 792      -                         * points to the start of the mac header.
 793      -                         */
 794      -                        mp = mp->b_cont;
      804 +        if ((uint8_t *)ip6h == mp->b_wptr) {
      805 +                /*
      806 +                 * The first mblk_t only includes the mac header.
      807 +                 * Note that it is safe to change the mp pointer here,
      808 +                 * as the subsequent operation does not assume mp
      809 +                 * points to the start of the mac header.
      810 +                 */
      811 +                mp = mp->b_cont;
 795  812  
 796      -                        /*
 797      -                         * Make sure ip6h holds the full ip6_t structure.
 798      -                         */
 799      -                        if (mp == NULL)
 800      -                                return (-1);
      813 +                /*
      814 +                 * Make sure the IP header points to an entire one.
      815 +                 */
      816 +                if (mp == NULL)
      817 +                        return (-1);
 801  818  
 802      -                        if (MBLKL(mp) < IPV6_HDR_LEN) {
 803      -                                modifiable = (DB_REF(mp) == 1);
      819 +                if (MBLKL(mp) < hdr_len) {
      820 +                        modifiable = (DB_REF(mp) == 1);
 804  821  
 805      -                                if (modifiable &&
 806      -                                    !pullupmsg(mp, IPV6_HDR_LEN)) {
 807      -                                        return (-1);
 808      -                                }
 809      -                        }
 810      -
 811      -                        ip6h = (ip6_t *)mp->b_rptr;
      822 +                        if (modifiable && !pullupmsg(mp, hdr_len))
      823 +                                return (-1);
 812  824                  }
 813  825  
 814      -                if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
 815      -                    ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
 816      -                        /*
 817      -                         * If either ip6h is not alligned, or ip6h does not
 818      -                         * hold the complete ip6_t structure (a pullupmsg()
 819      -                         * is not an option since it would result in an
 820      -                         * unalligned ip6h), fanout to the default ring. Note
 821      -                         * that this may cause packets reordering.
 822      -                         */
 823      -                        *indx = 0;
 824      -                        *type = OTH;
 825      -                        fanout_unalligned++;
 826      -                        return (0);
 827      -                }
      826 +                ip6h = (ip6_t *)mp->b_rptr;
      827 +                ipha = (ipha_t *)ip6h;
      828 +        }
 828  829  
      830 +        if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
      831 +            ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
      832 +                /*
      833 +                 * If either the IP header is not aligned, or it does not hold
      834 +                 * the complete simple structure (a pullupmsg() is not an
      835 +                 * option since it would result in an unaligned IP header),
      836 +                 * fanout to the default ring.
      837 +                 *
      838 +                 * Note that this may cause packet reordering.
      839 +                 */
      840 +                *indx = 0;
      841 +                *type = OTH;
      842 +                fanout_unaligned++;
      843 +                return (0);
      844 +        }
      845 +
      846 +        /*
      847 +         * Extract next-header, full header length, and source-hash value
      848 +         * using v4/v6 specific fields.
      849 +         */
      850 +        if (v6) {
 829  851                  remlen = ntohs(ip6h->ip6_plen);
 830  852                  nexthdr = ip6h->ip6_nxt;
 831      -
 832      -                if (remlen < MIN_EHDR_LEN)
 833      -                        return (-1);
      853 +                src_val = V4_PART_OF_V6(ip6h->ip6_src);
 834  854                  /*
 835  855                   * Do src based fanout if below tunable is set to B_TRUE or
 836  856                   * when mac_ip_hdr_length_v6() fails because of malformed
 837      -                 * packets or because mblk's need to be concatenated using
      857 +                 * packets or because mblks need to be concatenated using
 838  858                   * pullupmsg().
 839  859                   */
 840  860                  if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
 841  861                      mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
 842  862                          goto src_based_fanout;
 843  863                  }
 844      -                whereptr = (uint8_t *)ip6h + hdr_len;
 845      -
 846      -                /* If the transport is one of below, we do port based fanout */
 847      -                switch (nexthdr) {
 848      -                case IPPROTO_TCP:
 849      -                case IPPROTO_UDP:
 850      -                case IPPROTO_SCTP:
 851      -                case IPPROTO_ESP:
 852      -                        /*
 853      -                         * If the ports in the transport header is not part of
 854      -                         * the mblk, do src_based_fanout, instead of calling
 855      -                         * pullupmsg().
 856      -                         */
 857      -                        if (mp->b_cont != NULL &&
 858      -                            whereptr + PORTS_SIZE > mp->b_wptr) {
 859      -                                goto src_based_fanout;
 860      -                        }
 861      -                        break;
 862      -                default:
 863      -                        break;
      864 +        } else {
      865 +                hdr_len = IPH_HDR_LENGTH(ipha);
      866 +                remlen = ntohs(ipha->ipha_length) - hdr_len;
      867 +                nexthdr = ipha->ipha_protocol;
      868 +                src_val = (uint32_t)ipha->ipha_src;
      869 +                /*
      870 +                 * Catch IPv4 fragment case here.  IPv6 has nexthdr == FRAG
      871 +                 * for its equivalent case.
      872 +                 */
      873 +                if (mac_src_ipv4_fanout ||
      874 +                    (ntohs(ipha->ipha_fragment_offset_and_flags) &
      875 +                    (IPH_MF | IPH_OFFSET)) != 0) {
      876 +                        goto src_based_fanout;
 864  877                  }
      878 +        }
      879 +        if (remlen < MIN_EHDR_LEN)
      880 +                return (-1);
      881 +        whereptr = (uint8_t *)ip6h + hdr_len;
 865  882  
 866      -                switch (nexthdr) {
 867      -                case IPPROTO_TCP:
 868      -                        hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
 869      -                            *(uint32_t *)whereptr);
 870      -                        *indx = COMPUTE_INDEX(hash,
 871      -                            mac_srs->srs_tcp_ring_count);
 872      -                        *type = OTH;
 873      -                        break;
      883 +        /* If the transport is one of below, we do port/SPI based fanout */
      884 +        switch (nexthdr) {
      885 +        case IPPROTO_TCP:
      886 +        case IPPROTO_UDP:
      887 +        case IPPROTO_SCTP:
      888 +        case IPPROTO_ESP:
      889 +                /*
      890 +                 * If the ports or SPI in the transport header is not part of
      891 +                 * the mblk, do src_based_fanout, instead of calling
      892 +                 * pullupmsg().
      893 +                 */
      894 +                if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
      895 +                        break;  /* out of switch... */
      896 +                /* FALLTHRU */
      897 +        default:
      898 +                goto src_based_fanout;
      899 +        }
 874  900  
 875      -                case IPPROTO_UDP:
 876      -                case IPPROTO_SCTP:
 877      -                case IPPROTO_ESP:
 878      -                        if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
 879      -                                hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
 880      -                                    *(uint32_t *)whereptr);
 881      -                                *indx = COMPUTE_INDEX(hash,
 882      -                                    mac_srs->srs_udp_ring_count);
 883      -                        } else {
 884      -                                *indx = mac_srs->srs_ind %
 885      -                                    mac_srs->srs_udp_ring_count;
 886      -                                mac_srs->srs_ind++;
 887      -                        }
 888      -                        *type = OTH;
 889      -                        break;
 890      -
 891      -                        /* For all other protocol, do source based fanout */
 892      -                default:
 893      -                        goto src_based_fanout;
      901 +        switch (nexthdr) {
      902 +        case IPPROTO_TCP:
      903 +                hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
      904 +                *indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
      905 +                *type = OTH;
      906 +                break;
      907 +        case IPPROTO_UDP:
      908 +        case IPPROTO_SCTP:
      909 +        case IPPROTO_ESP:
      910 +                if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
      911 +                        hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
      912 +                        *indx = COMPUTE_INDEX(hash,
      913 +                            mac_srs->srs_udp_ring_count);
      914 +                } else {
      915 +                        *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
      916 +                        mac_srs->srs_ind++;
 894  917                  }
 895      -        } else {
 896      -                *indx = 0;
 897  918                  *type = OTH;
      919 +                break;
 898  920          }
 899  921          return (0);
 900  922  
 901  923  src_based_fanout:
 902      -        hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
      924 +        hash = HASH_ADDR(src_val, (uint32_t)0);
 903  925          *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
 904  926          *type = OTH;
 905  927          return (0);
 906  928  }
 907  929  
 908  930  /*
 909  931   * mac_rx_srs_fanout
 910  932   *
 911  933   * This routine delivers packets destined to an SRS into a soft ring member
 912  934   * of the set.

 913  935   *
 914  936   * Given a chain of packets we need to split it up into multiple sub chains
 915  937   * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
 916  938   * the soft ring one packet at a time, we want to enter it in the form of a
 917  939   * chain otherwise we get this start/stop behaviour where the worker thread
 918  940   * goes to sleep and then next packets comes in forcing it to wake up etc.
 919  941   *
 920  942   * Note:
 921  943   * Since we know what is the maximum fanout possible, we create a 2D array
 922  944   * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
 923  945   * variables so that we can enter the softrings with chain. We need the
 924  946   * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
 925  947   * for each packet would be expensive). If we ever want to have the
 926  948   * ability to have unlimited fanout, we should probably declare a head,
 927  949   * tail, cnt, sz with each soft ring (a data struct which contains a softring
 928  950   * along with these members) and create an array of this uber struct so we
 929  951   * don't have to do kmem_alloc.
 930  952   */
 931  953  int     fanout_oth1 = 0;
 932  954  int     fanout_oth2 = 0;
 933  955  int     fanout_oth3 = 0;
 934  956  int     fanout_oth4 = 0;
 935  957  int     fanout_oth5 = 0;
 936  958  
 937  959  static void
 938  960  mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 939  961  {
 940  962          struct ether_header             *ehp;
 941  963          struct ether_vlan_header        *evhp;
 942  964          uint32_t                        sap;
 943  965          ipha_t                          *ipha;
 944  966          uint8_t                         *dstaddr;
 945  967          uint_t                          indx;
 946  968          size_t                          ports_offset;
 947  969          size_t                          ipha_len;
 948  970          size_t                          hdrsize;
 949  971          uint_t                          hash;
 950  972          mblk_t                          *mp;
 951  973          mblk_t                          *headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
 952  974          mblk_t                          *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
 953  975          int                             cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
 954  976          size_t                          sz[MAX_SR_TYPES][MAX_SR_FANOUT];
 955  977          size_t                          sz1;
 956  978          boolean_t                       bw_ctl;
 957  979          boolean_t                       hw_classified;
 958  980          boolean_t                       dls_bypass;
 959  981          boolean_t                       is_ether;
 960  982          boolean_t                       is_unicast;
 961  983          int                             fanout_cnt;
 962  984          enum pkt_type                   type;
 963  985          mac_client_impl_t               *mcip = mac_srs->srs_mcip;
 964  986  
 965  987          is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
 966  988          bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
 967  989  
 968  990          /*
 969  991           * If we don't have a Rx ring, S/W classification would have done
 970  992           * its job and its a packet meant for us. If we were polling on
 971  993           * the default ring (i.e. there was a ring assigned to this SRS),
 972  994           * then we need to make sure that the mac address really belongs
 973  995           * to us.
 974  996           */
 975  997          hw_classified = mac_srs->srs_ring != NULL &&
 976  998              mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
 977  999  
 978 1000          /*
 979 1001           * Special clients (eg. VLAN, non ether, etc) need DLS
 980 1002           * processing in the Rx path. SRST_DLS_BYPASS will be clear for
 981 1003           * such SRSs. Another way of disabling bypass is to set the
 982 1004           * MCIS_RX_BYPASS_DISABLE flag.
 983 1005           */
 984 1006          dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
 985 1007              ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
 986 1008  
 987 1009          /*
 988 1010           * Since the softrings are never destroyed and we always
 989 1011           * create equal number of softrings for TCP, UDP and rest,
 990 1012           * its OK to check one of them for count and use it without
 991 1013           * any lock. In future, if soft rings get destroyed because
 992 1014           * of reduction in fanout, we will need to ensure that happens
 993 1015           * behind the SRS_PROC.
 994 1016           */
 995 1017          fanout_cnt = mac_srs->srs_tcp_ring_count;
 996 1018  
 997 1019          bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
 998 1020          bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
 999 1021          bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
1000 1022          bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
1001 1023  
1002 1024          /*
1003 1025           * We got a chain from SRS that we need to send to the soft rings.
1004 1026           * Since squeues for TCP & IPv4 sap poll their soft rings (for
1005 1027           * performance reasons), we need to separate out v4_tcp, v4_udp
1006 1028           * and the rest goes in other.
1007 1029           */
1008 1030          while (head != NULL) {
1009 1031                  mp = head;
1010 1032                  head = head->b_next;
1011 1033                  mp->b_next = NULL;
1012 1034  
1013 1035                  type = OTH;
1014 1036                  sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1015 1037  
1016 1038                  if (is_ether) {
1017 1039                          /*
1018 1040                           * At this point we can be sure the packet at least
1019 1041                           * has an ether header.
1020 1042                           */
1021 1043                          if (sz1 < sizeof (struct ether_header)) {
1022 1044                                  mac_rx_drop_pkt(mac_srs, mp);
1023 1045                                  continue;
1024 1046                          }
1025 1047                          ehp = (struct ether_header *)mp->b_rptr;
1026 1048  
1027 1049                          /*
1028 1050                           * Determine if this is a VLAN or non-VLAN packet.
1029 1051                           */
1030 1052                          if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1031 1053                                  evhp = (struct ether_vlan_header *)mp->b_rptr;
1032 1054                                  sap = ntohs(evhp->ether_type);
1033 1055                                  hdrsize = sizeof (struct ether_vlan_header);
1034 1056                                  /*
1035 1057                                   * Check if the VID of the packet, if any,
1036 1058                                   * belongs to this client.
1037 1059                                   */
1038 1060                                  if (!mac_client_check_flow_vid(mcip,
1039 1061                                      VLAN_ID(ntohs(evhp->ether_tci)))) {
1040 1062                                          mac_rx_drop_pkt(mac_srs, mp);
1041 1063                                          continue;
1042 1064                                  }
1043 1065                          } else {
1044 1066                                  hdrsize = sizeof (struct ether_header);
1045 1067                          }
1046 1068                          is_unicast =
1047 1069                              ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1048 1070                          dstaddr = (uint8_t *)&ehp->ether_dhost;
1049 1071                  } else {
1050 1072                          mac_header_info_t               mhi;
1051 1073  
1052 1074                          if (mac_header_info((mac_handle_t)mcip->mci_mip,
1053 1075                              mp, &mhi) != 0) {
1054 1076                                  mac_rx_drop_pkt(mac_srs, mp);
1055 1077                                  continue;
1056 1078                          }
1057 1079                          hdrsize = mhi.mhi_hdrsize;
1058 1080                          sap = mhi.mhi_bindsap;
1059 1081                          is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
1060 1082                          dstaddr = (uint8_t *)mhi.mhi_daddr;
1061 1083                  }
1062 1084  
1063 1085                  if (!dls_bypass) {
1064 1086                          if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1065 1087                              hdrsize, &type, &indx) == -1) {
1066 1088                                  mac_rx_drop_pkt(mac_srs, mp);
1067 1089                                  continue;
1068 1090                          }
1069 1091  
1070 1092                          FANOUT_ENQUEUE_MP(headmp[type][indx],
1071 1093                              tailmp[type][indx], cnt[type][indx], bw_ctl,
1072 1094                              sz[type][indx], sz1, mp);
1073 1095                          continue;
1074 1096                  }
1075 1097  
1076 1098  
1077 1099                  /*
1078 1100                   * If we are using the default Rx ring where H/W or S/W
1079 1101                   * classification has not happened, we need to verify if
1080 1102                   * this unicast packet really belongs to us.
1081 1103                   */
1082 1104                  if (sap == ETHERTYPE_IP) {
1083 1105                          /*
1084 1106                           * If we are H/W classified, but we have promisc
1085 1107                           * on, then we need to check for the unicast address.
1086 1108                           */
1087 1109                          if (hw_classified && mcip->mci_promisc_list != NULL) {
1088 1110                                  mac_address_t           *map;
1089 1111  
1090 1112                                  rw_enter(&mcip->mci_rw_lock, RW_READER);
1091 1113                                  map = mcip->mci_unicast;
1092 1114                                  if (bcmp(dstaddr, map->ma_addr,
1093 1115                                      map->ma_len) == 0)
1094 1116                                          type = UNDEF;
1095 1117                                  rw_exit(&mcip->mci_rw_lock);
1096 1118                          } else if (is_unicast) {
1097 1119                                  type = UNDEF;
1098 1120                          }
1099 1121                  }
1100 1122  
1101 1123                  /*
1102 1124                   * This needs to become a contract with the driver for
1103 1125                   * the fast path.
1104 1126                   */
1105 1127  
1106 1128                  ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1107 1129                  if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
1108 1130                          type = OTH;
1109 1131                          fanout_oth1++;
1110 1132                  }
1111 1133  
1112 1134                  if (type != OTH) {
1113 1135                          uint16_t        frag_offset_flags;
1114 1136  
1115 1137                          switch (ipha->ipha_protocol) {
1116 1138                          case IPPROTO_TCP:
1117 1139                          case IPPROTO_UDP:
1118 1140                          case IPPROTO_SCTP:
1119 1141                          case IPPROTO_ESP:
1120 1142                                  ipha_len = IPH_HDR_LENGTH(ipha);
1121 1143                                  if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
1122 1144                                      mp->b_wptr) {
1123 1145                                          type = OTH;
1124 1146                                          break;
1125 1147                                  }
1126 1148                                  frag_offset_flags =
1127 1149                                      ntohs(ipha->ipha_fragment_offset_and_flags);
1128 1150                                  if ((frag_offset_flags &
1129 1151                                      (IPH_MF | IPH_OFFSET)) != 0) {
1130 1152                                          type = OTH;
1131 1153                                          fanout_oth3++;
1132 1154                                          break;
1133 1155                                  }
1134 1156                                  ports_offset = hdrsize + ipha_len;
1135 1157                                  break;
1136 1158                          default:
1137 1159                                  type = OTH;
1138 1160                                  fanout_oth4++;
1139 1161                                  break;
1140 1162                          }
1141 1163                  }
1142 1164  
1143 1165                  if (type == OTH) {
1144 1166                          if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1145 1167                              hdrsize, &type, &indx) == -1) {
1146 1168                                  mac_rx_drop_pkt(mac_srs, mp);
1147 1169                                  continue;
1148 1170                          }
1149 1171  
1150 1172                          FANOUT_ENQUEUE_MP(headmp[type][indx],
1151 1173                              tailmp[type][indx], cnt[type][indx], bw_ctl,
1152 1174                              sz[type][indx], sz1, mp);
1153 1175                          continue;
1154 1176                  }
1155 1177  
1156 1178                  ASSERT(type == UNDEF);
1157 1179  
1158 1180                  /*
1159 1181                   * XXX-Sunay: We should hold srs_lock since ring_count
1160 1182                   * below can change. But if we are always called from
1161 1183                   * mac_rx_srs_drain and SRS_PROC is set, then we can
1162 1184                   * enforce that ring_count can't be changed i.e.
1163 1185                   * to change fanout type or ring count, the calling
1164 1186                   * thread needs to be behind SRS_PROC.
1165 1187                   */
1166 1188                  switch (ipha->ipha_protocol) {
1167 1189                  case IPPROTO_TCP:
1168 1190                          /*
1169 1191                           * Note that for ESP, we fanout on SPI and it is at the
1170 1192                           * same offset as the 2x16-bit ports. So it is clumped
1171 1193                           * along with TCP, UDP and SCTP.
1172 1194                           */
1173 1195                          hash = HASH_ADDR(ipha->ipha_src,
1174 1196                              *(uint32_t *)(mp->b_rptr + ports_offset));
1175 1197                          indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
1176 1198                          type = V4_TCP;
1177 1199                          mp->b_rptr += hdrsize;
1178 1200                          break;
1179 1201                  case IPPROTO_UDP:
1180 1202                  case IPPROTO_SCTP:
1181 1203                  case IPPROTO_ESP:
1182 1204                          if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
1183 1205                                  hash = HASH_ADDR(ipha->ipha_src,
1184 1206                                      *(uint32_t *)(mp->b_rptr + ports_offset));
1185 1207                                  indx = COMPUTE_INDEX(hash,
1186 1208                                      mac_srs->srs_udp_ring_count);
1187 1209                          } else {
1188 1210                                  indx = mac_srs->srs_ind %
1189 1211                                      mac_srs->srs_udp_ring_count;
1190 1212                                  mac_srs->srs_ind++;
1191 1213                          }
1192 1214                          type = V4_UDP;
1193 1215                          mp->b_rptr += hdrsize;
1194 1216                          break;
1195 1217                  default:
1196 1218                          indx = 0;
1197 1219                          type = OTH;
1198 1220                  }
1199 1221  
1200 1222                  FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
1201 1223                      cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
1202 1224          }
1203 1225  
1204 1226          for (type = V4_TCP; type < UNDEF; type++) {
1205 1227                  int     i;
1206 1228  
1207 1229                  for (i = 0; i < fanout_cnt; i++) {
1208 1230                          if (headmp[type][i] != NULL) {
1209 1231                                  mac_soft_ring_t *softring;
1210 1232  
1211 1233                                  ASSERT(tailmp[type][i]->b_next == NULL);
1212 1234                                  switch (type) {
1213 1235                                  case V4_TCP:
1214 1236                                          softring =
1215 1237                                              mac_srs->srs_tcp_soft_rings[i];
1216 1238                                          break;
1217 1239                                  case V4_UDP:
1218 1240                                          softring =
1219 1241                                              mac_srs->srs_udp_soft_rings[i];
1220 1242                                          break;
1221 1243                                  case OTH:
1222 1244                                          softring =
1223 1245                                              mac_srs->srs_oth_soft_rings[i];
1224 1246                                          break;
1225 1247                                  }
1226 1248                                  mac_rx_soft_ring_process(mcip,
1227 1249                                      softring, headmp[type][i], tailmp[type][i],
1228 1250                                      cnt[type][i], sz[type][i]);
1229 1251                          }
1230 1252                  }
1231 1253          }
1232 1254  }
1233 1255  
1234 1256  #define SRS_BYTES_TO_PICKUP     150000
1235 1257  ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
1236 1258  
1237 1259  /*
1238 1260   * mac_rx_srs_poll_ring
1239 1261   *
1240 1262   * This SRS Poll thread uses this routine to poll the underlying hardware
1241 1263   * Rx ring to get a chain of packets. It can inline process that chain
1242 1264   * if mac_latency_optimize is set (default) or signal the SRS worker thread
1243 1265   * to do the remaining processing.
1244 1266   *
1245 1267   * Since packets come in the system via interrupt or poll path, we also
1246 1268   * update the stats and deal with promiscous clients here.
1247 1269   */
1248 1270  void
1249 1271  mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
1250 1272  {
1251 1273          kmutex_t                *lock = &mac_srs->srs_lock;
1252 1274          kcondvar_t              *async = &mac_srs->srs_cv;
1253 1275          mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1254 1276          mblk_t                  *head, *tail, *mp;
1255 1277          callb_cpr_t             cprinfo;
1256 1278          ssize_t                 bytes_to_pickup;
1257 1279          size_t                  sz;
1258 1280          int                     count;
1259 1281          mac_client_impl_t       *smcip;
1260 1282  
1261 1283          CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
1262 1284          mutex_enter(lock);
1263 1285  
1264 1286  start:
1265 1287          for (;;) {
1266 1288                  if (mac_srs->srs_state & SRS_PAUSE)
1267 1289                          goto done;
1268 1290  
1269 1291                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
1270 1292                  cv_wait(async, lock);
1271 1293                  CALLB_CPR_SAFE_END(&cprinfo, lock);
1272 1294  
1273 1295                  if (mac_srs->srs_state & SRS_PAUSE)
1274 1296                          goto done;
1275 1297  
1276 1298  check_again:
1277 1299                  if (mac_srs->srs_type & SRST_BW_CONTROL) {
1278 1300                          /*
1279 1301                           * We pick as many bytes as we are allowed to queue.
1280 1302                           * Its possible that we will exceed the total
1281 1303                           * packets queued in case this SRS is part of the
1282 1304                           * Rx ring group since > 1 poll thread can be pulling
1283 1305                           * upto the max allowed packets at the same time
1284 1306                           * but that should be OK.
1285 1307                           */
1286 1308                          mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1287 1309                          bytes_to_pickup =
1288 1310                              mac_srs->srs_bw->mac_bw_drop_threshold -
1289 1311                              mac_srs->srs_bw->mac_bw_sz;
1290 1312                          /*
1291 1313                           * We shouldn't have been signalled if we
1292 1314                           * have 0 or less bytes to pick but since
1293 1315                           * some of the bytes accounting is driver
1294 1316                           * dependant, we do the safety check.
1295 1317                           */
1296 1318                          if (bytes_to_pickup < 0)
1297 1319                                  bytes_to_pickup = 0;
1298 1320                          mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1299 1321                  } else {
1300 1322                          /*
1301 1323                           * ToDO: Need to change the polling API
1302 1324                           * to add a packet count and a flag which
1303 1325                           * tells the driver whether we want packets
1304 1326                           * based on a count, or bytes, or all the
1305 1327                           * packets queued in the driver/HW. This
1306 1328                           * way, we never have to check the limits
1307 1329                           * on poll path. We truly let only as many
1308 1330                           * packets enter the system as we are willing
1309 1331                           * to process or queue.
1310 1332                           *
1311 1333                           * Something along the lines of
1312 1334                           * pkts_to_pickup = mac_soft_ring_max_q_cnt -
1313 1335                           *      mac_srs->srs_poll_pkt_cnt
1314 1336                           */
1315 1337  
1316 1338                          /*
1317 1339                           * Since we are not doing B/W control, pick
1318 1340                           * as many packets as allowed.
1319 1341                           */
1320 1342                          bytes_to_pickup = max_bytes_to_pickup;
1321 1343                  }
1322 1344  
1323 1345                  /* Poll the underlying Hardware */
1324 1346                  mutex_exit(lock);
1325 1347                  head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
1326 1348                  mutex_enter(lock);
1327 1349  
1328 1350                  ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
1329 1351                      SRS_POLL_THR_OWNER);
1330 1352  
1331 1353                  mp = tail = head;
1332 1354                  count = 0;
1333 1355                  sz = 0;
1334 1356                  while (mp != NULL) {
1335 1357                          tail = mp;
1336 1358                          sz += msgdsize(mp);
1337 1359                          mp = mp->b_next;
1338 1360                          count++;
1339 1361                  }
1340 1362  
1341 1363                  if (head != NULL) {
1342 1364                          tail->b_next = NULL;
1343 1365                          smcip = mac_srs->srs_mcip;
1344 1366  
1345 1367                          SRS_RX_STAT_UPDATE(mac_srs, pollbytes, sz);
1346 1368                          SRS_RX_STAT_UPDATE(mac_srs, pollcnt, count);
1347 1369  
1348 1370                          /*
1349 1371                           * If there are any promiscuous mode callbacks
1350 1372                           * defined for this MAC client, pass them a copy
1351 1373                           * if appropriate and also update the counters.
1352 1374                           */
1353 1375                          if (smcip != NULL) {
1354 1376                                  if (smcip->mci_mip->mi_promisc_list != NULL) {
1355 1377                                          mutex_exit(lock);
1356 1378                                          mac_promisc_dispatch(smcip->mci_mip,
1357 1379                                              head, NULL);
1358 1380                                          mutex_enter(lock);
1359 1381                                  }
1360 1382                          }
1361 1383                          if (mac_srs->srs_type & SRST_BW_CONTROL) {
1362 1384                                  mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1363 1385                                  mac_srs->srs_bw->mac_bw_polled += sz;
1364 1386                                  mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1365 1387                          }
1366 1388                          MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
1367 1389                              count, sz);
1368 1390                          if (count <= 10)
1369 1391                                  srs_rx->sr_stat.mrs_chaincntundr10++;
1370 1392                          else if (count > 10 && count <= 50)
1371 1393                                  srs_rx->sr_stat.mrs_chaincnt10to50++;
1372 1394                          else
1373 1395                                  srs_rx->sr_stat.mrs_chaincntover50++;
1374 1396                  }
1375 1397  
1376 1398                  /*
1377 1399                   * We are guaranteed that SRS_PROC will be set if we
1378 1400                   * are here. Also, poll thread gets to run only if
1379 1401                   * the drain was being done by a worker thread although
1380 1402                   * its possible that worker thread is still running
1381 1403                   * and poll thread was sent down to keep the pipeline
1382 1404                   * going instead of doing a complete drain and then
1383 1405                   * trying to poll the NIC.
1384 1406                   *
1385 1407                   * So we need to check SRS_WORKER flag to make sure
1386 1408                   * that the worker thread is not processing the queue
1387 1409                   * in parallel to us. The flags and conditions are
1388 1410                   * protected by the srs_lock to prevent any race. We
1389 1411                   * ensure that we don't drop the srs_lock from now
1390 1412                   * till the end and similarly we don't drop the srs_lock
1391 1413                   * in mac_rx_srs_drain() till similar condition check
1392 1414                   * are complete. The mac_rx_srs_drain() needs to ensure
1393 1415                   * that SRS_WORKER flag remains set as long as its
1394 1416                   * processing the queue.
1395 1417                   */
1396 1418                  if (!(mac_srs->srs_state & SRS_WORKER) &&
1397 1419                      (mac_srs->srs_first != NULL)) {
1398 1420                          /*
1399 1421                           * We have packets to process and worker thread
1400 1422                           * is not running. Check to see if poll thread is
1401 1423                           * allowed to process.
1402 1424                           */
1403 1425                          if (mac_srs->srs_state & SRS_LATENCY_OPT) {
1404 1426                                  mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
1405 1427                                  if (!(mac_srs->srs_state & SRS_PAUSE) &&
1406 1428                                      srs_rx->sr_poll_pkt_cnt <=
1407 1429                                      srs_rx->sr_lowat) {
1408 1430                                          srs_rx->sr_poll_again++;
1409 1431                                          goto check_again;
1410 1432                                  }
1411 1433                                  /*
1412 1434                                   * We are already above low water mark
1413 1435                                   * so stay in the polling mode but no
1414 1436                                   * need to poll. Once we dip below
1415 1437                                   * the polling threshold, the processing
1416 1438                                   * thread (soft ring) will signal us
1417 1439                                   * to poll again (MAC_UPDATE_SRS_COUNT)
1418 1440                                   */
1419 1441                                  srs_rx->sr_poll_drain_no_poll++;
1420 1442                                  mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1421 1443                                  /*
1422 1444                                   * In B/W control case, its possible
1423 1445                                   * that the backlog built up due to
1424 1446                                   * B/W limit being reached and packets
1425 1447                                   * are queued only in SRS. In this case,
1426 1448                                   * we should schedule worker thread
1427 1449                                   * since no one else will wake us up.
1428 1450                                   */
1429 1451                                  if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
1430 1452                                      (mac_srs->srs_tid == NULL)) {
1431 1453                                          mac_srs->srs_tid =
1432 1454                                              timeout(mac_srs_fire, mac_srs, 1);
1433 1455                                          srs_rx->sr_poll_worker_wakeup++;
1434 1456                                  }
1435 1457                          } else {
1436 1458                                  /*
1437 1459                                   * Wakeup the worker thread for more processing.
1438 1460                                   * We optimize for throughput in this case.
1439 1461                                   */
1440 1462                                  mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1441 1463                                  MAC_SRS_WORKER_WAKEUP(mac_srs);
1442 1464                                  srs_rx->sr_poll_sig_worker++;
1443 1465                          }
1444 1466                  } else if ((mac_srs->srs_first == NULL) &&
1445 1467                      !(mac_srs->srs_state & SRS_WORKER)) {
1446 1468                          /*
1447 1469                           * There is nothing queued in SRS and
1448 1470                           * no worker thread running. Plus we
1449 1471                           * didn't get anything from the H/W
1450 1472                           * as well (head == NULL);
1451 1473                           */
1452 1474                          ASSERT(head == NULL);
1453 1475                          mac_srs->srs_state &=
1454 1476                              ~(SRS_PROC|SRS_GET_PKTS);
1455 1477  
1456 1478                          /*
1457 1479                           * If we have a packets in soft ring, don't allow
1458 1480                           * more packets to come into this SRS by keeping the
1459 1481                           * interrupts off but not polling the H/W. The
1460 1482                           * poll thread will get signaled as soon as
1461 1483                           * srs_poll_pkt_cnt dips below poll threshold.
1462 1484                           */
1463 1485                          if (srs_rx->sr_poll_pkt_cnt == 0) {
1464 1486                                  srs_rx->sr_poll_intr_enable++;
1465 1487                                  MAC_SRS_POLLING_OFF(mac_srs);
1466 1488                          } else {
1467 1489                                  /*
1468 1490                                   * We know nothing is queued in SRS
1469 1491                                   * since we are here after checking
1470 1492                                   * srs_first is NULL. The backlog
1471 1493                                   * is entirely due to packets queued
1472 1494                                   * in Soft ring which will wake us up
1473 1495                                   * and get the interface out of polling
1474 1496                                   * mode once the backlog dips below
1475 1497                                   * sr_poll_thres.
1476 1498                                   */
1477 1499                                  srs_rx->sr_poll_no_poll++;
1478 1500                          }
1479 1501                  } else {
1480 1502                          /*
1481 1503                           * Worker thread is already running.
1482 1504                           * Nothing much to do. If the polling
1483 1505                           * was enabled, worker thread will deal
1484 1506                           * with that.
1485 1507                           */
1486 1508                          mac_srs->srs_state &= ~SRS_GET_PKTS;
1487 1509                          srs_rx->sr_poll_goto_sleep++;
1488 1510                  }
1489 1511          }
1490 1512  done:
1491 1513          mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
1492 1514          cv_signal(&mac_srs->srs_async);
1493 1515          /*
1494 1516           * If this is a temporary quiesce then wait for the restart signal
1495 1517           * from the srs worker. Then clear the flags and signal the srs worker
1496 1518           * to ensure a positive handshake and go back to start.
1497 1519           */
1498 1520          while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
1499 1521                  cv_wait(async, lock);
1500 1522          if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
1501 1523                  ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
1502 1524                  mac_srs->srs_state &=
1503 1525                      ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
1504 1526                  cv_signal(&mac_srs->srs_async);
1505 1527                  goto start;
1506 1528          } else {
1507 1529                  mac_srs->srs_state |= SRS_POLL_THR_EXITED;
1508 1530                  cv_signal(&mac_srs->srs_async);
1509 1531                  CALLB_CPR_EXIT(&cprinfo);
1510 1532                  thread_exit();
1511 1533          }
1512 1534  }
1513 1535  
1514 1536  /*
1515 1537   * mac_srs_pick_chain
1516 1538   *
1517 1539   * In Bandwidth control case, checks how many packets can be processed
1518 1540   * and return them in a sub chain.
1519 1541   */
1520 1542  static mblk_t *
1521 1543  mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
1522 1544      size_t *chain_sz, int *chain_cnt)
1523 1545  {
1524 1546          mblk_t                  *head = NULL;
1525 1547          mblk_t                  *tail = NULL;
1526 1548          size_t                  sz;
1527 1549          size_t                  tsz = 0;
1528 1550          int                     cnt = 0;
1529 1551          mblk_t                  *mp;
1530 1552  
1531 1553          ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1532 1554          mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1533 1555          if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
1534 1556              mac_srs->srs_bw->mac_bw_limit) ||
1535 1557              (mac_srs->srs_bw->mac_bw_limit == 0)) {
1536 1558                  mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1537 1559                  head = mac_srs->srs_first;
1538 1560                  mac_srs->srs_first = NULL;
1539 1561                  *chain_tail = mac_srs->srs_last;
1540 1562                  mac_srs->srs_last = NULL;
1541 1563                  *chain_sz = mac_srs->srs_size;
1542 1564                  *chain_cnt = mac_srs->srs_count;
1543 1565                  mac_srs->srs_count = 0;
1544 1566                  mac_srs->srs_size = 0;
1545 1567                  return (head);
1546 1568          }
1547 1569  
1548 1570          /*
1549 1571           * Can't clear the entire backlog.
1550 1572           * Need to find how many packets to pick
1551 1573           */
1552 1574          ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
1553 1575          while ((mp = mac_srs->srs_first) != NULL) {
1554 1576                  sz = msgdsize(mp);
1555 1577                  if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
1556 1578                      mac_srs->srs_bw->mac_bw_limit) {
1557 1579                          if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
1558 1580                                  mac_srs->srs_bw->mac_bw_state |=
1559 1581                                      SRS_BW_ENFORCED;
1560 1582                          break;
1561 1583                  }
1562 1584  
1563 1585                  /*
1564 1586                   * The _size & cnt is  decremented from the softrings
1565 1587                   * when they send up the packet for polling to work
1566 1588                   * properly.
1567 1589                   */
1568 1590                  tsz += sz;
1569 1591                  cnt++;
1570 1592                  mac_srs->srs_count--;
1571 1593                  mac_srs->srs_size -= sz;
1572 1594                  if (tail != NULL)
1573 1595                          tail->b_next = mp;
1574 1596                  else
1575 1597                          head = mp;
1576 1598                  tail = mp;
1577 1599                  mac_srs->srs_first = mac_srs->srs_first->b_next;
1578 1600          }
1579 1601          mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1580 1602          if (mac_srs->srs_first == NULL)
1581 1603                  mac_srs->srs_last = NULL;
1582 1604  
1583 1605          if (tail != NULL)
1584 1606                  tail->b_next = NULL;
1585 1607          *chain_tail = tail;
1586 1608          *chain_cnt = cnt;
1587 1609          *chain_sz = tsz;
1588 1610  
1589 1611          return (head);
1590 1612  }
1591 1613  
1592 1614  /*
1593 1615   * mac_rx_srs_drain
1594 1616   *
1595 1617   * The SRS drain routine. Gets to run to clear the queue. Any thread
1596 1618   * (worker, interrupt, poll) can call this based on processing model.
1597 1619   * The first thing we do is disable interrupts if possible and then
1598 1620   * drain the queue. we also try to poll the underlying hardware if
1599 1621   * there is a dedicated hardware Rx ring assigned to this SRS.
1600 1622   *
1601 1623   * There is a equivalent drain routine in bandwidth control mode
1602 1624   * mac_rx_srs_drain_bw. There is some code duplication between the two
1603 1625   * routines but they are highly performance sensitive and are easier
1604 1626   * to read/debug if they stay separate. Any code changes here might
1605 1627   * also apply to mac_rx_srs_drain_bw as well.
1606 1628   */
1607 1629  void
1608 1630  mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1609 1631  {
1610 1632          mblk_t                  *head;
1611 1633          mblk_t                  *tail;
1612 1634          timeout_id_t            tid;
1613 1635          int                     cnt = 0;
1614 1636          mac_client_impl_t       *mcip = mac_srs->srs_mcip;
1615 1637          mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1616 1638  
1617 1639          ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1618 1640          ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
1619 1641  
1620 1642          /* If we are blanked i.e. can't do upcalls, then we are done */
1621 1643          if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1622 1644                  ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1623 1645                      (mac_srs->srs_state & SRS_PAUSE));
1624 1646                  goto out;
1625 1647          }
1626 1648  
1627 1649          if (mac_srs->srs_first == NULL)
1628 1650                  goto out;
1629 1651  
1630 1652          if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
1631 1653              (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
1632 1654                  /*
1633 1655                   * In the normal case, the SRS worker thread does no
1634 1656                   * work and we wait for a backlog to build up before
1635 1657                   * we switch into polling mode. In case we are
1636 1658                   * optimizing for throughput, we use the worker thread
1637 1659                   * as well. The goal is to let worker thread process
1638 1660                   * the queue and poll thread to feed packets into
1639 1661                   * the queue. As such, we should signal the poll
1640 1662                   * thread to try and get more packets.
1641 1663                   *
1642 1664                   * We could have pulled this check in the POLL_RING
1643 1665                   * macro itself but keeping it explicit here makes
1644 1666                   * the architecture more human understandable.
1645 1667                   */
1646 1668                  MAC_SRS_POLL_RING(mac_srs);
1647 1669          }
1648 1670  
1649 1671  again:
1650 1672          head = mac_srs->srs_first;
1651 1673          mac_srs->srs_first = NULL;
1652 1674          tail = mac_srs->srs_last;
1653 1675          mac_srs->srs_last = NULL;
1654 1676          cnt = mac_srs->srs_count;
1655 1677          mac_srs->srs_count = 0;
1656 1678  
1657 1679          ASSERT(head != NULL);
1658 1680          ASSERT(tail != NULL);
1659 1681  
1660 1682          if ((tid = mac_srs->srs_tid) != 0)
1661 1683                  mac_srs->srs_tid = 0;
1662 1684  
1663 1685          mac_srs->srs_state |= (SRS_PROC|proc_type);
1664 1686  
1665 1687  
1666 1688          /*
1667 1689           * mcip is NULL for broadcast and multicast flows. The promisc
1668 1690           * callbacks for broadcast and multicast packets are delivered from
1669 1691           * mac_rx() and we don't need to worry about that case in this path
1670 1692           */
1671 1693          if (mcip != NULL) {
1672 1694                  if (mcip->mci_promisc_list != NULL) {
1673 1695                          mutex_exit(&mac_srs->srs_lock);
1674 1696                          mac_promisc_client_dispatch(mcip, head);
1675 1697                          mutex_enter(&mac_srs->srs_lock);
1676 1698                  }
1677 1699                  if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1678 1700                          mutex_exit(&mac_srs->srs_lock);
1679 1701                          mac_protect_intercept_dhcp(mcip, head);
1680 1702                          mutex_enter(&mac_srs->srs_lock);
1681 1703                  }
1682 1704          }
1683 1705  
1684 1706          /*
1685 1707           * Check if SRS itself is doing the processing
1686 1708           * This direct path does not apply when subflows are present. In this
1687 1709           * case, packets need to be dispatched to a soft ring according to the
1688 1710           * flow's bandwidth and other resources contraints.
1689 1711           */
1690 1712          if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1691 1713                  mac_direct_rx_t         proc;
1692 1714                  void                    *arg1;
1693 1715                  mac_resource_handle_t   arg2;
1694 1716  
1695 1717                  /*
1696 1718                   * This is the case when a Rx is directly
1697 1719                   * assigned and we have a fully classified
1698 1720                   * protocol chain. We can deal with it in
1699 1721                   * one shot.
1700 1722                   */
1701 1723                  proc = srs_rx->sr_func;
1702 1724                  arg1 = srs_rx->sr_arg1;
1703 1725                  arg2 = srs_rx->sr_arg2;
1704 1726  
1705 1727                  mac_srs->srs_state |= SRS_CLIENT_PROC;
1706 1728                  mutex_exit(&mac_srs->srs_lock);
1707 1729                  if (tid != 0) {
1708 1730                          (void) untimeout(tid);
1709 1731                          tid = 0;
1710 1732                  }
1711 1733  
1712 1734                  proc(arg1, arg2, head, NULL);
1713 1735                  /*
1714 1736                   * Decrement the size and count here itelf
1715 1737                   * since the packet has been processed.
1716 1738                   */
1717 1739                  mutex_enter(&mac_srs->srs_lock);
1718 1740                  MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1719 1741                  if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1720 1742                          cv_signal(&mac_srs->srs_client_cv);
1721 1743                  mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1722 1744          } else {
1723 1745                  /* Some kind of softrings based fanout is required */
1724 1746                  mutex_exit(&mac_srs->srs_lock);
1725 1747                  if (tid != 0) {
1726 1748                          (void) untimeout(tid);
1727 1749                          tid = 0;
1728 1750                  }
1729 1751  
1730 1752                  /*
1731 1753                   * Since the fanout routines can deal with chains,
1732 1754                   * shoot the entire chain up.
1733 1755                   */
1734 1756                  if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1735 1757                          mac_rx_srs_fanout(mac_srs, head);
1736 1758                  else
1737 1759                          mac_rx_srs_proto_fanout(mac_srs, head);
1738 1760                  mutex_enter(&mac_srs->srs_lock);
1739 1761          }
1740 1762  
1741 1763          if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) &&
1742 1764              (mac_srs->srs_first != NULL)) {
1743 1765                  /*
1744 1766                   * More packets arrived while we were clearing the
1745 1767                   * SRS. This can be possible because of one of
1746 1768                   * three conditions below:
1747 1769                   * 1) The driver is using multiple worker threads
1748 1770                   *    to send the packets to us.
1749 1771                   * 2) The driver has a race in switching
1750 1772                   *    between interrupt and polling mode or
1751 1773                   * 3) Packets are arriving in this SRS via the
1752 1774                   *    S/W classification as well.
1753 1775                   *
1754 1776                   * We should switch to polling mode and see if we
1755 1777                   * need to send the poll thread down. Also, signal
1756 1778                   * the worker thread to process whats just arrived.
1757 1779                   */
1758 1780                  MAC_SRS_POLLING_ON(mac_srs);
1759 1781                  if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
1760 1782                          srs_rx->sr_drain_poll_sig++;
1761 1783                          MAC_SRS_POLL_RING(mac_srs);
1762 1784                  }
1763 1785  
1764 1786                  /*
1765 1787                   * If we didn't signal the poll thread, we need
1766 1788                   * to deal with the pending packets ourselves.
1767 1789                   */
1768 1790                  if (proc_type == SRS_WORKER) {
1769 1791                          srs_rx->sr_drain_again++;
1770 1792                          goto again;
1771 1793                  } else {
1772 1794                          srs_rx->sr_drain_worker_sig++;
1773 1795                          cv_signal(&mac_srs->srs_async);
1774 1796                  }
1775 1797          }
1776 1798  
1777 1799  out:
1778 1800          if (mac_srs->srs_state & SRS_GET_PKTS) {
1779 1801                  /*
1780 1802                   * Poll thread is already running. Leave the
1781 1803                   * SRS_RPOC set and hand over the control to
1782 1804                   * poll thread.
1783 1805                   */
1784 1806                  mac_srs->srs_state &= ~proc_type;
1785 1807                  srs_rx->sr_drain_poll_running++;
1786 1808                  return;
1787 1809          }
1788 1810  
1789 1811          /*
1790 1812           * Even if there are no packets queued in SRS, we
1791 1813           * need to make sure that the shared counter is
1792 1814           * clear and any associated softrings have cleared
1793 1815           * all the backlog. Otherwise, leave the interface
1794 1816           * in polling mode and the poll thread will get
1795 1817           * signalled once the count goes down to zero.
1796 1818           *
1797 1819           * If someone is already draining the queue (SRS_PROC is
1798 1820           * set) when the srs_poll_pkt_cnt goes down to zero,
1799 1821           * then it means that drain is already running and we
1800 1822           * will turn off polling at that time if there is
1801 1823           * no backlog.
1802 1824           *
1803 1825           * As long as there are packets queued either
1804 1826           * in soft ring set or its soft rings, we will leave
1805 1827           * the interface in polling mode (even if the drain
1806 1828           * was done being the interrupt thread). We signal
1807 1829           * the poll thread as well if we have dipped below
1808 1830           * low water mark.
1809 1831           *
1810 1832           * NOTE: We can't use the MAC_SRS_POLLING_ON macro
1811 1833           * since that turn polling on only for worker thread.
1812 1834           * Its not worth turning polling on for interrupt
1813 1835           * thread (since NIC will not issue another interrupt)
1814 1836           * unless a backlog builds up.
1815 1837           */
1816 1838          if ((srs_rx->sr_poll_pkt_cnt > 0) &&
1817 1839              (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
1818 1840                  mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1819 1841                  srs_rx->sr_drain_keep_polling++;
1820 1842                  MAC_SRS_POLLING_ON(mac_srs);
1821 1843                  if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
1822 1844                          MAC_SRS_POLL_RING(mac_srs);
1823 1845                  return;
1824 1846          }
1825 1847  
1826 1848          /* Nothing else to do. Get out of poll mode */
1827 1849          MAC_SRS_POLLING_OFF(mac_srs);
1828 1850          mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1829 1851          srs_rx->sr_drain_finish_intr++;
1830 1852  }
1831 1853  
1832 1854  /*
1833 1855   * mac_rx_srs_drain_bw
1834 1856   *
1835 1857   * The SRS BW drain routine. Gets to run to clear the queue. Any thread
1836 1858   * (worker, interrupt, poll) can call this based on processing model.
1837 1859   * The first thing we do is disable interrupts if possible and then
1838 1860   * drain the queue. we also try to poll the underlying hardware if
1839 1861   * there is a dedicated hardware Rx ring assigned to this SRS.
1840 1862   *
1841 1863   * There is a equivalent drain routine in non bandwidth control mode
1842 1864   * mac_rx_srs_drain. There is some code duplication between the two
1843 1865   * routines but they are highly performance sensitive and are easier
1844 1866   * to read/debug if they stay separate. Any code changes here might
1845 1867   * also apply to mac_rx_srs_drain as well.
1846 1868   */
1847 1869  void
1848 1870  mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1849 1871  {
1850 1872          mblk_t                  *head;
1851 1873          mblk_t                  *tail;
1852 1874          timeout_id_t            tid;
1853 1875          size_t                  sz = 0;
1854 1876          int                     cnt = 0;
1855 1877          mac_client_impl_t       *mcip = mac_srs->srs_mcip;
1856 1878          mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1857 1879          clock_t                 now;
1858 1880  
1859 1881          ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1860 1882          ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
1861 1883  again:
1862 1884          /* Check if we are doing B/W control */
1863 1885          mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1864 1886          now = ddi_get_lbolt();
1865 1887          if (mac_srs->srs_bw->mac_bw_curr_time != now) {
1866 1888                  mac_srs->srs_bw->mac_bw_curr_time = now;
1867 1889                  mac_srs->srs_bw->mac_bw_used = 0;
1868 1890                  if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
1869 1891                          mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
1870 1892          } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
1871 1893                  mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1872 1894                  goto done;
1873 1895          } else if (mac_srs->srs_bw->mac_bw_used >
1874 1896              mac_srs->srs_bw->mac_bw_limit) {
1875 1897                  mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
1876 1898                  mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1877 1899                  goto done;
1878 1900          }
1879 1901          mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1880 1902  
1881 1903          /* If we are blanked i.e. can't do upcalls, then we are done */
1882 1904          if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1883 1905                  ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1884 1906                      (mac_srs->srs_state & SRS_PAUSE));
1885 1907                  goto done;
1886 1908          }
1887 1909  
1888 1910          sz = 0;
1889 1911          cnt = 0;
1890 1912          if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
1891 1913                  /*
1892 1914                   * We couldn't pick up a single packet.
1893 1915                   */
1894 1916                  mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1895 1917                  if ((mac_srs->srs_bw->mac_bw_used == 0) &&
1896 1918                      (mac_srs->srs_size != 0) &&
1897 1919                      !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
1898 1920                          /*
1899 1921                           * Seems like configured B/W doesn't
1900 1922                           * even allow processing of 1 packet
1901 1923                           * per tick.
1902 1924                           *
1903 1925                           * XXX: raise the limit to processing
1904 1926                           * at least 1 packet per tick.
1905 1927                           */
1906 1928                          mac_srs->srs_bw->mac_bw_limit +=
1907 1929                              mac_srs->srs_bw->mac_bw_limit;
1908 1930                          mac_srs->srs_bw->mac_bw_drop_threshold +=
1909 1931                              mac_srs->srs_bw->mac_bw_drop_threshold;
1910 1932                          cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
1911 1933                              "raised B/W limit to %d since not even a "
1912 1934                              "single packet can be processed per "
1913 1935                              "tick %d\n", (void *)mac_srs,
1914 1936                              (int)mac_srs->srs_bw->mac_bw_limit,
1915 1937                              (int)msgdsize(mac_srs->srs_first));
1916 1938                  }
1917 1939                  mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1918 1940                  goto done;
1919 1941          }
1920 1942  
1921 1943          ASSERT(head != NULL);
1922 1944          ASSERT(tail != NULL);
1923 1945  
1924 1946          /* zero bandwidth: drop all and return to interrupt mode */
1925 1947          mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1926 1948          if (mac_srs->srs_bw->mac_bw_limit == 0) {
1927 1949                  srs_rx->sr_stat.mrs_sdrops += cnt;
1928 1950                  ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
1929 1951                  mac_srs->srs_bw->mac_bw_sz -= sz;
1930 1952                  mac_srs->srs_bw->mac_bw_drop_bytes += sz;
1931 1953                  mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1932 1954                  mac_pkt_drop(NULL, NULL, head, B_FALSE);
1933 1955                  goto leave_poll;
1934 1956          } else {
1935 1957                  mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1936 1958          }
1937 1959  
1938 1960          if ((tid = mac_srs->srs_tid) != 0)
1939 1961                  mac_srs->srs_tid = 0;
1940 1962  
1941 1963          mac_srs->srs_state |= (SRS_PROC|proc_type);
1942 1964          MAC_SRS_WORKER_POLLING_ON(mac_srs);
1943 1965  
1944 1966          /*
1945 1967           * mcip is NULL for broadcast and multicast flows. The promisc
1946 1968           * callbacks for broadcast and multicast packets are delivered from
1947 1969           * mac_rx() and we don't need to worry about that case in this path
1948 1970           */
1949 1971          if (mcip != NULL) {
1950 1972                  if (mcip->mci_promisc_list != NULL) {
1951 1973                          mutex_exit(&mac_srs->srs_lock);
1952 1974                          mac_promisc_client_dispatch(mcip, head);
1953 1975                          mutex_enter(&mac_srs->srs_lock);
1954 1976                  }
1955 1977                  if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1956 1978                          mutex_exit(&mac_srs->srs_lock);
1957 1979                          mac_protect_intercept_dhcp(mcip, head);
1958 1980                          mutex_enter(&mac_srs->srs_lock);
1959 1981                  }
1960 1982          }
1961 1983  
1962 1984          /*
1963 1985           * Check if SRS itself is doing the processing
1964 1986           * This direct path does not apply when subflows are present. In this
1965 1987           * case, packets need to be dispatched to a soft ring according to the
1966 1988           * flow's bandwidth and other resources contraints.
1967 1989           */
1968 1990          if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1969 1991                  mac_direct_rx_t         proc;
1970 1992                  void                    *arg1;
1971 1993                  mac_resource_handle_t   arg2;
1972 1994  
1973 1995                  /*
1974 1996                   * This is the case when a Rx is directly
1975 1997                   * assigned and we have a fully classified
1976 1998                   * protocol chain. We can deal with it in
1977 1999                   * one shot.
1978 2000                   */
1979 2001                  proc = srs_rx->sr_func;
1980 2002                  arg1 = srs_rx->sr_arg1;
1981 2003                  arg2 = srs_rx->sr_arg2;
1982 2004  
1983 2005                  mac_srs->srs_state |= SRS_CLIENT_PROC;
1984 2006                  mutex_exit(&mac_srs->srs_lock);
1985 2007                  if (tid != 0) {
1986 2008                          (void) untimeout(tid);
1987 2009                          tid = 0;
1988 2010                  }
1989 2011  
1990 2012                  proc(arg1, arg2, head, NULL);
1991 2013                  /*
1992 2014                   * Decrement the size and count here itelf
1993 2015                   * since the packet has been processed.
1994 2016                   */
1995 2017                  mutex_enter(&mac_srs->srs_lock);
1996 2018                  MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1997 2019                  MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
1998 2020  
1999 2021                  if (mac_srs->srs_state & SRS_CLIENT_WAIT)
2000 2022                          cv_signal(&mac_srs->srs_client_cv);
2001 2023                  mac_srs->srs_state &= ~SRS_CLIENT_PROC;
2002 2024          } else {
2003 2025                  /* Some kind of softrings based fanout is required */
2004 2026                  mutex_exit(&mac_srs->srs_lock);
2005 2027                  if (tid != 0) {
2006 2028                          (void) untimeout(tid);
2007 2029                          tid = 0;
2008 2030                  }
2009 2031  
2010 2032                  /*
2011 2033                   * Since the fanout routines can deal with chains,
2012 2034                   * shoot the entire chain up.
2013 2035                   */
2014 2036                  if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
2015 2037                          mac_rx_srs_fanout(mac_srs, head);
2016 2038                  else
2017 2039                          mac_rx_srs_proto_fanout(mac_srs, head);
2018 2040                  mutex_enter(&mac_srs->srs_lock);
2019 2041          }
2020 2042  
2021 2043          /*
2022 2044           * Send the poll thread to pick up any packets arrived
2023 2045           * so far. This also serves as the last check in case
2024 2046           * nothing else is queued in the SRS. The poll thread
2025 2047           * is signalled only in the case the drain was done
2026 2048           * by the worker thread and SRS_WORKER is set. The
2027 2049           * worker thread can run in parallel as long as the
2028 2050           * SRS_WORKER flag is set. We we have nothing else to
2029 2051           * process, we can exit while leaving SRS_PROC set
2030 2052           * which gives the poll thread control to process and
2031 2053           * cleanup once it returns from the NIC.
2032 2054           *
2033 2055           * If we have nothing else to process, we need to
2034 2056           * ensure that we keep holding the srs_lock till
2035 2057           * all the checks below are done and control is
2036 2058           * handed to the poll thread if it was running.
2037 2059           */
2038 2060          mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2039 2061          if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2040 2062                  if (mac_srs->srs_first != NULL) {
2041 2063                          if (proc_type == SRS_WORKER) {
2042 2064                                  mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2043 2065                                  if (srs_rx->sr_poll_pkt_cnt <=
2044 2066                                      srs_rx->sr_lowat)
2045 2067                                          MAC_SRS_POLL_RING(mac_srs);
2046 2068                                  goto again;
2047 2069                          } else {
2048 2070                                  cv_signal(&mac_srs->srs_async);
2049 2071                          }
2050 2072                  }
2051 2073          }
2052 2074          mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2053 2075  
2054 2076  done:
2055 2077  
2056 2078          if (mac_srs->srs_state & SRS_GET_PKTS) {
2057 2079                  /*
2058 2080                   * Poll thread is already running. Leave the
2059 2081                   * SRS_RPOC set and hand over the control to
2060 2082                   * poll thread.
2061 2083                   */
2062 2084                  mac_srs->srs_state &= ~proc_type;
2063 2085                  return;
2064 2086          }
2065 2087  
2066 2088          /*
2067 2089           * If we can't process packets because we have exceeded
2068 2090           * B/W limit for this tick, just set the timeout
2069 2091           * and leave.
2070 2092           *
2071 2093           * Even if there are no packets queued in SRS, we
2072 2094           * need to make sure that the shared counter is
2073 2095           * clear and any associated softrings have cleared
2074 2096           * all the backlog. Otherwise, leave the interface
2075 2097           * in polling mode and the poll thread will get
2076 2098           * signalled once the count goes down to zero.
2077 2099           *
2078 2100           * If someone is already draining the queue (SRS_PROC is
2079 2101           * set) when the srs_poll_pkt_cnt goes down to zero,
2080 2102           * then it means that drain is already running and we
2081 2103           * will turn off polling at that time if there is
2082 2104           * no backlog. As long as there are packets queued either
2083 2105           * is soft ring set or its soft rings, we will leave
2084 2106           * the interface in polling mode.
2085 2107           */
2086 2108          mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2087 2109          if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
2088 2110              ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
2089 2111              (srs_rx->sr_poll_pkt_cnt > 0))) {
2090 2112                  MAC_SRS_POLLING_ON(mac_srs);
2091 2113                  mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2092 2114                  if ((mac_srs->srs_first != NULL) &&
2093 2115                      (mac_srs->srs_tid == NULL))
2094 2116                          mac_srs->srs_tid = timeout(mac_srs_fire,
2095 2117                              mac_srs, 1);
2096 2118                  mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2097 2119                  return;
2098 2120          }
2099 2121          mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2100 2122  
2101 2123  leave_poll:
2102 2124  
2103 2125          /* Nothing else to do. Get out of poll mode */
2104 2126          MAC_SRS_POLLING_OFF(mac_srs);
2105 2127          mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2106 2128  }
2107 2129  
2108 2130  /*
2109 2131   * mac_srs_worker
2110 2132   *
2111 2133   * The SRS worker routine. Drains the queue when no one else is
2112 2134   * processing it.
2113 2135   */
2114 2136  void
2115 2137  mac_srs_worker(mac_soft_ring_set_t *mac_srs)
2116 2138  {
2117 2139          kmutex_t                *lock = &mac_srs->srs_lock;
2118 2140          kcondvar_t              *async = &mac_srs->srs_async;
2119 2141          callb_cpr_t             cprinfo;
2120 2142          boolean_t               bw_ctl_flag;
2121 2143  
2122 2144          CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
2123 2145          mutex_enter(lock);
2124 2146  
2125 2147  start:
2126 2148          for (;;) {
2127 2149                  bw_ctl_flag = B_FALSE;
2128 2150                  if (mac_srs->srs_type & SRST_BW_CONTROL) {
2129 2151                          MAC_SRS_BW_LOCK(mac_srs);
2130 2152                          MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2131 2153                          if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
2132 2154                                  bw_ctl_flag = B_TRUE;
2133 2155                          MAC_SRS_BW_UNLOCK(mac_srs);
2134 2156                  }
2135 2157                  /*
2136 2158                   * The SRS_BW_ENFORCED flag may change since we have dropped
2137 2159                   * the mac_bw_lock. However the drain function can handle both
2138 2160                   * a drainable SRS or a bandwidth controlled SRS, and the
2139 2161                   * effect of scheduling a timeout is to wakeup the worker
2140 2162                   * thread which in turn will call the drain function. Since
2141 2163                   * we release the srs_lock atomically only in the cv_wait there
2142 2164                   * isn't a fear of waiting for ever.
2143 2165                   */
2144 2166                  while (((mac_srs->srs_state & SRS_PROC) ||
2145 2167                      (mac_srs->srs_first == NULL) || bw_ctl_flag ||
2146 2168                      (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
2147 2169                      !(mac_srs->srs_state & SRS_PAUSE)) {
2148 2170                          /*
2149 2171                           * If we have packets queued and we are here
2150 2172                           * because B/W control is in place, we better
2151 2173                           * schedule the worker wakeup after 1 tick
2152 2174                           * to see if bandwidth control can be relaxed.
2153 2175                           */
2154 2176                          if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
2155 2177                                  /*
2156 2178                                   * We need to ensure that a timer  is already
2157 2179                                   * scheduled or we force  schedule one for
2158 2180                                   * later so that we can continue processing
2159 2181                                   * after this  quanta is over.
2160 2182                                   */
2161 2183                                  mac_srs->srs_tid = timeout(mac_srs_fire,
2162 2184                                      mac_srs, 1);
2163 2185                          }
2164 2186  wait:
2165 2187                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
2166 2188                          cv_wait(async, lock);
2167 2189                          CALLB_CPR_SAFE_END(&cprinfo, lock);
2168 2190  
2169 2191                          if (mac_srs->srs_state & SRS_PAUSE)
2170 2192                                  goto done;
2171 2193                          if (mac_srs->srs_state & SRS_PROC)
2172 2194                                  goto wait;
2173 2195  
2174 2196                          if (mac_srs->srs_first != NULL &&
2175 2197                              mac_srs->srs_type & SRST_BW_CONTROL) {
2176 2198                                  MAC_SRS_BW_LOCK(mac_srs);
2177 2199                                  if (mac_srs->srs_bw->mac_bw_state &
2178 2200                                      SRS_BW_ENFORCED) {
2179 2201                                          MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2180 2202                                  }
2181 2203                                  bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
2182 2204                                      SRS_BW_ENFORCED;
2183 2205                                  MAC_SRS_BW_UNLOCK(mac_srs);
2184 2206                          }
2185 2207                  }
2186 2208  
2187 2209                  if (mac_srs->srs_state & SRS_PAUSE)
2188 2210                          goto done;
2189 2211                  mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
2190 2212          }
2191 2213  done:
2192 2214          /*
2193 2215           * The Rx SRS quiesce logic first cuts off packet supply to the SRS
2194 2216           * from both hard and soft classifications and waits for such threads
2195 2217           * to finish before signaling the worker. So at this point the only
2196 2218           * thread left that could be competing with the worker is the poll
2197 2219           * thread. In the case of Tx, there shouldn't be any thread holding
2198 2220           * SRS_PROC at this point.
2199 2221           */
2200 2222          if (!(mac_srs->srs_state & SRS_PROC)) {
2201 2223                  mac_srs->srs_state |= SRS_PROC;
2202 2224          } else {
2203 2225                  ASSERT((mac_srs->srs_type & SRST_TX) == 0);
2204 2226                  /*
2205 2227                   * Poll thread still owns the SRS and is still running
2206 2228                   */
2207 2229                  ASSERT((mac_srs->srs_poll_thr == NULL) ||
2208 2230                      ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
2209 2231                      SRS_POLL_THR_OWNER));
2210 2232          }
2211 2233          mac_srs_worker_quiesce(mac_srs);
2212 2234          /*
2213 2235           * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
2214 2236           * of the quiesce operation
2215 2237           */
2216 2238          while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
2217 2239                  cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
2218 2240  
2219 2241          if (mac_srs->srs_state & SRS_RESTART) {
2220 2242                  ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
2221 2243                  mac_srs_worker_restart(mac_srs);
2222 2244                  mac_srs->srs_state &= ~SRS_PROC;
2223 2245                  goto start;
2224 2246          }
2225 2247  
2226 2248          if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
2227 2249                  mac_srs_worker_quiesce(mac_srs);
2228 2250  
2229 2251          mac_srs->srs_state &= ~SRS_PROC;
2230 2252          /* The macro drops the srs_lock */
2231 2253          CALLB_CPR_EXIT(&cprinfo);
2232 2254          thread_exit();
2233 2255  }
2234 2256  
2235 2257  /*
2236 2258   * mac_rx_srs_subflow_process
2237 2259   *
2238 2260   * Receive side routine called from interrupt path when there are
2239 2261   * sub flows present on this SRS.
2240 2262   */
2241 2263  /* ARGSUSED */
2242 2264  void
2243 2265  mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
2244 2266      mblk_t *mp_chain, boolean_t loopback)
2245 2267  {
2246 2268          flow_entry_t            *flent = NULL;
2247 2269          flow_entry_t            *prev_flent = NULL;
2248 2270          mblk_t                  *mp = NULL;
2249 2271          mblk_t                  *tail = NULL;
2250 2272          mac_soft_ring_set_t     *mac_srs = (mac_soft_ring_set_t *)srs;
2251 2273          mac_client_impl_t       *mcip;
2252 2274  
2253 2275          mcip = mac_srs->srs_mcip;
2254 2276          ASSERT(mcip != NULL);
2255 2277  
2256 2278          /*
2257 2279           * We need to determine the SRS for every packet
2258 2280           * by walking the flow table, if we don't get any,
2259 2281           * then we proceed using the SRS we came with.
2260 2282           */
2261 2283          mp = tail = mp_chain;
2262 2284          while (mp != NULL) {
2263 2285  
2264 2286                  /*
2265 2287                   * We will increment the stats for the mactching subflow.
2266 2288                   * when we get the bytes/pkt count for the classified packets
2267 2289                   * later in mac_rx_srs_process.
2268 2290                   */
2269 2291                  (void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
2270 2292                      FLOW_INBOUND, &flent);
2271 2293  
2272 2294                  if (mp == mp_chain || flent == prev_flent) {
2273 2295                          if (prev_flent != NULL)
2274 2296                                  FLOW_REFRELE(prev_flent);
2275 2297                          prev_flent = flent;
2276 2298                          flent = NULL;
2277 2299                          tail = mp;
2278 2300                          mp = mp->b_next;
2279 2301                          continue;
2280 2302                  }
2281 2303                  tail->b_next = NULL;
2282 2304                  /*
2283 2305                   * A null indicates, this is for the mac_srs itself.
2284 2306                   * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
2285 2307                   */
2286 2308                  if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2287 2309                          mac_rx_srs_process(arg,
2288 2310                              (mac_resource_handle_t)mac_srs, mp_chain,
2289 2311                              loopback);
2290 2312                  } else {
2291 2313                          (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2292 2314                              prev_flent->fe_cb_arg2, mp_chain, loopback);
2293 2315                          FLOW_REFRELE(prev_flent);
2294 2316                  }
2295 2317                  prev_flent = flent;
2296 2318                  flent = NULL;
2297 2319                  mp_chain = mp;
2298 2320                  tail = mp;
2299 2321                  mp = mp->b_next;
2300 2322          }
2301 2323          /* Last chain */
2302 2324          ASSERT(mp_chain != NULL);
2303 2325          if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2304 2326                  mac_rx_srs_process(arg,
2305 2327                      (mac_resource_handle_t)mac_srs, mp_chain, loopback);
2306 2328          } else {
2307 2329                  (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2308 2330                      prev_flent->fe_cb_arg2, mp_chain, loopback);
2309 2331                  FLOW_REFRELE(prev_flent);
2310 2332          }
2311 2333  }
2312 2334  
2313 2335  /*
2314 2336   * mac_rx_srs_process
2315 2337   *
2316 2338   * Receive side routine called from the interrupt path.
2317 2339   *
2318 2340   * loopback is set to force a context switch on the loopback
2319 2341   * path between MAC clients.
2320 2342   */
2321 2343  /* ARGSUSED */
2322 2344  void
2323 2345  mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
2324 2346      boolean_t loopback)
2325 2347  {
2326 2348          mac_soft_ring_set_t     *mac_srs = (mac_soft_ring_set_t *)srs;
2327 2349          mblk_t                  *mp, *tail, *head;
2328 2350          int                     count = 0;
2329 2351          int                     count1;
2330 2352          size_t                  sz = 0;
2331 2353          size_t                  chain_sz, sz1;
2332 2354          mac_bw_ctl_t            *mac_bw;
2333 2355          mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
2334 2356  
2335 2357          /*
2336 2358           * Set the tail, count and sz. We set the sz irrespective
2337 2359           * of whether we are doing B/W control or not for the
2338 2360           * purpose of updating the stats.
2339 2361           */
2340 2362          mp = tail = mp_chain;
2341 2363          while (mp != NULL) {
2342 2364                  tail = mp;
2343 2365                  count++;
2344 2366                  sz += msgdsize(mp);
2345 2367                  mp = mp->b_next;
2346 2368          }
2347 2369  
2348 2370          mutex_enter(&mac_srs->srs_lock);
2349 2371  
2350 2372          if (loopback) {
2351 2373                  SRS_RX_STAT_UPDATE(mac_srs, lclbytes, sz);
2352 2374                  SRS_RX_STAT_UPDATE(mac_srs, lclcnt, count);
2353 2375  
2354 2376          } else {
2355 2377                  SRS_RX_STAT_UPDATE(mac_srs, intrbytes, sz);
2356 2378                  SRS_RX_STAT_UPDATE(mac_srs, intrcnt, count);
2357 2379          }
2358 2380  
2359 2381          /*
2360 2382           * If the SRS in already being processed; has been blanked;
2361 2383           * can be processed by worker thread only; or the B/W limit
2362 2384           * has been reached, then queue the chain and check if
2363 2385           * worker thread needs to be awakend.
2364 2386           */
2365 2387          if (mac_srs->srs_type & SRST_BW_CONTROL) {
2366 2388                  mac_bw = mac_srs->srs_bw;
2367 2389                  ASSERT(mac_bw != NULL);
2368 2390                  mutex_enter(&mac_bw->mac_bw_lock);
2369 2391                  mac_bw->mac_bw_intr += sz;
2370 2392                  if (mac_bw->mac_bw_limit == 0) {
2371 2393                          /* zero bandwidth: drop all */
2372 2394                          srs_rx->sr_stat.mrs_sdrops += count;
2373 2395                          mac_bw->mac_bw_drop_bytes += sz;
2374 2396                          mutex_exit(&mac_bw->mac_bw_lock);
2375 2397                          mutex_exit(&mac_srs->srs_lock);
2376 2398                          mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
2377 2399                          return;
2378 2400                  } else {
2379 2401                          if ((mac_bw->mac_bw_sz + sz) <=
2380 2402                              mac_bw->mac_bw_drop_threshold) {
2381 2403                                  mutex_exit(&mac_bw->mac_bw_lock);
2382 2404                                  MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
2383 2405                                      tail, count, sz);
2384 2406                          } else {
2385 2407                                  mp = mp_chain;
2386 2408                                  chain_sz = 0;
2387 2409                                  count1 = 0;
2388 2410                                  tail = NULL;
2389 2411                                  head = NULL;
2390 2412                                  while (mp != NULL) {
2391 2413                                          sz1 = msgdsize(mp);
2392 2414                                          if (mac_bw->mac_bw_sz + chain_sz + sz1 >
2393 2415                                              mac_bw->mac_bw_drop_threshold)
2394 2416                                                  break;
2395 2417                                          chain_sz += sz1;
2396 2418                                          count1++;
2397 2419                                          tail = mp;
2398 2420                                          mp = mp->b_next;
2399 2421                                  }
2400 2422                                  mutex_exit(&mac_bw->mac_bw_lock);
2401 2423                                  if (tail != NULL) {
2402 2424                                          head = tail->b_next;
2403 2425                                          tail->b_next = NULL;
2404 2426                                          MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
2405 2427                                              mp_chain, tail, count1, chain_sz);
2406 2428                                          sz -= chain_sz;
2407 2429                                          count -= count1;
2408 2430                                  } else {
2409 2431                                          /* Can't pick up any */
2410 2432                                          head = mp_chain;
2411 2433                                  }
2412 2434                                  if (head != NULL) {
2413 2435                                          /* Drop any packet over the threshold */
2414 2436                                          srs_rx->sr_stat.mrs_sdrops += count;
2415 2437                                          mutex_enter(&mac_bw->mac_bw_lock);
2416 2438                                          mac_bw->mac_bw_drop_bytes += sz;
2417 2439                                          mutex_exit(&mac_bw->mac_bw_lock);
2418 2440                                          freemsgchain(head);
2419 2441                                  }
2420 2442                          }
2421 2443                          MAC_SRS_WORKER_WAKEUP(mac_srs);
2422 2444                          mutex_exit(&mac_srs->srs_lock);
2423 2445                          return;
2424 2446                  }
2425 2447          }
2426 2448  
2427 2449          /*
2428 2450           * If the total number of packets queued in the SRS and
2429 2451           * its associated soft rings exceeds the max allowed,
2430 2452           * then drop the chain. If we are polling capable, this
2431 2453           * shouldn't be happening.
2432 2454           */
2433 2455          if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
2434 2456              (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
2435 2457                  mac_bw = mac_srs->srs_bw;
2436 2458                  srs_rx->sr_stat.mrs_sdrops += count;
2437 2459                  mutex_enter(&mac_bw->mac_bw_lock);
2438 2460                  mac_bw->mac_bw_drop_bytes += sz;
2439 2461                  mutex_exit(&mac_bw->mac_bw_lock);
2440 2462                  freemsgchain(mp_chain);
2441 2463                  mutex_exit(&mac_srs->srs_lock);
2442 2464                  return;
2443 2465          }
2444 2466  
2445 2467          MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
2446 2468  
2447 2469          if (!(mac_srs->srs_state & SRS_PROC)) {
2448 2470                  /*
2449 2471                   * If we are coming via loopback, if we are not optimizing for
2450 2472                   * latency, or if our stack is running deep, we should signal
2451 2473                   * the worker thread.
2452 2474                   */
2453 2475                  if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT) ||
2454 2476                      MAC_RX_SRS_TOODEEP()) {
2455 2477                          /*
2456 2478                           * For loopback, We need to let the worker take
2457 2479                           * over as we don't want to continue in the same
2458 2480                           * thread even if we can. This could lead to stack
2459 2481                           * overflows and may also end up using
2460 2482                           * resources (cpu) incorrectly.
2461 2483                           */
2462 2484                          cv_signal(&mac_srs->srs_async);
2463 2485                  } else {
2464 2486                          /*
2465 2487                           * Seems like no one is processing the SRS and
2466 2488                           * there is no backlog. We also inline process
2467 2489                           * our packet if its a single packet in non
2468 2490                           * latency optimized case (in latency optimized
2469 2491                           * case, we inline process chains of any size).
2470 2492                           */
2471 2493                          mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
2472 2494                  }
2473 2495          }
2474 2496          mutex_exit(&mac_srs->srs_lock);
2475 2497  }
2476 2498  
2477 2499  /* TX SIDE ROUTINES (RUNTIME) */
2478 2500  
2479 2501  /*
2480 2502   * mac_tx_srs_no_desc
2481 2503   *
2482 2504   * This routine is called by Tx single ring default mode
2483 2505   * when Tx ring runs out of descs.
2484 2506   */
2485 2507  mac_tx_cookie_t
2486 2508  mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2487 2509      uint16_t flag, mblk_t **ret_mp)
2488 2510  {
2489 2511          mac_tx_cookie_t cookie = NULL;
2490 2512          mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2491 2513          boolean_t wakeup_worker = B_TRUE;
2492 2514          uint32_t tx_mode = srs_tx->st_mode;
2493 2515          int cnt, sz;
2494 2516          mblk_t *tail;
2495 2517  
2496 2518          ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
2497 2519          if (flag & MAC_DROP_ON_NO_DESC) {
2498 2520                  MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2499 2521          } else {
2500 2522                  if (mac_srs->srs_first != NULL)
2501 2523                          wakeup_worker = B_FALSE;
2502 2524                  MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2503 2525                  if (flag & MAC_TX_NO_ENQUEUE) {
2504 2526                          /*
2505 2527                           * If TX_QUEUED is not set, queue the
2506 2528                           * packet and let mac_tx_srs_drain()
2507 2529                           * set the TX_BLOCKED bit for the
2508 2530                           * reasons explained above. Otherwise,
2509 2531                           * return the mblks.
2510 2532                           */
2511 2533                          if (wakeup_worker) {
2512 2534                                  MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2513 2535                                      mp_chain, tail, cnt, sz);
2514 2536                          } else {
2515 2537                                  MAC_TX_SET_NO_ENQUEUE(mac_srs,
2516 2538                                      mp_chain, ret_mp, cookie);
2517 2539                          }
2518 2540                  } else {
2519 2541                          MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2520 2542                              tail, cnt, sz, cookie);
2521 2543                  }
2522 2544                  if (wakeup_worker)
2523 2545                          cv_signal(&mac_srs->srs_async);
2524 2546          }
2525 2547          return (cookie);
2526 2548  }
2527 2549  
2528 2550  /*
2529 2551   * mac_tx_srs_enqueue
2530 2552   *
2531 2553   * This routine is called when Tx SRS is operating in either serializer
2532 2554   * or bandwidth mode. In serializer mode, a packet will get enqueued
2533 2555   * when a thread cannot enter SRS exclusively. In bandwidth mode,
2534 2556   * packets gets queued if allowed byte-count limit for a tick is
2535 2557   * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
2536 2558   * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
2537 2559   * the default mode or fanout mode. Here packets get dropped or
2538 2560   * returned back to the caller only after hi-watermark worth of data
2539 2561   * is queued.
2540 2562   */
2541 2563  static mac_tx_cookie_t
2542 2564  mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2543 2565      uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
2544 2566  {
2545 2567          mac_tx_cookie_t cookie = NULL;
2546 2568          int cnt, sz;
2547 2569          mblk_t *tail;
2548 2570          boolean_t wakeup_worker = B_TRUE;
2549 2571  
2550 2572          /*
2551 2573           * Ignore fanout hint if we don't have multiple tx rings.
2552 2574           */
2553 2575          if (!MAC_TX_SOFT_RINGS(mac_srs))
2554 2576                  fanout_hint = 0;
2555 2577  
2556 2578          if (mac_srs->srs_first != NULL)
2557 2579                  wakeup_worker = B_FALSE;
2558 2580          MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2559 2581          if (flag & MAC_DROP_ON_NO_DESC) {
2560 2582                  if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
2561 2583                          MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2562 2584                  } else {
2563 2585                          MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2564 2586                              mp_chain, tail, cnt, sz);
2565 2587                  }
2566 2588          } else if (flag & MAC_TX_NO_ENQUEUE) {
2567 2589                  if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
2568 2590                      (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
2569 2591                          MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
2570 2592                              ret_mp, cookie);
2571 2593                  } else {
2572 2594                          mp_chain->b_prev = (mblk_t *)fanout_hint;
2573 2595                          MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2574 2596                              mp_chain, tail, cnt, sz);
2575 2597                  }
2576 2598          } else {
2577 2599                  /*
2578 2600                   * If you are BW_ENFORCED, just enqueue the
2579 2601                   * packet. srs_worker will drain it at the
2580 2602                   * prescribed rate. Before enqueueing, save
2581 2603                   * the fanout hint.
2582 2604                   */
2583 2605                  mp_chain->b_prev = (mblk_t *)fanout_hint;
2584 2606                  MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2585 2607                      tail, cnt, sz, cookie);
2586 2608          }
2587 2609          if (wakeup_worker)
2588 2610                  cv_signal(&mac_srs->srs_async);
2589 2611          return (cookie);
2590 2612  }
2591 2613  
2592 2614  /*
2593 2615   * There are seven tx modes:
2594 2616   *
2595 2617   * 1) Default mode (SRS_TX_DEFAULT)
2596 2618   * 2) Serialization mode (SRS_TX_SERIALIZE)
2597 2619   * 3) Fanout mode (SRS_TX_FANOUT)
2598 2620   * 4) Bandwdith mode (SRS_TX_BW)
2599 2621   * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
2600 2622   * 6) aggr Tx mode (SRS_TX_AGGR)
2601 2623   * 7) aggr Tx bw mode (SRS_TX_BW_AGGR)
2602 2624   *
2603 2625   * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
2604 2626   * based on the number of Tx rings requested for an SRS and whether
2605 2627   * bandwidth control is requested or not.
2606 2628   *
2607 2629   * The default mode (i.e., no fanout/no bandwidth) is used when the
2608 2630   * underlying NIC does not have Tx rings or just one Tx ring. In this mode,
2609 2631   * the SRS acts as a pass-thru. Packets will go directly to mac_tx_send().
2610 2632   * When the underlying Tx ring runs out of Tx descs, it starts queueing up
2611 2633   * packets in SRS. When flow-control is relieved, the srs_worker drains
2612 2634   * the queued packets and informs blocked clients to restart sending
2613 2635   * packets.
2614 2636   *
2615 2637   * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. This
2616 2638   * mode is used when the link has no Tx rings or only one Tx ring.
2617 2639   *
2618 2640   * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
2619 2641   * Tx rings. Each Tx ring will have a soft ring associated with it.
2620 2642   * These soft rings will be hung off the Tx SRS. Queueing if it happens
2621 2643   * due to lack of Tx desc will be in individual soft ring (and not srs)
2622 2644   * associated with Tx ring.
2623 2645   *
2624 2646   * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
2625 2647   * only if bw is available. Otherwise the packets will be queued in
2626 2648   * SRS. If fanout to multiple Tx rings is configured, the packets will
2627 2649   * be fanned out among the soft rings associated with the Tx rings.
2628 2650   *
2629 2651   * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
2630 2652   * invokes an aggr function, aggr_find_tx_ring(), to find a pseudo Tx ring
2631 2653   * belonging to a port on which the packet has to be sent. Aggr will
2632 2654   * always have a pseudo Tx ring associated with it even when it is an
2633 2655   * aggregation over a single NIC that has no Tx rings. Even in such a
2634 2656   * case, the single pseudo Tx ring will have a soft ring associated with
2635 2657   * it and the soft ring will hang off the SRS.
2636 2658   *
2637 2659   * If a bandwidth is specified for an aggr, SRS_TX_BW_AGGR mode is used.
2638 2660   * In this mode, the bandwidth is first applied on the outgoing packets
2639 2661   * and later mac_tx_addr_mode() function is called to send the packet out
2640 2662   * of one of the pseudo Tx rings.
2641 2663   *
2642 2664   * Four flags are used in srs_state for indicating flow control
2643 2665   * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
2644 2666   * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
2645 2667   * driver below.
2646 2668   * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
2647 2669   * and flow-control pressure is applied back to clients. The clients expect
2648 2670   * wakeup when flow-control is relieved.
2649 2671   * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
2650 2672   * got returned back to client either due to lack of Tx descs or due to bw
2651 2673   * control reasons. The clients expect a wakeup when condition is relieved.
2652 2674   *
2653 2675   * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
2654 2676   * some clients set the following values too: MAC_DROP_ON_NO_DESC,
2655 2677   * MAC_TX_NO_ENQUEUE
2656 2678   * Mac clients that do not want packets to be enqueued in the mac layer set
2657 2679   * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
2658 2680   * Tx soft rings but instead get dropped when the NIC runs out of desc. The
2659 2681   * behaviour of this flag is different when the Tx is running in serializer
2660 2682   * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
2661 2683   * get dropped when Tx high watermark is reached.
2662 2684   * There are some mac clients like vsw, aggr that want the mblks to be
2663 2685   * returned back to clients instead of being queued in Tx SRS (or Tx soft
2664 2686   * rings) under flow-control (i.e., out of desc or exceeding bw limits)
2665 2687   * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
2666 2688   * In the default and Tx fanout mode, the un-transmitted mblks will be
2667 2689   * returned back to the clients when the driver runs out of Tx descs.
2668 2690   * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
2669 2691   * soft ring) so that the clients can be woken up when Tx desc become
2670 2692   * available. When running in serializer or bandwidth mode mode,
2671 2693   * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
2672 2694   */
2673 2695  
2674 2696  mac_tx_func_t
2675 2697  mac_tx_get_func(uint32_t mode)
2676 2698  {
2677 2699          return (mac_tx_mode_list[mode].mac_tx_func);
2678 2700  }
2679 2701  
2680 2702  /* ARGSUSED */
2681 2703  static mac_tx_cookie_t
2682 2704  mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2683 2705      uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2684 2706  {
2685 2707          mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
2686 2708          mac_tx_stats_t          stats;
2687 2709          mac_tx_cookie_t         cookie = NULL;
2688 2710  
2689 2711          ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
2690 2712  
2691 2713          /* Regular case with a single Tx ring */
2692 2714          /*
2693 2715           * SRS_TX_BLOCKED is set when underlying NIC runs
2694 2716           * out of Tx descs and messages start getting
2695 2717           * queued. It won't get reset until
2696 2718           * tx_srs_drain() completely drains out the
2697 2719           * messages.
2698 2720           */
2699 2721          if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2700 2722                  /* Tx descs/resources not available */
2701 2723                  mutex_enter(&mac_srs->srs_lock);
2702 2724                  if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2703 2725                          cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
2704 2726                              flag, ret_mp);
2705 2727                          mutex_exit(&mac_srs->srs_lock);
2706 2728                          return (cookie);
2707 2729                  }
2708 2730                  /*
2709 2731                   * While we were computing mblk count, the
2710 2732                   * flow control condition got relieved.
2711 2733                   * Continue with the transmission.
2712 2734                   */
2713 2735                  mutex_exit(&mac_srs->srs_lock);
2714 2736          }
2715 2737  
2716 2738          mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2717 2739              mp_chain, &stats);
2718 2740  
2719 2741          /*
2720 2742           * Multiple threads could be here sending packets.
2721 2743           * Under such conditions, it is not possible to
2722 2744           * automically set SRS_TX_BLOCKED bit to indicate
2723 2745           * out of tx desc condition. To atomically set
2724 2746           * this, we queue the returned packet and do
2725 2747           * the setting of SRS_TX_BLOCKED in
2726 2748           * mac_tx_srs_drain().
2727 2749           */
2728 2750          if (mp_chain != NULL) {
2729 2751                  mutex_enter(&mac_srs->srs_lock);
2730 2752                  cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
2731 2753                  mutex_exit(&mac_srs->srs_lock);
2732 2754                  return (cookie);
2733 2755          }
2734 2756          SRS_TX_STATS_UPDATE(mac_srs, &stats);
2735 2757  
2736 2758          return (NULL);
2737 2759  }
2738 2760  
2739 2761  /*
2740 2762   * mac_tx_serialize_mode
2741 2763   *
2742 2764   * This is an experimental mode implemented as per the request of PAE.
2743 2765   * In this mode, all callers attempting to send a packet to the NIC
2744 2766   * will get serialized. Only one thread at any time will access the
2745 2767   * NIC to send the packet out.
2746 2768   */
2747 2769  /* ARGSUSED */
2748 2770  static mac_tx_cookie_t
2749 2771  mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2750 2772      uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2751 2773  {
2752 2774          mac_tx_stats_t          stats;
2753 2775          mac_tx_cookie_t         cookie = NULL;
2754 2776          mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
2755 2777  
2756 2778          /* Single ring, serialize below */
2757 2779          ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
2758 2780          mutex_enter(&mac_srs->srs_lock);
2759 2781          if ((mac_srs->srs_first != NULL) ||
2760 2782              (mac_srs->srs_state & SRS_PROC)) {
2761 2783                  /*
2762 2784                   * In serialization mode, queue all packets until
2763 2785                   * TX_HIWAT is set.
2764 2786                   * If drop bit is set, drop if TX_HIWAT is set.
2765 2787                   * If no_enqueue is set, still enqueue until hiwat
2766 2788                   * is set and return mblks after TX_HIWAT is set.
2767 2789                   */
2768 2790                  cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
2769 2791                      flag, NULL, ret_mp);
2770 2792                  mutex_exit(&mac_srs->srs_lock);
2771 2793                  return (cookie);
2772 2794          }
2773 2795          /*
2774 2796           * No packets queued, nothing on proc and no flow
2775 2797           * control condition. Fast-path, ok. Do inline
2776 2798           * processing.
2777 2799           */
2778 2800          mac_srs->srs_state |= SRS_PROC;
2779 2801          mutex_exit(&mac_srs->srs_lock);
2780 2802  
2781 2803          mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2782 2804              mp_chain, &stats);
2783 2805  
2784 2806          mutex_enter(&mac_srs->srs_lock);
2785 2807          mac_srs->srs_state &= ~SRS_PROC;
2786 2808          if (mp_chain != NULL) {
2787 2809                  cookie = mac_tx_srs_enqueue(mac_srs,
2788 2810                      mp_chain, flag, NULL, ret_mp);
2789 2811          }
2790 2812          if (mac_srs->srs_first != NULL) {
2791 2813                  /*
2792 2814                   * We processed inline our packet and a new
2793 2815                   * packet/s got queued while we were
2794 2816                   * processing. Wakeup srs worker
2795 2817                   */
2796 2818                  cv_signal(&mac_srs->srs_async);
2797 2819          }
2798 2820          mutex_exit(&mac_srs->srs_lock);
2799 2821  
2800 2822          if (cookie == NULL)
2801 2823                  SRS_TX_STATS_UPDATE(mac_srs, &stats);
2802 2824  
2803 2825          return (cookie);
2804 2826  }
2805 2827  
2806 2828  /*
2807 2829   * mac_tx_fanout_mode
2808 2830   *
2809 2831   * In this mode, the SRS will have access to multiple Tx rings to send
2810 2832   * the packet out. The fanout hint that is passed as an argument is
2811 2833   * used to find an appropriate ring to fanout the traffic. Each Tx
2812 2834   * ring, in turn,  will have a soft ring associated with it. If a Tx
2813 2835   * ring runs out of Tx desc's the returned packet will be queued in
2814 2836   * the soft ring associated with that Tx ring. The srs itself will not
2815 2837   * queue any packets.
2816 2838   */
2817 2839  
2818 2840  #define MAC_TX_SOFT_RING_PROCESS(chain) {                               \
2819 2841          index = COMPUTE_INDEX(hash, mac_srs->srs_tx_ring_count),        \
2820 2842          softring = mac_srs->srs_tx_soft_rings[index];                   \
2821 2843          cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
2822 2844          DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index);       \
2823 2845  }
2824 2846  
2825 2847  static mac_tx_cookie_t
2826 2848  mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2827 2849      uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2828 2850  {
2829 2851          mac_soft_ring_t         *softring;
2830 2852          uint64_t                hash;
2831 2853          uint_t                  index;
2832 2854          mac_tx_cookie_t         cookie = NULL;
2833 2855  
2834 2856          ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
2835 2857              mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
2836 2858          if (fanout_hint != 0) {
2837 2859                  /*
2838 2860                   * The hint is specified by the caller, simply pass the
2839 2861                   * whole chain to the soft ring.
2840 2862                   */
2841 2863                  hash = HASH_HINT(fanout_hint);
2842 2864                  MAC_TX_SOFT_RING_PROCESS(mp_chain);
2843 2865          } else {
2844 2866                  mblk_t *last_mp, *cur_mp, *sub_chain;
2845 2867                  uint64_t last_hash = 0;
2846 2868                  uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
2847 2869  
2848 2870                  /*
2849 2871                   * Compute the hash from the contents (headers) of the
2850 2872                   * packets of the mblk chain. Split the chains into
2851 2873                   * subchains of the same conversation.
2852 2874                   *
2853 2875                   * Since there may be more than one ring used for
2854 2876                   * sub-chains of the same call, and since the caller
2855 2877                   * does not maintain per conversation state since it
2856 2878                   * passed a zero hint, unsent subchains will be
2857 2879                   * dropped.
2858 2880                   */
2859 2881  
2860 2882                  flag |= MAC_DROP_ON_NO_DESC;
2861 2883                  ret_mp = NULL;
2862 2884  
2863 2885                  ASSERT(ret_mp == NULL);
2864 2886  
2865 2887                  sub_chain = NULL;
2866 2888                  last_mp = NULL;
2867 2889  
2868 2890                  for (cur_mp = mp_chain; cur_mp != NULL;
2869 2891                      cur_mp = cur_mp->b_next) {
2870 2892                          hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
2871 2893                              B_TRUE);
2872 2894                          if (last_hash != 0 && hash != last_hash) {
2873 2895                                  /*
2874 2896                                   * Starting a different subchain, send current
2875 2897                                   * chain out.
2876 2898                                   */
2877 2899                                  ASSERT(last_mp != NULL);
2878 2900                                  last_mp->b_next = NULL;
2879 2901                                  MAC_TX_SOFT_RING_PROCESS(sub_chain);
2880 2902                                  sub_chain = NULL;
2881 2903                          }
2882 2904  
2883 2905                          /* add packet to subchain */
2884 2906                          if (sub_chain == NULL)
2885 2907                                  sub_chain = cur_mp;
2886 2908                          last_mp = cur_mp;
2887 2909                          last_hash = hash;
2888 2910                  }
2889 2911  
2890 2912                  if (sub_chain != NULL) {
2891 2913                          /* send last subchain */
2892 2914                          ASSERT(last_mp != NULL);
2893 2915                          last_mp->b_next = NULL;
2894 2916                          MAC_TX_SOFT_RING_PROCESS(sub_chain);
2895 2917                  }
2896 2918  
2897 2919                  cookie = NULL;
2898 2920          }
2899 2921  
2900 2922          return (cookie);
2901 2923  }
2902 2924  
2903 2925  /*
2904 2926   * mac_tx_bw_mode
2905 2927   *
2906 2928   * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
2907 2929   * only if bw is available. Otherwise the packets will be queued in
2908 2930   * SRS. If the SRS has multiple Tx rings, then packets will get fanned
2909 2931   * out to a Tx rings.
2910 2932   */
2911 2933  static mac_tx_cookie_t
2912 2934  mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2913 2935      uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2914 2936  {
2915 2937          int                     cnt, sz;
2916 2938          mblk_t                  *tail;
2917 2939          mac_tx_cookie_t         cookie = NULL;
2918 2940          mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
2919 2941          clock_t                 now;
2920 2942  
2921 2943          ASSERT(TX_BANDWIDTH_MODE(mac_srs));
2922 2944          ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
2923 2945          mutex_enter(&mac_srs->srs_lock);
2924 2946          if (mac_srs->srs_bw->mac_bw_limit == 0) {
2925 2947                  /*
2926 2948                   * zero bandwidth, no traffic is sent: drop the packets,
2927 2949                   * or return the whole chain if the caller requests all
2928 2950                   * unsent packets back.
2929 2951                   */
2930 2952                  if (flag & MAC_TX_NO_ENQUEUE) {
2931 2953                          cookie = (mac_tx_cookie_t)mac_srs;
2932 2954                          *ret_mp = mp_chain;
2933 2955                  } else {
2934 2956                          MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2935 2957                  }
2936 2958                  mutex_exit(&mac_srs->srs_lock);
2937 2959                  return (cookie);
2938 2960          } else if ((mac_srs->srs_first != NULL) ||
2939 2961              (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2940 2962                  cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2941 2963                      fanout_hint, ret_mp);
2942 2964                  mutex_exit(&mac_srs->srs_lock);
2943 2965                  return (cookie);
2944 2966          }
2945 2967          MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2946 2968          now = ddi_get_lbolt();
2947 2969          if (mac_srs->srs_bw->mac_bw_curr_time != now) {
2948 2970                  mac_srs->srs_bw->mac_bw_curr_time = now;
2949 2971                  mac_srs->srs_bw->mac_bw_used = 0;
2950 2972          } else if (mac_srs->srs_bw->mac_bw_used >
2951 2973              mac_srs->srs_bw->mac_bw_limit) {
2952 2974                  mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
2953 2975                  MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2954 2976                      mp_chain, tail, cnt, sz);
2955 2977                  /*
2956 2978                   * Wakeup worker thread. Note that worker
2957 2979                   * thread has to be woken up so that it
2958 2980                   * can fire up the timer to be woken up
2959 2981                   * on the next tick. Also once
2960 2982                   * BW_ENFORCED is set, it can only be
2961 2983                   * reset by srs_worker thread. Until then
2962 2984                   * all packets will get queued up in SRS
2963 2985                   * and hence this this code path won't be
2964 2986                   * entered until BW_ENFORCED is reset.
2965 2987                   */
2966 2988                  cv_signal(&mac_srs->srs_async);
2967 2989                  mutex_exit(&mac_srs->srs_lock);
2968 2990                  return (cookie);
2969 2991          }
2970 2992  
2971 2993          mac_srs->srs_bw->mac_bw_used += sz;
2972 2994          mutex_exit(&mac_srs->srs_lock);
2973 2995  
2974 2996          if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
2975 2997                  mac_soft_ring_t *softring;
2976 2998                  uint_t indx, hash;
2977 2999  
2978 3000                  hash = HASH_HINT(fanout_hint);
2979 3001                  indx = COMPUTE_INDEX(hash,
2980 3002                      mac_srs->srs_tx_ring_count);
2981 3003                  softring = mac_srs->srs_tx_soft_rings[indx];
2982 3004                  return (mac_tx_soft_ring_process(softring, mp_chain, flag,
2983 3005                      ret_mp));
2984 3006          } else if (srs_tx->st_mode == SRS_TX_BW_AGGR) {
2985 3007                  return (mac_tx_aggr_mode(mac_srs, mp_chain,
2986 3008                      fanout_hint, flag, ret_mp));
2987 3009          } else {
2988 3010                  mac_tx_stats_t          stats;
2989 3011  
2990 3012                  mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2991 3013                      mp_chain, &stats);
2992 3014  
2993 3015                  if (mp_chain != NULL) {
2994 3016                          mutex_enter(&mac_srs->srs_lock);
2995 3017                          MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2996 3018                          if (mac_srs->srs_bw->mac_bw_used > sz)
2997 3019                                  mac_srs->srs_bw->mac_bw_used -= sz;
2998 3020                          else
2999 3021                                  mac_srs->srs_bw->mac_bw_used = 0;
3000 3022                          cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
3001 3023                              fanout_hint, ret_mp);
3002 3024                          mutex_exit(&mac_srs->srs_lock);
3003 3025                          return (cookie);
3004 3026                  }
3005 3027                  SRS_TX_STATS_UPDATE(mac_srs, &stats);
3006 3028  
3007 3029                  return (NULL);
3008 3030          }
3009 3031  }
3010 3032  
3011 3033  /*
3012 3034   * mac_tx_aggr_mode
3013 3035   *
3014 3036   * This routine invokes an aggr function, aggr_find_tx_ring(), to find
3015 3037   * a (pseudo) Tx ring belonging to a port on which the packet has to
3016 3038   * be sent. aggr_find_tx_ring() first finds the outgoing port based on
3017 3039   * L2/L3/L4 policy and then uses the fanout_hint passed to it to pick
3018 3040   * a Tx ring from the selected port.
3019 3041   *
3020 3042   * Note that a port can be deleted from the aggregation. In such a case,
3021 3043   * the aggregation layer first separates the port from the rest of the
3022 3044   * ports making sure that port (and thus any Tx rings associated with
3023 3045   * it) won't get selected in the call to aggr_find_tx_ring() function.
3024 3046   * Later calls are made to mac_group_rem_ring() passing pseudo Tx ring
3025 3047   * handles one by one which in turn will quiesce the Tx SRS and remove
3026 3048   * the soft ring associated with the pseudo Tx ring. Unlike Rx side
3027 3049   * where a cookie is used to protect against mac_rx_ring() calls on
3028 3050   * rings that have been removed, no such cookie is needed on the Tx
3029 3051   * side as the pseudo Tx ring won't be available anymore to
3030 3052   * aggr_find_tx_ring() once the port has been removed.
3031 3053   */
3032 3054  static mac_tx_cookie_t
3033 3055  mac_tx_aggr_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3034 3056      uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
3035 3057  {
3036 3058          mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
3037 3059          mac_tx_ring_fn_t        find_tx_ring_fn;
3038 3060          mac_ring_handle_t       ring = NULL;
3039 3061          void                    *arg;
3040 3062          mac_soft_ring_t         *sringp;
3041 3063  
3042 3064          find_tx_ring_fn = srs_tx->st_capab_aggr.mca_find_tx_ring_fn;
3043 3065          arg = srs_tx->st_capab_aggr.mca_arg;
3044 3066          if (find_tx_ring_fn(arg, mp_chain, fanout_hint, &ring) == NULL)
3045 3067                  return (NULL);
3046 3068          sringp = srs_tx->st_soft_rings[((mac_ring_t *)ring)->mr_index];
3047 3069          return (mac_tx_soft_ring_process(sringp, mp_chain, flag, ret_mp));
3048 3070  }
3049 3071  
3050 3072  void
3051 3073  mac_tx_invoke_callbacks(mac_client_impl_t *mcip, mac_tx_cookie_t cookie)
3052 3074  {
3053 3075          mac_cb_t *mcb;
3054 3076          mac_tx_notify_cb_t *mtnfp;
3055 3077  
3056 3078          /* Wakeup callback registered clients */
3057 3079          MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
3058 3080          for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
3059 3081              mcb = mcb->mcb_nextp) {
3060 3082                  mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
3061 3083                  mtnfp->mtnf_fn(mtnfp->mtnf_arg, cookie);
3062 3084          }
3063 3085          MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
3064 3086              &mcip->mci_tx_notify_cb_list);
3065 3087  }
3066 3088  
3067 3089  /* ARGSUSED */
3068 3090  void
3069 3091  mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
3070 3092  {
3071 3093          mblk_t                  *head, *tail;
3072 3094          size_t                  sz;
3073 3095          uint32_t                tx_mode;
3074 3096          uint_t                  saved_pkt_count;
3075 3097          mac_tx_stats_t          stats;
3076 3098          mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
3077 3099          clock_t                 now;
3078 3100  
3079 3101          saved_pkt_count = 0;
3080 3102          ASSERT(mutex_owned(&mac_srs->srs_lock));
3081 3103          ASSERT(!(mac_srs->srs_state & SRS_PROC));
3082 3104  
3083 3105          mac_srs->srs_state |= SRS_PROC;
3084 3106  
3085 3107          tx_mode = srs_tx->st_mode;
3086 3108          if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
3087 3109                  if (mac_srs->srs_first != NULL) {
3088 3110                          head = mac_srs->srs_first;
3089 3111                          tail = mac_srs->srs_last;
3090 3112                          saved_pkt_count = mac_srs->srs_count;
3091 3113                          mac_srs->srs_first = NULL;
3092 3114                          mac_srs->srs_last = NULL;
3093 3115                          mac_srs->srs_count = 0;
3094 3116                          mutex_exit(&mac_srs->srs_lock);
3095 3117  
3096 3118                          head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3097 3119                              head, &stats);
3098 3120  
3099 3121                          mutex_enter(&mac_srs->srs_lock);
3100 3122                          if (head != NULL) {
3101 3123                                  /* Device out of tx desc, set block */
3102 3124                                  if (head->b_next == NULL)
3103 3125                                          VERIFY(head == tail);
3104 3126                                  tail->b_next = mac_srs->srs_first;
3105 3127                                  mac_srs->srs_first = head;
3106 3128                                  mac_srs->srs_count +=
3107 3129                                      (saved_pkt_count - stats.mts_opackets);
3108 3130                                  if (mac_srs->srs_last == NULL)
3109 3131                                          mac_srs->srs_last = tail;
3110 3132                                  MAC_TX_SRS_BLOCK(mac_srs, head);
3111 3133                          } else {
3112 3134                                  srs_tx->st_woken_up = B_FALSE;
3113 3135                                  SRS_TX_STATS_UPDATE(mac_srs, &stats);
3114 3136                          }
3115 3137                  }
3116 3138          } else if (tx_mode == SRS_TX_BW) {
3117 3139                  /*
3118 3140                   * We are here because the timer fired and we have some data
3119 3141                   * to tranmit. Also mac_tx_srs_worker should have reset
3120 3142                   * SRS_BW_ENFORCED flag
3121 3143                   */
3122 3144                  ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
3123 3145                  head = tail = mac_srs->srs_first;
3124 3146                  while (mac_srs->srs_first != NULL) {
3125 3147                          tail = mac_srs->srs_first;
3126 3148                          tail->b_prev = NULL;
3127 3149                          mac_srs->srs_first = tail->b_next;
3128 3150                          if (mac_srs->srs_first == NULL)
3129 3151                                  mac_srs->srs_last = NULL;
3130 3152                          mac_srs->srs_count--;
3131 3153                          sz = msgdsize(tail);
3132 3154                          mac_srs->srs_size -= sz;
3133 3155                          saved_pkt_count++;
3134 3156                          MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
3135 3157  
3136 3158                          if (mac_srs->srs_bw->mac_bw_used <
3137 3159                              mac_srs->srs_bw->mac_bw_limit)
3138 3160                                  continue;
3139 3161  
3140 3162                          now = ddi_get_lbolt();
3141 3163                          if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3142 3164                                  mac_srs->srs_bw->mac_bw_curr_time = now;
3143 3165                                  mac_srs->srs_bw->mac_bw_used = sz;
3144 3166                                  continue;
3145 3167                          }
3146 3168                          mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3147 3169                          break;
3148 3170                  }
3149 3171  
3150 3172                  ASSERT((head == NULL && tail == NULL) ||
3151 3173                      (head != NULL && tail != NULL));
3152 3174                  if (tail != NULL) {
3153 3175                          tail->b_next = NULL;
3154 3176                          mutex_exit(&mac_srs->srs_lock);
3155 3177  
3156 3178                          head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3157 3179                              head, &stats);
3158 3180  
3159 3181                          mutex_enter(&mac_srs->srs_lock);
3160 3182                          if (head != NULL) {
3161 3183                                  uint_t size_sent;
3162 3184  
3163 3185                                  /* Device out of tx desc, set block */
3164 3186                                  if (head->b_next == NULL)
3165 3187                                          VERIFY(head == tail);
3166 3188                                  tail->b_next = mac_srs->srs_first;
3167 3189                                  mac_srs->srs_first = head;
3168 3190                                  mac_srs->srs_count +=
3169 3191                                      (saved_pkt_count - stats.mts_opackets);
3170 3192                                  if (mac_srs->srs_last == NULL)
3171 3193                                          mac_srs->srs_last = tail;
3172 3194                                  size_sent = sz - stats.mts_obytes;
3173 3195                                  mac_srs->srs_size += size_sent;
3174 3196                                  mac_srs->srs_bw->mac_bw_sz += size_sent;
3175 3197                                  if (mac_srs->srs_bw->mac_bw_used > size_sent) {
3176 3198                                          mac_srs->srs_bw->mac_bw_used -=
3177 3199                                              size_sent;
3178 3200                                  } else {
3179 3201                                          mac_srs->srs_bw->mac_bw_used = 0;
3180 3202                                  }
3181 3203                                  MAC_TX_SRS_BLOCK(mac_srs, head);
3182 3204                          } else {
3183 3205                                  srs_tx->st_woken_up = B_FALSE;
3184 3206                                  SRS_TX_STATS_UPDATE(mac_srs, &stats);
3185 3207                          }
3186 3208                  }
3187 3209          } else if (tx_mode == SRS_TX_BW_FANOUT || tx_mode == SRS_TX_BW_AGGR) {
3188 3210                  mblk_t *prev;
3189 3211                  uint64_t hint;
3190 3212  
3191 3213                  /*
3192 3214                   * We are here because the timer fired and we
3193 3215                   * have some quota to tranmit.
3194 3216                   */
3195 3217                  prev = NULL;
3196 3218                  head = tail = mac_srs->srs_first;
3197 3219                  while (mac_srs->srs_first != NULL) {
3198 3220                          tail = mac_srs->srs_first;
3199 3221                          mac_srs->srs_first = tail->b_next;
3200 3222                          if (mac_srs->srs_first == NULL)
3201 3223                                  mac_srs->srs_last = NULL;
3202 3224                          mac_srs->srs_count--;
3203 3225                          sz = msgdsize(tail);
3204 3226                          mac_srs->srs_size -= sz;
3205 3227                          mac_srs->srs_bw->mac_bw_used += sz;
3206 3228                          if (prev == NULL)
3207 3229                                  hint = (ulong_t)tail->b_prev;
3208 3230                          if (hint != (ulong_t)tail->b_prev) {
3209 3231                                  prev->b_next = NULL;
3210 3232                                  mutex_exit(&mac_srs->srs_lock);
3211 3233                                  TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3212 3234                                  head = tail;
3213 3235                                  hint = (ulong_t)tail->b_prev;
3214 3236                                  mutex_enter(&mac_srs->srs_lock);
3215 3237                          }
3216 3238  
3217 3239                          prev = tail;
3218 3240                          tail->b_prev = NULL;
3219 3241                          if (mac_srs->srs_bw->mac_bw_used <
3220 3242                              mac_srs->srs_bw->mac_bw_limit)
3221 3243                                  continue;
3222 3244  
3223 3245                          now = ddi_get_lbolt();
3224 3246                          if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3225 3247                                  mac_srs->srs_bw->mac_bw_curr_time = now;
3226 3248                                  mac_srs->srs_bw->mac_bw_used = 0;
3227 3249                                  continue;
3228 3250                          }
3229 3251                          mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3230 3252                          break;
3231 3253                  }
3232 3254                  ASSERT((head == NULL && tail == NULL) ||
3233 3255                      (head != NULL && tail != NULL));
3234 3256                  if (tail != NULL) {
3235 3257                          tail->b_next = NULL;
3236 3258                          mutex_exit(&mac_srs->srs_lock);
3237 3259                          TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3238 3260                          mutex_enter(&mac_srs->srs_lock);
3239 3261                  }
3240 3262          }
3241 3263          /*
3242 3264           * SRS_TX_FANOUT case not considered here because packets
3243 3265           * won't be queued in the SRS for this case. Packets will
3244 3266           * be sent directly to soft rings underneath and if there
3245 3267           * is any queueing at all, it would be in Tx side soft
3246 3268           * rings.
3247 3269           */
3248 3270  
3249 3271          /*
3250 3272           * When srs_count becomes 0, reset SRS_TX_HIWAT and
3251 3273           * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
3252 3274           */
3253 3275          if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
3254 3276              (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
3255 3277                  mac_client_impl_t *mcip = mac_srs->srs_mcip;
3256 3278                  boolean_t wakeup_required = B_FALSE;
3257 3279  
3258 3280                  if (mac_srs->srs_state &
3259 3281                      (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
3260 3282                          wakeup_required = B_TRUE;
3261 3283                  }
3262 3284                  mac_srs->srs_state &= ~(SRS_TX_HIWAT |
3263 3285                      SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
3264 3286                  mutex_exit(&mac_srs->srs_lock);
3265 3287                  if (wakeup_required) {
3266 3288                          mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)mac_srs);
3267 3289                          /*
3268 3290                           * If the client is not the primary MAC client, then we
3269 3291                           * need to send the notification to the clients upper
3270 3292                           * MAC, i.e. mci_upper_mip.
3271 3293                           */
3272 3294                          mac_tx_notify(mcip->mci_upper_mip != NULL ?
3273 3295                              mcip->mci_upper_mip : mcip->mci_mip);
3274 3296                  }
3275 3297                  mutex_enter(&mac_srs->srs_lock);
3276 3298          }
3277 3299          mac_srs->srs_state &= ~SRS_PROC;
3278 3300  }
3279 3301  
3280 3302  /*
3281 3303   * Given a packet, get the flow_entry that identifies the flow
3282 3304   * to which that packet belongs. The flow_entry will contain
3283 3305   * the transmit function to be used to send the packet. If the
3284 3306   * function returns NULL, the packet should be sent using the
3285 3307   * underlying NIC.
3286 3308   */
3287 3309  static flow_entry_t *
3288 3310  mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
3289 3311  {
3290 3312          flow_entry_t            *flent = NULL;
3291 3313          mac_client_impl_t       *mcip;
3292 3314          int     err;
3293 3315  
3294 3316          /*
3295 3317           * Do classification on the packet.
3296 3318           */
3297 3319          err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
3298 3320          if (err != 0)
3299 3321                  return (NULL);
3300 3322  
3301 3323          /*
3302 3324           * This flent might just be an additional one on the MAC client,
3303 3325           * i.e. for classification purposes (different fdesc), however
3304 3326           * the resources, SRS et. al., are in the mci_flent, so if
3305 3327           * this isn't the mci_flent, we need to get it.
3306 3328           */
3307 3329          if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
3308 3330                  FLOW_REFRELE(flent);
3309 3331                  flent = mcip->mci_flent;
3310 3332                  FLOW_TRY_REFHOLD(flent, err);
3311 3333                  if (err != 0)
3312 3334                          return (NULL);
3313 3335          }
3314 3336  
3315 3337          return (flent);
3316 3338  }
3317 3339  
3318 3340  /*
3319 3341   * This macro is only meant to be used by mac_tx_send().
3320 3342   */
3321 3343  #define CHECK_VID_AND_ADD_TAG(mp) {                     \
3322 3344          if (vid_check) {                                \
3323 3345                  int err = 0;                            \
3324 3346                                                          \
3325 3347                  MAC_VID_CHECK(src_mcip, (mp), err);     \
3326 3348                  if (err != 0) {                         \
3327 3349                          freemsg((mp));                  \
3328 3350                          (mp) = next;                    \
3329 3351                          oerrors++;                      \
3330 3352                          continue;                       \
3331 3353                  }                                       \
3332 3354          }                                               \
3333 3355          if (add_tag) {                                  \
3334 3356                  (mp) = mac_add_vlan_tag((mp), 0, vid);  \
3335 3357                  if ((mp) == NULL) {                     \
3336 3358                          (mp) = next;                    \
3337 3359                          oerrors++;                      \
3338 3360                          continue;                       \
3339 3361                  }                                       \
3340 3362          }                                               \
3341 3363  }
3342 3364  
3343 3365  mblk_t *
3344 3366  mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
3345 3367      mac_tx_stats_t *stats)
3346 3368  {
3347 3369          mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
3348 3370          mac_impl_t *mip = src_mcip->mci_mip;
3349 3371          uint_t obytes = 0, opackets = 0, oerrors = 0;
3350 3372          mblk_t *mp = NULL, *next;
3351 3373          boolean_t vid_check, add_tag;
3352 3374          uint16_t vid = 0;
3353 3375  
3354 3376          if (mip->mi_nclients > 1) {
3355 3377                  vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
3356 3378                  add_tag = MAC_TAG_NEEDED(src_mcip);
3357 3379                  if (add_tag)
3358 3380                          vid = mac_client_vid(mch);
3359 3381          } else {
3360 3382                  ASSERT(mip->mi_nclients == 1);
3361 3383                  vid_check = add_tag = B_FALSE;
3362 3384          }
3363 3385  
3364 3386          /*
3365 3387           * Fastpath: if there's only one client, we simply send
3366 3388           * the packet down to the underlying NIC.
3367 3389           */
3368 3390          if (mip->mi_nactiveclients == 1) {
3369 3391                  DTRACE_PROBE2(fastpath,
3370 3392                      mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
3371 3393  
3372 3394                  mp = mp_chain;
3373 3395                  while (mp != NULL) {
3374 3396                          next = mp->b_next;
3375 3397                          mp->b_next = NULL;
3376 3398                          opackets++;
3377 3399                          obytes += (mp->b_cont == NULL ? MBLKL(mp) :
3378 3400                              msgdsize(mp));
3379 3401  
3380 3402                          CHECK_VID_AND_ADD_TAG(mp);
3381 3403                          MAC_TX(mip, ring, mp, src_mcip);
3382 3404  
3383 3405                          /*
3384 3406                           * If the driver is out of descriptors and does a
3385 3407                           * partial send it will return a chain of unsent
3386 3408                           * mblks. Adjust the accounting stats.
3387 3409                           */
3388 3410                          if (mp != NULL) {
3389 3411                                  opackets--;
3390 3412                                  obytes -= msgdsize(mp);
3391 3413                                  mp->b_next = next;
3392 3414                                  break;
3393 3415                          }
3394 3416                          mp = next;
3395 3417                  }
3396 3418                  goto done;
3397 3419          }
3398 3420  
3399 3421          /*
3400 3422           * No fastpath, we either have more than one MAC client
3401 3423           * defined on top of the same MAC, or one or more MAC
3402 3424           * client promiscuous callbacks.
3403 3425           */
3404 3426          DTRACE_PROBE3(slowpath, mac_client_impl_t *,
3405 3427              src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
3406 3428  
3407 3429          mp = mp_chain;
3408 3430          while (mp != NULL) {
3409 3431                  flow_entry_t *dst_flow_ent;
3410 3432                  void *flow_cookie;
3411 3433                  size_t  pkt_size;
3412 3434                  mblk_t *mp1;
3413 3435  
3414 3436                  next = mp->b_next;
3415 3437                  mp->b_next = NULL;
3416 3438                  opackets++;
3417 3439                  pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
3418 3440                  obytes += pkt_size;
3419 3441                  CHECK_VID_AND_ADD_TAG(mp);
3420 3442  
3421 3443                  /*
3422 3444                   * Find the destination.
3423 3445                   */
3424 3446                  dst_flow_ent = mac_tx_classify(mip, mp);
3425 3447  
3426 3448                  if (dst_flow_ent != NULL) {
3427 3449                          size_t  hdrsize;
3428 3450                          int     err = 0;
3429 3451  
3430 3452                          if (mip->mi_info.mi_nativemedia == DL_ETHER) {
3431 3453                                  struct ether_vlan_header *evhp =
3432 3454                                      (struct ether_vlan_header *)mp->b_rptr;
3433 3455  
3434 3456                                  if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
3435 3457                                          hdrsize = sizeof (*evhp);
3436 3458                                  else
3437 3459                                          hdrsize = sizeof (struct ether_header);
3438 3460                          } else {
3439 3461                                  mac_header_info_t       mhi;
3440 3462  
3441 3463                                  err = mac_header_info((mac_handle_t)mip,
3442 3464                                      mp, &mhi);
3443 3465                                  if (err == 0)
3444 3466                                          hdrsize = mhi.mhi_hdrsize;
3445 3467                          }
3446 3468  
3447 3469                          /*
3448 3470                           * Got a matching flow. It's either another
3449 3471                           * MAC client, or a broadcast/multicast flow.
3450 3472                           * Make sure the packet size is within the
3451 3473                           * allowed size. If not drop the packet and
3452 3474                           * move to next packet.
3453 3475                           */
3454 3476                          if (err != 0 ||
3455 3477                              (pkt_size - hdrsize) > mip->mi_sdu_max) {
3456 3478                                  oerrors++;
3457 3479                                  DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
3458 3480                                      mblk_t *, mp);
3459 3481                                  freemsg(mp);
3460 3482                                  mp = next;
3461 3483                                  FLOW_REFRELE(dst_flow_ent);
3462 3484                                  continue;
3463 3485                          }
3464 3486                          flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
3465 3487                          if (flow_cookie != NULL) {
3466 3488                                  /*
3467 3489                                   * The vnic_bcast_send function expects
3468 3490                                   * to receive the sender MAC client
3469 3491                                   * as value for arg2.
3470 3492                                   */
3471 3493                                  mac_bcast_send(flow_cookie, src_mcip, mp,
3472 3494                                      B_TRUE);
3473 3495                          } else {
3474 3496                                  /*
3475 3497                                   * loopback the packet to a local MAC
3476 3498                                   * client. We force a context switch
3477 3499                                   * if both source and destination MAC
3478 3500                                   * clients are used by IP, i.e.
3479 3501                                   * bypass is set.
3480 3502                                   */
3481 3503                                  boolean_t do_switch;
3482 3504                                  mac_client_impl_t *dst_mcip =
3483 3505                                      dst_flow_ent->fe_mcip;
3484 3506  
3485 3507                                  /*
3486 3508                                   * Check if there are promiscuous mode
3487 3509                                   * callbacks defined. This check is
3488 3510                                   * done here in the 'else' case and
3489 3511                                   * not in other cases because this
3490 3512                                   * path is for local loopback
3491 3513                                   * communication which does not go
3492 3514                                   * through MAC_TX(). For paths that go
3493 3515                                   * through MAC_TX(), the promisc_list
3494 3516                                   * check is done inside the MAC_TX()
3495 3517                                   * macro.
3496 3518                                   */
3497 3519                                  if (mip->mi_promisc_list != NULL)
3498 3520                                          mac_promisc_dispatch(mip, mp, src_mcip);
3499 3521  
3500 3522                                  do_switch = ((src_mcip->mci_state_flags &
3501 3523                                      dst_mcip->mci_state_flags &
3502 3524                                      MCIS_CLIENT_POLL_CAPABLE) != 0);
3503 3525  
3504 3526                                  if ((mp1 = mac_fix_cksum(mp)) != NULL) {
3505 3527                                          (dst_flow_ent->fe_cb_fn)(
3506 3528                                              dst_flow_ent->fe_cb_arg1,
3507 3529                                              dst_flow_ent->fe_cb_arg2,
3508 3530                                              mp1, do_switch);
3509 3531                                  }
3510 3532                          }
3511 3533                          FLOW_REFRELE(dst_flow_ent);
3512 3534                  } else {
3513 3535                          /*
3514 3536                           * Unknown destination, send via the underlying
3515 3537                           * NIC.
3516 3538                           */
3517 3539                          MAC_TX(mip, ring, mp, src_mcip);
3518 3540                          if (mp != NULL) {
3519 3541                                  /*
3520 3542                                   * Adjust for the last packet that
3521 3543                                   * could not be transmitted
3522 3544                                   */
3523 3545                                  opackets--;
3524 3546                                  obytes -= pkt_size;
3525 3547                                  mp->b_next = next;
3526 3548                                  break;
3527 3549                          }
3528 3550                  }
3529 3551                  mp = next;
3530 3552          }
3531 3553  
3532 3554  done:
3533 3555          stats->mts_obytes = obytes;
3534 3556          stats->mts_opackets = opackets;
3535 3557          stats->mts_oerrors = oerrors;
3536 3558          return (mp);
3537 3559  }
3538 3560  
3539 3561  /*
3540 3562   * mac_tx_srs_ring_present
3541 3563   *
3542 3564   * Returns whether the specified ring is part of the specified SRS.
3543 3565   */
3544 3566  boolean_t
3545 3567  mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3546 3568  {
3547 3569          int i;
3548 3570          mac_soft_ring_t *soft_ring;
3549 3571  
3550 3572          if (srs->srs_tx.st_arg2 == tx_ring)
3551 3573                  return (B_TRUE);
3552 3574  
3553 3575          for (i = 0; i < srs->srs_tx_ring_count; i++) {
3554 3576                  soft_ring =  srs->srs_tx_soft_rings[i];
3555 3577                  if (soft_ring->s_ring_tx_arg2 == tx_ring)
3556 3578                          return (B_TRUE);
3557 3579          }
3558 3580  
3559 3581          return (B_FALSE);
3560 3582  }
3561 3583  
3562 3584  /*
3563 3585   * mac_tx_srs_get_soft_ring
3564 3586   *
3565 3587   * Returns the TX soft ring associated with the given ring, if present.
3566 3588   */
3567 3589  mac_soft_ring_t *
3568 3590  mac_tx_srs_get_soft_ring(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3569 3591  {
3570 3592          int             i;
3571 3593          mac_soft_ring_t *soft_ring;
3572 3594  
3573 3595          if (srs->srs_tx.st_arg2 == tx_ring)
3574 3596                  return (NULL);
3575 3597  
3576 3598          for (i = 0; i < srs->srs_tx_ring_count; i++) {
3577 3599                  soft_ring =  srs->srs_tx_soft_rings[i];
3578 3600                  if (soft_ring->s_ring_tx_arg2 == tx_ring)
3579 3601                          return (soft_ring);
3580 3602          }
3581 3603  
3582 3604          return (NULL);
3583 3605  }
3584 3606  
3585 3607  /*
3586 3608   * mac_tx_srs_wakeup
3587 3609   *
3588 3610   * Called when Tx desc become available. Wakeup the appropriate worker
3589 3611   * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
3590 3612   * state field.
3591 3613   */
3592 3614  void
3593 3615  mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
3594 3616  {
3595 3617          int i;
3596 3618          mac_soft_ring_t *sringp;
3597 3619          mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3598 3620  
3599 3621          mutex_enter(&mac_srs->srs_lock);
3600 3622          /*
3601 3623           * srs_tx_ring_count == 0 is the single ring mode case. In
3602 3624           * this mode, there will not be Tx soft rings associated
3603 3625           * with the SRS.
3604 3626           */
3605 3627          if (!MAC_TX_SOFT_RINGS(mac_srs)) {
3606 3628                  if (srs_tx->st_arg2 == ring &&
3607 3629                      mac_srs->srs_state & SRS_TX_BLOCKED) {
3608 3630                          mac_srs->srs_state &= ~SRS_TX_BLOCKED;
3609 3631                          srs_tx->st_stat.mts_unblockcnt++;
3610 3632                          cv_signal(&mac_srs->srs_async);
3611 3633                  }
3612 3634                  /*
3613 3635                   * A wakeup can come before tx_srs_drain() could
3614 3636                   * grab srs lock and set SRS_TX_BLOCKED. So
3615 3637                   * always set woken_up flag when we come here.
3616 3638                   */
3617 3639                  srs_tx->st_woken_up = B_TRUE;
3618 3640                  mutex_exit(&mac_srs->srs_lock);
3619 3641                  return;
3620 3642          }
3621 3643  
3622 3644          /*
3623 3645           * If you are here, it is for FANOUT, BW_FANOUT,
3624 3646           * AGGR_MODE or AGGR_BW_MODE case
3625 3647           */
3626 3648          for (i = 0; i < mac_srs->srs_tx_ring_count; i++) {
3627 3649                  sringp = mac_srs->srs_tx_soft_rings[i];
3628 3650                  mutex_enter(&sringp->s_ring_lock);
3629 3651                  if (sringp->s_ring_tx_arg2 == ring) {
3630 3652                          if (sringp->s_ring_state & S_RING_BLOCK) {
3631 3653                                  sringp->s_ring_state &= ~S_RING_BLOCK;
3632 3654                                  sringp->s_st_stat.mts_unblockcnt++;
3633 3655                                  cv_signal(&sringp->s_ring_async);
3634 3656                          }
3635 3657                          sringp->s_ring_tx_woken_up = B_TRUE;
3636 3658                  }
3637 3659                  mutex_exit(&sringp->s_ring_lock);
3638 3660          }
3639 3661          mutex_exit(&mac_srs->srs_lock);
3640 3662  }
3641 3663  
3642 3664  /*
3643 3665   * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
3644 3666   * the blocked clients again.
3645 3667   */
3646 3668  void
3647 3669  mac_tx_notify(mac_impl_t *mip)
3648 3670  {
3649 3671          i_mac_notify(mip, MAC_NOTE_TX);
3650 3672  }
3651 3673  
3652 3674  /*
3653 3675   * RX SOFTRING RELATED FUNCTIONS
3654 3676   *
3655 3677   * These functions really belong in mac_soft_ring.c and here for
3656 3678   * a short period.
3657 3679   */
3658 3680  
3659 3681  #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {             \
3660 3682          /*                                                              \
3661 3683           * Enqueue our mblk chain.                                      \
3662 3684           */                                                             \
3663 3685          ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock));                      \
3664 3686                                                                          \
3665 3687          if ((ringp)->s_ring_last != NULL)                               \
3666 3688                  (ringp)->s_ring_last->b_next = (mp);                    \
3667 3689          else                                                            \
3668 3690                  (ringp)->s_ring_first = (mp);                           \
3669 3691          (ringp)->s_ring_last = (tail);                                  \
3670 3692          (ringp)->s_ring_count += (cnt);                                 \
3671 3693          ASSERT((ringp)->s_ring_count > 0);                              \
3672 3694          if ((ringp)->s_ring_type & ST_RING_BW_CTL) {                    \
3673 3695                  (ringp)->s_ring_size += sz;                             \
3674 3696          }                                                               \
3675 3697  }
3676 3698  
3677 3699  /*
3678 3700   * Default entry point to deliver a packet chain to a MAC client.
3679 3701   * If the MAC client has flows, do the classification with these
3680 3702   * flows as well.
3681 3703   */
3682 3704  /* ARGSUSED */
3683 3705  void
3684 3706  mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
3685 3707      mac_header_info_t *arg3)
3686 3708  {
3687 3709          mac_client_impl_t *mcip = arg1;
3688 3710  
3689 3711          if (mcip->mci_nvids == 1 &&
3690 3712              !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
3691 3713                  /*
3692 3714                   * If the client has exactly one VID associated with it
3693 3715                   * and striping of VLAN header is not disabled,
3694 3716                   * remove the VLAN tag from the packet before
3695 3717                   * passing it on to the client's receive callback.
3696 3718                   * Note that this needs to be done after we dispatch
3697 3719                   * the packet to the promiscuous listeners of the
3698 3720                   * client, since they expect to see the whole
3699 3721                   * frame including the VLAN headers.
3700 3722                   */
3701 3723                  mp_chain = mac_strip_vlan_tag_chain(mp_chain);
3702 3724          }
3703 3725  
3704 3726          mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
3705 3727  }
3706 3728  
3707 3729  /*
3708 3730   * mac_rx_soft_ring_process
3709 3731   *
3710 3732   * process a chain for a given soft ring. The number of packets queued
3711 3733   * in the SRS and its associated soft rings (including this one) is
3712 3734   * very small (tracked by srs_poll_pkt_cnt), then allow the entering
3713 3735   * thread (interrupt or poll thread) to do inline processing. This
3714 3736   * helps keep the latency down under low load.
3715 3737   *
3716 3738   * The proc and arg for each mblk is already stored in the mblk in
3717 3739   * appropriate places.
3718 3740   */
3719 3741  /* ARGSUSED */
3720 3742  void
3721 3743  mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
3722 3744      mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
3723 3745  {
3724 3746          mac_direct_rx_t         proc;
3725 3747          void                    *arg1;
3726 3748          mac_resource_handle_t   arg2;
3727 3749          mac_soft_ring_set_t     *mac_srs = ringp->s_ring_set;
3728 3750  
3729 3751          ASSERT(ringp != NULL);
3730 3752          ASSERT(mp_chain != NULL);
3731 3753          ASSERT(tail != NULL);
3732 3754          ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3733 3755  
3734 3756          mutex_enter(&ringp->s_ring_lock);
3735 3757          ringp->s_ring_total_inpkt += cnt;
3736 3758          ringp->s_ring_total_rbytes += sz;
3737 3759          if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
3738 3760              !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
3739 3761                  /* If on processor or blanking on, then enqueue and return */
3740 3762                  if (ringp->s_ring_state & S_RING_BLANK ||
3741 3763                      ringp->s_ring_state & S_RING_PROC) {
3742 3764                          SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3743 3765                          mutex_exit(&ringp->s_ring_lock);
3744 3766                          return;
3745 3767                  }
3746 3768                  proc = ringp->s_ring_rx_func;
3747 3769                  arg1 = ringp->s_ring_rx_arg1;
3748 3770                  arg2 = ringp->s_ring_rx_arg2;
3749 3771                  /*
3750 3772                   * See if anything is already queued. If we are the
3751 3773                   * first packet, do inline processing else queue the
3752 3774                   * packet and do the drain.
3753 3775                   */
3754 3776                  if (ringp->s_ring_first == NULL) {
3755 3777                          /*
3756 3778                           * Fast-path, ok to process and nothing queued.
3757 3779                           */
3758 3780                          ringp->s_ring_run = curthread;
3759 3781                          ringp->s_ring_state |= (S_RING_PROC);
3760 3782  
3761 3783                          mutex_exit(&ringp->s_ring_lock);
3762 3784  
3763 3785                          /*
3764 3786                           * We are the chain of 1 packet so
3765 3787                           * go through this fast path.
3766 3788                           */
3767 3789                          ASSERT(mp_chain->b_next == NULL);
3768 3790  
3769 3791                          (*proc)(arg1, arg2, mp_chain, NULL);
3770 3792  
3771 3793                          ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3772 3794                          /*
3773 3795                           * If we have a soft ring set which is doing
3774 3796                           * bandwidth control, we need to decrement
3775 3797                           * srs_size and count so it the SRS can have a
3776 3798                           * accurate idea of what is the real data
3777 3799                           * queued between SRS and its soft rings. We
3778 3800                           * decrement the counters only when the packet
3779 3801                           * gets processed by both SRS and the soft ring.
3780 3802                           */
3781 3803                          mutex_enter(&mac_srs->srs_lock);
3782 3804                          MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
3783 3805                          MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
3784 3806                          mutex_exit(&mac_srs->srs_lock);
3785 3807  
3786 3808                          mutex_enter(&ringp->s_ring_lock);
3787 3809                          ringp->s_ring_run = NULL;
3788 3810                          ringp->s_ring_state &= ~S_RING_PROC;
3789 3811                          if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
3790 3812                                  cv_signal(&ringp->s_ring_client_cv);
3791 3813  
3792 3814                          if ((ringp->s_ring_first == NULL) ||
3793 3815                              (ringp->s_ring_state & S_RING_BLANK)) {
3794 3816                                  /*
3795 3817                                   * We processed inline our packet and
3796 3818                                   * nothing new has arrived or our
3797 3819                                   * receiver doesn't want to receive
3798 3820                                   * any packets. We are done.
3799 3821                                   */
3800 3822                                  mutex_exit(&ringp->s_ring_lock);
3801 3823                                  return;
3802 3824                          }
3803 3825                  } else {
3804 3826                          SOFT_RING_ENQUEUE_CHAIN(ringp,
3805 3827                              mp_chain, tail, cnt, sz);
3806 3828                  }
3807 3829  
3808 3830                  /*
3809 3831                   * We are here because either we couldn't do inline
3810 3832                   * processing (because something was already
3811 3833                   * queued), or we had a chain of more than one
3812 3834                   * packet, or something else arrived after we were
3813 3835                   * done with inline processing.
3814 3836                   */
3815 3837                  ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3816 3838                  ASSERT(ringp->s_ring_first != NULL);
3817 3839  
3818 3840                  ringp->s_ring_drain_func(ringp);
3819 3841                  mutex_exit(&ringp->s_ring_lock);
3820 3842                  return;
3821 3843          } else {
3822 3844                  /* ST_RING_WORKER_ONLY case */
3823 3845                  SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3824 3846                  mac_soft_ring_worker_wakeup(ringp);
3825 3847                  mutex_exit(&ringp->s_ring_lock);
3826 3848          }
3827 3849  }
3828 3850  
3829 3851  /*
3830 3852   * TX SOFTRING RELATED FUNCTIONS
3831 3853   *
3832 3854   * These functions really belong in mac_soft_ring.c and here for
3833 3855   * a short period.
3834 3856   */
3835 3857  
3836 3858  #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {          \
3837 3859          ASSERT(MUTEX_HELD(&ringp->s_ring_lock));                        \
3838 3860          ringp->s_ring_state |= S_RING_ENQUEUED;                         \
3839 3861          SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);        \
3840 3862  }
3841 3863  
3842 3864  /*
3843 3865   * mac_tx_sring_queued
3844 3866   *
3845 3867   * When we are out of transmit descriptors and we already have a
3846 3868   * queue that exceeds hiwat (or the client called us with
3847 3869   * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
3848 3870   * soft ring pointer as the opaque cookie for the client enable
3849 3871   * flow control.
3850 3872   */
3851 3873  static mac_tx_cookie_t
3852 3874  mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
3853 3875      mblk_t **ret_mp)
3854 3876  {
3855 3877          int cnt;
3856 3878          size_t sz;
3857 3879          mblk_t *tail;
3858 3880          mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3859 3881          mac_tx_cookie_t cookie = NULL;
3860 3882          boolean_t wakeup_worker = B_TRUE;
3861 3883  
3862 3884          ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3863 3885          MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3864 3886          if (flag & MAC_DROP_ON_NO_DESC) {
3865 3887                  mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
3866 3888                  /* increment freed stats */
3867 3889                  ringp->s_ring_drops += cnt;
3868 3890                  cookie = (mac_tx_cookie_t)ringp;
3869 3891          } else {
3870 3892                  if (ringp->s_ring_first != NULL)
3871 3893                          wakeup_worker = B_FALSE;
3872 3894  
3873 3895                  if (flag & MAC_TX_NO_ENQUEUE) {
3874 3896                          /*
3875 3897                           * If QUEUED is not set, queue the packet
3876 3898                           * and let mac_tx_soft_ring_drain() set
3877 3899                           * the TX_BLOCKED bit for the reasons
3878 3900                           * explained above. Otherwise, return the
3879 3901                           * mblks.
3880 3902                           */
3881 3903                          if (wakeup_worker) {
3882 3904                                  TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
3883 3905                                      mp_chain, tail, cnt, sz);
3884 3906                          } else {
3885 3907                                  ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
3886 3908                                  cookie = (mac_tx_cookie_t)ringp;
3887 3909                                  *ret_mp = mp_chain;
3888 3910                          }
3889 3911                  } else {
3890 3912                          boolean_t enqueue = B_TRUE;
3891 3913  
3892 3914                          if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3893 3915                                  /*
3894 3916                                   * flow-controlled. Store ringp in cookie
3895 3917                                   * so that it can be returned as
3896 3918                                   * mac_tx_cookie_t to client
3897 3919                                   */
3898 3920                                  ringp->s_ring_state |= S_RING_TX_HIWAT;
3899 3921                                  cookie = (mac_tx_cookie_t)ringp;
3900 3922                                  ringp->s_ring_hiwat_cnt++;
3901 3923                                  if (ringp->s_ring_count >
3902 3924                                      ringp->s_ring_tx_max_q_cnt) {
3903 3925                                          /* increment freed stats */
3904 3926                                          ringp->s_ring_drops += cnt;
3905 3927                                          /*
3906 3928                                           * b_prev may be set to the fanout hint
3907 3929                                           * hence can't use freemsg directly
3908 3930                                           */
3909 3931                                          mac_pkt_drop(NULL, NULL,
3910 3932                                              mp_chain, B_FALSE);
3911 3933                                          DTRACE_PROBE1(tx_queued_hiwat,
3912 3934                                              mac_soft_ring_t *, ringp);
3913 3935                                          enqueue = B_FALSE;
3914 3936                                  }
3915 3937                          }
3916 3938                          if (enqueue) {
3917 3939                                  TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
3918 3940                                      tail, cnt, sz);
3919 3941                          }
3920 3942                  }
3921 3943                  if (wakeup_worker)
3922 3944                          cv_signal(&ringp->s_ring_async);
3923 3945          }
3924 3946          return (cookie);
3925 3947  }
3926 3948  
3927 3949  
3928 3950  /*
3929 3951   * mac_tx_soft_ring_process
3930 3952   *
3931 3953   * This routine is called when fanning out outgoing traffic among
3932 3954   * multipe Tx rings.
3933 3955   * Note that a soft ring is associated with a h/w Tx ring.
3934 3956   */
3935 3957  mac_tx_cookie_t
3936 3958  mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
3937 3959      uint16_t flag, mblk_t **ret_mp)
3938 3960  {
3939 3961          mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3940 3962          int     cnt;
3941 3963          size_t  sz;
3942 3964          mblk_t  *tail;
3943 3965          mac_tx_cookie_t cookie = NULL;
3944 3966  
3945 3967          ASSERT(ringp != NULL);
3946 3968          ASSERT(mp_chain != NULL);
3947 3969          ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3948 3970          /*
3949 3971           * The following modes can come here: SRS_TX_BW_FANOUT,
3950 3972           * SRS_TX_FANOUT, SRS_TX_AGGR, SRS_TX_BW_AGGR.
3951 3973           */
3952 3974          ASSERT(MAC_TX_SOFT_RINGS(mac_srs));
3953 3975          ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
3954 3976              mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT ||
3955 3977              mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
3956 3978              mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
3957 3979  
3958 3980          if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
3959 3981                  /* Serialization mode */
3960 3982  
3961 3983                  mutex_enter(&ringp->s_ring_lock);
3962 3984                  if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3963 3985                          cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3964 3986                              flag, ret_mp);
3965 3987                          mutex_exit(&ringp->s_ring_lock);
3966 3988                          return (cookie);
3967 3989                  }
3968 3990                  MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3969 3991                  TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3970 3992                  if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
3971 3993                          /*
3972 3994                           * If ring is blocked due to lack of Tx
3973 3995                           * descs, just return. Worker thread
3974 3996                           * will get scheduled when Tx desc's
3975 3997                           * become available.
3976 3998                           */
3977 3999                          mutex_exit(&ringp->s_ring_lock);
3978 4000                          return (cookie);
3979 4001                  }
3980 4002                  mac_soft_ring_worker_wakeup(ringp);
3981 4003                  mutex_exit(&ringp->s_ring_lock);
3982 4004                  return (cookie);
3983 4005          } else {
3984 4006                  /* Default fanout mode */
3985 4007                  /*
3986 4008                   * S_RING_BLOCKED is set when underlying NIC runs
3987 4009                   * out of Tx descs and messages start getting
3988 4010                   * queued. It won't get reset until
3989 4011                   * tx_srs_drain() completely drains out the
3990 4012                   * messages.
3991 4013                   */
3992 4014                  mac_tx_stats_t          stats;
3993 4015  
3994 4016                  if (ringp->s_ring_state & S_RING_ENQUEUED) {
3995 4017                          /* Tx descs/resources not available */
3996 4018                          mutex_enter(&ringp->s_ring_lock);
3997 4019                          if (ringp->s_ring_state & S_RING_ENQUEUED) {
3998 4020                                  cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3999 4021                                      flag, ret_mp);
4000 4022                                  mutex_exit(&ringp->s_ring_lock);
4001 4023                                  return (cookie);
4002 4024                          }
4003 4025                          /*
4004 4026                           * While we were computing mblk count, the
4005 4027                           * flow control condition got relieved.
4006 4028                           * Continue with the transmission.
4007 4029                           */
4008 4030                          mutex_exit(&ringp->s_ring_lock);
4009 4031                  }
4010 4032  
4011 4033                  mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
4012 4034                      ringp->s_ring_tx_arg2, mp_chain, &stats);
4013 4035  
4014 4036                  /*
4015 4037                   * Multiple threads could be here sending packets.
4016 4038                   * Under such conditions, it is not possible to
4017 4039                   * automically set S_RING_BLOCKED bit to indicate
4018 4040                   * out of tx desc condition. To atomically set
4019 4041                   * this, we queue the returned packet and do
4020 4042                   * the setting of S_RING_BLOCKED in
4021 4043                   * mac_tx_soft_ring_drain().
4022 4044                   */
4023 4045                  if (mp_chain != NULL) {
4024 4046                          mutex_enter(&ringp->s_ring_lock);
4025 4047                          cookie =
4026 4048                              mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
4027 4049                          mutex_exit(&ringp->s_ring_lock);
4028 4050                          return (cookie);
4029 4051                  }
4030 4052                  SRS_TX_STATS_UPDATE(mac_srs, &stats);
4031 4053                  SOFTRING_TX_STATS_UPDATE(ringp, &stats);
4032 4054  
4033 4055                  return (NULL);
4034 4056          }
4035 4057  }

↓ open down ↓

3123 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX