1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/callb.h>
  28 #include <sys/sdt.h>
  29 #include <sys/strsubr.h>
  30 #include <sys/strsun.h>
  31 #include <sys/vlan.h>
  32 #include <inet/ipsec_impl.h>
  33 #include <inet/ip_impl.h>
  34 #include <inet/sadb.h>
  35 #include <inet/ipsecesp.h>
  36 #include <inet/ipsecah.h>
  37 #include <inet/ip6.h>
  38 
  39 #include <sys/mac_impl.h>
  40 #include <sys/mac_client_impl.h>
  41 #include <sys/mac_client_priv.h>
  42 #include <sys/mac_soft_ring.h>
  43 #include <sys/mac_flow_impl.h>
  44 
  45 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
  46     uintptr_t, uint16_t, mblk_t **);
  47 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
  48     uintptr_t, uint16_t, mblk_t **);
  49 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
  50     uintptr_t, uint16_t, mblk_t **);
  51 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
  52     uintptr_t, uint16_t, mblk_t **);
  53 static mac_tx_cookie_t mac_tx_aggr_mode(mac_soft_ring_set_t *, mblk_t *,
  54     uintptr_t, uint16_t, mblk_t **);
  55 
  56 typedef struct mac_tx_mode_s {
  57         mac_tx_srs_mode_t       mac_tx_mode;
  58         mac_tx_func_t           mac_tx_func;
  59 } mac_tx_mode_t;
  60 
  61 /*
  62  * There are seven modes of operation on the Tx side. These modes get set
  63  * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
  64  * none of the other modes are user configurable. They get selected by
  65  * the system depending upon whether the link (or flow) has multiple Tx
  66  * rings or a bandwidth configured, or if the link is an aggr, etc.
  67  *
  68  * When the Tx SRS is operating in aggr mode (st_mode) or if there are
  69  * multiple Tx rings owned by Tx SRS, then each Tx ring (pseudo or
  70  * otherwise) will have a soft ring associated with it. These soft rings
  71  * are stored in srs_tx_soft_rings[] array.
  72  *
  73  * Additionally in the case of aggr, there is the st_soft_rings[] array
  74  * in the mac_srs_tx_t structure. This array is used to store the same
  75  * set of soft rings that are present in srs_tx_soft_rings[] array but
  76  * in a different manner. The soft ring associated with the pseudo Tx
  77  * ring is saved at mr_index (of the pseudo ring) in st_soft_rings[]
  78  * array. This helps in quickly getting the soft ring associated with the
  79  * Tx ring when aggr_find_tx_ring() returns the pseudo Tx ring that is to
  80  * be used for transmit.
  81  */
  82 mac_tx_mode_t mac_tx_mode_list[] = {
  83         {SRS_TX_DEFAULT,        mac_tx_single_ring_mode},
  84         {SRS_TX_SERIALIZE,      mac_tx_serializer_mode},
  85         {SRS_TX_FANOUT,         mac_tx_fanout_mode},
  86         {SRS_TX_BW,             mac_tx_bw_mode},
  87         {SRS_TX_BW_FANOUT,      mac_tx_bw_mode},
  88         {SRS_TX_AGGR,           mac_tx_aggr_mode},
  89         {SRS_TX_BW_AGGR,        mac_tx_bw_mode}
  90 };
  91 
  92 /*
  93  * Soft Ring Set (SRS) - The Run time code that deals with
  94  * dynamic polling from the hardware, bandwidth enforcement,
  95  * fanout etc.
  96  *
  97  * We try to use H/W classification on NIC and assign traffic for
  98  * a MAC address to a particular Rx ring or ring group. There is a
  99  * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
 100  * switches the underlying Rx ring between interrupt and
 101  * polling mode and enforces any specified B/W control.
 102  *
 103  * There is always a SRS created and tied to each H/W and S/W rule.
 104  * Whenever we create a H/W rule, we always add the the same rule to
 105  * S/W classifier and tie a SRS to it.
 106  *
 107  * In case a B/W control is specified, it is broken into bytes
 108  * per ticks and as soon as the quota for a tick is exhausted,
 109  * the underlying Rx ring is forced into poll mode for remainder of
 110  * the tick. The SRS poll thread only polls for bytes that are
 111  * allowed to come in the SRS. We typically let 4x the configured
 112  * B/W worth of packets to come in the SRS (to prevent unnecessary
 113  * drops due to bursts) but only process the specified amount.
 114  *
 115  * A MAC client (e.g. a VNIC or aggr) can have 1 or more
 116  * Rx rings (and corresponding SRSs) assigned to it. The SRS
 117  * in turn can have softrings to do protocol level fanout or
 118  * softrings to do S/W based fanout or both. In case the NIC
 119  * has no Rx rings, we do S/W classification to respective SRS.
 120  * The S/W classification rule is always setup and ready. This
 121  * allows the MAC layer to reassign Rx rings whenever needed
 122  * but packets still continue to flow via the default path and
 123  * getting S/W classified to correct SRS.
 124  *
 125  * The SRS's are used on both Tx and Rx side. They use the same
 126  * data structure but the processing routines have slightly different
 127  * semantics due to the fact that Rx side needs to do dynamic
 128  * polling etc.
 129  *
 130  * Dynamic Polling Notes
 131  * =====================
 132  *
 133  * Each Soft ring set is capable of switching its Rx ring between
 134  * interrupt and poll mode and actively 'polls' for packets in
 135  * poll mode. If the SRS is implementing a B/W limit, it makes
 136  * sure that only Max allowed packets are pulled in poll mode
 137  * and goes to poll mode as soon as B/W limit is exceeded. As
 138  * such, there are no overheads to implement B/W limits.
 139  *
 140  * In poll mode, its better to keep the pipeline going where the
 141  * SRS worker thread keeps processing packets and poll thread
 142  * keeps bringing more packets (specially if they get to run
 143  * on different CPUs). This also prevents the overheads associated
 144  * by excessive signalling (on NUMA machines, this can be
 145  * pretty devastating). The exception is latency optimized case
 146  * where worker thread does no work and interrupt and poll thread
 147  * are allowed to do their own drain.
 148  *
 149  * We use the following policy to control Dynamic Polling:
 150  * 1) We switch to poll mode anytime the processing
 151  *    thread causes a backlog to build up in SRS and
 152  *    its associated Soft Rings (sr_poll_pkt_cnt > 0).
 153  * 2) As long as the backlog stays under the low water
 154  *    mark (sr_lowat), we poll the H/W for more packets.
 155  * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
 156  *    water mark, we stay in poll mode but don't poll
 157  *    the H/W for more packets.
 158  * 4) Anytime in polling mode, if we poll the H/W for
 159  *    packets and find nothing plus we have an existing
 160  *    backlog (sr_poll_pkt_cnt > 0), we stay in polling
 161  *    mode but don't poll the H/W for packets anymore
 162  *    (let the polling thread go to sleep).
 163  * 5) Once the backlog is relived (packets are processed)
 164  *    we reenable polling (by signalling the poll thread)
 165  *    only when the backlog dips below sr_poll_thres.
 166  * 6) sr_hiwat is used exclusively when we are not
 167  *    polling capable and is used to decide when to
 168  *    drop packets so the SRS queue length doesn't grow
 169  *    infinitely.
 170  *
 171  * NOTE: Also see the block level comment on top of mac_soft_ring.c
 172  */
 173 
 174 /*
 175  * mac_latency_optimize
 176  *
 177  * Controls whether the poll thread can process the packets inline
 178  * or let the SRS worker thread do the processing. This applies if
 179  * the SRS was not being processed. For latency sensitive traffic,
 180  * this needs to be true to allow inline processing. For throughput
 181  * under load, this should be false.
 182  *
 183  * This (and other similar) tunable should be rolled into a link
 184  * or flow specific workload hint that can be set using dladm
 185  * linkprop (instead of multiple such tunables).
 186  */
 187 boolean_t mac_latency_optimize = B_TRUE;
 188 
 189 /*
 190  * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
 191  *
 192  * queue a mp or chain in soft ring set and increment the
 193  * local count (srs_count) for the SRS and the shared counter
 194  * (srs_poll_pkt_cnt - shared between SRS and its soft rings
 195  * to track the total unprocessed packets for polling to work
 196  * correctly).
 197  *
 198  * The size (total bytes queued) counters are incremented only
 199  * if we are doing B/W control.
 200  */
 201 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {         \
 202         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                        \
 203         if ((mac_srs)->srs_last != NULL)                             \
 204                 (mac_srs)->srs_last->b_next = (head);                     \
 205         else                                                            \
 206                 (mac_srs)->srs_first = (head);                               \
 207         (mac_srs)->srs_last = (tail);                                        \
 208         (mac_srs)->srs_count += count;                                       \
 209 }
 210 
 211 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {      \
 212         mac_srs_rx_t    *srs_rx = &(mac_srs)->srs_rx;                    \
 213                                                                         \
 214         MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);          \
 215         srs_rx->sr_poll_pkt_cnt += count;                            \
 216         ASSERT(srs_rx->sr_poll_pkt_cnt > 0);                              \
 217         if ((mac_srs)->srs_type & SRST_BW_CONTROL) {                     \
 218                 (mac_srs)->srs_size += (sz);                         \
 219                 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock);         \
 220                 (mac_srs)->srs_bw->mac_bw_sz += (sz);                     \
 221                 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock);          \
 222         }                                                               \
 223 }
 224 
 225 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {      \
 226         mac_srs->srs_state |= SRS_ENQUEUED;                          \
 227         MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);          \
 228         if ((mac_srs)->srs_type & SRST_BW_CONTROL) {                     \
 229                 (mac_srs)->srs_size += (sz);                         \
 230                 (mac_srs)->srs_bw->mac_bw_sz += (sz);                     \
 231         }                                                               \
 232 }
 233 
 234 /*
 235  * Turn polling on routines
 236  */
 237 #define MAC_SRS_POLLING_ON(mac_srs) {                                   \
 238         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                        \
 239         if (((mac_srs)->srs_state &                                      \
 240             (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) {    \
 241                 (mac_srs)->srs_state |= SRS_POLLING;                 \
 242                 (void) mac_hwring_disable_intr((mac_ring_handle_t)      \
 243                     (mac_srs)->srs_ring);                            \
 244                 (mac_srs)->srs_rx.sr_poll_on++;                              \
 245         }                                                               \
 246 }
 247 
 248 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) {                            \
 249         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                        \
 250         if (((mac_srs)->srs_state &                                      \
 251             (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) ==              \
 252             (SRS_POLLING_CAPAB|SRS_WORKER)) {                           \
 253                 (mac_srs)->srs_state |= SRS_POLLING;                 \
 254                 (void) mac_hwring_disable_intr((mac_ring_handle_t)      \
 255                     (mac_srs)->srs_ring);                            \
 256                 (mac_srs)->srs_rx.sr_worker_poll_on++;                       \
 257         }                                                               \
 258 }
 259 
 260 /*
 261  * MAC_SRS_POLL_RING
 262  *
 263  * Signal the SRS poll thread to poll the underlying H/W ring
 264  * provided it wasn't already polling (SRS_GET_PKTS was set).
 265  *
 266  * Poll thread gets to run only from mac_rx_srs_drain() and only
 267  * if the drain was being done by the worker thread.
 268  */
 269 #define MAC_SRS_POLL_RING(mac_srs) {                                    \
 270         mac_srs_rx_t    *srs_rx = &(mac_srs)->srs_rx;                    \
 271                                                                         \
 272         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                        \
 273         srs_rx->sr_poll_thr_sig++;                                   \
 274         if (((mac_srs)->srs_state &                                      \
 275             (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) ==             \
 276                 (SRS_WORKER|SRS_POLLING_CAPAB)) {                       \
 277                 (mac_srs)->srs_state |= SRS_GET_PKTS;                        \
 278                 cv_signal(&(mac_srs)->srs_cv);                           \
 279         } else {                                                        \
 280                 srs_rx->sr_poll_thr_busy++;                          \
 281         }                                                               \
 282 }
 283 
 284 /*
 285  * MAC_SRS_CHECK_BW_CONTROL
 286  *
 287  * Check to see if next tick has started so we can reset the
 288  * SRS_BW_ENFORCED flag and allow more packets to come in the
 289  * system.
 290  */
 291 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) {                             \
 292         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                        \
 293         ASSERT(((mac_srs)->srs_type & SRST_TX) ||                        \
 294             MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock));             \
 295         clock_t now = ddi_get_lbolt();                                  \
 296         if ((mac_srs)->srs_bw->mac_bw_curr_time != now) {         \
 297                 (mac_srs)->srs_bw->mac_bw_curr_time = now;                \
 298                 (mac_srs)->srs_bw->mac_bw_used = 0;                       \
 299                 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED)        \
 300                         (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
 301         }                                                               \
 302 }
 303 
 304 /*
 305  * MAC_SRS_WORKER_WAKEUP
 306  *
 307  * Wake up the SRS worker thread to process the queue as long as
 308  * no one else is processing the queue. If we are optimizing for
 309  * latency, we wake up the worker thread immediately or else we
 310  * wait mac_srs_worker_wakeup_ticks before worker thread gets
 311  * woken up.
 312  */
 313 int mac_srs_worker_wakeup_ticks = 0;
 314 #define MAC_SRS_WORKER_WAKEUP(mac_srs) {                                \
 315         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                        \
 316         if (!((mac_srs)->srs_state & SRS_PROC) &&                        \
 317                 (mac_srs)->srs_tid == NULL) {                                \
 318                 if (((mac_srs)->srs_state & SRS_LATENCY_OPT) ||          \
 319                         (mac_srs_worker_wakeup_ticks == 0))             \
 320                         cv_signal(&(mac_srs)->srs_async);                \
 321                 else                                                    \
 322                         (mac_srs)->srs_tid =                         \
 323                                 timeout(mac_srs_fire, (mac_srs),        \
 324                                         mac_srs_worker_wakeup_ticks);   \
 325         }                                                               \
 326 }
 327 
 328 #define TX_BANDWIDTH_MODE(mac_srs)                              \
 329         ((mac_srs)->srs_tx.st_mode == SRS_TX_BW ||           \
 330             (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT || \
 331             (mac_srs)->srs_tx.st_mode == SRS_TX_BW_AGGR)
 332 
 333 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) {                      \
 334         if (tx_mode == SRS_TX_BW_FANOUT)                                \
 335                 (void) mac_tx_fanout_mode(mac_srs, head, hint, 0, NULL);\
 336         else                                                            \
 337                 (void) mac_tx_aggr_mode(mac_srs, head, hint, 0, NULL);  \
 338 }
 339 
 340 /*
 341  * MAC_TX_SRS_BLOCK
 342  *
 343  * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
 344  * will be set only if srs_tx_woken_up is FALSE. If
 345  * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
 346  * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
 347  * attempt to transmit again and not setting SRS_TX_BLOCKED does
 348  * that.
 349  */
 350 #define MAC_TX_SRS_BLOCK(srs, mp)       {                       \
 351         ASSERT(MUTEX_HELD(&(srs)->srs_lock));                    \
 352         if ((srs)->srs_tx.st_woken_up) {                     \
 353                 (srs)->srs_tx.st_woken_up = B_FALSE;         \
 354         } else {                                                \
 355                 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED));    \
 356                 (srs)->srs_state |= SRS_TX_BLOCKED;          \
 357                 (srs)->srs_tx.st_stat.mts_blockcnt++;                \
 358         }                                                       \
 359 }
 360 
 361 /*
 362  * MAC_TX_SRS_TEST_HIWAT
 363  *
 364  * Called before queueing a packet onto Tx SRS to test and set
 365  * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
 366  */
 367 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) {         \
 368         boolean_t enqueue = 1;                                          \
 369                                                                         \
 370         if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) {               \
 371                 /*                                                      \
 372                  * flow-controlled. Store srs in cookie so that it      \
 373                  * can be returned as mac_tx_cookie_t to client         \
 374                  */                                                     \
 375                 (srs)->srs_state |= SRS_TX_HIWAT;                    \
 376                 cookie = (mac_tx_cookie_t)srs;                          \
 377                 (srs)->srs_tx.st_hiwat_cnt++;                                \
 378                 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) {   \
 379                         /* increment freed stats */                     \
 380                         (srs)->srs_tx.st_stat.mts_sdrops += cnt;     \
 381                         /*                                              \
 382                          * b_prev may be set to the fanout hint         \
 383                          * hence can't use freemsg directly             \
 384                          */                                             \
 385                         mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);    \
 386                         DTRACE_PROBE1(tx_queued_hiwat,                  \
 387                             mac_soft_ring_set_t *, srs);                \
 388                         enqueue = 0;                                    \
 389                 }                                                       \
 390         }                                                               \
 391         if (enqueue)                                                    \
 392                 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz);       \
 393 }
 394 
 395 /* Some utility macros */
 396 #define MAC_SRS_BW_LOCK(srs)                                            \
 397         if (!(srs->srs_type & SRST_TX))                                  \
 398                 mutex_enter(&srs->srs_bw->mac_bw_lock);
 399 
 400 #define MAC_SRS_BW_UNLOCK(srs)                                          \
 401         if (!(srs->srs_type & SRST_TX))                                  \
 402                 mutex_exit(&srs->srs_bw->mac_bw_lock);
 403 
 404 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) {              \
 405         mac_pkt_drop(NULL, NULL, mp, B_FALSE);                  \
 406         /* increment freed stats */                             \
 407         mac_srs->srs_tx.st_stat.mts_sdrops++;                        \
 408         cookie = (mac_tx_cookie_t)srs;                          \
 409 }
 410 
 411 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) {          \
 412         mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT;                  \
 413         cookie = (mac_tx_cookie_t)srs;                                  \
 414         *ret_mp = mp_chain;                                             \
 415 }
 416 
 417 /*
 418  * Drop the rx packet and advance to the next one in the chain.
 419  */
 420 static void
 421 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
 422 {
 423         mac_srs_rx_t    *srs_rx = &srs->srs_rx;
 424 
 425         ASSERT(mp->b_next == NULL);
 426         mutex_enter(&srs->srs_lock);
 427         MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
 428         MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
 429         mutex_exit(&srs->srs_lock);
 430 
 431         srs_rx->sr_stat.mrs_sdrops++;
 432         freemsg(mp);
 433 }
 434 
 435 /* DATAPATH RUNTIME ROUTINES */
 436 
 437 /*
 438  * mac_srs_fire
 439  *
 440  * Timer callback routine for waking up the SRS worker thread.
 441  */
 442 static void
 443 mac_srs_fire(void *arg)
 444 {
 445         mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
 446 
 447         mutex_enter(&mac_srs->srs_lock);
 448         if (mac_srs->srs_tid == 0) {
 449                 mutex_exit(&mac_srs->srs_lock);
 450                 return;
 451         }
 452 
 453         mac_srs->srs_tid = 0;
 454         if (!(mac_srs->srs_state & SRS_PROC))
 455                 cv_signal(&mac_srs->srs_async);
 456 
 457         mutex_exit(&mac_srs->srs_lock);
 458 }
 459 
 460 /*
 461  * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
 462  * and it is used on the TX path.
 463  */
 464 #define HASH_HINT(hint) \
 465         ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
 466 
 467 
 468 /*
 469  * hash based on the src address and the port information.
 470  */
 471 #define HASH_ADDR(src, ports)                                   \
 472         (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \
 473         ((ports) >> 8) ^ (ports))
 474 
 475 #define COMPUTE_INDEX(key, sz)  (key % sz)
 476 
 477 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) {       \
 478         if ((tail) != NULL) {                                           \
 479                 ASSERT((tail)->b_next == NULL);                              \
 480                 (tail)->b_next = (mp);                                       \
 481         } else {                                                        \
 482                 ASSERT((head) == NULL);                                 \
 483                 (head) = (mp);                                          \
 484         }                                                               \
 485         (tail) = (mp);                                                  \
 486         (cnt)++;                                                        \
 487         if ((bw_ctl))                                                   \
 488                 (sz) += (sz0);                                          \
 489 }
 490 
 491 #define MAC_FANOUT_DEFAULT      0
 492 #define MAC_FANOUT_RND_ROBIN    1
 493 int mac_fanout_type = MAC_FANOUT_DEFAULT;
 494 
 495 #define MAX_SR_TYPES    3
 496 /* fanout types for port based hashing */
 497 enum pkt_type {
 498         V4_TCP = 0,
 499         V4_UDP,
 500         OTH,
 501         UNDEF
 502 };
 503 
 504 /*
 505  * In general we do port based hashing to spread traffic over different
 506  * softrings. The below tunable allows to override that behavior. Setting it
 507  * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
 508  * is also the applicable to ipv6 packets carrying multiple optional headers
 509  * and other uncommon packet types.
 510  */
 511 boolean_t mac_src_ipv6_fanout = B_FALSE;
 512 
 513 /*
 514  * Pair of local and remote ports in the transport header
 515  */
 516 #define PORTS_SIZE 4
 517 
 518 /*
 519  * mac_rx_srs_proto_fanout
 520  *
 521  * This routine delivers packets destined to an SRS into one of the
 522  * protocol soft rings.
 523  *
 524  * Given a chain of packets we need to split it up into multiple sub chains
 525  * destined into TCP, UDP or OTH soft ring. Instead of entering
 526  * the soft ring one packet at a time, we want to enter it in the form of a
 527  * chain otherwise we get this start/stop behaviour where the worker thread
 528  * goes to sleep and then next packets comes in forcing it to wake up etc.
 529  */
 530 static void
 531 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 532 {
 533         struct ether_header             *ehp;
 534         struct ether_vlan_header        *evhp;
 535         uint32_t                        sap;
 536         ipha_t                          *ipha;
 537         uint8_t                         *dstaddr;
 538         size_t                          hdrsize;
 539         mblk_t                          *mp;
 540         mblk_t                          *headmp[MAX_SR_TYPES];
 541         mblk_t                          *tailmp[MAX_SR_TYPES];
 542         int                             cnt[MAX_SR_TYPES];
 543         size_t                          sz[MAX_SR_TYPES];
 544         size_t                          sz1;
 545         boolean_t                       bw_ctl;
 546         boolean_t                       hw_classified;
 547         boolean_t                       dls_bypass;
 548         boolean_t                       is_ether;
 549         boolean_t                       is_unicast;
 550         enum pkt_type                   type;
 551         mac_client_impl_t               *mcip = mac_srs->srs_mcip;
 552 
 553         is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
 554         bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
 555 
 556         /*
 557          * If we don't have a Rx ring, S/W classification would have done
 558          * its job and its a packet meant for us. If we were polling on
 559          * the default ring (i.e. there was a ring assigned to this SRS),
 560          * then we need to make sure that the mac address really belongs
 561          * to us.
 562          */
 563         hw_classified = mac_srs->srs_ring != NULL &&
 564             mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
 565 
 566         /*
 567          * Special clients (eg. VLAN, non ether, etc) need DLS
 568          * processing in the Rx path. SRST_DLS_BYPASS will be clear for
 569          * such SRSs. Another way of disabling bypass is to set the
 570          * MCIS_RX_BYPASS_DISABLE flag.
 571          */
 572         dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
 573             ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
 574 
 575         bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
 576         bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
 577         bzero(cnt, MAX_SR_TYPES * sizeof (int));
 578         bzero(sz, MAX_SR_TYPES * sizeof (size_t));
 579 
 580         /*
 581          * We got a chain from SRS that we need to send to the soft rings.
 582          * Since squeues for TCP & IPv4 sap poll their soft rings (for
 583          * performance reasons), we need to separate out v4_tcp, v4_udp
 584          * and the rest goes in other.
 585          */
 586         while (head != NULL) {
 587                 mp = head;
 588                 head = head->b_next;
 589                 mp->b_next = NULL;
 590 
 591                 type = OTH;
 592                 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
 593 
 594                 if (is_ether) {
 595                         /*
 596                          * At this point we can be sure the packet at least
 597                          * has an ether header.
 598                          */
 599                         if (sz1 < sizeof (struct ether_header)) {
 600                                 mac_rx_drop_pkt(mac_srs, mp);
 601                                 continue;
 602                         }
 603                         ehp = (struct ether_header *)mp->b_rptr;
 604 
 605                         /*
 606                          * Determine if this is a VLAN or non-VLAN packet.
 607                          */
 608                         if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
 609                                 evhp = (struct ether_vlan_header *)mp->b_rptr;
 610                                 sap = ntohs(evhp->ether_type);
 611                                 hdrsize = sizeof (struct ether_vlan_header);
 612                                 /*
 613                                  * Check if the VID of the packet, if any,
 614                                  * belongs to this client.
 615                                  */
 616                                 if (!mac_client_check_flow_vid(mcip,
 617                                     VLAN_ID(ntohs(evhp->ether_tci)))) {
 618                                         mac_rx_drop_pkt(mac_srs, mp);
 619                                         continue;
 620                                 }
 621                         } else {
 622                                 hdrsize = sizeof (struct ether_header);
 623                         }
 624                         is_unicast =
 625                             ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
 626                         dstaddr = (uint8_t *)&ehp->ether_dhost;
 627                 } else {
 628                         mac_header_info_t               mhi;
 629 
 630                         if (mac_header_info((mac_handle_t)mcip->mci_mip,
 631                             mp, &mhi) != 0) {
 632                                 mac_rx_drop_pkt(mac_srs, mp);
 633                                 continue;
 634                         }
 635                         hdrsize = mhi.mhi_hdrsize;
 636                         sap = mhi.mhi_bindsap;
 637                         is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
 638                         dstaddr = (uint8_t *)mhi.mhi_daddr;
 639                 }
 640 
 641                 if (!dls_bypass) {
 642                         FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
 643                             cnt[type], bw_ctl, sz[type], sz1, mp);
 644                         continue;
 645                 }
 646 
 647                 if (sap == ETHERTYPE_IP) {
 648                         /*
 649                          * If we are H/W classified, but we have promisc
 650                          * on, then we need to check for the unicast address.
 651                          */
 652                         if (hw_classified && mcip->mci_promisc_list != NULL) {
 653                                 mac_address_t           *map;
 654 
 655                                 rw_enter(&mcip->mci_rw_lock, RW_READER);
 656                                 map = mcip->mci_unicast;
 657                                 if (bcmp(dstaddr, map->ma_addr,
 658                                     map->ma_len) == 0)
 659                                         type = UNDEF;
 660                                 rw_exit(&mcip->mci_rw_lock);
 661                         } else if (is_unicast) {
 662                                 type = UNDEF;
 663                         }
 664                 }
 665 
 666                 /*
 667                  * This needs to become a contract with the driver for
 668                  * the fast path.
 669                  *
 670                  * In the normal case the packet will have at least the L2
 671                  * header and the IP + Transport header in the same mblk.
 672                  * This is usually the case when the NIC driver sends up
 673                  * the packet. This is also true when the stack generates
 674                  * a packet that is looped back and when the stack uses the
 675                  * fastpath mechanism. The normal case is optimized for
 676                  * performance and may bypass DLS. All other cases go through
 677                  * the 'OTH' type path without DLS bypass.
 678                  */
 679 
 680                 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
 681                 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
 682                         type = OTH;
 683 
 684                 if (type == OTH) {
 685                         FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
 686                             cnt[type], bw_ctl, sz[type], sz1, mp);
 687                         continue;
 688                 }
 689 
 690                 ASSERT(type == UNDEF);
 691                 /*
 692                  * We look for at least 4 bytes past the IP header to get
 693                  * the port information. If we get an IP fragment, we don't
 694                  * have the port information, and we use just the protocol
 695                  * information.
 696                  */
 697                 switch (ipha->ipha_protocol) {
 698                 case IPPROTO_TCP:
 699                         type = V4_TCP;
 700                         mp->b_rptr += hdrsize;
 701                         break;
 702                 case IPPROTO_UDP:
 703                         type = V4_UDP;
 704                         mp->b_rptr += hdrsize;
 705                         break;
 706                 default:
 707                         type = OTH;
 708                         break;
 709                 }
 710 
 711                 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
 712                     bw_ctl, sz[type], sz1, mp);
 713         }
 714 
 715         for (type = V4_TCP; type < UNDEF; type++) {
 716                 if (headmp[type] != NULL) {
 717                         mac_soft_ring_t                 *softring;
 718 
 719                         ASSERT(tailmp[type]->b_next == NULL);
 720                         switch (type) {
 721                         case V4_TCP:
 722                                 softring = mac_srs->srs_tcp_soft_rings[0];
 723                                 break;
 724                         case V4_UDP:
 725                                 softring = mac_srs->srs_udp_soft_rings[0];
 726                                 break;
 727                         case OTH:
 728                                 softring = mac_srs->srs_oth_soft_rings[0];
 729                         }
 730                         mac_rx_soft_ring_process(mcip, softring,
 731                             headmp[type], tailmp[type], cnt[type], sz[type]);
 732                 }
 733         }
 734 }
 735 
 736 int     fanout_unalligned = 0;
 737 
 738 /*
 739  * mac_rx_srs_long_fanout
 740  *
 741  * The fanout routine for IPv6
 742  */
 743 static int
 744 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
 745     uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
 746 {
 747         ip6_t           *ip6h;
 748         uint8_t         *whereptr;
 749         uint_t          hash;
 750         uint16_t        remlen;
 751         uint8_t         nexthdr;
 752         uint16_t        hdr_len;
 753 
 754         if (sap == ETHERTYPE_IPV6) {
 755                 boolean_t       modifiable = B_TRUE;
 756 
 757                 ASSERT(MBLKL(mp) >= hdrsize);
 758 
 759                 ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
 760                 if ((unsigned char *)ip6h == mp->b_wptr) {
 761                         /*
 762                          * The first mblk_t only includes the mac header.
 763                          * Note that it is safe to change the mp pointer here,
 764                          * as the subsequent operation does not assume mp
 765                          * points to the start of the mac header.
 766                          */
 767                         mp = mp->b_cont;
 768 
 769                         /*
 770                          * Make sure ip6h holds the full ip6_t structure.
 771                          */
 772                         if (mp == NULL)
 773                                 return (-1);
 774 
 775                         if (MBLKL(mp) < IPV6_HDR_LEN) {
 776                                 modifiable = (DB_REF(mp) == 1);
 777 
 778                                 if (modifiable &&
 779                                     !pullupmsg(mp, IPV6_HDR_LEN)) {
 780                                         return (-1);
 781                                 }
 782                         }
 783 
 784                         ip6h = (ip6_t *)mp->b_rptr;
 785                 }
 786 
 787                 if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
 788                     ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
 789                         /*
 790                          * If either ip6h is not alligned, or ip6h does not
 791                          * hold the complete ip6_t structure (a pullupmsg()
 792                          * is not an option since it would result in an
 793                          * unalligned ip6h), fanout to the default ring. Note
 794                          * that this may cause packets reordering.
 795                          */
 796                         *indx = 0;
 797                         *type = OTH;
 798                         fanout_unalligned++;
 799                         return (0);
 800                 }
 801 
 802                 remlen = ntohs(ip6h->ip6_plen);
 803                 nexthdr = ip6h->ip6_nxt;
 804 
 805                 if (remlen < MIN_EHDR_LEN)
 806                         return (-1);
 807                 /*
 808                  * Do src based fanout if below tunable is set to B_TRUE or
 809                  * when mac_ip_hdr_length_v6() fails because of malformed
 810                  * packets or because mblk's need to be concatenated using
 811                  * pullupmsg().
 812                  */
 813                 if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
 814                     mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
 815                         goto src_based_fanout;
 816                 }
 817                 whereptr = (uint8_t *)ip6h + hdr_len;
 818 
 819                 /* If the transport is one of below, we do port based fanout */
 820                 switch (nexthdr) {
 821                 case IPPROTO_TCP:
 822                 case IPPROTO_UDP:
 823                 case IPPROTO_SCTP:
 824                 case IPPROTO_ESP:
 825                         /*
 826                          * If the ports in the transport header is not part of
 827                          * the mblk, do src_based_fanout, instead of calling
 828                          * pullupmsg().
 829                          */
 830                         if (mp->b_cont != NULL &&
 831                             whereptr + PORTS_SIZE > mp->b_wptr) {
 832                                 goto src_based_fanout;
 833                         }
 834                         break;
 835                 default:
 836                         break;
 837                 }
 838 
 839                 switch (nexthdr) {
 840                 case IPPROTO_TCP:
 841                         hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
 842                             *(uint32_t *)whereptr);
 843                         *indx = COMPUTE_INDEX(hash,
 844                             mac_srs->srs_tcp_ring_count);
 845                         *type = OTH;
 846                         break;
 847 
 848                 case IPPROTO_UDP:
 849                 case IPPROTO_SCTP:
 850                 case IPPROTO_ESP:
 851                         if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
 852                                 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
 853                                     *(uint32_t *)whereptr);
 854                                 *indx = COMPUTE_INDEX(hash,
 855                                     mac_srs->srs_udp_ring_count);
 856                         } else {
 857                                 *indx = mac_srs->srs_ind %
 858                                     mac_srs->srs_udp_ring_count;
 859                                 mac_srs->srs_ind++;
 860                         }
 861                         *type = OTH;
 862                         break;
 863 
 864                         /* For all other protocol, do source based fanout */
 865                 default:
 866                         goto src_based_fanout;
 867                 }
 868         } else {
 869                 *indx = 0;
 870                 *type = OTH;
 871         }
 872         return (0);
 873 
 874 src_based_fanout:
 875         hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
 876         *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
 877         *type = OTH;
 878         return (0);
 879 }
 880 
 881 /*
 882  * mac_rx_srs_fanout
 883  *
 884  * This routine delivers packets destined to an SRS into a soft ring member
 885  * of the set.
 886  *
 887  * Given a chain of packets we need to split it up into multiple sub chains
 888  * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
 889  * the soft ring one packet at a time, we want to enter it in the form of a
 890  * chain otherwise we get this start/stop behaviour where the worker thread
 891  * goes to sleep and then next packets comes in forcing it to wake up etc.
 892  *
 893  * Note:
 894  * Since we know what is the maximum fanout possible, we create a 2D array
 895  * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
 896  * variables so that we can enter the softrings with chain. We need the
 897  * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
 898  * for each packet would be expensive). If we ever want to have the
 899  * ability to have unlimited fanout, we should probably declare a head,
 900  * tail, cnt, sz with each soft ring (a data struct which contains a softring
 901  * along with these members) and create an array of this uber struct so we
 902  * don't have to do kmem_alloc.
 903  */
 904 int     fanout_oth1 = 0;
 905 int     fanout_oth2 = 0;
 906 int     fanout_oth3 = 0;
 907 int     fanout_oth4 = 0;
 908 int     fanout_oth5 = 0;
 909 
 910 static void
 911 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 912 {
 913         struct ether_header             *ehp;
 914         struct ether_vlan_header        *evhp;
 915         uint32_t                        sap;
 916         ipha_t                          *ipha;
 917         uint8_t                         *dstaddr;
 918         uint_t                          indx;
 919         size_t                          ports_offset;
 920         size_t                          ipha_len;
 921         size_t                          hdrsize;
 922         uint_t                          hash;
 923         mblk_t                          *mp;
 924         mblk_t                          *headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
 925         mblk_t                          *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
 926         int                             cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
 927         size_t                          sz[MAX_SR_TYPES][MAX_SR_FANOUT];
 928         size_t                          sz1;
 929         boolean_t                       bw_ctl;
 930         boolean_t                       hw_classified;
 931         boolean_t                       dls_bypass;
 932         boolean_t                       is_ether;
 933         boolean_t                       is_unicast;
 934         int                             fanout_cnt;
 935         enum pkt_type                   type;
 936         mac_client_impl_t               *mcip = mac_srs->srs_mcip;
 937 
 938         is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
 939         bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
 940 
 941         /*
 942          * If we don't have a Rx ring, S/W classification would have done
 943          * its job and its a packet meant for us. If we were polling on
 944          * the default ring (i.e. there was a ring assigned to this SRS),
 945          * then we need to make sure that the mac address really belongs
 946          * to us.
 947          */
 948         hw_classified = mac_srs->srs_ring != NULL &&
 949             mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
 950 
 951         /*
 952          * Special clients (eg. VLAN, non ether, etc) need DLS
 953          * processing in the Rx path. SRST_DLS_BYPASS will be clear for
 954          * such SRSs. Another way of disabling bypass is to set the
 955          * MCIS_RX_BYPASS_DISABLE flag.
 956          */
 957         dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
 958             ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
 959 
 960         /*
 961          * Since the softrings are never destroyed and we always
 962          * create equal number of softrings for TCP, UDP and rest,
 963          * its OK to check one of them for count and use it without
 964          * any lock. In future, if soft rings get destroyed because
 965          * of reduction in fanout, we will need to ensure that happens
 966          * behind the SRS_PROC.
 967          */
 968         fanout_cnt = mac_srs->srs_tcp_ring_count;
 969 
 970         bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
 971         bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
 972         bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
 973         bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
 974 
 975         /*
 976          * We got a chain from SRS that we need to send to the soft rings.
 977          * Since squeues for TCP & IPv4 sap poll their soft rings (for
 978          * performance reasons), we need to separate out v4_tcp, v4_udp
 979          * and the rest goes in other.
 980          */
 981         while (head != NULL) {
 982                 mp = head;
 983                 head = head->b_next;
 984                 mp->b_next = NULL;
 985 
 986                 type = OTH;
 987                 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
 988 
 989                 if (is_ether) {
 990                         /*
 991                          * At this point we can be sure the packet at least
 992                          * has an ether header.
 993                          */
 994                         if (sz1 < sizeof (struct ether_header)) {
 995                                 mac_rx_drop_pkt(mac_srs, mp);
 996                                 continue;
 997                         }
 998                         ehp = (struct ether_header *)mp->b_rptr;
 999 
1000                         /*
1001                          * Determine if this is a VLAN or non-VLAN packet.
1002                          */
1003                         if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1004                                 evhp = (struct ether_vlan_header *)mp->b_rptr;
1005                                 sap = ntohs(evhp->ether_type);
1006                                 hdrsize = sizeof (struct ether_vlan_header);
1007                                 /*
1008                                  * Check if the VID of the packet, if any,
1009                                  * belongs to this client.
1010                                  */
1011                                 if (!mac_client_check_flow_vid(mcip,
1012                                     VLAN_ID(ntohs(evhp->ether_tci)))) {
1013                                         mac_rx_drop_pkt(mac_srs, mp);
1014                                         continue;
1015                                 }
1016                         } else {
1017                                 hdrsize = sizeof (struct ether_header);
1018                         }
1019                         is_unicast =
1020                             ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1021                         dstaddr = (uint8_t *)&ehp->ether_dhost;
1022                 } else {
1023                         mac_header_info_t               mhi;
1024 
1025                         if (mac_header_info((mac_handle_t)mcip->mci_mip,
1026                             mp, &mhi) != 0) {
1027                                 mac_rx_drop_pkt(mac_srs, mp);
1028                                 continue;
1029                         }
1030                         hdrsize = mhi.mhi_hdrsize;
1031                         sap = mhi.mhi_bindsap;
1032                         is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
1033                         dstaddr = (uint8_t *)mhi.mhi_daddr;
1034                 }
1035 
1036                 if (!dls_bypass) {
1037                         if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1038                             hdrsize, &type, &indx) == -1) {
1039                                 mac_rx_drop_pkt(mac_srs, mp);
1040                                 continue;
1041                         }
1042 
1043                         FANOUT_ENQUEUE_MP(headmp[type][indx],
1044                             tailmp[type][indx], cnt[type][indx], bw_ctl,
1045                             sz[type][indx], sz1, mp);
1046                         continue;
1047                 }
1048 
1049 
1050                 /*
1051                  * If we are using the default Rx ring where H/W or S/W
1052                  * classification has not happened, we need to verify if
1053                  * this unicast packet really belongs to us.
1054                  */
1055                 if (sap == ETHERTYPE_IP) {
1056                         /*
1057                          * If we are H/W classified, but we have promisc
1058                          * on, then we need to check for the unicast address.
1059                          */
1060                         if (hw_classified && mcip->mci_promisc_list != NULL) {
1061                                 mac_address_t           *map;
1062 
1063                                 rw_enter(&mcip->mci_rw_lock, RW_READER);
1064                                 map = mcip->mci_unicast;
1065                                 if (bcmp(dstaddr, map->ma_addr,
1066                                     map->ma_len) == 0)
1067                                         type = UNDEF;
1068                                 rw_exit(&mcip->mci_rw_lock);
1069                         } else if (is_unicast) {
1070                                 type = UNDEF;
1071                         }
1072                 }
1073 
1074                 /*
1075                  * This needs to become a contract with the driver for
1076                  * the fast path.
1077                  */
1078 
1079                 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1080                 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
1081                         type = OTH;
1082                         fanout_oth1++;
1083                 }
1084 
1085                 if (type != OTH) {
1086                         uint16_t        frag_offset_flags;
1087 
1088                         switch (ipha->ipha_protocol) {
1089                         case IPPROTO_TCP:
1090                         case IPPROTO_UDP:
1091                         case IPPROTO_SCTP:
1092                         case IPPROTO_ESP:
1093                                 ipha_len = IPH_HDR_LENGTH(ipha);
1094                                 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
1095                                     mp->b_wptr) {
1096                                         type = OTH;
1097                                         break;
1098                                 }
1099                                 frag_offset_flags =
1100                                     ntohs(ipha->ipha_fragment_offset_and_flags);
1101                                 if ((frag_offset_flags &
1102                                     (IPH_MF | IPH_OFFSET)) != 0) {
1103                                         type = OTH;
1104                                         fanout_oth3++;
1105                                         break;
1106                                 }
1107                                 ports_offset = hdrsize + ipha_len;
1108                                 break;
1109                         default:
1110                                 type = OTH;
1111                                 fanout_oth4++;
1112                                 break;
1113                         }
1114                 }
1115 
1116                 if (type == OTH) {
1117                         if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1118                             hdrsize, &type, &indx) == -1) {
1119                                 mac_rx_drop_pkt(mac_srs, mp);
1120                                 continue;
1121                         }
1122 
1123                         FANOUT_ENQUEUE_MP(headmp[type][indx],
1124                             tailmp[type][indx], cnt[type][indx], bw_ctl,
1125                             sz[type][indx], sz1, mp);
1126                         continue;
1127                 }
1128 
1129                 ASSERT(type == UNDEF);
1130 
1131                 /*
1132                  * XXX-Sunay: We should hold srs_lock since ring_count
1133                  * below can change. But if we are always called from
1134                  * mac_rx_srs_drain and SRS_PROC is set, then we can
1135                  * enforce that ring_count can't be changed i.e.
1136                  * to change fanout type or ring count, the calling
1137                  * thread needs to be behind SRS_PROC.
1138                  */
1139                 switch (ipha->ipha_protocol) {
1140                 case IPPROTO_TCP:
1141                         /*
1142                          * Note that for ESP, we fanout on SPI and it is at the
1143                          * same offset as the 2x16-bit ports. So it is clumped
1144                          * along with TCP, UDP and SCTP.
1145                          */
1146                         hash = HASH_ADDR(ipha->ipha_src,
1147                             *(uint32_t *)(mp->b_rptr + ports_offset));
1148                         indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
1149                         type = V4_TCP;
1150                         mp->b_rptr += hdrsize;
1151                         break;
1152                 case IPPROTO_UDP:
1153                 case IPPROTO_SCTP:
1154                 case IPPROTO_ESP:
1155                         if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
1156                                 hash = HASH_ADDR(ipha->ipha_src,
1157                                     *(uint32_t *)(mp->b_rptr + ports_offset));
1158                                 indx = COMPUTE_INDEX(hash,
1159                                     mac_srs->srs_udp_ring_count);
1160                         } else {
1161                                 indx = mac_srs->srs_ind %
1162                                     mac_srs->srs_udp_ring_count;
1163                                 mac_srs->srs_ind++;
1164                         }
1165                         type = V4_UDP;
1166                         mp->b_rptr += hdrsize;
1167                         break;
1168                 default:
1169                         indx = 0;
1170                         type = OTH;
1171                 }
1172 
1173                 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
1174                     cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
1175         }
1176 
1177         for (type = V4_TCP; type < UNDEF; type++) {
1178                 int     i;
1179 
1180                 for (i = 0; i < fanout_cnt; i++) {
1181                         if (headmp[type][i] != NULL) {
1182                                 mac_soft_ring_t *softring;
1183 
1184                                 ASSERT(tailmp[type][i]->b_next == NULL);
1185                                 switch (type) {
1186                                 case V4_TCP:
1187                                         softring =
1188                                             mac_srs->srs_tcp_soft_rings[i];
1189                                         break;
1190                                 case V4_UDP:
1191                                         softring =
1192                                             mac_srs->srs_udp_soft_rings[i];
1193                                         break;
1194                                 case OTH:
1195                                         softring =
1196                                             mac_srs->srs_oth_soft_rings[i];
1197                                         break;
1198                                 }
1199                                 mac_rx_soft_ring_process(mcip,
1200                                     softring, headmp[type][i], tailmp[type][i],
1201                                     cnt[type][i], sz[type][i]);
1202                         }
1203                 }
1204         }
1205 }
1206 
1207 #define SRS_BYTES_TO_PICKUP     150000
1208 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
1209 
1210 /*
1211  * mac_rx_srs_poll_ring
1212  *
1213  * This SRS Poll thread uses this routine to poll the underlying hardware
1214  * Rx ring to get a chain of packets. It can inline process that chain
1215  * if mac_latency_optimize is set (default) or signal the SRS worker thread
1216  * to do the remaining processing.
1217  *
1218  * Since packets come in the system via interrupt or poll path, we also
1219  * update the stats and deal with promiscous clients here.
1220  */
1221 void
1222 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
1223 {
1224         kmutex_t                *lock = &mac_srs->srs_lock;
1225         kcondvar_t              *async = &mac_srs->srs_cv;
1226         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1227         mblk_t                  *head, *tail, *mp;
1228         callb_cpr_t             cprinfo;
1229         ssize_t                 bytes_to_pickup;
1230         size_t                  sz;
1231         int                     count;
1232         mac_client_impl_t       *smcip;
1233 
1234         CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
1235         mutex_enter(lock);
1236 
1237 start:
1238         for (;;) {
1239                 if (mac_srs->srs_state & SRS_PAUSE)
1240                         goto done;
1241 
1242                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1243                 cv_wait(async, lock);
1244                 CALLB_CPR_SAFE_END(&cprinfo, lock);
1245 
1246                 if (mac_srs->srs_state & SRS_PAUSE)
1247                         goto done;
1248 
1249 check_again:
1250                 if (mac_srs->srs_type & SRST_BW_CONTROL) {
1251                         /*
1252                          * We pick as many bytes as we are allowed to queue.
1253                          * Its possible that we will exceed the total
1254                          * packets queued in case this SRS is part of the
1255                          * Rx ring group since > 1 poll thread can be pulling
1256                          * upto the max allowed packets at the same time
1257                          * but that should be OK.
1258                          */
1259                         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1260                         bytes_to_pickup =
1261                             mac_srs->srs_bw->mac_bw_drop_threshold -
1262                             mac_srs->srs_bw->mac_bw_sz;
1263                         /*
1264                          * We shouldn't have been signalled if we
1265                          * have 0 or less bytes to pick but since
1266                          * some of the bytes accounting is driver
1267                          * dependant, we do the safety check.
1268                          */
1269                         if (bytes_to_pickup < 0)
1270                                 bytes_to_pickup = 0;
1271                         mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1272                 } else {
1273                         /*
1274                          * ToDO: Need to change the polling API
1275                          * to add a packet count and a flag which
1276                          * tells the driver whether we want packets
1277                          * based on a count, or bytes, or all the
1278                          * packets queued in the driver/HW. This
1279                          * way, we never have to check the limits
1280                          * on poll path. We truly let only as many
1281                          * packets enter the system as we are willing
1282                          * to process or queue.
1283                          *
1284                          * Something along the lines of
1285                          * pkts_to_pickup = mac_soft_ring_max_q_cnt -
1286                          *      mac_srs->srs_poll_pkt_cnt
1287                          */
1288 
1289                         /*
1290                          * Since we are not doing B/W control, pick
1291                          * as many packets as allowed.
1292                          */
1293                         bytes_to_pickup = max_bytes_to_pickup;
1294                 }
1295 
1296                 /* Poll the underlying Hardware */
1297                 mutex_exit(lock);
1298                 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
1299                 mutex_enter(lock);
1300 
1301                 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
1302                     SRS_POLL_THR_OWNER);
1303 
1304                 mp = tail = head;
1305                 count = 0;
1306                 sz = 0;
1307                 while (mp != NULL) {
1308                         tail = mp;
1309                         sz += msgdsize(mp);
1310                         mp = mp->b_next;
1311                         count++;
1312                 }
1313 
1314                 if (head != NULL) {
1315                         tail->b_next = NULL;
1316                         smcip = mac_srs->srs_mcip;
1317 
1318                         SRS_RX_STAT_UPDATE(mac_srs, pollbytes, sz);
1319                         SRS_RX_STAT_UPDATE(mac_srs, pollcnt, count);
1320 
1321                         /*
1322                          * If there are any promiscuous mode callbacks
1323                          * defined for this MAC client, pass them a copy
1324                          * if appropriate and also update the counters.
1325                          */
1326                         if (smcip != NULL) {
1327                                 if (smcip->mci_mip->mi_promisc_list != NULL) {
1328                                         mutex_exit(lock);
1329                                         mac_promisc_dispatch(smcip->mci_mip,
1330                                             head, NULL);
1331                                         mutex_enter(lock);
1332                                 }
1333                         }
1334                         if (mac_srs->srs_type & SRST_BW_CONTROL) {
1335                                 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1336                                 mac_srs->srs_bw->mac_bw_polled += sz;
1337                                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1338                         }
1339                         MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
1340                             count, sz);
1341                         if (count <= 10)
1342                                 srs_rx->sr_stat.mrs_chaincntundr10++;
1343                         else if (count > 10 && count <= 50)
1344                                 srs_rx->sr_stat.mrs_chaincnt10to50++;
1345                         else
1346                                 srs_rx->sr_stat.mrs_chaincntover50++;
1347                 }
1348 
1349                 /*
1350                  * We are guaranteed that SRS_PROC will be set if we
1351                  * are here. Also, poll thread gets to run only if
1352                  * the drain was being done by a worker thread although
1353                  * its possible that worker thread is still running
1354                  * and poll thread was sent down to keep the pipeline
1355                  * going instead of doing a complete drain and then
1356                  * trying to poll the NIC.
1357                  *
1358                  * So we need to check SRS_WORKER flag to make sure
1359                  * that the worker thread is not processing the queue
1360                  * in parallel to us. The flags and conditions are
1361                  * protected by the srs_lock to prevent any race. We
1362                  * ensure that we don't drop the srs_lock from now
1363                  * till the end and similarly we don't drop the srs_lock
1364                  * in mac_rx_srs_drain() till similar condition check
1365                  * are complete. The mac_rx_srs_drain() needs to ensure
1366                  * that SRS_WORKER flag remains set as long as its
1367                  * processing the queue.
1368                  */
1369                 if (!(mac_srs->srs_state & SRS_WORKER) &&
1370                     (mac_srs->srs_first != NULL)) {
1371                         /*
1372                          * We have packets to process and worker thread
1373                          * is not running. Check to see if poll thread is
1374                          * allowed to process.
1375                          */
1376                         if (mac_srs->srs_state & SRS_LATENCY_OPT) {
1377                                 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
1378                                 if (!(mac_srs->srs_state & SRS_PAUSE) &&
1379                                     srs_rx->sr_poll_pkt_cnt <=
1380                                     srs_rx->sr_lowat) {
1381                                         srs_rx->sr_poll_again++;
1382                                         goto check_again;
1383                                 }
1384                                 /*
1385                                  * We are already above low water mark
1386                                  * so stay in the polling mode but no
1387                                  * need to poll. Once we dip below
1388                                  * the polling threshold, the processing
1389                                  * thread (soft ring) will signal us
1390                                  * to poll again (MAC_UPDATE_SRS_COUNT)
1391                                  */
1392                                 srs_rx->sr_poll_drain_no_poll++;
1393                                 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1394                                 /*
1395                                  * In B/W control case, its possible
1396                                  * that the backlog built up due to
1397                                  * B/W limit being reached and packets
1398                                  * are queued only in SRS. In this case,
1399                                  * we should schedule worker thread
1400                                  * since no one else will wake us up.
1401                                  */
1402                                 if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
1403                                     (mac_srs->srs_tid == NULL)) {
1404                                         mac_srs->srs_tid =
1405                                             timeout(mac_srs_fire, mac_srs, 1);
1406                                         srs_rx->sr_poll_worker_wakeup++;
1407                                 }
1408                         } else {
1409                                 /*
1410                                  * Wakeup the worker thread for more processing.
1411                                  * We optimize for throughput in this case.
1412                                  */
1413                                 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1414                                 MAC_SRS_WORKER_WAKEUP(mac_srs);
1415                                 srs_rx->sr_poll_sig_worker++;
1416                         }
1417                 } else if ((mac_srs->srs_first == NULL) &&
1418                     !(mac_srs->srs_state & SRS_WORKER)) {
1419                         /*
1420                          * There is nothing queued in SRS and
1421                          * no worker thread running. Plus we
1422                          * didn't get anything from the H/W
1423                          * as well (head == NULL);
1424                          */
1425                         ASSERT(head == NULL);
1426                         mac_srs->srs_state &=
1427                             ~(SRS_PROC|SRS_GET_PKTS);
1428 
1429                         /*
1430                          * If we have a packets in soft ring, don't allow
1431                          * more packets to come into this SRS by keeping the
1432                          * interrupts off but not polling the H/W. The
1433                          * poll thread will get signaled as soon as
1434                          * srs_poll_pkt_cnt dips below poll threshold.
1435                          */
1436                         if (srs_rx->sr_poll_pkt_cnt == 0) {
1437                                 srs_rx->sr_poll_intr_enable++;
1438                                 MAC_SRS_POLLING_OFF(mac_srs);
1439                         } else {
1440                                 /*
1441                                  * We know nothing is queued in SRS
1442                                  * since we are here after checking
1443                                  * srs_first is NULL. The backlog
1444                                  * is entirely due to packets queued
1445                                  * in Soft ring which will wake us up
1446                                  * and get the interface out of polling
1447                                  * mode once the backlog dips below
1448                                  * sr_poll_thres.
1449                                  */
1450                                 srs_rx->sr_poll_no_poll++;
1451                         }
1452                 } else {
1453                         /*
1454                          * Worker thread is already running.
1455                          * Nothing much to do. If the polling
1456                          * was enabled, worker thread will deal
1457                          * with that.
1458                          */
1459                         mac_srs->srs_state &= ~SRS_GET_PKTS;
1460                         srs_rx->sr_poll_goto_sleep++;
1461                 }
1462         }
1463 done:
1464         mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
1465         cv_signal(&mac_srs->srs_async);
1466         /*
1467          * If this is a temporary quiesce then wait for the restart signal
1468          * from the srs worker. Then clear the flags and signal the srs worker
1469          * to ensure a positive handshake and go back to start.
1470          */
1471         while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
1472                 cv_wait(async, lock);
1473         if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
1474                 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
1475                 mac_srs->srs_state &=
1476                     ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
1477                 cv_signal(&mac_srs->srs_async);
1478                 goto start;
1479         } else {
1480                 mac_srs->srs_state |= SRS_POLL_THR_EXITED;
1481                 cv_signal(&mac_srs->srs_async);
1482                 CALLB_CPR_EXIT(&cprinfo);
1483                 thread_exit();
1484         }
1485 }
1486 
1487 /*
1488  * mac_srs_pick_chain
1489  *
1490  * In Bandwidth control case, checks how many packets can be processed
1491  * and return them in a sub chain.
1492  */
1493 static mblk_t *
1494 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
1495     size_t *chain_sz, int *chain_cnt)
1496 {
1497         mblk_t                  *head = NULL;
1498         mblk_t                  *tail = NULL;
1499         size_t                  sz;
1500         size_t                  tsz = 0;
1501         int                     cnt = 0;
1502         mblk_t                  *mp;
1503 
1504         ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1505         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1506         if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
1507             mac_srs->srs_bw->mac_bw_limit) ||
1508             (mac_srs->srs_bw->mac_bw_limit == 0)) {
1509                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1510                 head = mac_srs->srs_first;
1511                 mac_srs->srs_first = NULL;
1512                 *chain_tail = mac_srs->srs_last;
1513                 mac_srs->srs_last = NULL;
1514                 *chain_sz = mac_srs->srs_size;
1515                 *chain_cnt = mac_srs->srs_count;
1516                 mac_srs->srs_count = 0;
1517                 mac_srs->srs_size = 0;
1518                 return (head);
1519         }
1520 
1521         /*
1522          * Can't clear the entire backlog.
1523          * Need to find how many packets to pick
1524          */
1525         ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
1526         while ((mp = mac_srs->srs_first) != NULL) {
1527                 sz = msgdsize(mp);
1528                 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
1529                     mac_srs->srs_bw->mac_bw_limit) {
1530                         if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
1531                                 mac_srs->srs_bw->mac_bw_state |=
1532                                     SRS_BW_ENFORCED;
1533                         break;
1534                 }
1535 
1536                 /*
1537                  * The _size & cnt is  decremented from the softrings
1538                  * when they send up the packet for polling to work
1539                  * properly.
1540                  */
1541                 tsz += sz;
1542                 cnt++;
1543                 mac_srs->srs_count--;
1544                 mac_srs->srs_size -= sz;
1545                 if (tail != NULL)
1546                         tail->b_next = mp;
1547                 else
1548                         head = mp;
1549                 tail = mp;
1550                 mac_srs->srs_first = mac_srs->srs_first->b_next;
1551         }
1552         mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1553         if (mac_srs->srs_first == NULL)
1554                 mac_srs->srs_last = NULL;
1555 
1556         if (tail != NULL)
1557                 tail->b_next = NULL;
1558         *chain_tail = tail;
1559         *chain_cnt = cnt;
1560         *chain_sz = tsz;
1561 
1562         return (head);
1563 }
1564 
1565 /*
1566  * mac_rx_srs_drain
1567  *
1568  * The SRS drain routine. Gets to run to clear the queue. Any thread
1569  * (worker, interrupt, poll) can call this based on processing model.
1570  * The first thing we do is disable interrupts if possible and then
1571  * drain the queue. we also try to poll the underlying hardware if
1572  * there is a dedicated hardware Rx ring assigned to this SRS.
1573  *
1574  * There is a equivalent drain routine in bandwidth control mode
1575  * mac_rx_srs_drain_bw. There is some code duplication between the two
1576  * routines but they are highly performance sensitive and are easier
1577  * to read/debug if they stay separate. Any code changes here might
1578  * also apply to mac_rx_srs_drain_bw as well.
1579  */
1580 void
1581 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1582 {
1583         mblk_t                  *head;
1584         mblk_t                  *tail;
1585         timeout_id_t            tid;
1586         int                     cnt = 0;
1587         mac_client_impl_t       *mcip = mac_srs->srs_mcip;
1588         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1589 
1590         ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1591         ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
1592 
1593         /* If we are blanked i.e. can't do upcalls, then we are done */
1594         if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1595                 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1596                     (mac_srs->srs_state & SRS_PAUSE));
1597                 goto out;
1598         }
1599 
1600         if (mac_srs->srs_first == NULL)
1601                 goto out;
1602 
1603         if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
1604             (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
1605                 /*
1606                  * In the normal case, the SRS worker thread does no
1607                  * work and we wait for a backlog to build up before
1608                  * we switch into polling mode. In case we are
1609                  * optimizing for throughput, we use the worker thread
1610                  * as well. The goal is to let worker thread process
1611                  * the queue and poll thread to feed packets into
1612                  * the queue. As such, we should signal the poll
1613                  * thread to try and get more packets.
1614                  *
1615                  * We could have pulled this check in the POLL_RING
1616                  * macro itself but keeping it explicit here makes
1617                  * the architecture more human understandable.
1618                  */
1619                 MAC_SRS_POLL_RING(mac_srs);
1620         }
1621 
1622 again:
1623         head = mac_srs->srs_first;
1624         mac_srs->srs_first = NULL;
1625         tail = mac_srs->srs_last;
1626         mac_srs->srs_last = NULL;
1627         cnt = mac_srs->srs_count;
1628         mac_srs->srs_count = 0;
1629 
1630         ASSERT(head != NULL);
1631         ASSERT(tail != NULL);
1632 
1633         if ((tid = mac_srs->srs_tid) != 0)
1634                 mac_srs->srs_tid = 0;
1635 
1636         mac_srs->srs_state |= (SRS_PROC|proc_type);
1637 
1638 
1639         /*
1640          * mcip is NULL for broadcast and multicast flows. The promisc
1641          * callbacks for broadcast and multicast packets are delivered from
1642          * mac_rx() and we don't need to worry about that case in this path
1643          */
1644         if (mcip != NULL) {
1645                 if (mcip->mci_promisc_list != NULL) {
1646                         mutex_exit(&mac_srs->srs_lock);
1647                         mac_promisc_client_dispatch(mcip, head);
1648                         mutex_enter(&mac_srs->srs_lock);
1649                 }
1650                 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1651                         mutex_exit(&mac_srs->srs_lock);
1652                         mac_protect_intercept_dhcp(mcip, head);
1653                         mutex_enter(&mac_srs->srs_lock);
1654                 }
1655         }
1656 
1657         /*
1658          * Check if SRS itself is doing the processing
1659          * This direct path does not apply when subflows are present. In this
1660          * case, packets need to be dispatched to a soft ring according to the
1661          * flow's bandwidth and other resources contraints.
1662          */
1663         if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1664                 mac_direct_rx_t         proc;
1665                 void                    *arg1;
1666                 mac_resource_handle_t   arg2;
1667 
1668                 /*
1669                  * This is the case when a Rx is directly
1670                  * assigned and we have a fully classified
1671                  * protocol chain. We can deal with it in
1672                  * one shot.
1673                  */
1674                 proc = srs_rx->sr_func;
1675                 arg1 = srs_rx->sr_arg1;
1676                 arg2 = srs_rx->sr_arg2;
1677 
1678                 mac_srs->srs_state |= SRS_CLIENT_PROC;
1679                 mutex_exit(&mac_srs->srs_lock);
1680                 if (tid != 0) {
1681                         (void) untimeout(tid);
1682                         tid = 0;
1683                 }
1684 
1685                 proc(arg1, arg2, head, NULL);
1686                 /*
1687                  * Decrement the size and count here itelf
1688                  * since the packet has been processed.
1689                  */
1690                 mutex_enter(&mac_srs->srs_lock);
1691                 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1692                 if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1693                         cv_signal(&mac_srs->srs_client_cv);
1694                 mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1695         } else {
1696                 /* Some kind of softrings based fanout is required */
1697                 mutex_exit(&mac_srs->srs_lock);
1698                 if (tid != 0) {
1699                         (void) untimeout(tid);
1700                         tid = 0;
1701                 }
1702 
1703                 /*
1704                  * Since the fanout routines can deal with chains,
1705                  * shoot the entire chain up.
1706                  */
1707                 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1708                         mac_rx_srs_fanout(mac_srs, head);
1709                 else
1710                         mac_rx_srs_proto_fanout(mac_srs, head);
1711                 mutex_enter(&mac_srs->srs_lock);
1712         }
1713 
1714         if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) &&
1715             (mac_srs->srs_first != NULL)) {
1716                 /*
1717                  * More packets arrived while we were clearing the
1718                  * SRS. This can be possible because of one of
1719                  * three conditions below:
1720                  * 1) The driver is using multiple worker threads
1721                  *    to send the packets to us.
1722                  * 2) The driver has a race in switching
1723                  *    between interrupt and polling mode or
1724                  * 3) Packets are arriving in this SRS via the
1725                  *    S/W classification as well.
1726                  *
1727                  * We should switch to polling mode and see if we
1728                  * need to send the poll thread down. Also, signal
1729                  * the worker thread to process whats just arrived.
1730                  */
1731                 MAC_SRS_POLLING_ON(mac_srs);
1732                 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
1733                         srs_rx->sr_drain_poll_sig++;
1734                         MAC_SRS_POLL_RING(mac_srs);
1735                 }
1736 
1737                 /*
1738                  * If we didn't signal the poll thread, we need
1739                  * to deal with the pending packets ourselves.
1740                  */
1741                 if (proc_type == SRS_WORKER) {
1742                         srs_rx->sr_drain_again++;
1743                         goto again;
1744                 } else {
1745                         srs_rx->sr_drain_worker_sig++;
1746                         cv_signal(&mac_srs->srs_async);
1747                 }
1748         }
1749 
1750 out:
1751         if (mac_srs->srs_state & SRS_GET_PKTS) {
1752                 /*
1753                  * Poll thread is already running. Leave the
1754                  * SRS_RPOC set and hand over the control to
1755                  * poll thread.
1756                  */
1757                 mac_srs->srs_state &= ~proc_type;
1758                 srs_rx->sr_drain_poll_running++;
1759                 return;
1760         }
1761 
1762         /*
1763          * Even if there are no packets queued in SRS, we
1764          * need to make sure that the shared counter is
1765          * clear and any associated softrings have cleared
1766          * all the backlog. Otherwise, leave the interface
1767          * in polling mode and the poll thread will get
1768          * signalled once the count goes down to zero.
1769          *
1770          * If someone is already draining the queue (SRS_PROC is
1771          * set) when the srs_poll_pkt_cnt goes down to zero,
1772          * then it means that drain is already running and we
1773          * will turn off polling at that time if there is
1774          * no backlog.
1775          *
1776          * As long as there are packets queued either
1777          * in soft ring set or its soft rings, we will leave
1778          * the interface in polling mode (even if the drain
1779          * was done being the interrupt thread). We signal
1780          * the poll thread as well if we have dipped below
1781          * low water mark.
1782          *
1783          * NOTE: We can't use the MAC_SRS_POLLING_ON macro
1784          * since that turn polling on only for worker thread.
1785          * Its not worth turning polling on for interrupt
1786          * thread (since NIC will not issue another interrupt)
1787          * unless a backlog builds up.
1788          */
1789         if ((srs_rx->sr_poll_pkt_cnt > 0) &&
1790             (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
1791                 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1792                 srs_rx->sr_drain_keep_polling++;
1793                 MAC_SRS_POLLING_ON(mac_srs);
1794                 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
1795                         MAC_SRS_POLL_RING(mac_srs);
1796                 return;
1797         }
1798 
1799         /* Nothing else to do. Get out of poll mode */
1800         MAC_SRS_POLLING_OFF(mac_srs);
1801         mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1802         srs_rx->sr_drain_finish_intr++;
1803 }
1804 
1805 /*
1806  * mac_rx_srs_drain_bw
1807  *
1808  * The SRS BW drain routine. Gets to run to clear the queue. Any thread
1809  * (worker, interrupt, poll) can call this based on processing model.
1810  * The first thing we do is disable interrupts if possible and then
1811  * drain the queue. we also try to poll the underlying hardware if
1812  * there is a dedicated hardware Rx ring assigned to this SRS.
1813  *
1814  * There is a equivalent drain routine in non bandwidth control mode
1815  * mac_rx_srs_drain. There is some code duplication between the two
1816  * routines but they are highly performance sensitive and are easier
1817  * to read/debug if they stay separate. Any code changes here might
1818  * also apply to mac_rx_srs_drain as well.
1819  */
1820 void
1821 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1822 {
1823         mblk_t                  *head;
1824         mblk_t                  *tail;
1825         timeout_id_t            tid;
1826         size_t                  sz = 0;
1827         int                     cnt = 0;
1828         mac_client_impl_t       *mcip = mac_srs->srs_mcip;
1829         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1830         clock_t                 now;
1831 
1832         ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1833         ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
1834 again:
1835         /* Check if we are doing B/W control */
1836         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1837         now = ddi_get_lbolt();
1838         if (mac_srs->srs_bw->mac_bw_curr_time != now) {
1839                 mac_srs->srs_bw->mac_bw_curr_time = now;
1840                 mac_srs->srs_bw->mac_bw_used = 0;
1841                 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
1842                         mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
1843         } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
1844                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1845                 goto done;
1846         } else if (mac_srs->srs_bw->mac_bw_used >
1847             mac_srs->srs_bw->mac_bw_limit) {
1848                 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
1849                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1850                 goto done;
1851         }
1852         mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1853 
1854         /* If we are blanked i.e. can't do upcalls, then we are done */
1855         if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1856                 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1857                     (mac_srs->srs_state & SRS_PAUSE));
1858                 goto done;
1859         }
1860 
1861         sz = 0;
1862         cnt = 0;
1863         if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
1864                 /*
1865                  * We couldn't pick up a single packet.
1866                  */
1867                 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1868                 if ((mac_srs->srs_bw->mac_bw_used == 0) &&
1869                     (mac_srs->srs_size != 0) &&
1870                     !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
1871                         /*
1872                          * Seems like configured B/W doesn't
1873                          * even allow processing of 1 packet
1874                          * per tick.
1875                          *
1876                          * XXX: raise the limit to processing
1877                          * at least 1 packet per tick.
1878                          */
1879                         mac_srs->srs_bw->mac_bw_limit +=
1880                             mac_srs->srs_bw->mac_bw_limit;
1881                         mac_srs->srs_bw->mac_bw_drop_threshold +=
1882                             mac_srs->srs_bw->mac_bw_drop_threshold;
1883                         cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
1884                             "raised B/W limit to %d since not even a "
1885                             "single packet can be processed per "
1886                             "tick %d\n", (void *)mac_srs,
1887                             (int)mac_srs->srs_bw->mac_bw_limit,
1888                             (int)msgdsize(mac_srs->srs_first));
1889                 }
1890                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1891                 goto done;
1892         }
1893 
1894         ASSERT(head != NULL);
1895         ASSERT(tail != NULL);
1896 
1897         /* zero bandwidth: drop all and return to interrupt mode */
1898         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1899         if (mac_srs->srs_bw->mac_bw_limit == 0) {
1900                 srs_rx->sr_stat.mrs_sdrops += cnt;
1901                 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
1902                 mac_srs->srs_bw->mac_bw_sz -= sz;
1903                 mac_srs->srs_bw->mac_bw_drop_bytes += sz;
1904                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1905                 mac_pkt_drop(NULL, NULL, head, B_FALSE);
1906                 goto leave_poll;
1907         } else {
1908                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1909         }
1910 
1911         if ((tid = mac_srs->srs_tid) != 0)
1912                 mac_srs->srs_tid = 0;
1913 
1914         mac_srs->srs_state |= (SRS_PROC|proc_type);
1915         MAC_SRS_WORKER_POLLING_ON(mac_srs);
1916 
1917         /*
1918          * mcip is NULL for broadcast and multicast flows. The promisc
1919          * callbacks for broadcast and multicast packets are delivered from
1920          * mac_rx() and we don't need to worry about that case in this path
1921          */
1922         if (mcip != NULL) {
1923                 if (mcip->mci_promisc_list != NULL) {
1924                         mutex_exit(&mac_srs->srs_lock);
1925                         mac_promisc_client_dispatch(mcip, head);
1926                         mutex_enter(&mac_srs->srs_lock);
1927                 }
1928                 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1929                         mutex_exit(&mac_srs->srs_lock);
1930                         mac_protect_intercept_dhcp(mcip, head);
1931                         mutex_enter(&mac_srs->srs_lock);
1932                 }
1933         }
1934 
1935         /*
1936          * Check if SRS itself is doing the processing
1937          * This direct path does not apply when subflows are present. In this
1938          * case, packets need to be dispatched to a soft ring according to the
1939          * flow's bandwidth and other resources contraints.
1940          */
1941         if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1942                 mac_direct_rx_t         proc;
1943                 void                    *arg1;
1944                 mac_resource_handle_t   arg2;
1945 
1946                 /*
1947                  * This is the case when a Rx is directly
1948                  * assigned and we have a fully classified
1949                  * protocol chain. We can deal with it in
1950                  * one shot.
1951                  */
1952                 proc = srs_rx->sr_func;
1953                 arg1 = srs_rx->sr_arg1;
1954                 arg2 = srs_rx->sr_arg2;
1955 
1956                 mac_srs->srs_state |= SRS_CLIENT_PROC;
1957                 mutex_exit(&mac_srs->srs_lock);
1958                 if (tid != 0) {
1959                         (void) untimeout(tid);
1960                         tid = 0;
1961                 }
1962 
1963                 proc(arg1, arg2, head, NULL);
1964                 /*
1965                  * Decrement the size and count here itelf
1966                  * since the packet has been processed.
1967                  */
1968                 mutex_enter(&mac_srs->srs_lock);
1969                 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1970                 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
1971 
1972                 if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1973                         cv_signal(&mac_srs->srs_client_cv);
1974                 mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1975         } else {
1976                 /* Some kind of softrings based fanout is required */
1977                 mutex_exit(&mac_srs->srs_lock);
1978                 if (tid != 0) {
1979                         (void) untimeout(tid);
1980                         tid = 0;
1981                 }
1982 
1983                 /*
1984                  * Since the fanout routines can deal with chains,
1985                  * shoot the entire chain up.
1986                  */
1987                 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1988                         mac_rx_srs_fanout(mac_srs, head);
1989                 else
1990                         mac_rx_srs_proto_fanout(mac_srs, head);
1991                 mutex_enter(&mac_srs->srs_lock);
1992         }
1993 
1994         /*
1995          * Send the poll thread to pick up any packets arrived
1996          * so far. This also serves as the last check in case
1997          * nothing else is queued in the SRS. The poll thread
1998          * is signalled only in the case the drain was done
1999          * by the worker thread and SRS_WORKER is set. The
2000          * worker thread can run in parallel as long as the
2001          * SRS_WORKER flag is set. We we have nothing else to
2002          * process, we can exit while leaving SRS_PROC set
2003          * which gives the poll thread control to process and
2004          * cleanup once it returns from the NIC.
2005          *
2006          * If we have nothing else to process, we need to
2007          * ensure that we keep holding the srs_lock till
2008          * all the checks below are done and control is
2009          * handed to the poll thread if it was running.
2010          */
2011         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2012         if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2013                 if (mac_srs->srs_first != NULL) {
2014                         if (proc_type == SRS_WORKER) {
2015                                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2016                                 if (srs_rx->sr_poll_pkt_cnt <=
2017                                     srs_rx->sr_lowat)
2018                                         MAC_SRS_POLL_RING(mac_srs);
2019                                 goto again;
2020                         } else {
2021                                 cv_signal(&mac_srs->srs_async);
2022                         }
2023                 }
2024         }
2025         mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2026 
2027 done:
2028 
2029         if (mac_srs->srs_state & SRS_GET_PKTS) {
2030                 /*
2031                  * Poll thread is already running. Leave the
2032                  * SRS_RPOC set and hand over the control to
2033                  * poll thread.
2034                  */
2035                 mac_srs->srs_state &= ~proc_type;
2036                 return;
2037         }
2038 
2039         /*
2040          * If we can't process packets because we have exceeded
2041          * B/W limit for this tick, just set the timeout
2042          * and leave.
2043          *
2044          * Even if there are no packets queued in SRS, we
2045          * need to make sure that the shared counter is
2046          * clear and any associated softrings have cleared
2047          * all the backlog. Otherwise, leave the interface
2048          * in polling mode and the poll thread will get
2049          * signalled once the count goes down to zero.
2050          *
2051          * If someone is already draining the queue (SRS_PROC is
2052          * set) when the srs_poll_pkt_cnt goes down to zero,
2053          * then it means that drain is already running and we
2054          * will turn off polling at that time if there is
2055          * no backlog. As long as there are packets queued either
2056          * is soft ring set or its soft rings, we will leave
2057          * the interface in polling mode.
2058          */
2059         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2060         if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
2061             ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
2062             (srs_rx->sr_poll_pkt_cnt > 0))) {
2063                 MAC_SRS_POLLING_ON(mac_srs);
2064                 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2065                 if ((mac_srs->srs_first != NULL) &&
2066                     (mac_srs->srs_tid == NULL))
2067                         mac_srs->srs_tid = timeout(mac_srs_fire,
2068                             mac_srs, 1);
2069                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2070                 return;
2071         }
2072         mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2073 
2074 leave_poll:
2075 
2076         /* Nothing else to do. Get out of poll mode */
2077         MAC_SRS_POLLING_OFF(mac_srs);
2078         mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2079 }
2080 
2081 /*
2082  * mac_srs_worker
2083  *
2084  * The SRS worker routine. Drains the queue when no one else is
2085  * processing it.
2086  */
2087 void
2088 mac_srs_worker(mac_soft_ring_set_t *mac_srs)
2089 {
2090         kmutex_t                *lock = &mac_srs->srs_lock;
2091         kcondvar_t              *async = &mac_srs->srs_async;
2092         callb_cpr_t             cprinfo;
2093         boolean_t               bw_ctl_flag;
2094 
2095         CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
2096         mutex_enter(lock);
2097 
2098 start:
2099         for (;;) {
2100                 bw_ctl_flag = B_FALSE;
2101                 if (mac_srs->srs_type & SRST_BW_CONTROL) {
2102                         MAC_SRS_BW_LOCK(mac_srs);
2103                         MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2104                         if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
2105                                 bw_ctl_flag = B_TRUE;
2106                         MAC_SRS_BW_UNLOCK(mac_srs);
2107                 }
2108                 /*
2109                  * The SRS_BW_ENFORCED flag may change since we have dropped
2110                  * the mac_bw_lock. However the drain function can handle both
2111                  * a drainable SRS or a bandwidth controlled SRS, and the
2112                  * effect of scheduling a timeout is to wakeup the worker
2113                  * thread which in turn will call the drain function. Since
2114                  * we release the srs_lock atomically only in the cv_wait there
2115                  * isn't a fear of waiting for ever.
2116                  */
2117                 while (((mac_srs->srs_state & SRS_PROC) ||
2118                     (mac_srs->srs_first == NULL) || bw_ctl_flag ||
2119                     (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
2120                     !(mac_srs->srs_state & SRS_PAUSE)) {
2121                         /*
2122                          * If we have packets queued and we are here
2123                          * because B/W control is in place, we better
2124                          * schedule the worker wakeup after 1 tick
2125                          * to see if bandwidth control can be relaxed.
2126                          */
2127                         if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
2128                                 /*
2129                                  * We need to ensure that a timer  is already
2130                                  * scheduled or we force  schedule one for
2131                                  * later so that we can continue processing
2132                                  * after this  quanta is over.
2133                                  */
2134                                 mac_srs->srs_tid = timeout(mac_srs_fire,
2135                                     mac_srs, 1);
2136                         }
2137 wait:
2138                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
2139                         cv_wait(async, lock);
2140                         CALLB_CPR_SAFE_END(&cprinfo, lock);
2141 
2142                         if (mac_srs->srs_state & SRS_PAUSE)
2143                                 goto done;
2144                         if (mac_srs->srs_state & SRS_PROC)
2145                                 goto wait;
2146 
2147                         if (mac_srs->srs_first != NULL &&
2148                             mac_srs->srs_type & SRST_BW_CONTROL) {
2149                                 MAC_SRS_BW_LOCK(mac_srs);
2150                                 if (mac_srs->srs_bw->mac_bw_state &
2151                                     SRS_BW_ENFORCED) {
2152                                         MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2153                                 }
2154                                 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
2155                                     SRS_BW_ENFORCED;
2156                                 MAC_SRS_BW_UNLOCK(mac_srs);
2157                         }
2158                 }
2159 
2160                 if (mac_srs->srs_state & SRS_PAUSE)
2161                         goto done;
2162                 mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
2163         }
2164 done:
2165         /*
2166          * The Rx SRS quiesce logic first cuts off packet supply to the SRS
2167          * from both hard and soft classifications and waits for such threads
2168          * to finish before signaling the worker. So at this point the only
2169          * thread left that could be competing with the worker is the poll
2170          * thread. In the case of Tx, there shouldn't be any thread holding
2171          * SRS_PROC at this point.
2172          */
2173         if (!(mac_srs->srs_state & SRS_PROC)) {
2174                 mac_srs->srs_state |= SRS_PROC;
2175         } else {
2176                 ASSERT((mac_srs->srs_type & SRST_TX) == 0);
2177                 /*
2178                  * Poll thread still owns the SRS and is still running
2179                  */
2180                 ASSERT((mac_srs->srs_poll_thr == NULL) ||
2181                     ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
2182                     SRS_POLL_THR_OWNER));
2183         }
2184         mac_srs_worker_quiesce(mac_srs);
2185         /*
2186          * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
2187          * of the quiesce operation
2188          */
2189         while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
2190                 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
2191 
2192         if (mac_srs->srs_state & SRS_RESTART) {
2193                 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
2194                 mac_srs_worker_restart(mac_srs);
2195                 mac_srs->srs_state &= ~SRS_PROC;
2196                 goto start;
2197         }
2198 
2199         if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
2200                 mac_srs_worker_quiesce(mac_srs);
2201 
2202         mac_srs->srs_state &= ~SRS_PROC;
2203         /* The macro drops the srs_lock */
2204         CALLB_CPR_EXIT(&cprinfo);
2205         thread_exit();
2206 }
2207 
2208 /*
2209  * mac_rx_srs_subflow_process
2210  *
2211  * Receive side routine called from interrupt path when there are
2212  * sub flows present on this SRS.
2213  */
2214 /* ARGSUSED */
2215 void
2216 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
2217     mblk_t *mp_chain, boolean_t loopback)
2218 {
2219         flow_entry_t            *flent = NULL;
2220         flow_entry_t            *prev_flent = NULL;
2221         mblk_t                  *mp = NULL;
2222         mblk_t                  *tail = NULL;
2223         mac_soft_ring_set_t     *mac_srs = (mac_soft_ring_set_t *)srs;
2224         mac_client_impl_t       *mcip;
2225 
2226         mcip = mac_srs->srs_mcip;
2227         ASSERT(mcip != NULL);
2228 
2229         /*
2230          * We need to determine the SRS for every packet
2231          * by walking the flow table, if we don't get any,
2232          * then we proceed using the SRS we came with.
2233          */
2234         mp = tail = mp_chain;
2235         while (mp != NULL) {
2236 
2237                 /*
2238                  * We will increment the stats for the mactching subflow.
2239                  * when we get the bytes/pkt count for the classified packets
2240                  * later in mac_rx_srs_process.
2241                  */
2242                 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
2243                     FLOW_INBOUND, &flent);
2244 
2245                 if (mp == mp_chain || flent == prev_flent) {
2246                         if (prev_flent != NULL)
2247                                 FLOW_REFRELE(prev_flent);
2248                         prev_flent = flent;
2249                         flent = NULL;
2250                         tail = mp;
2251                         mp = mp->b_next;
2252                         continue;
2253                 }
2254                 tail->b_next = NULL;
2255                 /*
2256                  * A null indicates, this is for the mac_srs itself.
2257                  * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
2258                  */
2259                 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2260                         mac_rx_srs_process(arg,
2261                             (mac_resource_handle_t)mac_srs, mp_chain,
2262                             loopback);
2263                 } else {
2264                         (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2265                             prev_flent->fe_cb_arg2, mp_chain, loopback);
2266                         FLOW_REFRELE(prev_flent);
2267                 }
2268                 prev_flent = flent;
2269                 flent = NULL;
2270                 mp_chain = mp;
2271                 tail = mp;
2272                 mp = mp->b_next;
2273         }
2274         /* Last chain */
2275         ASSERT(mp_chain != NULL);
2276         if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2277                 mac_rx_srs_process(arg,
2278                     (mac_resource_handle_t)mac_srs, mp_chain, loopback);
2279         } else {
2280                 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2281                     prev_flent->fe_cb_arg2, mp_chain, loopback);
2282                 FLOW_REFRELE(prev_flent);
2283         }
2284 }
2285 
2286 /*
2287  * mac_rx_srs_process
2288  *
2289  * Receive side routine called from the interrupt path.
2290  *
2291  * loopback is set to force a context switch on the loopback
2292  * path between MAC clients.
2293  */
2294 /* ARGSUSED */
2295 void
2296 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
2297     boolean_t loopback)
2298 {
2299         mac_soft_ring_set_t     *mac_srs = (mac_soft_ring_set_t *)srs;
2300         mblk_t                  *mp, *tail, *head;
2301         int                     count = 0;
2302         int                     count1;
2303         size_t                  sz = 0;
2304         size_t                  chain_sz, sz1;
2305         mac_bw_ctl_t            *mac_bw;
2306         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
2307 
2308         /*
2309          * Set the tail, count and sz. We set the sz irrespective
2310          * of whether we are doing B/W control or not for the
2311          * purpose of updating the stats.
2312          */
2313         mp = tail = mp_chain;
2314         while (mp != NULL) {
2315                 tail = mp;
2316                 count++;
2317                 sz += msgdsize(mp);
2318                 mp = mp->b_next;
2319         }
2320 
2321         mutex_enter(&mac_srs->srs_lock);
2322 
2323         if (loopback) {
2324                 SRS_RX_STAT_UPDATE(mac_srs, lclbytes, sz);
2325                 SRS_RX_STAT_UPDATE(mac_srs, lclcnt, count);
2326 
2327         } else {
2328                 SRS_RX_STAT_UPDATE(mac_srs, intrbytes, sz);
2329                 SRS_RX_STAT_UPDATE(mac_srs, intrcnt, count);
2330         }
2331 
2332         /*
2333          * If the SRS in already being processed; has been blanked;
2334          * can be processed by worker thread only; or the B/W limit
2335          * has been reached, then queue the chain and check if
2336          * worker thread needs to be awakend.
2337          */
2338         if (mac_srs->srs_type & SRST_BW_CONTROL) {
2339                 mac_bw = mac_srs->srs_bw;
2340                 ASSERT(mac_bw != NULL);
2341                 mutex_enter(&mac_bw->mac_bw_lock);
2342                 mac_bw->mac_bw_intr += sz;
2343                 if (mac_bw->mac_bw_limit == 0) {
2344                         /* zero bandwidth: drop all */
2345                         srs_rx->sr_stat.mrs_sdrops += count;
2346                         mac_bw->mac_bw_drop_bytes += sz;
2347                         mutex_exit(&mac_bw->mac_bw_lock);
2348                         mutex_exit(&mac_srs->srs_lock);
2349                         mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
2350                         return;
2351                 } else {
2352                         if ((mac_bw->mac_bw_sz + sz) <=
2353                             mac_bw->mac_bw_drop_threshold) {
2354                                 mutex_exit(&mac_bw->mac_bw_lock);
2355                                 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
2356                                     tail, count, sz);
2357                         } else {
2358                                 mp = mp_chain;
2359                                 chain_sz = 0;
2360                                 count1 = 0;
2361                                 tail = NULL;
2362                                 head = NULL;
2363                                 while (mp != NULL) {
2364                                         sz1 = msgdsize(mp);
2365                                         if (mac_bw->mac_bw_sz + chain_sz + sz1 >
2366                                             mac_bw->mac_bw_drop_threshold)
2367                                                 break;
2368                                         chain_sz += sz1;
2369                                         count1++;
2370                                         tail = mp;
2371                                         mp = mp->b_next;
2372                                 }
2373                                 mutex_exit(&mac_bw->mac_bw_lock);
2374                                 if (tail != NULL) {
2375                                         head = tail->b_next;
2376                                         tail->b_next = NULL;
2377                                         MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
2378                                             mp_chain, tail, count1, chain_sz);
2379                                         sz -= chain_sz;
2380                                         count -= count1;
2381                                 } else {
2382                                         /* Can't pick up any */
2383                                         head = mp_chain;
2384                                 }
2385                                 if (head != NULL) {
2386                                         /* Drop any packet over the threshold */
2387                                         srs_rx->sr_stat.mrs_sdrops += count;
2388                                         mutex_enter(&mac_bw->mac_bw_lock);
2389                                         mac_bw->mac_bw_drop_bytes += sz;
2390                                         mutex_exit(&mac_bw->mac_bw_lock);
2391                                         freemsgchain(head);
2392                                 }
2393                         }
2394                         MAC_SRS_WORKER_WAKEUP(mac_srs);
2395                         mutex_exit(&mac_srs->srs_lock);
2396                         return;
2397                 }
2398         }
2399 
2400         /*
2401          * If the total number of packets queued in the SRS and
2402          * its associated soft rings exceeds the max allowed,
2403          * then drop the chain. If we are polling capable, this
2404          * shouldn't be happening.
2405          */
2406         if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
2407             (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
2408                 mac_bw = mac_srs->srs_bw;
2409                 srs_rx->sr_stat.mrs_sdrops += count;
2410                 mutex_enter(&mac_bw->mac_bw_lock);
2411                 mac_bw->mac_bw_drop_bytes += sz;
2412                 mutex_exit(&mac_bw->mac_bw_lock);
2413                 freemsgchain(mp_chain);
2414                 mutex_exit(&mac_srs->srs_lock);
2415                 return;
2416         }
2417 
2418         MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
2419 
2420         if (!(mac_srs->srs_state & SRS_PROC)) {
2421                 /*
2422                  * If we are coming via loopback or if we are not
2423                  * optimizing for latency, we should signal the
2424                  * worker thread.
2425                  */
2426                 if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) {
2427                         /*
2428                          * For loopback, We need to let the worker take
2429                          * over as we don't want to continue in the same
2430                          * thread even if we can. This could lead to stack
2431                          * overflows and may also end up using
2432                          * resources (cpu) incorrectly.
2433                          */
2434                         cv_signal(&mac_srs->srs_async);
2435                 } else {
2436                         /*
2437                          * Seems like no one is processing the SRS and
2438                          * there is no backlog. We also inline process
2439                          * our packet if its a single packet in non
2440                          * latency optimized case (in latency optimized
2441                          * case, we inline process chains of any size).
2442                          */
2443                         mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
2444                 }
2445         }
2446         mutex_exit(&mac_srs->srs_lock);
2447 }
2448 
2449 /* TX SIDE ROUTINES (RUNTIME) */
2450 
2451 /*
2452  * mac_tx_srs_no_desc
2453  *
2454  * This routine is called by Tx single ring default mode
2455  * when Tx ring runs out of descs.
2456  */
2457 mac_tx_cookie_t
2458 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2459     uint16_t flag, mblk_t **ret_mp)
2460 {
2461         mac_tx_cookie_t cookie = NULL;
2462         mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2463         boolean_t wakeup_worker = B_TRUE;
2464         uint32_t tx_mode = srs_tx->st_mode;
2465         int cnt, sz;
2466         mblk_t *tail;
2467 
2468         ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
2469         if (flag & MAC_DROP_ON_NO_DESC) {
2470                 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2471         } else {
2472                 if (mac_srs->srs_first != NULL)
2473                         wakeup_worker = B_FALSE;
2474                 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2475                 if (flag & MAC_TX_NO_ENQUEUE) {
2476                         /*
2477                          * If TX_QUEUED is not set, queue the
2478                          * packet and let mac_tx_srs_drain()
2479                          * set the TX_BLOCKED bit for the
2480                          * reasons explained above. Otherwise,
2481                          * return the mblks.
2482                          */
2483                         if (wakeup_worker) {
2484                                 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2485                                     mp_chain, tail, cnt, sz);
2486                         } else {
2487                                 MAC_TX_SET_NO_ENQUEUE(mac_srs,
2488                                     mp_chain, ret_mp, cookie);
2489                         }
2490                 } else {
2491                         MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2492                             tail, cnt, sz, cookie);
2493                 }
2494                 if (wakeup_worker)
2495                         cv_signal(&mac_srs->srs_async);
2496         }
2497         return (cookie);
2498 }
2499 
2500 /*
2501  * mac_tx_srs_enqueue
2502  *
2503  * This routine is called when Tx SRS is operating in either serializer
2504  * or bandwidth mode. In serializer mode, a packet will get enqueued
2505  * when a thread cannot enter SRS exclusively. In bandwidth mode,
2506  * packets gets queued if allowed byte-count limit for a tick is
2507  * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
2508  * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
2509  * the default mode or fanout mode. Here packets get dropped or
2510  * returned back to the caller only after hi-watermark worth of data
2511  * is queued.
2512  */
2513 static mac_tx_cookie_t
2514 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2515     uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
2516 {
2517         mac_tx_cookie_t cookie = NULL;
2518         int cnt, sz;
2519         mblk_t *tail;
2520         boolean_t wakeup_worker = B_TRUE;
2521 
2522         /*
2523          * Ignore fanout hint if we don't have multiple tx rings.
2524          */
2525         if (!MAC_TX_SOFT_RINGS(mac_srs))
2526                 fanout_hint = 0;
2527 
2528         if (mac_srs->srs_first != NULL)
2529                 wakeup_worker = B_FALSE;
2530         MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2531         if (flag & MAC_DROP_ON_NO_DESC) {
2532                 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
2533                         MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2534                 } else {
2535                         MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2536                             mp_chain, tail, cnt, sz);
2537                 }
2538         } else if (flag & MAC_TX_NO_ENQUEUE) {
2539                 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
2540                     (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
2541                         MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
2542                             ret_mp, cookie);
2543                 } else {
2544                         mp_chain->b_prev = (mblk_t *)fanout_hint;
2545                         MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2546                             mp_chain, tail, cnt, sz);
2547                 }
2548         } else {
2549                 /*
2550                  * If you are BW_ENFORCED, just enqueue the
2551                  * packet. srs_worker will drain it at the
2552                  * prescribed rate. Before enqueueing, save
2553                  * the fanout hint.
2554                  */
2555                 mp_chain->b_prev = (mblk_t *)fanout_hint;
2556                 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2557                     tail, cnt, sz, cookie);
2558         }
2559         if (wakeup_worker)
2560                 cv_signal(&mac_srs->srs_async);
2561         return (cookie);
2562 }
2563 
2564 /*
2565  * There are seven tx modes:
2566  *
2567  * 1) Default mode (SRS_TX_DEFAULT)
2568  * 2) Serialization mode (SRS_TX_SERIALIZE)
2569  * 3) Fanout mode (SRS_TX_FANOUT)
2570  * 4) Bandwdith mode (SRS_TX_BW)
2571  * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
2572  * 6) aggr Tx mode (SRS_TX_AGGR)
2573  * 7) aggr Tx bw mode (SRS_TX_BW_AGGR)
2574  *
2575  * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
2576  * based on the number of Tx rings requested for an SRS and whether
2577  * bandwidth control is requested or not.
2578  *
2579  * The default mode (i.e., no fanout/no bandwidth) is used when the
2580  * underlying NIC does not have Tx rings or just one Tx ring. In this mode,
2581  * the SRS acts as a pass-thru. Packets will go directly to mac_tx_send().
2582  * When the underlying Tx ring runs out of Tx descs, it starts queueing up
2583  * packets in SRS. When flow-control is relieved, the srs_worker drains
2584  * the queued packets and informs blocked clients to restart sending
2585  * packets.
2586  *
2587  * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. This
2588  * mode is used when the link has no Tx rings or only one Tx ring.
2589  *
2590  * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
2591  * Tx rings. Each Tx ring will have a soft ring associated with it.
2592  * These soft rings will be hung off the Tx SRS. Queueing if it happens
2593  * due to lack of Tx desc will be in individual soft ring (and not srs)
2594  * associated with Tx ring.
2595  *
2596  * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
2597  * only if bw is available. Otherwise the packets will be queued in
2598  * SRS. If fanout to multiple Tx rings is configured, the packets will
2599  * be fanned out among the soft rings associated with the Tx rings.
2600  *
2601  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
2602  * invokes an aggr function, aggr_find_tx_ring(), to find a pseudo Tx ring
2603  * belonging to a port on which the packet has to be sent. Aggr will
2604  * always have a pseudo Tx ring associated with it even when it is an
2605  * aggregation over a single NIC that has no Tx rings. Even in such a
2606  * case, the single pseudo Tx ring will have a soft ring associated with
2607  * it and the soft ring will hang off the SRS.
2608  *
2609  * If a bandwidth is specified for an aggr, SRS_TX_BW_AGGR mode is used.
2610  * In this mode, the bandwidth is first applied on the outgoing packets
2611  * and later mac_tx_addr_mode() function is called to send the packet out
2612  * of one of the pseudo Tx rings.
2613  *
2614  * Four flags are used in srs_state for indicating flow control
2615  * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
2616  * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
2617  * driver below.
2618  * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
2619  * and flow-control pressure is applied back to clients. The clients expect
2620  * wakeup when flow-control is relieved.
2621  * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
2622  * got returned back to client either due to lack of Tx descs or due to bw
2623  * control reasons. The clients expect a wakeup when condition is relieved.
2624  *
2625  * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
2626  * some clients set the following values too: MAC_DROP_ON_NO_DESC,
2627  * MAC_TX_NO_ENQUEUE
2628  * Mac clients that do not want packets to be enqueued in the mac layer set
2629  * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
2630  * Tx soft rings but instead get dropped when the NIC runs out of desc. The
2631  * behaviour of this flag is different when the Tx is running in serializer
2632  * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
2633  * get dropped when Tx high watermark is reached.
2634  * There are some mac clients like vsw, aggr that want the mblks to be
2635  * returned back to clients instead of being queued in Tx SRS (or Tx soft
2636  * rings) under flow-control (i.e., out of desc or exceeding bw limits)
2637  * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
2638  * In the default and Tx fanout mode, the un-transmitted mblks will be
2639  * returned back to the clients when the driver runs out of Tx descs.
2640  * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
2641  * soft ring) so that the clients can be woken up when Tx desc become
2642  * available. When running in serializer or bandwidth mode mode,
2643  * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
2644  */
2645 
2646 mac_tx_func_t
2647 mac_tx_get_func(uint32_t mode)
2648 {
2649         return (mac_tx_mode_list[mode].mac_tx_func);
2650 }
2651 
2652 /* ARGSUSED */
2653 static mac_tx_cookie_t
2654 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2655     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2656 {
2657         mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
2658         mac_tx_stats_t          stats;
2659         mac_tx_cookie_t         cookie = NULL;
2660 
2661         ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
2662 
2663         /* Regular case with a single Tx ring */
2664         /*
2665          * SRS_TX_BLOCKED is set when underlying NIC runs
2666          * out of Tx descs and messages start getting
2667          * queued. It won't get reset until
2668          * tx_srs_drain() completely drains out the
2669          * messages.
2670          */
2671         if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2672                 /* Tx descs/resources not available */
2673                 mutex_enter(&mac_srs->srs_lock);
2674                 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2675                         cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
2676                             flag, ret_mp);
2677                         mutex_exit(&mac_srs->srs_lock);
2678                         return (cookie);
2679                 }
2680                 /*
2681                  * While we were computing mblk count, the
2682                  * flow control condition got relieved.
2683                  * Continue with the transmission.
2684                  */
2685                 mutex_exit(&mac_srs->srs_lock);
2686         }
2687 
2688         mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2689             mp_chain, &stats);
2690 
2691         /*
2692          * Multiple threads could be here sending packets.
2693          * Under such conditions, it is not possible to
2694          * automically set SRS_TX_BLOCKED bit to indicate
2695          * out of tx desc condition. To atomically set
2696          * this, we queue the returned packet and do
2697          * the setting of SRS_TX_BLOCKED in
2698          * mac_tx_srs_drain().
2699          */
2700         if (mp_chain != NULL) {
2701                 mutex_enter(&mac_srs->srs_lock);
2702                 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
2703                 mutex_exit(&mac_srs->srs_lock);
2704                 return (cookie);
2705         }
2706         SRS_TX_STATS_UPDATE(mac_srs, &stats);
2707 
2708         return (NULL);
2709 }
2710 
2711 /*
2712  * mac_tx_serialize_mode
2713  *
2714  * This is an experimental mode implemented as per the request of PAE.
2715  * In this mode, all callers attempting to send a packet to the NIC
2716  * will get serialized. Only one thread at any time will access the
2717  * NIC to send the packet out.
2718  */
2719 /* ARGSUSED */
2720 static mac_tx_cookie_t
2721 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2722     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2723 {
2724         mac_tx_stats_t          stats;
2725         mac_tx_cookie_t         cookie = NULL;
2726         mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
2727 
2728         /* Single ring, serialize below */
2729         ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
2730         mutex_enter(&mac_srs->srs_lock);
2731         if ((mac_srs->srs_first != NULL) ||
2732             (mac_srs->srs_state & SRS_PROC)) {
2733                 /*
2734                  * In serialization mode, queue all packets until
2735                  * TX_HIWAT is set.
2736                  * If drop bit is set, drop if TX_HIWAT is set.
2737                  * If no_enqueue is set, still enqueue until hiwat
2738                  * is set and return mblks after TX_HIWAT is set.
2739                  */
2740                 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
2741                     flag, NULL, ret_mp);
2742                 mutex_exit(&mac_srs->srs_lock);
2743                 return (cookie);
2744         }
2745         /*
2746          * No packets queued, nothing on proc and no flow
2747          * control condition. Fast-path, ok. Do inline
2748          * processing.
2749          */
2750         mac_srs->srs_state |= SRS_PROC;
2751         mutex_exit(&mac_srs->srs_lock);
2752 
2753         mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2754             mp_chain, &stats);
2755 
2756         mutex_enter(&mac_srs->srs_lock);
2757         mac_srs->srs_state &= ~SRS_PROC;
2758         if (mp_chain != NULL) {
2759                 cookie = mac_tx_srs_enqueue(mac_srs,
2760                     mp_chain, flag, NULL, ret_mp);
2761         }
2762         if (mac_srs->srs_first != NULL) {
2763                 /*
2764                  * We processed inline our packet and a new
2765                  * packet/s got queued while we were
2766                  * processing. Wakeup srs worker
2767                  */
2768                 cv_signal(&mac_srs->srs_async);
2769         }
2770         mutex_exit(&mac_srs->srs_lock);
2771 
2772         if (cookie == NULL)
2773                 SRS_TX_STATS_UPDATE(mac_srs, &stats);
2774 
2775         return (cookie);
2776 }
2777 
2778 /*
2779  * mac_tx_fanout_mode
2780  *
2781  * In this mode, the SRS will have access to multiple Tx rings to send
2782  * the packet out. The fanout hint that is passed as an argument is
2783  * used to find an appropriate ring to fanout the traffic. Each Tx
2784  * ring, in turn,  will have a soft ring associated with it. If a Tx
2785  * ring runs out of Tx desc's the returned packet will be queued in
2786  * the soft ring associated with that Tx ring. The srs itself will not
2787  * queue any packets.
2788  */
2789 
2790 #define MAC_TX_SOFT_RING_PROCESS(chain) {                               \
2791         index = COMPUTE_INDEX(hash, mac_srs->srs_tx_ring_count),     \
2792         softring = mac_srs->srs_tx_soft_rings[index];                        \
2793         cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
2794         DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index);       \
2795 }
2796 
2797 static mac_tx_cookie_t
2798 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2799     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2800 {
2801         mac_soft_ring_t         *softring;
2802         uint64_t                hash;
2803         uint_t                  index;
2804         mac_tx_cookie_t         cookie = NULL;
2805 
2806         ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
2807             mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
2808         if (fanout_hint != 0) {
2809                 /*
2810                  * The hint is specified by the caller, simply pass the
2811                  * whole chain to the soft ring.
2812                  */
2813                 hash = HASH_HINT(fanout_hint);
2814                 MAC_TX_SOFT_RING_PROCESS(mp_chain);
2815         } else {
2816                 mblk_t *last_mp, *cur_mp, *sub_chain;
2817                 uint64_t last_hash = 0;
2818                 uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
2819 
2820                 /*
2821                  * Compute the hash from the contents (headers) of the
2822                  * packets of the mblk chain. Split the chains into
2823                  * subchains of the same conversation.
2824                  *
2825                  * Since there may be more than one ring used for
2826                  * sub-chains of the same call, and since the caller
2827                  * does not maintain per conversation state since it
2828                  * passed a zero hint, unsent subchains will be
2829                  * dropped.
2830                  */
2831 
2832                 flag |= MAC_DROP_ON_NO_DESC;
2833                 ret_mp = NULL;
2834 
2835                 ASSERT(ret_mp == NULL);
2836 
2837                 sub_chain = NULL;
2838                 last_mp = NULL;
2839 
2840                 for (cur_mp = mp_chain; cur_mp != NULL;
2841                     cur_mp = cur_mp->b_next) {
2842                         hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
2843                             B_TRUE);
2844                         if (last_hash != 0 && hash != last_hash) {
2845                                 /*
2846                                  * Starting a different subchain, send current
2847                                  * chain out.
2848                                  */
2849                                 ASSERT(last_mp != NULL);
2850                                 last_mp->b_next = NULL;
2851                                 MAC_TX_SOFT_RING_PROCESS(sub_chain);
2852                                 sub_chain = NULL;
2853                         }
2854 
2855                         /* add packet to subchain */
2856                         if (sub_chain == NULL)
2857                                 sub_chain = cur_mp;
2858                         last_mp = cur_mp;
2859                         last_hash = hash;
2860                 }
2861 
2862                 if (sub_chain != NULL) {
2863                         /* send last subchain */
2864                         ASSERT(last_mp != NULL);
2865                         last_mp->b_next = NULL;
2866                         MAC_TX_SOFT_RING_PROCESS(sub_chain);
2867                 }
2868 
2869                 cookie = NULL;
2870         }
2871 
2872         return (cookie);
2873 }
2874 
2875 /*
2876  * mac_tx_bw_mode
2877  *
2878  * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
2879  * only if bw is available. Otherwise the packets will be queued in
2880  * SRS. If the SRS has multiple Tx rings, then packets will get fanned
2881  * out to a Tx rings.
2882  */
2883 static mac_tx_cookie_t
2884 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2885     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2886 {
2887         int                     cnt, sz;
2888         mblk_t                  *tail;
2889         mac_tx_cookie_t         cookie = NULL;
2890         mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
2891         clock_t                 now;
2892 
2893         ASSERT(TX_BANDWIDTH_MODE(mac_srs));
2894         ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
2895         mutex_enter(&mac_srs->srs_lock);
2896         if (mac_srs->srs_bw->mac_bw_limit == 0) {
2897                 /*
2898                  * zero bandwidth, no traffic is sent: drop the packets,
2899                  * or return the whole chain if the caller requests all
2900                  * unsent packets back.
2901                  */
2902                 if (flag & MAC_TX_NO_ENQUEUE) {
2903                         cookie = (mac_tx_cookie_t)mac_srs;
2904                         *ret_mp = mp_chain;
2905                 } else {
2906                         MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2907                 }
2908                 mutex_exit(&mac_srs->srs_lock);
2909                 return (cookie);
2910         } else if ((mac_srs->srs_first != NULL) ||
2911             (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2912                 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2913                     fanout_hint, ret_mp);
2914                 mutex_exit(&mac_srs->srs_lock);
2915                 return (cookie);
2916         }
2917         MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2918         now = ddi_get_lbolt();
2919         if (mac_srs->srs_bw->mac_bw_curr_time != now) {
2920                 mac_srs->srs_bw->mac_bw_curr_time = now;
2921                 mac_srs->srs_bw->mac_bw_used = 0;
2922         } else if (mac_srs->srs_bw->mac_bw_used >
2923             mac_srs->srs_bw->mac_bw_limit) {
2924                 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
2925                 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2926                     mp_chain, tail, cnt, sz);
2927                 /*
2928                  * Wakeup worker thread. Note that worker
2929                  * thread has to be woken up so that it
2930                  * can fire up the timer to be woken up
2931                  * on the next tick. Also once
2932                  * BW_ENFORCED is set, it can only be
2933                  * reset by srs_worker thread. Until then
2934                  * all packets will get queued up in SRS
2935                  * and hence this this code path won't be
2936                  * entered until BW_ENFORCED is reset.
2937                  */
2938                 cv_signal(&mac_srs->srs_async);
2939                 mutex_exit(&mac_srs->srs_lock);
2940                 return (cookie);
2941         }
2942 
2943         mac_srs->srs_bw->mac_bw_used += sz;
2944         mutex_exit(&mac_srs->srs_lock);
2945 
2946         if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
2947                 mac_soft_ring_t *softring;
2948                 uint_t indx, hash;
2949 
2950                 hash = HASH_HINT(fanout_hint);
2951                 indx = COMPUTE_INDEX(hash,
2952                     mac_srs->srs_tx_ring_count);
2953                 softring = mac_srs->srs_tx_soft_rings[indx];
2954                 return (mac_tx_soft_ring_process(softring, mp_chain, flag,
2955                     ret_mp));
2956         } else if (srs_tx->st_mode == SRS_TX_BW_AGGR) {
2957                 return (mac_tx_aggr_mode(mac_srs, mp_chain,
2958                     fanout_hint, flag, ret_mp));
2959         } else {
2960                 mac_tx_stats_t          stats;
2961 
2962                 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2963                     mp_chain, &stats);
2964 
2965                 if (mp_chain != NULL) {
2966                         mutex_enter(&mac_srs->srs_lock);
2967                         MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2968                         if (mac_srs->srs_bw->mac_bw_used > sz)
2969                                 mac_srs->srs_bw->mac_bw_used -= sz;
2970                         else
2971                                 mac_srs->srs_bw->mac_bw_used = 0;
2972                         cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2973                             fanout_hint, ret_mp);
2974                         mutex_exit(&mac_srs->srs_lock);
2975                         return (cookie);
2976                 }
2977                 SRS_TX_STATS_UPDATE(mac_srs, &stats);
2978 
2979                 return (NULL);
2980         }
2981 }
2982 
2983 /*
2984  * mac_tx_aggr_mode
2985  *
2986  * This routine invokes an aggr function, aggr_find_tx_ring(), to find
2987  * a (pseudo) Tx ring belonging to a port on which the packet has to
2988  * be sent. aggr_find_tx_ring() first finds the outgoing port based on
2989  * L2/L3/L4 policy and then uses the fanout_hint passed to it to pick
2990  * a Tx ring from the selected port.
2991  *
2992  * Note that a port can be deleted from the aggregation. In such a case,
2993  * the aggregation layer first separates the port from the rest of the
2994  * ports making sure that port (and thus any Tx rings associated with
2995  * it) won't get selected in the call to aggr_find_tx_ring() function.
2996  * Later calls are made to mac_group_rem_ring() passing pseudo Tx ring
2997  * handles one by one which in turn will quiesce the Tx SRS and remove
2998  * the soft ring associated with the pseudo Tx ring. Unlike Rx side
2999  * where a cookie is used to protect against mac_rx_ring() calls on
3000  * rings that have been removed, no such cookie is needed on the Tx
3001  * side as the pseudo Tx ring won't be available anymore to
3002  * aggr_find_tx_ring() once the port has been removed.
3003  */
3004 static mac_tx_cookie_t
3005 mac_tx_aggr_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3006     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
3007 {
3008         mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
3009         mac_tx_ring_fn_t        find_tx_ring_fn;
3010         mac_ring_handle_t       ring = NULL;
3011         void                    *arg;
3012         mac_soft_ring_t         *sringp;
3013 
3014         find_tx_ring_fn = srs_tx->st_capab_aggr.mca_find_tx_ring_fn;
3015         arg = srs_tx->st_capab_aggr.mca_arg;
3016         if (find_tx_ring_fn(arg, mp_chain, fanout_hint, &ring) == NULL)
3017                 return (NULL);
3018         sringp = srs_tx->st_soft_rings[((mac_ring_t *)ring)->mr_index];
3019         return (mac_tx_soft_ring_process(sringp, mp_chain, flag, ret_mp));
3020 }
3021 
3022 void
3023 mac_tx_invoke_callbacks(mac_client_impl_t *mcip, mac_tx_cookie_t cookie)
3024 {
3025         mac_cb_t *mcb;
3026         mac_tx_notify_cb_t *mtnfp;
3027 
3028         /* Wakeup callback registered clients */
3029         MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
3030         for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
3031             mcb = mcb->mcb_nextp) {
3032                 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
3033                 mtnfp->mtnf_fn(mtnfp->mtnf_arg, cookie);
3034         }
3035         MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
3036             &mcip->mci_tx_notify_cb_list);
3037 }
3038 
3039 /* ARGSUSED */
3040 void
3041 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
3042 {
3043         mblk_t                  *head, *tail;
3044         size_t                  sz;
3045         uint32_t                tx_mode;
3046         uint_t                  saved_pkt_count;
3047         mac_tx_stats_t          stats;
3048         mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
3049         clock_t                 now;
3050 
3051         saved_pkt_count = 0;
3052         ASSERT(mutex_owned(&mac_srs->srs_lock));
3053         ASSERT(!(mac_srs->srs_state & SRS_PROC));
3054 
3055         mac_srs->srs_state |= SRS_PROC;
3056 
3057         tx_mode = srs_tx->st_mode;
3058         if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
3059                 if (mac_srs->srs_first != NULL) {
3060                         head = mac_srs->srs_first;
3061                         tail = mac_srs->srs_last;
3062                         saved_pkt_count = mac_srs->srs_count;
3063                         mac_srs->srs_first = NULL;
3064                         mac_srs->srs_last = NULL;
3065                         mac_srs->srs_count = 0;
3066                         mutex_exit(&mac_srs->srs_lock);
3067 
3068                         head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3069                             head, &stats);
3070 
3071                         mutex_enter(&mac_srs->srs_lock);
3072                         if (head != NULL) {
3073                                 /* Device out of tx desc, set block */
3074                                 if (head->b_next == NULL)
3075                                         VERIFY(head == tail);
3076                                 tail->b_next = mac_srs->srs_first;
3077                                 mac_srs->srs_first = head;
3078                                 mac_srs->srs_count +=
3079                                     (saved_pkt_count - stats.mts_opackets);
3080                                 if (mac_srs->srs_last == NULL)
3081                                         mac_srs->srs_last = tail;
3082                                 MAC_TX_SRS_BLOCK(mac_srs, head);
3083                         } else {
3084                                 srs_tx->st_woken_up = B_FALSE;
3085                                 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3086                         }
3087                 }
3088         } else if (tx_mode == SRS_TX_BW) {
3089                 /*
3090                  * We are here because the timer fired and we have some data
3091                  * to tranmit. Also mac_tx_srs_worker should have reset
3092                  * SRS_BW_ENFORCED flag
3093                  */
3094                 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
3095                 head = tail = mac_srs->srs_first;
3096                 while (mac_srs->srs_first != NULL) {
3097                         tail = mac_srs->srs_first;
3098                         tail->b_prev = NULL;
3099                         mac_srs->srs_first = tail->b_next;
3100                         if (mac_srs->srs_first == NULL)
3101                                 mac_srs->srs_last = NULL;
3102                         mac_srs->srs_count--;
3103                         sz = msgdsize(tail);
3104                         mac_srs->srs_size -= sz;
3105                         saved_pkt_count++;
3106                         MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
3107 
3108                         if (mac_srs->srs_bw->mac_bw_used <
3109                             mac_srs->srs_bw->mac_bw_limit)
3110                                 continue;
3111 
3112                         now = ddi_get_lbolt();
3113                         if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3114                                 mac_srs->srs_bw->mac_bw_curr_time = now;
3115                                 mac_srs->srs_bw->mac_bw_used = sz;
3116                                 continue;
3117                         }
3118                         mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3119                         break;
3120                 }
3121 
3122                 ASSERT((head == NULL && tail == NULL) ||
3123                     (head != NULL && tail != NULL));
3124                 if (tail != NULL) {
3125                         tail->b_next = NULL;
3126                         mutex_exit(&mac_srs->srs_lock);
3127 
3128                         head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3129                             head, &stats);
3130 
3131                         mutex_enter(&mac_srs->srs_lock);
3132                         if (head != NULL) {
3133                                 uint_t size_sent;
3134 
3135                                 /* Device out of tx desc, set block */
3136                                 if (head->b_next == NULL)
3137                                         VERIFY(head == tail);
3138                                 tail->b_next = mac_srs->srs_first;
3139                                 mac_srs->srs_first = head;
3140                                 mac_srs->srs_count +=
3141                                     (saved_pkt_count - stats.mts_opackets);
3142                                 if (mac_srs->srs_last == NULL)
3143                                         mac_srs->srs_last = tail;
3144                                 size_sent = sz - stats.mts_obytes;
3145                                 mac_srs->srs_size += size_sent;
3146                                 mac_srs->srs_bw->mac_bw_sz += size_sent;
3147                                 if (mac_srs->srs_bw->mac_bw_used > size_sent) {
3148                                         mac_srs->srs_bw->mac_bw_used -=
3149                                             size_sent;
3150                                 } else {
3151                                         mac_srs->srs_bw->mac_bw_used = 0;
3152                                 }
3153                                 MAC_TX_SRS_BLOCK(mac_srs, head);
3154                         } else {
3155                                 srs_tx->st_woken_up = B_FALSE;
3156                                 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3157                         }
3158                 }
3159         } else if (tx_mode == SRS_TX_BW_FANOUT || tx_mode == SRS_TX_BW_AGGR) {
3160                 mblk_t *prev;
3161                 uint64_t hint;
3162 
3163                 /*
3164                  * We are here because the timer fired and we
3165                  * have some quota to tranmit.
3166                  */
3167                 prev = NULL;
3168                 head = tail = mac_srs->srs_first;
3169                 while (mac_srs->srs_first != NULL) {
3170                         tail = mac_srs->srs_first;
3171                         mac_srs->srs_first = tail->b_next;
3172                         if (mac_srs->srs_first == NULL)
3173                                 mac_srs->srs_last = NULL;
3174                         mac_srs->srs_count--;
3175                         sz = msgdsize(tail);
3176                         mac_srs->srs_size -= sz;
3177                         mac_srs->srs_bw->mac_bw_used += sz;
3178                         if (prev == NULL)
3179                                 hint = (ulong_t)tail->b_prev;
3180                         if (hint != (ulong_t)tail->b_prev) {
3181                                 prev->b_next = NULL;
3182                                 mutex_exit(&mac_srs->srs_lock);
3183                                 TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3184                                 head = tail;
3185                                 hint = (ulong_t)tail->b_prev;
3186                                 mutex_enter(&mac_srs->srs_lock);
3187                         }
3188 
3189                         prev = tail;
3190                         tail->b_prev = NULL;
3191                         if (mac_srs->srs_bw->mac_bw_used <
3192                             mac_srs->srs_bw->mac_bw_limit)
3193                                 continue;
3194 
3195                         now = ddi_get_lbolt();
3196                         if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3197                                 mac_srs->srs_bw->mac_bw_curr_time = now;
3198                                 mac_srs->srs_bw->mac_bw_used = 0;
3199                                 continue;
3200                         }
3201                         mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3202                         break;
3203                 }
3204                 ASSERT((head == NULL && tail == NULL) ||
3205                     (head != NULL && tail != NULL));
3206                 if (tail != NULL) {
3207                         tail->b_next = NULL;
3208                         mutex_exit(&mac_srs->srs_lock);
3209                         TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3210                         mutex_enter(&mac_srs->srs_lock);
3211                 }
3212         }
3213         /*
3214          * SRS_TX_FANOUT case not considered here because packets
3215          * won't be queued in the SRS for this case. Packets will
3216          * be sent directly to soft rings underneath and if there
3217          * is any queueing at all, it would be in Tx side soft
3218          * rings.
3219          */
3220 
3221         /*
3222          * When srs_count becomes 0, reset SRS_TX_HIWAT and
3223          * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
3224          */
3225         if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
3226             (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
3227                 mac_client_impl_t *mcip = mac_srs->srs_mcip;
3228                 boolean_t wakeup_required = B_FALSE;
3229 
3230                 if (mac_srs->srs_state &
3231                     (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
3232                         wakeup_required = B_TRUE;
3233                 }
3234                 mac_srs->srs_state &= ~(SRS_TX_HIWAT |
3235                     SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
3236                 mutex_exit(&mac_srs->srs_lock);
3237                 if (wakeup_required) {
3238                         mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)mac_srs);
3239                         /*
3240                          * If the client is not the primary MAC client, then we
3241                          * need to send the notification to the clients upper
3242                          * MAC, i.e. mci_upper_mip.
3243                          */
3244                         mac_tx_notify(mcip->mci_upper_mip != NULL ?
3245                             mcip->mci_upper_mip : mcip->mci_mip);
3246                 }
3247                 mutex_enter(&mac_srs->srs_lock);
3248         }
3249         mac_srs->srs_state &= ~SRS_PROC;
3250 }
3251 
3252 /*
3253  * Given a packet, get the flow_entry that identifies the flow
3254  * to which that packet belongs. The flow_entry will contain
3255  * the transmit function to be used to send the packet. If the
3256  * function returns NULL, the packet should be sent using the
3257  * underlying NIC.
3258  */
3259 static flow_entry_t *
3260 mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
3261 {
3262         flow_entry_t            *flent = NULL;
3263         mac_client_impl_t       *mcip;
3264         int     err;
3265 
3266         /*
3267          * Do classification on the packet.
3268          */
3269         err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
3270         if (err != 0)
3271                 return (NULL);
3272 
3273         /*
3274          * This flent might just be an additional one on the MAC client,
3275          * i.e. for classification purposes (different fdesc), however
3276          * the resources, SRS et. al., are in the mci_flent, so if
3277          * this isn't the mci_flent, we need to get it.
3278          */
3279         if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
3280                 FLOW_REFRELE(flent);
3281                 flent = mcip->mci_flent;
3282                 FLOW_TRY_REFHOLD(flent, err);
3283                 if (err != 0)
3284                         return (NULL);
3285         }
3286 
3287         return (flent);
3288 }
3289 
3290 /*
3291  * This macro is only meant to be used by mac_tx_send().
3292  */
3293 #define CHECK_VID_AND_ADD_TAG(mp) {                     \
3294         if (vid_check) {                                \
3295                 int err = 0;                            \
3296                                                         \
3297                 MAC_VID_CHECK(src_mcip, (mp), err);     \
3298                 if (err != 0) {                         \
3299                         freemsg((mp));                  \
3300                         (mp) = next;                    \
3301                         oerrors++;                      \
3302                         continue;                       \
3303                 }                                       \
3304         }                                               \
3305         if (add_tag) {                                  \
3306                 (mp) = mac_add_vlan_tag((mp), 0, vid);  \
3307                 if ((mp) == NULL) {                     \
3308                         (mp) = next;                    \
3309                         oerrors++;                      \
3310                         continue;                       \
3311                 }                                       \
3312         }                                               \
3313 }
3314 
3315 mblk_t *
3316 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
3317     mac_tx_stats_t *stats)
3318 {
3319         mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
3320         mac_impl_t *mip = src_mcip->mci_mip;
3321         uint_t obytes = 0, opackets = 0, oerrors = 0;
3322         mblk_t *mp = NULL, *next;
3323         boolean_t vid_check, add_tag;
3324         uint16_t vid = 0;
3325 
3326         if (mip->mi_nclients > 1) {
3327                 vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
3328                 add_tag = MAC_TAG_NEEDED(src_mcip);
3329                 if (add_tag)
3330                         vid = mac_client_vid(mch);
3331         } else {
3332                 ASSERT(mip->mi_nclients == 1);
3333                 vid_check = add_tag = B_FALSE;
3334         }
3335 
3336         /*
3337          * Fastpath: if there's only one client, we simply send
3338          * the packet down to the underlying NIC.
3339          */
3340         if (mip->mi_nactiveclients == 1) {
3341                 DTRACE_PROBE2(fastpath,
3342                     mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
3343 
3344                 mp = mp_chain;
3345                 while (mp != NULL) {
3346                         next = mp->b_next;
3347                         mp->b_next = NULL;
3348                         opackets++;
3349                         obytes += (mp->b_cont == NULL ? MBLKL(mp) :
3350                             msgdsize(mp));
3351 
3352                         CHECK_VID_AND_ADD_TAG(mp);
3353                         MAC_TX(mip, ring, mp, src_mcip);
3354 
3355                         /*
3356                          * If the driver is out of descriptors and does a
3357                          * partial send it will return a chain of unsent
3358                          * mblks. Adjust the accounting stats.
3359                          */
3360                         if (mp != NULL) {
3361                                 opackets--;
3362                                 obytes -= msgdsize(mp);
3363                                 mp->b_next = next;
3364                                 break;
3365                         }
3366                         mp = next;
3367                 }
3368                 goto done;
3369         }
3370 
3371         /*
3372          * No fastpath, we either have more than one MAC client
3373          * defined on top of the same MAC, or one or more MAC
3374          * client promiscuous callbacks.
3375          */
3376         DTRACE_PROBE3(slowpath, mac_client_impl_t *,
3377             src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
3378 
3379         mp = mp_chain;
3380         while (mp != NULL) {
3381                 flow_entry_t *dst_flow_ent;
3382                 void *flow_cookie;
3383                 size_t  pkt_size;
3384                 mblk_t *mp1;
3385 
3386                 next = mp->b_next;
3387                 mp->b_next = NULL;
3388                 opackets++;
3389                 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
3390                 obytes += pkt_size;
3391                 CHECK_VID_AND_ADD_TAG(mp);
3392 
3393                 /*
3394                  * Find the destination.
3395                  */
3396                 dst_flow_ent = mac_tx_classify(mip, mp);
3397 
3398                 if (dst_flow_ent != NULL) {
3399                         size_t  hdrsize;
3400                         int     err = 0;
3401 
3402                         if (mip->mi_info.mi_nativemedia == DL_ETHER) {
3403                                 struct ether_vlan_header *evhp =
3404                                     (struct ether_vlan_header *)mp->b_rptr;
3405 
3406                                 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
3407                                         hdrsize = sizeof (*evhp);
3408                                 else
3409                                         hdrsize = sizeof (struct ether_header);
3410                         } else {
3411                                 mac_header_info_t       mhi;
3412 
3413                                 err = mac_header_info((mac_handle_t)mip,
3414                                     mp, &mhi);
3415                                 if (err == 0)
3416                                         hdrsize = mhi.mhi_hdrsize;
3417                         }
3418 
3419                         /*
3420                          * Got a matching flow. It's either another
3421                          * MAC client, or a broadcast/multicast flow.
3422                          * Make sure the packet size is within the
3423                          * allowed size. If not drop the packet and
3424                          * move to next packet.
3425                          */
3426                         if (err != 0 ||
3427                             (pkt_size - hdrsize) > mip->mi_sdu_max) {
3428                                 oerrors++;
3429                                 DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
3430                                     mblk_t *, mp);
3431                                 freemsg(mp);
3432                                 mp = next;
3433                                 FLOW_REFRELE(dst_flow_ent);
3434                                 continue;
3435                         }
3436                         flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
3437                         if (flow_cookie != NULL) {
3438                                 /*
3439                                  * The vnic_bcast_send function expects
3440                                  * to receive the sender MAC client
3441                                  * as value for arg2.
3442                                  */
3443                                 mac_bcast_send(flow_cookie, src_mcip, mp,
3444                                     B_TRUE);
3445                         } else {
3446                                 /*
3447                                  * loopback the packet to a local MAC
3448                                  * client. We force a context switch
3449                                  * if both source and destination MAC
3450                                  * clients are used by IP, i.e.
3451                                  * bypass is set.
3452                                  */
3453                                 boolean_t do_switch;
3454                                 mac_client_impl_t *dst_mcip =
3455                                     dst_flow_ent->fe_mcip;
3456 
3457                                 /*
3458                                  * Check if there are promiscuous mode
3459                                  * callbacks defined. This check is
3460                                  * done here in the 'else' case and
3461                                  * not in other cases because this
3462                                  * path is for local loopback
3463                                  * communication which does not go
3464                                  * through MAC_TX(). For paths that go
3465                                  * through MAC_TX(), the promisc_list
3466                                  * check is done inside the MAC_TX()
3467                                  * macro.
3468                                  */
3469                                 if (mip->mi_promisc_list != NULL)
3470                                         mac_promisc_dispatch(mip, mp, src_mcip);
3471 
3472                                 do_switch = ((src_mcip->mci_state_flags &
3473                                     dst_mcip->mci_state_flags &
3474                                     MCIS_CLIENT_POLL_CAPABLE) != 0);
3475 
3476                                 if ((mp1 = mac_fix_cksum(mp)) != NULL) {
3477                                         (dst_flow_ent->fe_cb_fn)(
3478                                             dst_flow_ent->fe_cb_arg1,
3479                                             dst_flow_ent->fe_cb_arg2,
3480                                             mp1, do_switch);
3481                                 }
3482                         }
3483                         FLOW_REFRELE(dst_flow_ent);
3484                 } else {
3485                         /*
3486                          * Unknown destination, send via the underlying
3487                          * NIC.
3488                          */
3489                         MAC_TX(mip, ring, mp, src_mcip);
3490                         if (mp != NULL) {
3491                                 /*
3492                                  * Adjust for the last packet that
3493                                  * could not be transmitted
3494                                  */
3495                                 opackets--;
3496                                 obytes -= pkt_size;
3497                                 mp->b_next = next;
3498                                 break;
3499                         }
3500                 }
3501                 mp = next;
3502         }
3503 
3504 done:
3505         stats->mts_obytes = obytes;
3506         stats->mts_opackets = opackets;
3507         stats->mts_oerrors = oerrors;
3508         return (mp);
3509 }
3510 
3511 /*
3512  * mac_tx_srs_ring_present
3513  *
3514  * Returns whether the specified ring is part of the specified SRS.
3515  */
3516 boolean_t
3517 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3518 {
3519         int i;
3520         mac_soft_ring_t *soft_ring;
3521 
3522         if (srs->srs_tx.st_arg2 == tx_ring)
3523                 return (B_TRUE);
3524 
3525         for (i = 0; i < srs->srs_tx_ring_count; i++) {
3526                 soft_ring =  srs->srs_tx_soft_rings[i];
3527                 if (soft_ring->s_ring_tx_arg2 == tx_ring)
3528                         return (B_TRUE);
3529         }
3530 
3531         return (B_FALSE);
3532 }
3533 
3534 /*
3535  * mac_tx_srs_get_soft_ring
3536  *
3537  * Returns the TX soft ring associated with the given ring, if present.
3538  */
3539 mac_soft_ring_t *
3540 mac_tx_srs_get_soft_ring(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3541 {
3542         int             i;
3543         mac_soft_ring_t *soft_ring;
3544 
3545         if (srs->srs_tx.st_arg2 == tx_ring)
3546                 return (NULL);
3547 
3548         for (i = 0; i < srs->srs_tx_ring_count; i++) {
3549                 soft_ring =  srs->srs_tx_soft_rings[i];
3550                 if (soft_ring->s_ring_tx_arg2 == tx_ring)
3551                         return (soft_ring);
3552         }
3553 
3554         return (NULL);
3555 }
3556 
3557 /*
3558  * mac_tx_srs_wakeup
3559  *
3560  * Called when Tx desc become available. Wakeup the appropriate worker
3561  * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
3562  * state field.
3563  */
3564 void
3565 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
3566 {
3567         int i;
3568         mac_soft_ring_t *sringp;
3569         mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3570 
3571         mutex_enter(&mac_srs->srs_lock);
3572         /*
3573          * srs_tx_ring_count == 0 is the single ring mode case. In
3574          * this mode, there will not be Tx soft rings associated
3575          * with the SRS.
3576          */
3577         if (!MAC_TX_SOFT_RINGS(mac_srs)) {
3578                 if (srs_tx->st_arg2 == ring &&
3579                     mac_srs->srs_state & SRS_TX_BLOCKED) {
3580                         mac_srs->srs_state &= ~SRS_TX_BLOCKED;
3581                         srs_tx->st_stat.mts_unblockcnt++;
3582                         cv_signal(&mac_srs->srs_async);
3583                 }
3584                 /*
3585                  * A wakeup can come before tx_srs_drain() could
3586                  * grab srs lock and set SRS_TX_BLOCKED. So
3587                  * always set woken_up flag when we come here.
3588                  */
3589                 srs_tx->st_woken_up = B_TRUE;
3590                 mutex_exit(&mac_srs->srs_lock);
3591                 return;
3592         }
3593 
3594         /*
3595          * If you are here, it is for FANOUT, BW_FANOUT,
3596          * AGGR_MODE or AGGR_BW_MODE case
3597          */
3598         for (i = 0; i < mac_srs->srs_tx_ring_count; i++) {
3599                 sringp = mac_srs->srs_tx_soft_rings[i];
3600                 mutex_enter(&sringp->s_ring_lock);
3601                 if (sringp->s_ring_tx_arg2 == ring) {
3602                         if (sringp->s_ring_state & S_RING_BLOCK) {
3603                                 sringp->s_ring_state &= ~S_RING_BLOCK;
3604                                 sringp->s_st_stat.mts_unblockcnt++;
3605                                 cv_signal(&sringp->s_ring_async);
3606                         }
3607                         sringp->s_ring_tx_woken_up = B_TRUE;
3608                 }
3609                 mutex_exit(&sringp->s_ring_lock);
3610         }
3611         mutex_exit(&mac_srs->srs_lock);
3612 }
3613 
3614 /*
3615  * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
3616  * the blocked clients again.
3617  */
3618 void
3619 mac_tx_notify(mac_impl_t *mip)
3620 {
3621         i_mac_notify(mip, MAC_NOTE_TX);
3622 }
3623 
3624 /*
3625  * RX SOFTRING RELATED FUNCTIONS
3626  *
3627  * These functions really belong in mac_soft_ring.c and here for
3628  * a short period.
3629  */
3630 
3631 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {             \
3632         /*                                                              \
3633          * Enqueue our mblk chain.                                      \
3634          */                                                             \
3635         ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock));                       \
3636                                                                         \
3637         if ((ringp)->s_ring_last != NULL)                            \
3638                 (ringp)->s_ring_last->b_next = (mp);                      \
3639         else                                                            \
3640                 (ringp)->s_ring_first = (mp);                                \
3641         (ringp)->s_ring_last = (tail);                                       \
3642         (ringp)->s_ring_count += (cnt);                                      \
3643         ASSERT((ringp)->s_ring_count > 0);                                \
3644         if ((ringp)->s_ring_type & ST_RING_BW_CTL) {                     \
3645                 (ringp)->s_ring_size += sz;                          \
3646         }                                                               \
3647 }
3648 
3649 /*
3650  * Default entry point to deliver a packet chain to a MAC client.
3651  * If the MAC client has flows, do the classification with these
3652  * flows as well.
3653  */
3654 /* ARGSUSED */
3655 void
3656 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
3657     mac_header_info_t *arg3)
3658 {
3659         mac_client_impl_t *mcip = arg1;
3660 
3661         if (mcip->mci_nvids == 1 &&
3662             !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
3663                 /*
3664                  * If the client has exactly one VID associated with it
3665                  * and striping of VLAN header is not disabled,
3666                  * remove the VLAN tag from the packet before
3667                  * passing it on to the client's receive callback.
3668                  * Note that this needs to be done after we dispatch
3669                  * the packet to the promiscuous listeners of the
3670                  * client, since they expect to see the whole
3671                  * frame including the VLAN headers.
3672                  */
3673                 mp_chain = mac_strip_vlan_tag_chain(mp_chain);
3674         }
3675 
3676         mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
3677 }
3678 
3679 /*
3680  * mac_rx_soft_ring_process
3681  *
3682  * process a chain for a given soft ring. The number of packets queued
3683  * in the SRS and its associated soft rings (including this one) is
3684  * very small (tracked by srs_poll_pkt_cnt), then allow the entering
3685  * thread (interrupt or poll thread) to do inline processing. This
3686  * helps keep the latency down under low load.
3687  *
3688  * The proc and arg for each mblk is already stored in the mblk in
3689  * appropriate places.
3690  */
3691 /* ARGSUSED */
3692 void
3693 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
3694     mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
3695 {
3696         mac_direct_rx_t         proc;
3697         void                    *arg1;
3698         mac_resource_handle_t   arg2;
3699         mac_soft_ring_set_t     *mac_srs = ringp->s_ring_set;
3700 
3701         ASSERT(ringp != NULL);
3702         ASSERT(mp_chain != NULL);
3703         ASSERT(tail != NULL);
3704         ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3705 
3706         mutex_enter(&ringp->s_ring_lock);
3707         ringp->s_ring_total_inpkt += cnt;
3708         ringp->s_ring_total_rbytes += sz;
3709         if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
3710             !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
3711                 /* If on processor or blanking on, then enqueue and return */
3712                 if (ringp->s_ring_state & S_RING_BLANK ||
3713                     ringp->s_ring_state & S_RING_PROC) {
3714                         SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3715                         mutex_exit(&ringp->s_ring_lock);
3716                         return;
3717                 }
3718                 proc = ringp->s_ring_rx_func;
3719                 arg1 = ringp->s_ring_rx_arg1;
3720                 arg2 = ringp->s_ring_rx_arg2;
3721                 /*
3722                  * See if anything is already queued. If we are the
3723                  * first packet, do inline processing else queue the
3724                  * packet and do the drain.
3725                  */
3726                 if (ringp->s_ring_first == NULL) {
3727                         /*
3728                          * Fast-path, ok to process and nothing queued.
3729                          */
3730                         ringp->s_ring_run = curthread;
3731                         ringp->s_ring_state |= (S_RING_PROC);
3732 
3733                         mutex_exit(&ringp->s_ring_lock);
3734 
3735                         /*
3736                          * We are the chain of 1 packet so
3737                          * go through this fast path.
3738                          */
3739                         ASSERT(mp_chain->b_next == NULL);
3740 
3741                         (*proc)(arg1, arg2, mp_chain, NULL);
3742 
3743                         ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3744                         /*
3745                          * If we have a soft ring set which is doing
3746                          * bandwidth control, we need to decrement
3747                          * srs_size and count so it the SRS can have a
3748                          * accurate idea of what is the real data
3749                          * queued between SRS and its soft rings. We
3750                          * decrement the counters only when the packet
3751                          * gets processed by both SRS and the soft ring.
3752                          */
3753                         mutex_enter(&mac_srs->srs_lock);
3754                         MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
3755                         MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
3756                         mutex_exit(&mac_srs->srs_lock);
3757 
3758                         mutex_enter(&ringp->s_ring_lock);
3759                         ringp->s_ring_run = NULL;
3760                         ringp->s_ring_state &= ~S_RING_PROC;
3761                         if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
3762                                 cv_signal(&ringp->s_ring_client_cv);
3763 
3764                         if ((ringp->s_ring_first == NULL) ||
3765                             (ringp->s_ring_state & S_RING_BLANK)) {
3766                                 /*
3767                                  * We processed inline our packet and
3768                                  * nothing new has arrived or our
3769                                  * receiver doesn't want to receive
3770                                  * any packets. We are done.
3771                                  */
3772                                 mutex_exit(&ringp->s_ring_lock);
3773                                 return;
3774                         }
3775                 } else {
3776                         SOFT_RING_ENQUEUE_CHAIN(ringp,
3777                             mp_chain, tail, cnt, sz);
3778                 }
3779 
3780                 /*
3781                  * We are here because either we couldn't do inline
3782                  * processing (because something was already
3783                  * queued), or we had a chain of more than one
3784                  * packet, or something else arrived after we were
3785                  * done with inline processing.
3786                  */
3787                 ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3788                 ASSERT(ringp->s_ring_first != NULL);
3789 
3790                 ringp->s_ring_drain_func(ringp);
3791                 mutex_exit(&ringp->s_ring_lock);
3792                 return;
3793         } else {
3794                 /* ST_RING_WORKER_ONLY case */
3795                 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3796                 mac_soft_ring_worker_wakeup(ringp);
3797                 mutex_exit(&ringp->s_ring_lock);
3798         }
3799 }
3800 
3801 /*
3802  * TX SOFTRING RELATED FUNCTIONS
3803  *
3804  * These functions really belong in mac_soft_ring.c and here for
3805  * a short period.
3806  */
3807 
3808 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {          \
3809         ASSERT(MUTEX_HELD(&ringp->s_ring_lock));                 \
3810         ringp->s_ring_state |= S_RING_ENQUEUED;                              \
3811         SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);        \
3812 }
3813 
3814 /*
3815  * mac_tx_sring_queued
3816  *
3817  * When we are out of transmit descriptors and we already have a
3818  * queue that exceeds hiwat (or the client called us with
3819  * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
3820  * soft ring pointer as the opaque cookie for the client enable
3821  * flow control.
3822  */
3823 static mac_tx_cookie_t
3824 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
3825     mblk_t **ret_mp)
3826 {
3827         int cnt;
3828         size_t sz;
3829         mblk_t *tail;
3830         mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3831         mac_tx_cookie_t cookie = NULL;
3832         boolean_t wakeup_worker = B_TRUE;
3833 
3834         ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3835         MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3836         if (flag & MAC_DROP_ON_NO_DESC) {
3837                 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
3838                 /* increment freed stats */
3839                 ringp->s_ring_drops += cnt;
3840                 cookie = (mac_tx_cookie_t)ringp;
3841         } else {
3842                 if (ringp->s_ring_first != NULL)
3843                         wakeup_worker = B_FALSE;
3844 
3845                 if (flag & MAC_TX_NO_ENQUEUE) {
3846                         /*
3847                          * If QUEUED is not set, queue the packet
3848                          * and let mac_tx_soft_ring_drain() set
3849                          * the TX_BLOCKED bit for the reasons
3850                          * explained above. Otherwise, return the
3851                          * mblks.
3852                          */
3853                         if (wakeup_worker) {
3854                                 TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
3855                                     mp_chain, tail, cnt, sz);
3856                         } else {
3857                                 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
3858                                 cookie = (mac_tx_cookie_t)ringp;
3859                                 *ret_mp = mp_chain;
3860                         }
3861                 } else {
3862                         boolean_t enqueue = B_TRUE;
3863 
3864                         if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3865                                 /*
3866                                  * flow-controlled. Store ringp in cookie
3867                                  * so that it can be returned as
3868                                  * mac_tx_cookie_t to client
3869                                  */
3870                                 ringp->s_ring_state |= S_RING_TX_HIWAT;
3871                                 cookie = (mac_tx_cookie_t)ringp;
3872                                 ringp->s_ring_hiwat_cnt++;
3873                                 if (ringp->s_ring_count >
3874                                     ringp->s_ring_tx_max_q_cnt) {
3875                                         /* increment freed stats */
3876                                         ringp->s_ring_drops += cnt;
3877                                         /*
3878                                          * b_prev may be set to the fanout hint
3879                                          * hence can't use freemsg directly
3880                                          */
3881                                         mac_pkt_drop(NULL, NULL,
3882                                             mp_chain, B_FALSE);
3883                                         DTRACE_PROBE1(tx_queued_hiwat,
3884                                             mac_soft_ring_t *, ringp);
3885                                         enqueue = B_FALSE;
3886                                 }
3887                         }
3888                         if (enqueue) {
3889                                 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
3890                                     tail, cnt, sz);
3891                         }
3892                 }
3893                 if (wakeup_worker)
3894                         cv_signal(&ringp->s_ring_async);
3895         }
3896         return (cookie);
3897 }
3898 
3899 
3900 /*
3901  * mac_tx_soft_ring_process
3902  *
3903  * This routine is called when fanning out outgoing traffic among
3904  * multipe Tx rings.
3905  * Note that a soft ring is associated with a h/w Tx ring.
3906  */
3907 mac_tx_cookie_t
3908 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
3909     uint16_t flag, mblk_t **ret_mp)
3910 {
3911         mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3912         int     cnt;
3913         size_t  sz;
3914         mblk_t  *tail;
3915         mac_tx_cookie_t cookie = NULL;
3916 
3917         ASSERT(ringp != NULL);
3918         ASSERT(mp_chain != NULL);
3919         ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3920         /*
3921          * The following modes can come here: SRS_TX_BW_FANOUT,
3922          * SRS_TX_FANOUT, SRS_TX_AGGR, SRS_TX_BW_AGGR.
3923          */
3924         ASSERT(MAC_TX_SOFT_RINGS(mac_srs));
3925         ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
3926             mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT ||
3927             mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
3928             mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
3929 
3930         if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
3931                 /* Serialization mode */
3932 
3933                 mutex_enter(&ringp->s_ring_lock);
3934                 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3935                         cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3936                             flag, ret_mp);
3937                         mutex_exit(&ringp->s_ring_lock);
3938                         return (cookie);
3939                 }
3940                 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3941                 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3942                 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
3943                         /*
3944                          * If ring is blocked due to lack of Tx
3945                          * descs, just return. Worker thread
3946                          * will get scheduled when Tx desc's
3947                          * become available.
3948                          */
3949                         mutex_exit(&ringp->s_ring_lock);
3950                         return (cookie);
3951                 }
3952                 mac_soft_ring_worker_wakeup(ringp);
3953                 mutex_exit(&ringp->s_ring_lock);
3954                 return (cookie);
3955         } else {
3956                 /* Default fanout mode */
3957                 /*
3958                  * S_RING_BLOCKED is set when underlying NIC runs
3959                  * out of Tx descs and messages start getting
3960                  * queued. It won't get reset until
3961                  * tx_srs_drain() completely drains out the
3962                  * messages.
3963                  */
3964                 mac_tx_stats_t          stats;
3965 
3966                 if (ringp->s_ring_state & S_RING_ENQUEUED) {
3967                         /* Tx descs/resources not available */
3968                         mutex_enter(&ringp->s_ring_lock);
3969                         if (ringp->s_ring_state & S_RING_ENQUEUED) {
3970                                 cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3971                                     flag, ret_mp);
3972                                 mutex_exit(&ringp->s_ring_lock);
3973                                 return (cookie);
3974                         }
3975                         /*
3976                          * While we were computing mblk count, the
3977                          * flow control condition got relieved.
3978                          * Continue with the transmission.
3979                          */
3980                         mutex_exit(&ringp->s_ring_lock);
3981                 }
3982 
3983                 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
3984                     ringp->s_ring_tx_arg2, mp_chain, &stats);
3985 
3986                 /*
3987                  * Multiple threads could be here sending packets.
3988                  * Under such conditions, it is not possible to
3989                  * automically set S_RING_BLOCKED bit to indicate
3990                  * out of tx desc condition. To atomically set
3991                  * this, we queue the returned packet and do
3992                  * the setting of S_RING_BLOCKED in
3993                  * mac_tx_soft_ring_drain().
3994                  */
3995                 if (mp_chain != NULL) {
3996                         mutex_enter(&ringp->s_ring_lock);
3997                         cookie =
3998                             mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
3999                         mutex_exit(&ringp->s_ring_lock);
4000                         return (cookie);
4001                 }
4002                 SRS_TX_STATS_UPDATE(mac_srs, &stats);
4003                 SOFTRING_TX_STATS_UPDATE(ringp, &stats);
4004 
4005                 return (NULL);
4006         }
4007 }