1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2011 Joyent, Inc. All rights reserved.
25 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/callb.h>
30 #include <sys/sdt.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/vlan.h>
34 #include <sys/stack.h>
35 #include <sys/archsystm.h>
36 #include <inet/ipsec_impl.h>
37 #include <inet/ip_impl.h>
38 #include <inet/sadb.h>
39 #include <inet/ipsecesp.h>
40 #include <inet/ipsecah.h>
41 #include <inet/ip6.h>
42
43 #include <sys/mac_impl.h>
44 #include <sys/mac_client_impl.h>
45 #include <sys/mac_client_priv.h>
46 #include <sys/mac_soft_ring.h>
47 #include <sys/mac_flow_impl.h>
48
49 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
50 uintptr_t, uint16_t, mblk_t **);
51 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
52 uintptr_t, uint16_t, mblk_t **);
53 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
54 uintptr_t, uint16_t, mblk_t **);
55 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
56 uintptr_t, uint16_t, mblk_t **);
57 static mac_tx_cookie_t mac_tx_aggr_mode(mac_soft_ring_set_t *, mblk_t *,
58 uintptr_t, uint16_t, mblk_t **);
59
60 typedef struct mac_tx_mode_s {
61 mac_tx_srs_mode_t mac_tx_mode;
62 mac_tx_func_t mac_tx_func;
63 } mac_tx_mode_t;
64
65 /*
66 * There are seven modes of operation on the Tx side. These modes get set
67 * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
68 * none of the other modes are user configurable. They get selected by
69 * the system depending upon whether the link (or flow) has multiple Tx
70 * rings or a bandwidth configured, or if the link is an aggr, etc.
71 *
72 * When the Tx SRS is operating in aggr mode (st_mode) or if there are
73 * multiple Tx rings owned by Tx SRS, then each Tx ring (pseudo or
74 * otherwise) will have a soft ring associated with it. These soft rings
75 * are stored in srs_tx_soft_rings[] array.
76 *
77 * Additionally in the case of aggr, there is the st_soft_rings[] array
78 * in the mac_srs_tx_t structure. This array is used to store the same
79 * set of soft rings that are present in srs_tx_soft_rings[] array but
80 * in a different manner. The soft ring associated with the pseudo Tx
81 * ring is saved at mr_index (of the pseudo ring) in st_soft_rings[]
82 * array. This helps in quickly getting the soft ring associated with the
83 * Tx ring when aggr_find_tx_ring() returns the pseudo Tx ring that is to
84 * be used for transmit.
85 */
86 mac_tx_mode_t mac_tx_mode_list[] = {
87 {SRS_TX_DEFAULT, mac_tx_single_ring_mode},
88 {SRS_TX_SERIALIZE, mac_tx_serializer_mode},
89 {SRS_TX_FANOUT, mac_tx_fanout_mode},
90 {SRS_TX_BW, mac_tx_bw_mode},
91 {SRS_TX_BW_FANOUT, mac_tx_bw_mode},
92 {SRS_TX_AGGR, mac_tx_aggr_mode},
93 {SRS_TX_BW_AGGR, mac_tx_bw_mode}
94 };
95
96 /*
97 * Soft Ring Set (SRS) - The Run time code that deals with
98 * dynamic polling from the hardware, bandwidth enforcement,
99 * fanout etc.
100 *
101 * We try to use H/W classification on NIC and assign traffic for
102 * a MAC address to a particular Rx ring or ring group. There is a
103 * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
104 * switches the underlying Rx ring between interrupt and
105 * polling mode and enforces any specified B/W control.
106 *
107 * There is always a SRS created and tied to each H/W and S/W rule.
108 * Whenever we create a H/W rule, we always add the the same rule to
109 * S/W classifier and tie a SRS to it.
110 *
111 * In case a B/W control is specified, it is broken into bytes
112 * per ticks and as soon as the quota for a tick is exhausted,
113 * the underlying Rx ring is forced into poll mode for remainder of
114 * the tick. The SRS poll thread only polls for bytes that are
115 * allowed to come in the SRS. We typically let 4x the configured
116 * B/W worth of packets to come in the SRS (to prevent unnecessary
117 * drops due to bursts) but only process the specified amount.
118 *
119 * A MAC client (e.g. a VNIC or aggr) can have 1 or more
120 * Rx rings (and corresponding SRSs) assigned to it. The SRS
121 * in turn can have softrings to do protocol level fanout or
122 * softrings to do S/W based fanout or both. In case the NIC
123 * has no Rx rings, we do S/W classification to respective SRS.
124 * The S/W classification rule is always setup and ready. This
125 * allows the MAC layer to reassign Rx rings whenever needed
126 * but packets still continue to flow via the default path and
127 * getting S/W classified to correct SRS.
128 *
129 * The SRS's are used on both Tx and Rx side. They use the same
130 * data structure but the processing routines have slightly different
131 * semantics due to the fact that Rx side needs to do dynamic
132 * polling etc.
133 *
134 * Dynamic Polling Notes
135 * =====================
136 *
137 * Each Soft ring set is capable of switching its Rx ring between
138 * interrupt and poll mode and actively 'polls' for packets in
139 * poll mode. If the SRS is implementing a B/W limit, it makes
140 * sure that only Max allowed packets are pulled in poll mode
141 * and goes to poll mode as soon as B/W limit is exceeded. As
142 * such, there are no overheads to implement B/W limits.
143 *
144 * In poll mode, its better to keep the pipeline going where the
145 * SRS worker thread keeps processing packets and poll thread
146 * keeps bringing more packets (specially if they get to run
147 * on different CPUs). This also prevents the overheads associated
148 * by excessive signalling (on NUMA machines, this can be
149 * pretty devastating). The exception is latency optimized case
150 * where worker thread does no work and interrupt and poll thread
151 * are allowed to do their own drain.
152 *
153 * We use the following policy to control Dynamic Polling:
154 * 1) We switch to poll mode anytime the processing
155 * thread causes a backlog to build up in SRS and
156 * its associated Soft Rings (sr_poll_pkt_cnt > 0).
157 * 2) As long as the backlog stays under the low water
158 * mark (sr_lowat), we poll the H/W for more packets.
159 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
160 * water mark, we stay in poll mode but don't poll
161 * the H/W for more packets.
162 * 4) Anytime in polling mode, if we poll the H/W for
163 * packets and find nothing plus we have an existing
164 * backlog (sr_poll_pkt_cnt > 0), we stay in polling
165 * mode but don't poll the H/W for packets anymore
166 * (let the polling thread go to sleep).
167 * 5) Once the backlog is relived (packets are processed)
168 * we reenable polling (by signalling the poll thread)
169 * only when the backlog dips below sr_poll_thres.
170 * 6) sr_hiwat is used exclusively when we are not
171 * polling capable and is used to decide when to
172 * drop packets so the SRS queue length doesn't grow
173 * infinitely.
174 *
175 * NOTE: Also see the block level comment on top of mac_soft_ring.c
176 */
177
178 /*
179 * mac_latency_optimize
180 *
181 * Controls whether the poll thread can process the packets inline
182 * or let the SRS worker thread do the processing. This applies if
183 * the SRS was not being processed. For latency sensitive traffic,
184 * this needs to be true to allow inline processing. For throughput
185 * under load, this should be false.
186 *
187 * This (and other similar) tunable should be rolled into a link
188 * or flow specific workload hint that can be set using dladm
189 * linkprop (instead of multiple such tunables).
190 */
191 boolean_t mac_latency_optimize = B_TRUE;
192
193 /*
194 * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
195 *
196 * queue a mp or chain in soft ring set and increment the
197 * local count (srs_count) for the SRS and the shared counter
198 * (srs_poll_pkt_cnt - shared between SRS and its soft rings
199 * to track the total unprocessed packets for polling to work
200 * correctly).
201 *
202 * The size (total bytes queued) counters are incremented only
203 * if we are doing B/W control.
204 */
205 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
206 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
207 if ((mac_srs)->srs_last != NULL) \
208 (mac_srs)->srs_last->b_next = (head); \
209 else \
210 (mac_srs)->srs_first = (head); \
211 (mac_srs)->srs_last = (tail); \
212 (mac_srs)->srs_count += count; \
213 }
214
215 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
216 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \
217 \
218 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \
219 srs_rx->sr_poll_pkt_cnt += count; \
220 ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \
221 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \
222 (mac_srs)->srs_size += (sz); \
223 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \
224 (mac_srs)->srs_bw->mac_bw_sz += (sz); \
225 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \
226 } \
227 }
228
229 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
230 mac_srs->srs_state |= SRS_ENQUEUED; \
231 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \
232 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \
233 (mac_srs)->srs_size += (sz); \
234 (mac_srs)->srs_bw->mac_bw_sz += (sz); \
235 } \
236 }
237
238 /*
239 * Turn polling on routines
240 */
241 #define MAC_SRS_POLLING_ON(mac_srs) { \
242 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
243 if (((mac_srs)->srs_state & \
244 (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \
245 (mac_srs)->srs_state |= SRS_POLLING; \
246 (void) mac_hwring_disable_intr((mac_ring_handle_t) \
247 (mac_srs)->srs_ring); \
248 (mac_srs)->srs_rx.sr_poll_on++; \
249 } \
250 }
251
252 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \
253 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
254 if (((mac_srs)->srs_state & \
255 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \
256 (SRS_POLLING_CAPAB|SRS_WORKER)) { \
257 (mac_srs)->srs_state |= SRS_POLLING; \
258 (void) mac_hwring_disable_intr((mac_ring_handle_t) \
259 (mac_srs)->srs_ring); \
260 (mac_srs)->srs_rx.sr_worker_poll_on++; \
261 } \
262 }
263
264 /*
265 * MAC_SRS_POLL_RING
266 *
267 * Signal the SRS poll thread to poll the underlying H/W ring
268 * provided it wasn't already polling (SRS_GET_PKTS was set).
269 *
270 * Poll thread gets to run only from mac_rx_srs_drain() and only
271 * if the drain was being done by the worker thread.
272 */
273 #define MAC_SRS_POLL_RING(mac_srs) { \
274 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \
275 \
276 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
277 srs_rx->sr_poll_thr_sig++; \
278 if (((mac_srs)->srs_state & \
279 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \
280 (SRS_WORKER|SRS_POLLING_CAPAB)) { \
281 (mac_srs)->srs_state |= SRS_GET_PKTS; \
282 cv_signal(&(mac_srs)->srs_cv); \
283 } else { \
284 srs_rx->sr_poll_thr_busy++; \
285 } \
286 }
287
288 /*
289 * MAC_SRS_CHECK_BW_CONTROL
290 *
291 * Check to see if next tick has started so we can reset the
292 * SRS_BW_ENFORCED flag and allow more packets to come in the
293 * system.
294 */
295 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \
296 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
297 ASSERT(((mac_srs)->srs_type & SRST_TX) || \
298 MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \
299 clock_t now = ddi_get_lbolt(); \
300 if ((mac_srs)->srs_bw->mac_bw_curr_time != now) { \
301 (mac_srs)->srs_bw->mac_bw_curr_time = now; \
302 (mac_srs)->srs_bw->mac_bw_used = 0; \
303 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \
304 (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
305 } \
306 }
307
308 /*
309 * MAC_SRS_WORKER_WAKEUP
310 *
311 * Wake up the SRS worker thread to process the queue as long as
312 * no one else is processing the queue. If we are optimizing for
313 * latency, we wake up the worker thread immediately or else we
314 * wait mac_srs_worker_wakeup_ticks before worker thread gets
315 * woken up.
316 */
317 int mac_srs_worker_wakeup_ticks = 0;
318 #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \
319 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
320 if (!((mac_srs)->srs_state & SRS_PROC) && \
321 (mac_srs)->srs_tid == NULL) { \
322 if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \
323 (mac_srs_worker_wakeup_ticks == 0)) \
324 cv_signal(&(mac_srs)->srs_async); \
325 else \
326 (mac_srs)->srs_tid = \
327 timeout(mac_srs_fire, (mac_srs), \
328 mac_srs_worker_wakeup_ticks); \
329 } \
330 }
331
332 #define TX_BANDWIDTH_MODE(mac_srs) \
333 ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \
334 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT || \
335 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_AGGR)
336
337 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \
338 if (tx_mode == SRS_TX_BW_FANOUT) \
339 (void) mac_tx_fanout_mode(mac_srs, head, hint, 0, NULL);\
340 else \
341 (void) mac_tx_aggr_mode(mac_srs, head, hint, 0, NULL); \
342 }
343
344 /*
345 * MAC_TX_SRS_BLOCK
346 *
347 * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
348 * will be set only if srs_tx_woken_up is FALSE. If
349 * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
350 * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
351 * attempt to transmit again and not setting SRS_TX_BLOCKED does
352 * that.
353 */
354 #define MAC_TX_SRS_BLOCK(srs, mp) { \
355 ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \
356 if ((srs)->srs_tx.st_woken_up) { \
357 (srs)->srs_tx.st_woken_up = B_FALSE; \
358 } else { \
359 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \
360 (srs)->srs_state |= SRS_TX_BLOCKED; \
361 (srs)->srs_tx.st_stat.mts_blockcnt++; \
362 } \
363 }
364
365 /*
366 * MAC_TX_SRS_TEST_HIWAT
367 *
368 * Called before queueing a packet onto Tx SRS to test and set
369 * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
370 */
371 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \
372 boolean_t enqueue = 1; \
373 \
374 if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \
375 /* \
376 * flow-controlled. Store srs in cookie so that it \
377 * can be returned as mac_tx_cookie_t to client \
378 */ \
379 (srs)->srs_state |= SRS_TX_HIWAT; \
380 cookie = (mac_tx_cookie_t)srs; \
381 (srs)->srs_tx.st_hiwat_cnt++; \
382 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \
383 /* increment freed stats */ \
384 (srs)->srs_tx.st_stat.mts_sdrops += cnt; \
385 /* \
386 * b_prev may be set to the fanout hint \
387 * hence can't use freemsg directly \
388 */ \
389 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \
390 DTRACE_PROBE1(tx_queued_hiwat, \
391 mac_soft_ring_set_t *, srs); \
392 enqueue = 0; \
393 } \
394 } \
395 if (enqueue) \
396 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \
397 }
398
399 /* Some utility macros */
400 #define MAC_SRS_BW_LOCK(srs) \
401 if (!(srs->srs_type & SRST_TX)) \
402 mutex_enter(&srs->srs_bw->mac_bw_lock);
403
404 #define MAC_SRS_BW_UNLOCK(srs) \
405 if (!(srs->srs_type & SRST_TX)) \
406 mutex_exit(&srs->srs_bw->mac_bw_lock);
407
408 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \
409 mac_pkt_drop(NULL, NULL, mp, B_FALSE); \
410 /* increment freed stats */ \
411 mac_srs->srs_tx.st_stat.mts_sdrops++; \
412 cookie = (mac_tx_cookie_t)srs; \
413 }
414
415 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \
416 mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \
417 cookie = (mac_tx_cookie_t)srs; \
418 *ret_mp = mp_chain; \
419 }
420
421 /*
422 * MAC_RX_SRS_TOODEEP
423 *
424 * Macro called as part of receive-side processing to determine if handling
425 * can occur in situ (in the interrupt thread) or if it should be left to a
426 * worker thread. Note that the constant used to make this determination is
427 * not entirely made-up, and is a result of some emprical validation. That
428 * said, the constant is left as a static variable to allow it to be
429 * dynamically tuned in the field if and as needed.
430 */
431 static uintptr_t mac_rx_srs_stack_needed = 10240;
432 static uint_t mac_rx_srs_stack_toodeep;
433
434 #ifndef STACK_GROWTH_DOWN
435 #error Downward stack growth assumed.
436 #endif
437
438 #define MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
439 (uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \
440 ++mac_rx_srs_stack_toodeep)
441
442
443 /*
444 * Drop the rx packet and advance to the next one in the chain.
445 */
446 static void
447 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
448 {
449 mac_srs_rx_t *srs_rx = &srs->srs_rx;
450
451 ASSERT(mp->b_next == NULL);
452 mutex_enter(&srs->srs_lock);
453 MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
454 MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
455 mutex_exit(&srs->srs_lock);
456
457 srs_rx->sr_stat.mrs_sdrops++;
458 freemsg(mp);
459 }
460
461 /* DATAPATH RUNTIME ROUTINES */
462
463 /*
464 * mac_srs_fire
465 *
466 * Timer callback routine for waking up the SRS worker thread.
467 */
468 static void
469 mac_srs_fire(void *arg)
470 {
471 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
472
473 mutex_enter(&mac_srs->srs_lock);
474 if (mac_srs->srs_tid == 0) {
475 mutex_exit(&mac_srs->srs_lock);
476 return;
477 }
478
479 mac_srs->srs_tid = 0;
480 if (!(mac_srs->srs_state & SRS_PROC))
481 cv_signal(&mac_srs->srs_async);
482
483 mutex_exit(&mac_srs->srs_lock);
484 }
485
486 /*
487 * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
488 * and it is used on the TX path.
489 */
490 #define HASH_HINT(hint) \
491 ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
492
493
494 /*
495 * hash based on the src address and the port information.
496 */
497 #define HASH_ADDR(src, ports) \
498 (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \
499 ((ports) >> 8) ^ (ports))
500
501 #define COMPUTE_INDEX(key, sz) (key % sz)
502
503 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \
504 if ((tail) != NULL) { \
505 ASSERT((tail)->b_next == NULL); \
506 (tail)->b_next = (mp); \
507 } else { \
508 ASSERT((head) == NULL); \
509 (head) = (mp); \
510 } \
511 (tail) = (mp); \
512 (cnt)++; \
513 if ((bw_ctl)) \
514 (sz) += (sz0); \
515 }
516
517 #define MAC_FANOUT_DEFAULT 0
518 #define MAC_FANOUT_RND_ROBIN 1
519 int mac_fanout_type = MAC_FANOUT_DEFAULT;
520
521 #define MAX_SR_TYPES 3
522 /* fanout types for port based hashing */
523 enum pkt_type {
524 V4_TCP = 0,
525 V4_UDP,
526 OTH,
527 UNDEF
528 };
529
530 /*
531 * In general we do port based hashing to spread traffic over different
532 * softrings. The below tunables allow to override that behavior. Setting one
533 * (depending on IPv6 or IPv4) to B_TRUE allows a fanout based on src
534 * IPv6 or IPv4 address. This behavior is also applicable to IPv6 packets
535 * carrying multiple optional headers and other uncommon packet types.
536 */
537 boolean_t mac_src_ipv6_fanout = B_FALSE;
538 boolean_t mac_src_ipv4_fanout = B_FALSE;
539
540 /*
541 * Pair of local and remote ports in the transport header
542 */
543 #define PORTS_SIZE 4
544
545 /*
546 * mac_rx_srs_proto_fanout
547 *
548 * This routine delivers packets destined to an SRS into one of the
549 * protocol soft rings.
550 *
551 * Given a chain of packets we need to split it up into multiple sub chains
552 * destined into TCP, UDP or OTH soft ring. Instead of entering
553 * the soft ring one packet at a time, we want to enter it in the form of a
554 * chain otherwise we get this start/stop behaviour where the worker thread
555 * goes to sleep and then next packets comes in forcing it to wake up etc.
556 */
557 static void
558 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
559 {
560 struct ether_header *ehp;
561 struct ether_vlan_header *evhp;
562 uint32_t sap;
563 ipha_t *ipha;
564 uint8_t *dstaddr;
565 size_t hdrsize;
566 mblk_t *mp;
567 mblk_t *headmp[MAX_SR_TYPES];
568 mblk_t *tailmp[MAX_SR_TYPES];
569 int cnt[MAX_SR_TYPES];
570 size_t sz[MAX_SR_TYPES];
571 size_t sz1;
572 boolean_t bw_ctl;
573 boolean_t hw_classified;
574 boolean_t dls_bypass;
575 boolean_t is_ether;
576 boolean_t is_unicast;
577 enum pkt_type type;
578 mac_client_impl_t *mcip = mac_srs->srs_mcip;
579
580 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
581 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
582
583 /*
584 * If we don't have a Rx ring, S/W classification would have done
585 * its job and its a packet meant for us. If we were polling on
586 * the default ring (i.e. there was a ring assigned to this SRS),
587 * then we need to make sure that the mac address really belongs
588 * to us.
589 */
590 hw_classified = mac_srs->srs_ring != NULL &&
591 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
592
593 /*
594 * Special clients (eg. VLAN, non ether, etc) need DLS
595 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
596 * such SRSs. Another way of disabling bypass is to set the
597 * MCIS_RX_BYPASS_DISABLE flag.
598 */
599 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
600 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
601
602 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
603 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
604 bzero(cnt, MAX_SR_TYPES * sizeof (int));
605 bzero(sz, MAX_SR_TYPES * sizeof (size_t));
606
607 /*
608 * We got a chain from SRS that we need to send to the soft rings.
609 * Since squeues for TCP & IPv4 sap poll their soft rings (for
610 * performance reasons), we need to separate out v4_tcp, v4_udp
611 * and the rest goes in other.
612 */
613 while (head != NULL) {
614 mp = head;
615 head = head->b_next;
616 mp->b_next = NULL;
617
618 type = OTH;
619 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
620
621 if (is_ether) {
622 /*
623 * At this point we can be sure the packet at least
624 * has an ether header.
625 */
626 if (sz1 < sizeof (struct ether_header)) {
627 mac_rx_drop_pkt(mac_srs, mp);
628 continue;
629 }
630 ehp = (struct ether_header *)mp->b_rptr;
631
632 /*
633 * Determine if this is a VLAN or non-VLAN packet.
634 */
635 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
636 evhp = (struct ether_vlan_header *)mp->b_rptr;
637 sap = ntohs(evhp->ether_type);
638 hdrsize = sizeof (struct ether_vlan_header);
639 /*
640 * Check if the VID of the packet, if any,
641 * belongs to this client.
642 */
643 if (!mac_client_check_flow_vid(mcip,
644 VLAN_ID(ntohs(evhp->ether_tci)))) {
645 mac_rx_drop_pkt(mac_srs, mp);
646 continue;
647 }
648 } else {
649 hdrsize = sizeof (struct ether_header);
650 }
651 is_unicast =
652 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
653 dstaddr = (uint8_t *)&ehp->ether_dhost;
654 } else {
655 mac_header_info_t mhi;
656
657 if (mac_header_info((mac_handle_t)mcip->mci_mip,
658 mp, &mhi) != 0) {
659 mac_rx_drop_pkt(mac_srs, mp);
660 continue;
661 }
662 hdrsize = mhi.mhi_hdrsize;
663 sap = mhi.mhi_bindsap;
664 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
665 dstaddr = (uint8_t *)mhi.mhi_daddr;
666 }
667
668 if (!dls_bypass) {
669 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
670 cnt[type], bw_ctl, sz[type], sz1, mp);
671 continue;
672 }
673
674 if (sap == ETHERTYPE_IP) {
675 /*
676 * If we are H/W classified, but we have promisc
677 * on, then we need to check for the unicast address.
678 */
679 if (hw_classified && mcip->mci_promisc_list != NULL) {
680 mac_address_t *map;
681
682 rw_enter(&mcip->mci_rw_lock, RW_READER);
683 map = mcip->mci_unicast;
684 if (bcmp(dstaddr, map->ma_addr,
685 map->ma_len) == 0)
686 type = UNDEF;
687 rw_exit(&mcip->mci_rw_lock);
688 } else if (is_unicast) {
689 type = UNDEF;
690 }
691 }
692
693 /*
694 * This needs to become a contract with the driver for
695 * the fast path.
696 *
697 * In the normal case the packet will have at least the L2
698 * header and the IP + Transport header in the same mblk.
699 * This is usually the case when the NIC driver sends up
700 * the packet. This is also true when the stack generates
701 * a packet that is looped back and when the stack uses the
702 * fastpath mechanism. The normal case is optimized for
703 * performance and may bypass DLS. All other cases go through
704 * the 'OTH' type path without DLS bypass.
705 */
706
707 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
708 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
709 type = OTH;
710
711 if (type == OTH) {
712 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
713 cnt[type], bw_ctl, sz[type], sz1, mp);
714 continue;
715 }
716
717 ASSERT(type == UNDEF);
718 /*
719 * We look for at least 4 bytes past the IP header to get
720 * the port information. If we get an IP fragment, we don't
721 * have the port information, and we use just the protocol
722 * information.
723 */
724 switch (ipha->ipha_protocol) {
725 case IPPROTO_TCP:
726 type = V4_TCP;
727 mp->b_rptr += hdrsize;
728 break;
729 case IPPROTO_UDP:
730 type = V4_UDP;
731 mp->b_rptr += hdrsize;
732 break;
733 default:
734 type = OTH;
735 break;
736 }
737
738 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
739 bw_ctl, sz[type], sz1, mp);
740 }
741
742 for (type = V4_TCP; type < UNDEF; type++) {
743 if (headmp[type] != NULL) {
744 mac_soft_ring_t *softring;
745
746 ASSERT(tailmp[type]->b_next == NULL);
747 switch (type) {
748 case V4_TCP:
749 softring = mac_srs->srs_tcp_soft_rings[0];
750 break;
751 case V4_UDP:
752 softring = mac_srs->srs_udp_soft_rings[0];
753 break;
754 case OTH:
755 softring = mac_srs->srs_oth_soft_rings[0];
756 }
757 mac_rx_soft_ring_process(mcip, softring,
758 headmp[type], tailmp[type], cnt[type], sz[type]);
759 }
760 }
761 }
762
763 int fanout_unaligned = 0;
764
765 /*
766 * mac_rx_srs_long_fanout
767 *
768 * The fanout routine for VLANs, and for anything else that isn't performing
769 * explicit dls bypass. Returns -1 on an error (drop the packet due to a
770 * malformed packet), 0 on success, with values written in *indx and *type.
771 */
772 static int
773 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
774 uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
775 {
776 ip6_t *ip6h;
777 ipha_t *ipha;
778 uint8_t *whereptr;
779 uint_t hash;
780 uint16_t remlen;
781 uint8_t nexthdr;
782 uint16_t hdr_len;
783 uint32_t src_val;
784 boolean_t modifiable = B_TRUE;
785 boolean_t v6;
786
787 ASSERT(MBLKL(mp) >= hdrsize);
788
789 if (sap == ETHERTYPE_IPV6) {
790 v6 = B_TRUE;
791 hdr_len = IPV6_HDR_LEN;
792 } else if (sap == ETHERTYPE_IP) {
793 v6 = B_FALSE;
794 hdr_len = IP_SIMPLE_HDR_LENGTH;
795 } else {
796 *indx = 0;
797 *type = OTH;
798 return (0);
799 }
800
801 ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
802 ipha = (ipha_t *)ip6h;
803
804 if ((uint8_t *)ip6h == mp->b_wptr) {
805 /*
806 * The first mblk_t only includes the mac header.
807 * Note that it is safe to change the mp pointer here,
808 * as the subsequent operation does not assume mp
809 * points to the start of the mac header.
810 */
811 mp = mp->b_cont;
812
813 /*
814 * Make sure the IP header points to an entire one.
815 */
816 if (mp == NULL)
817 return (-1);
818
819 if (MBLKL(mp) < hdr_len) {
820 modifiable = (DB_REF(mp) == 1);
821
822 if (modifiable && !pullupmsg(mp, hdr_len))
823 return (-1);
824 }
825
826 ip6h = (ip6_t *)mp->b_rptr;
827 ipha = (ipha_t *)ip6h;
828 }
829
830 if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
831 ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
832 /*
833 * If either the IP header is not aligned, or it does not hold
834 * the complete simple structure (a pullupmsg() is not an
835 * option since it would result in an unaligned IP header),
836 * fanout to the default ring.
837 *
838 * Note that this may cause packet reordering.
839 */
840 *indx = 0;
841 *type = OTH;
842 fanout_unaligned++;
843 return (0);
844 }
845
846 /*
847 * Extract next-header, full header length, and source-hash value
848 * using v4/v6 specific fields.
849 */
850 if (v6) {
851 remlen = ntohs(ip6h->ip6_plen);
852 nexthdr = ip6h->ip6_nxt;
853 src_val = V4_PART_OF_V6(ip6h->ip6_src);
854 /*
855 * Do src based fanout if below tunable is set to B_TRUE or
856 * when mac_ip_hdr_length_v6() fails because of malformed
857 * packets or because mblks need to be concatenated using
858 * pullupmsg().
859 */
860 if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
861 mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
862 goto src_based_fanout;
863 }
864 } else {
865 hdr_len = IPH_HDR_LENGTH(ipha);
866 remlen = ntohs(ipha->ipha_length) - hdr_len;
867 nexthdr = ipha->ipha_protocol;
868 src_val = (uint32_t)ipha->ipha_src;
869 /*
870 * Catch IPv4 fragment case here. IPv6 has nexthdr == FRAG
871 * for its equivalent case.
872 */
873 if (mac_src_ipv4_fanout ||
874 (ntohs(ipha->ipha_fragment_offset_and_flags) &
875 (IPH_MF | IPH_OFFSET)) != 0) {
876 goto src_based_fanout;
877 }
878 }
879 if (remlen < MIN_EHDR_LEN)
880 return (-1);
881 whereptr = (uint8_t *)ip6h + hdr_len;
882
883 /* If the transport is one of below, we do port/SPI based fanout */
884 switch (nexthdr) {
885 case IPPROTO_TCP:
886 case IPPROTO_UDP:
887 case IPPROTO_SCTP:
888 case IPPROTO_ESP:
889 /*
890 * If the ports or SPI in the transport header is not part of
891 * the mblk, do src_based_fanout, instead of calling
892 * pullupmsg().
893 */
894 if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
895 break; /* out of switch... */
896 /* FALLTHRU */
897 default:
898 goto src_based_fanout;
899 }
900
901 switch (nexthdr) {
902 case IPPROTO_TCP:
903 hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
904 *indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
905 *type = OTH;
906 break;
907 case IPPROTO_UDP:
908 case IPPROTO_SCTP:
909 case IPPROTO_ESP:
910 if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
911 hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
912 *indx = COMPUTE_INDEX(hash,
913 mac_srs->srs_udp_ring_count);
914 } else {
915 *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
916 mac_srs->srs_ind++;
917 }
918 *type = OTH;
919 break;
920 }
921 return (0);
922
923 src_based_fanout:
924 hash = HASH_ADDR(src_val, (uint32_t)0);
925 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
926 *type = OTH;
927 return (0);
928 }
929
930 /*
931 * mac_rx_srs_fanout
932 *
933 * This routine delivers packets destined to an SRS into a soft ring member
934 * of the set.
935 *
936 * Given a chain of packets we need to split it up into multiple sub chains
937 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
938 * the soft ring one packet at a time, we want to enter it in the form of a
939 * chain otherwise we get this start/stop behaviour where the worker thread
940 * goes to sleep and then next packets comes in forcing it to wake up etc.
941 *
942 * Note:
943 * Since we know what is the maximum fanout possible, we create a 2D array
944 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
945 * variables so that we can enter the softrings with chain. We need the
946 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
947 * for each packet would be expensive). If we ever want to have the
948 * ability to have unlimited fanout, we should probably declare a head,
949 * tail, cnt, sz with each soft ring (a data struct which contains a softring
950 * along with these members) and create an array of this uber struct so we
951 * don't have to do kmem_alloc.
952 */
953 int fanout_oth1 = 0;
954 int fanout_oth2 = 0;
955 int fanout_oth3 = 0;
956 int fanout_oth4 = 0;
957 int fanout_oth5 = 0;
958
959 static void
960 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
961 {
962 struct ether_header *ehp;
963 struct ether_vlan_header *evhp;
964 uint32_t sap;
965 ipha_t *ipha;
966 uint8_t *dstaddr;
967 uint_t indx;
968 size_t ports_offset;
969 size_t ipha_len;
970 size_t hdrsize;
971 uint_t hash;
972 mblk_t *mp;
973 mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
974 mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
975 int cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
976 size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT];
977 size_t sz1;
978 boolean_t bw_ctl;
979 boolean_t hw_classified;
980 boolean_t dls_bypass;
981 boolean_t is_ether;
982 boolean_t is_unicast;
983 int fanout_cnt;
984 enum pkt_type type;
985 mac_client_impl_t *mcip = mac_srs->srs_mcip;
986
987 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
988 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
989
990 /*
991 * If we don't have a Rx ring, S/W classification would have done
992 * its job and its a packet meant for us. If we were polling on
993 * the default ring (i.e. there was a ring assigned to this SRS),
994 * then we need to make sure that the mac address really belongs
995 * to us.
996 */
997 hw_classified = mac_srs->srs_ring != NULL &&
998 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
999
1000 /*
1001 * Special clients (eg. VLAN, non ether, etc) need DLS
1002 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
1003 * such SRSs. Another way of disabling bypass is to set the
1004 * MCIS_RX_BYPASS_DISABLE flag.
1005 */
1006 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
1007 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1008
1009 /*
1010 * Since the softrings are never destroyed and we always
1011 * create equal number of softrings for TCP, UDP and rest,
1012 * its OK to check one of them for count and use it without
1013 * any lock. In future, if soft rings get destroyed because
1014 * of reduction in fanout, we will need to ensure that happens
1015 * behind the SRS_PROC.
1016 */
1017 fanout_cnt = mac_srs->srs_tcp_ring_count;
1018
1019 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1020 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1021 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
1022 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
1023
1024 /*
1025 * We got a chain from SRS that we need to send to the soft rings.
1026 * Since squeues for TCP & IPv4 sap poll their soft rings (for
1027 * performance reasons), we need to separate out v4_tcp, v4_udp
1028 * and the rest goes in other.
1029 */
1030 while (head != NULL) {
1031 mp = head;
1032 head = head->b_next;
1033 mp->b_next = NULL;
1034
1035 type = OTH;
1036 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1037
1038 if (is_ether) {
1039 /*
1040 * At this point we can be sure the packet at least
1041 * has an ether header.
1042 */
1043 if (sz1 < sizeof (struct ether_header)) {
1044 mac_rx_drop_pkt(mac_srs, mp);
1045 continue;
1046 }
1047 ehp = (struct ether_header *)mp->b_rptr;
1048
1049 /*
1050 * Determine if this is a VLAN or non-VLAN packet.
1051 */
1052 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1053 evhp = (struct ether_vlan_header *)mp->b_rptr;
1054 sap = ntohs(evhp->ether_type);
1055 hdrsize = sizeof (struct ether_vlan_header);
1056 /*
1057 * Check if the VID of the packet, if any,
1058 * belongs to this client.
1059 */
1060 if (!mac_client_check_flow_vid(mcip,
1061 VLAN_ID(ntohs(evhp->ether_tci)))) {
1062 mac_rx_drop_pkt(mac_srs, mp);
1063 continue;
1064 }
1065 } else {
1066 hdrsize = sizeof (struct ether_header);
1067 }
1068 is_unicast =
1069 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1070 dstaddr = (uint8_t *)&ehp->ether_dhost;
1071 } else {
1072 mac_header_info_t mhi;
1073
1074 if (mac_header_info((mac_handle_t)mcip->mci_mip,
1075 mp, &mhi) != 0) {
1076 mac_rx_drop_pkt(mac_srs, mp);
1077 continue;
1078 }
1079 hdrsize = mhi.mhi_hdrsize;
1080 sap = mhi.mhi_bindsap;
1081 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
1082 dstaddr = (uint8_t *)mhi.mhi_daddr;
1083 }
1084
1085 if (!dls_bypass) {
1086 if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1087 hdrsize, &type, &indx) == -1) {
1088 mac_rx_drop_pkt(mac_srs, mp);
1089 continue;
1090 }
1091
1092 FANOUT_ENQUEUE_MP(headmp[type][indx],
1093 tailmp[type][indx], cnt[type][indx], bw_ctl,
1094 sz[type][indx], sz1, mp);
1095 continue;
1096 }
1097
1098
1099 /*
1100 * If we are using the default Rx ring where H/W or S/W
1101 * classification has not happened, we need to verify if
1102 * this unicast packet really belongs to us.
1103 */
1104 if (sap == ETHERTYPE_IP) {
1105 /*
1106 * If we are H/W classified, but we have promisc
1107 * on, then we need to check for the unicast address.
1108 */
1109 if (hw_classified && mcip->mci_promisc_list != NULL) {
1110 mac_address_t *map;
1111
1112 rw_enter(&mcip->mci_rw_lock, RW_READER);
1113 map = mcip->mci_unicast;
1114 if (bcmp(dstaddr, map->ma_addr,
1115 map->ma_len) == 0)
1116 type = UNDEF;
1117 rw_exit(&mcip->mci_rw_lock);
1118 } else if (is_unicast) {
1119 type = UNDEF;
1120 }
1121 }
1122
1123 /*
1124 * This needs to become a contract with the driver for
1125 * the fast path.
1126 */
1127
1128 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1129 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
1130 type = OTH;
1131 fanout_oth1++;
1132 }
1133
1134 if (type != OTH) {
1135 uint16_t frag_offset_flags;
1136
1137 switch (ipha->ipha_protocol) {
1138 case IPPROTO_TCP:
1139 case IPPROTO_UDP:
1140 case IPPROTO_SCTP:
1141 case IPPROTO_ESP:
1142 ipha_len = IPH_HDR_LENGTH(ipha);
1143 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
1144 mp->b_wptr) {
1145 type = OTH;
1146 break;
1147 }
1148 frag_offset_flags =
1149 ntohs(ipha->ipha_fragment_offset_and_flags);
1150 if ((frag_offset_flags &
1151 (IPH_MF | IPH_OFFSET)) != 0) {
1152 type = OTH;
1153 fanout_oth3++;
1154 break;
1155 }
1156 ports_offset = hdrsize + ipha_len;
1157 break;
1158 default:
1159 type = OTH;
1160 fanout_oth4++;
1161 break;
1162 }
1163 }
1164
1165 if (type == OTH) {
1166 if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1167 hdrsize, &type, &indx) == -1) {
1168 mac_rx_drop_pkt(mac_srs, mp);
1169 continue;
1170 }
1171
1172 FANOUT_ENQUEUE_MP(headmp[type][indx],
1173 tailmp[type][indx], cnt[type][indx], bw_ctl,
1174 sz[type][indx], sz1, mp);
1175 continue;
1176 }
1177
1178 ASSERT(type == UNDEF);
1179
1180 /*
1181 * XXX-Sunay: We should hold srs_lock since ring_count
1182 * below can change. But if we are always called from
1183 * mac_rx_srs_drain and SRS_PROC is set, then we can
1184 * enforce that ring_count can't be changed i.e.
1185 * to change fanout type or ring count, the calling
1186 * thread needs to be behind SRS_PROC.
1187 */
1188 switch (ipha->ipha_protocol) {
1189 case IPPROTO_TCP:
1190 /*
1191 * Note that for ESP, we fanout on SPI and it is at the
1192 * same offset as the 2x16-bit ports. So it is clumped
1193 * along with TCP, UDP and SCTP.
1194 */
1195 hash = HASH_ADDR(ipha->ipha_src,
1196 *(uint32_t *)(mp->b_rptr + ports_offset));
1197 indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
1198 type = V4_TCP;
1199 mp->b_rptr += hdrsize;
1200 break;
1201 case IPPROTO_UDP:
1202 case IPPROTO_SCTP:
1203 case IPPROTO_ESP:
1204 if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
1205 hash = HASH_ADDR(ipha->ipha_src,
1206 *(uint32_t *)(mp->b_rptr + ports_offset));
1207 indx = COMPUTE_INDEX(hash,
1208 mac_srs->srs_udp_ring_count);
1209 } else {
1210 indx = mac_srs->srs_ind %
1211 mac_srs->srs_udp_ring_count;
1212 mac_srs->srs_ind++;
1213 }
1214 type = V4_UDP;
1215 mp->b_rptr += hdrsize;
1216 break;
1217 default:
1218 indx = 0;
1219 type = OTH;
1220 }
1221
1222 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
1223 cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
1224 }
1225
1226 for (type = V4_TCP; type < UNDEF; type++) {
1227 int i;
1228
1229 for (i = 0; i < fanout_cnt; i++) {
1230 if (headmp[type][i] != NULL) {
1231 mac_soft_ring_t *softring;
1232
1233 ASSERT(tailmp[type][i]->b_next == NULL);
1234 switch (type) {
1235 case V4_TCP:
1236 softring =
1237 mac_srs->srs_tcp_soft_rings[i];
1238 break;
1239 case V4_UDP:
1240 softring =
1241 mac_srs->srs_udp_soft_rings[i];
1242 break;
1243 case OTH:
1244 softring =
1245 mac_srs->srs_oth_soft_rings[i];
1246 break;
1247 }
1248 mac_rx_soft_ring_process(mcip,
1249 softring, headmp[type][i], tailmp[type][i],
1250 cnt[type][i], sz[type][i]);
1251 }
1252 }
1253 }
1254 }
1255
1256 #define SRS_BYTES_TO_PICKUP 150000
1257 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
1258
1259 /*
1260 * mac_rx_srs_poll_ring
1261 *
1262 * This SRS Poll thread uses this routine to poll the underlying hardware
1263 * Rx ring to get a chain of packets. It can inline process that chain
1264 * if mac_latency_optimize is set (default) or signal the SRS worker thread
1265 * to do the remaining processing.
1266 *
1267 * Since packets come in the system via interrupt or poll path, we also
1268 * update the stats and deal with promiscous clients here.
1269 */
1270 void
1271 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
1272 {
1273 kmutex_t *lock = &mac_srs->srs_lock;
1274 kcondvar_t *async = &mac_srs->srs_cv;
1275 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
1276 mblk_t *head, *tail, *mp;
1277 callb_cpr_t cprinfo;
1278 ssize_t bytes_to_pickup;
1279 size_t sz;
1280 int count;
1281 mac_client_impl_t *smcip;
1282
1283 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
1284 mutex_enter(lock);
1285
1286 start:
1287 for (;;) {
1288 if (mac_srs->srs_state & SRS_PAUSE)
1289 goto done;
1290
1291 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1292 cv_wait(async, lock);
1293 CALLB_CPR_SAFE_END(&cprinfo, lock);
1294
1295 if (mac_srs->srs_state & SRS_PAUSE)
1296 goto done;
1297
1298 check_again:
1299 if (mac_srs->srs_type & SRST_BW_CONTROL) {
1300 /*
1301 * We pick as many bytes as we are allowed to queue.
1302 * Its possible that we will exceed the total
1303 * packets queued in case this SRS is part of the
1304 * Rx ring group since > 1 poll thread can be pulling
1305 * upto the max allowed packets at the same time
1306 * but that should be OK.
1307 */
1308 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1309 bytes_to_pickup =
1310 mac_srs->srs_bw->mac_bw_drop_threshold -
1311 mac_srs->srs_bw->mac_bw_sz;
1312 /*
1313 * We shouldn't have been signalled if we
1314 * have 0 or less bytes to pick but since
1315 * some of the bytes accounting is driver
1316 * dependant, we do the safety check.
1317 */
1318 if (bytes_to_pickup < 0)
1319 bytes_to_pickup = 0;
1320 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1321 } else {
1322 /*
1323 * ToDO: Need to change the polling API
1324 * to add a packet count and a flag which
1325 * tells the driver whether we want packets
1326 * based on a count, or bytes, or all the
1327 * packets queued in the driver/HW. This
1328 * way, we never have to check the limits
1329 * on poll path. We truly let only as many
1330 * packets enter the system as we are willing
1331 * to process or queue.
1332 *
1333 * Something along the lines of
1334 * pkts_to_pickup = mac_soft_ring_max_q_cnt -
1335 * mac_srs->srs_poll_pkt_cnt
1336 */
1337
1338 /*
1339 * Since we are not doing B/W control, pick
1340 * as many packets as allowed.
1341 */
1342 bytes_to_pickup = max_bytes_to_pickup;
1343 }
1344
1345 /* Poll the underlying Hardware */
1346 mutex_exit(lock);
1347 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
1348 mutex_enter(lock);
1349
1350 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
1351 SRS_POLL_THR_OWNER);
1352
1353 mp = tail = head;
1354 count = 0;
1355 sz = 0;
1356 while (mp != NULL) {
1357 tail = mp;
1358 sz += msgdsize(mp);
1359 mp = mp->b_next;
1360 count++;
1361 }
1362
1363 if (head != NULL) {
1364 tail->b_next = NULL;
1365 smcip = mac_srs->srs_mcip;
1366
1367 SRS_RX_STAT_UPDATE(mac_srs, pollbytes, sz);
1368 SRS_RX_STAT_UPDATE(mac_srs, pollcnt, count);
1369
1370 /*
1371 * If there are any promiscuous mode callbacks
1372 * defined for this MAC client, pass them a copy
1373 * if appropriate and also update the counters.
1374 */
1375 if (smcip != NULL) {
1376 if (smcip->mci_mip->mi_promisc_list != NULL) {
1377 mutex_exit(lock);
1378 mac_promisc_dispatch(smcip->mci_mip,
1379 head, NULL);
1380 mutex_enter(lock);
1381 }
1382 }
1383 if (mac_srs->srs_type & SRST_BW_CONTROL) {
1384 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1385 mac_srs->srs_bw->mac_bw_polled += sz;
1386 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1387 }
1388 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
1389 count, sz);
1390 if (count <= 10)
1391 srs_rx->sr_stat.mrs_chaincntundr10++;
1392 else if (count > 10 && count <= 50)
1393 srs_rx->sr_stat.mrs_chaincnt10to50++;
1394 else
1395 srs_rx->sr_stat.mrs_chaincntover50++;
1396 }
1397
1398 /*
1399 * We are guaranteed that SRS_PROC will be set if we
1400 * are here. Also, poll thread gets to run only if
1401 * the drain was being done by a worker thread although
1402 * its possible that worker thread is still running
1403 * and poll thread was sent down to keep the pipeline
1404 * going instead of doing a complete drain and then
1405 * trying to poll the NIC.
1406 *
1407 * So we need to check SRS_WORKER flag to make sure
1408 * that the worker thread is not processing the queue
1409 * in parallel to us. The flags and conditions are
1410 * protected by the srs_lock to prevent any race. We
1411 * ensure that we don't drop the srs_lock from now
1412 * till the end and similarly we don't drop the srs_lock
1413 * in mac_rx_srs_drain() till similar condition check
1414 * are complete. The mac_rx_srs_drain() needs to ensure
1415 * that SRS_WORKER flag remains set as long as its
1416 * processing the queue.
1417 */
1418 if (!(mac_srs->srs_state & SRS_WORKER) &&
1419 (mac_srs->srs_first != NULL)) {
1420 /*
1421 * We have packets to process and worker thread
1422 * is not running. Check to see if poll thread is
1423 * allowed to process.
1424 */
1425 if (mac_srs->srs_state & SRS_LATENCY_OPT) {
1426 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
1427 if (!(mac_srs->srs_state & SRS_PAUSE) &&
1428 srs_rx->sr_poll_pkt_cnt <=
1429 srs_rx->sr_lowat) {
1430 srs_rx->sr_poll_again++;
1431 goto check_again;
1432 }
1433 /*
1434 * We are already above low water mark
1435 * so stay in the polling mode but no
1436 * need to poll. Once we dip below
1437 * the polling threshold, the processing
1438 * thread (soft ring) will signal us
1439 * to poll again (MAC_UPDATE_SRS_COUNT)
1440 */
1441 srs_rx->sr_poll_drain_no_poll++;
1442 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1443 /*
1444 * In B/W control case, its possible
1445 * that the backlog built up due to
1446 * B/W limit being reached and packets
1447 * are queued only in SRS. In this case,
1448 * we should schedule worker thread
1449 * since no one else will wake us up.
1450 */
1451 if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
1452 (mac_srs->srs_tid == NULL)) {
1453 mac_srs->srs_tid =
1454 timeout(mac_srs_fire, mac_srs, 1);
1455 srs_rx->sr_poll_worker_wakeup++;
1456 }
1457 } else {
1458 /*
1459 * Wakeup the worker thread for more processing.
1460 * We optimize for throughput in this case.
1461 */
1462 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1463 MAC_SRS_WORKER_WAKEUP(mac_srs);
1464 srs_rx->sr_poll_sig_worker++;
1465 }
1466 } else if ((mac_srs->srs_first == NULL) &&
1467 !(mac_srs->srs_state & SRS_WORKER)) {
1468 /*
1469 * There is nothing queued in SRS and
1470 * no worker thread running. Plus we
1471 * didn't get anything from the H/W
1472 * as well (head == NULL);
1473 */
1474 ASSERT(head == NULL);
1475 mac_srs->srs_state &=
1476 ~(SRS_PROC|SRS_GET_PKTS);
1477
1478 /*
1479 * If we have a packets in soft ring, don't allow
1480 * more packets to come into this SRS by keeping the
1481 * interrupts off but not polling the H/W. The
1482 * poll thread will get signaled as soon as
1483 * srs_poll_pkt_cnt dips below poll threshold.
1484 */
1485 if (srs_rx->sr_poll_pkt_cnt == 0) {
1486 srs_rx->sr_poll_intr_enable++;
1487 MAC_SRS_POLLING_OFF(mac_srs);
1488 } else {
1489 /*
1490 * We know nothing is queued in SRS
1491 * since we are here after checking
1492 * srs_first is NULL. The backlog
1493 * is entirely due to packets queued
1494 * in Soft ring which will wake us up
1495 * and get the interface out of polling
1496 * mode once the backlog dips below
1497 * sr_poll_thres.
1498 */
1499 srs_rx->sr_poll_no_poll++;
1500 }
1501 } else {
1502 /*
1503 * Worker thread is already running.
1504 * Nothing much to do. If the polling
1505 * was enabled, worker thread will deal
1506 * with that.
1507 */
1508 mac_srs->srs_state &= ~SRS_GET_PKTS;
1509 srs_rx->sr_poll_goto_sleep++;
1510 }
1511 }
1512 done:
1513 mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
1514 cv_signal(&mac_srs->srs_async);
1515 /*
1516 * If this is a temporary quiesce then wait for the restart signal
1517 * from the srs worker. Then clear the flags and signal the srs worker
1518 * to ensure a positive handshake and go back to start.
1519 */
1520 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
1521 cv_wait(async, lock);
1522 if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
1523 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
1524 mac_srs->srs_state &=
1525 ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
1526 cv_signal(&mac_srs->srs_async);
1527 goto start;
1528 } else {
1529 mac_srs->srs_state |= SRS_POLL_THR_EXITED;
1530 cv_signal(&mac_srs->srs_async);
1531 CALLB_CPR_EXIT(&cprinfo);
1532 thread_exit();
1533 }
1534 }
1535
1536 /*
1537 * mac_srs_pick_chain
1538 *
1539 * In Bandwidth control case, checks how many packets can be processed
1540 * and return them in a sub chain.
1541 */
1542 static mblk_t *
1543 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
1544 size_t *chain_sz, int *chain_cnt)
1545 {
1546 mblk_t *head = NULL;
1547 mblk_t *tail = NULL;
1548 size_t sz;
1549 size_t tsz = 0;
1550 int cnt = 0;
1551 mblk_t *mp;
1552
1553 ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1554 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1555 if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
1556 mac_srs->srs_bw->mac_bw_limit) ||
1557 (mac_srs->srs_bw->mac_bw_limit == 0)) {
1558 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1559 head = mac_srs->srs_first;
1560 mac_srs->srs_first = NULL;
1561 *chain_tail = mac_srs->srs_last;
1562 mac_srs->srs_last = NULL;
1563 *chain_sz = mac_srs->srs_size;
1564 *chain_cnt = mac_srs->srs_count;
1565 mac_srs->srs_count = 0;
1566 mac_srs->srs_size = 0;
1567 return (head);
1568 }
1569
1570 /*
1571 * Can't clear the entire backlog.
1572 * Need to find how many packets to pick
1573 */
1574 ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
1575 while ((mp = mac_srs->srs_first) != NULL) {
1576 sz = msgdsize(mp);
1577 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
1578 mac_srs->srs_bw->mac_bw_limit) {
1579 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
1580 mac_srs->srs_bw->mac_bw_state |=
1581 SRS_BW_ENFORCED;
1582 break;
1583 }
1584
1585 /*
1586 * The _size & cnt is decremented from the softrings
1587 * when they send up the packet for polling to work
1588 * properly.
1589 */
1590 tsz += sz;
1591 cnt++;
1592 mac_srs->srs_count--;
1593 mac_srs->srs_size -= sz;
1594 if (tail != NULL)
1595 tail->b_next = mp;
1596 else
1597 head = mp;
1598 tail = mp;
1599 mac_srs->srs_first = mac_srs->srs_first->b_next;
1600 }
1601 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1602 if (mac_srs->srs_first == NULL)
1603 mac_srs->srs_last = NULL;
1604
1605 if (tail != NULL)
1606 tail->b_next = NULL;
1607 *chain_tail = tail;
1608 *chain_cnt = cnt;
1609 *chain_sz = tsz;
1610
1611 return (head);
1612 }
1613
1614 /*
1615 * mac_rx_srs_drain
1616 *
1617 * The SRS drain routine. Gets to run to clear the queue. Any thread
1618 * (worker, interrupt, poll) can call this based on processing model.
1619 * The first thing we do is disable interrupts if possible and then
1620 * drain the queue. we also try to poll the underlying hardware if
1621 * there is a dedicated hardware Rx ring assigned to this SRS.
1622 *
1623 * There is a equivalent drain routine in bandwidth control mode
1624 * mac_rx_srs_drain_bw. There is some code duplication between the two
1625 * routines but they are highly performance sensitive and are easier
1626 * to read/debug if they stay separate. Any code changes here might
1627 * also apply to mac_rx_srs_drain_bw as well.
1628 */
1629 void
1630 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1631 {
1632 mblk_t *head;
1633 mblk_t *tail;
1634 timeout_id_t tid;
1635 int cnt = 0;
1636 mac_client_impl_t *mcip = mac_srs->srs_mcip;
1637 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
1638
1639 ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1640 ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
1641
1642 /* If we are blanked i.e. can't do upcalls, then we are done */
1643 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1644 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1645 (mac_srs->srs_state & SRS_PAUSE));
1646 goto out;
1647 }
1648
1649 if (mac_srs->srs_first == NULL)
1650 goto out;
1651
1652 if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
1653 (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
1654 /*
1655 * In the normal case, the SRS worker thread does no
1656 * work and we wait for a backlog to build up before
1657 * we switch into polling mode. In case we are
1658 * optimizing for throughput, we use the worker thread
1659 * as well. The goal is to let worker thread process
1660 * the queue and poll thread to feed packets into
1661 * the queue. As such, we should signal the poll
1662 * thread to try and get more packets.
1663 *
1664 * We could have pulled this check in the POLL_RING
1665 * macro itself but keeping it explicit here makes
1666 * the architecture more human understandable.
1667 */
1668 MAC_SRS_POLL_RING(mac_srs);
1669 }
1670
1671 again:
1672 head = mac_srs->srs_first;
1673 mac_srs->srs_first = NULL;
1674 tail = mac_srs->srs_last;
1675 mac_srs->srs_last = NULL;
1676 cnt = mac_srs->srs_count;
1677 mac_srs->srs_count = 0;
1678
1679 ASSERT(head != NULL);
1680 ASSERT(tail != NULL);
1681
1682 if ((tid = mac_srs->srs_tid) != 0)
1683 mac_srs->srs_tid = 0;
1684
1685 mac_srs->srs_state |= (SRS_PROC|proc_type);
1686
1687
1688 /*
1689 * mcip is NULL for broadcast and multicast flows. The promisc
1690 * callbacks for broadcast and multicast packets are delivered from
1691 * mac_rx() and we don't need to worry about that case in this path
1692 */
1693 if (mcip != NULL) {
1694 if (mcip->mci_promisc_list != NULL) {
1695 mutex_exit(&mac_srs->srs_lock);
1696 mac_promisc_client_dispatch(mcip, head);
1697 mutex_enter(&mac_srs->srs_lock);
1698 }
1699 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1700 mutex_exit(&mac_srs->srs_lock);
1701 mac_protect_intercept_dhcp(mcip, head);
1702 mutex_enter(&mac_srs->srs_lock);
1703 }
1704 }
1705
1706 /*
1707 * Check if SRS itself is doing the processing
1708 * This direct path does not apply when subflows are present. In this
1709 * case, packets need to be dispatched to a soft ring according to the
1710 * flow's bandwidth and other resources contraints.
1711 */
1712 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1713 mac_direct_rx_t proc;
1714 void *arg1;
1715 mac_resource_handle_t arg2;
1716
1717 /*
1718 * This is the case when a Rx is directly
1719 * assigned and we have a fully classified
1720 * protocol chain. We can deal with it in
1721 * one shot.
1722 */
1723 proc = srs_rx->sr_func;
1724 arg1 = srs_rx->sr_arg1;
1725 arg2 = srs_rx->sr_arg2;
1726
1727 mac_srs->srs_state |= SRS_CLIENT_PROC;
1728 mutex_exit(&mac_srs->srs_lock);
1729 if (tid != 0) {
1730 (void) untimeout(tid);
1731 tid = 0;
1732 }
1733
1734 proc(arg1, arg2, head, NULL);
1735 /*
1736 * Decrement the size and count here itelf
1737 * since the packet has been processed.
1738 */
1739 mutex_enter(&mac_srs->srs_lock);
1740 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1741 if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1742 cv_signal(&mac_srs->srs_client_cv);
1743 mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1744 } else {
1745 /* Some kind of softrings based fanout is required */
1746 mutex_exit(&mac_srs->srs_lock);
1747 if (tid != 0) {
1748 (void) untimeout(tid);
1749 tid = 0;
1750 }
1751
1752 /*
1753 * Since the fanout routines can deal with chains,
1754 * shoot the entire chain up.
1755 */
1756 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1757 mac_rx_srs_fanout(mac_srs, head);
1758 else
1759 mac_rx_srs_proto_fanout(mac_srs, head);
1760 mutex_enter(&mac_srs->srs_lock);
1761 }
1762
1763 if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) &&
1764 (mac_srs->srs_first != NULL)) {
1765 /*
1766 * More packets arrived while we were clearing the
1767 * SRS. This can be possible because of one of
1768 * three conditions below:
1769 * 1) The driver is using multiple worker threads
1770 * to send the packets to us.
1771 * 2) The driver has a race in switching
1772 * between interrupt and polling mode or
1773 * 3) Packets are arriving in this SRS via the
1774 * S/W classification as well.
1775 *
1776 * We should switch to polling mode and see if we
1777 * need to send the poll thread down. Also, signal
1778 * the worker thread to process whats just arrived.
1779 */
1780 MAC_SRS_POLLING_ON(mac_srs);
1781 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
1782 srs_rx->sr_drain_poll_sig++;
1783 MAC_SRS_POLL_RING(mac_srs);
1784 }
1785
1786 /*
1787 * If we didn't signal the poll thread, we need
1788 * to deal with the pending packets ourselves.
1789 */
1790 if (proc_type == SRS_WORKER) {
1791 srs_rx->sr_drain_again++;
1792 goto again;
1793 } else {
1794 srs_rx->sr_drain_worker_sig++;
1795 cv_signal(&mac_srs->srs_async);
1796 }
1797 }
1798
1799 out:
1800 if (mac_srs->srs_state & SRS_GET_PKTS) {
1801 /*
1802 * Poll thread is already running. Leave the
1803 * SRS_RPOC set and hand over the control to
1804 * poll thread.
1805 */
1806 mac_srs->srs_state &= ~proc_type;
1807 srs_rx->sr_drain_poll_running++;
1808 return;
1809 }
1810
1811 /*
1812 * Even if there are no packets queued in SRS, we
1813 * need to make sure that the shared counter is
1814 * clear and any associated softrings have cleared
1815 * all the backlog. Otherwise, leave the interface
1816 * in polling mode and the poll thread will get
1817 * signalled once the count goes down to zero.
1818 *
1819 * If someone is already draining the queue (SRS_PROC is
1820 * set) when the srs_poll_pkt_cnt goes down to zero,
1821 * then it means that drain is already running and we
1822 * will turn off polling at that time if there is
1823 * no backlog.
1824 *
1825 * As long as there are packets queued either
1826 * in soft ring set or its soft rings, we will leave
1827 * the interface in polling mode (even if the drain
1828 * was done being the interrupt thread). We signal
1829 * the poll thread as well if we have dipped below
1830 * low water mark.
1831 *
1832 * NOTE: We can't use the MAC_SRS_POLLING_ON macro
1833 * since that turn polling on only for worker thread.
1834 * Its not worth turning polling on for interrupt
1835 * thread (since NIC will not issue another interrupt)
1836 * unless a backlog builds up.
1837 */
1838 if ((srs_rx->sr_poll_pkt_cnt > 0) &&
1839 (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
1840 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1841 srs_rx->sr_drain_keep_polling++;
1842 MAC_SRS_POLLING_ON(mac_srs);
1843 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
1844 MAC_SRS_POLL_RING(mac_srs);
1845 return;
1846 }
1847
1848 /* Nothing else to do. Get out of poll mode */
1849 MAC_SRS_POLLING_OFF(mac_srs);
1850 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1851 srs_rx->sr_drain_finish_intr++;
1852 }
1853
1854 /*
1855 * mac_rx_srs_drain_bw
1856 *
1857 * The SRS BW drain routine. Gets to run to clear the queue. Any thread
1858 * (worker, interrupt, poll) can call this based on processing model.
1859 * The first thing we do is disable interrupts if possible and then
1860 * drain the queue. we also try to poll the underlying hardware if
1861 * there is a dedicated hardware Rx ring assigned to this SRS.
1862 *
1863 * There is a equivalent drain routine in non bandwidth control mode
1864 * mac_rx_srs_drain. There is some code duplication between the two
1865 * routines but they are highly performance sensitive and are easier
1866 * to read/debug if they stay separate. Any code changes here might
1867 * also apply to mac_rx_srs_drain as well.
1868 */
1869 void
1870 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1871 {
1872 mblk_t *head;
1873 mblk_t *tail;
1874 timeout_id_t tid;
1875 size_t sz = 0;
1876 int cnt = 0;
1877 mac_client_impl_t *mcip = mac_srs->srs_mcip;
1878 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
1879 clock_t now;
1880
1881 ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1882 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
1883 again:
1884 /* Check if we are doing B/W control */
1885 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1886 now = ddi_get_lbolt();
1887 if (mac_srs->srs_bw->mac_bw_curr_time != now) {
1888 mac_srs->srs_bw->mac_bw_curr_time = now;
1889 mac_srs->srs_bw->mac_bw_used = 0;
1890 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
1891 mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
1892 } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
1893 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1894 goto done;
1895 } else if (mac_srs->srs_bw->mac_bw_used >
1896 mac_srs->srs_bw->mac_bw_limit) {
1897 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
1898 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1899 goto done;
1900 }
1901 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1902
1903 /* If we are blanked i.e. can't do upcalls, then we are done */
1904 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1905 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1906 (mac_srs->srs_state & SRS_PAUSE));
1907 goto done;
1908 }
1909
1910 sz = 0;
1911 cnt = 0;
1912 if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
1913 /*
1914 * We couldn't pick up a single packet.
1915 */
1916 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1917 if ((mac_srs->srs_bw->mac_bw_used == 0) &&
1918 (mac_srs->srs_size != 0) &&
1919 !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
1920 /*
1921 * Seems like configured B/W doesn't
1922 * even allow processing of 1 packet
1923 * per tick.
1924 *
1925 * XXX: raise the limit to processing
1926 * at least 1 packet per tick.
1927 */
1928 mac_srs->srs_bw->mac_bw_limit +=
1929 mac_srs->srs_bw->mac_bw_limit;
1930 mac_srs->srs_bw->mac_bw_drop_threshold +=
1931 mac_srs->srs_bw->mac_bw_drop_threshold;
1932 cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
1933 "raised B/W limit to %d since not even a "
1934 "single packet can be processed per "
1935 "tick %d\n", (void *)mac_srs,
1936 (int)mac_srs->srs_bw->mac_bw_limit,
1937 (int)msgdsize(mac_srs->srs_first));
1938 }
1939 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1940 goto done;
1941 }
1942
1943 ASSERT(head != NULL);
1944 ASSERT(tail != NULL);
1945
1946 /* zero bandwidth: drop all and return to interrupt mode */
1947 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1948 if (mac_srs->srs_bw->mac_bw_limit == 0) {
1949 srs_rx->sr_stat.mrs_sdrops += cnt;
1950 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
1951 mac_srs->srs_bw->mac_bw_sz -= sz;
1952 mac_srs->srs_bw->mac_bw_drop_bytes += sz;
1953 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1954 mac_pkt_drop(NULL, NULL, head, B_FALSE);
1955 goto leave_poll;
1956 } else {
1957 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1958 }
1959
1960 if ((tid = mac_srs->srs_tid) != 0)
1961 mac_srs->srs_tid = 0;
1962
1963 mac_srs->srs_state |= (SRS_PROC|proc_type);
1964 MAC_SRS_WORKER_POLLING_ON(mac_srs);
1965
1966 /*
1967 * mcip is NULL for broadcast and multicast flows. The promisc
1968 * callbacks for broadcast and multicast packets are delivered from
1969 * mac_rx() and we don't need to worry about that case in this path
1970 */
1971 if (mcip != NULL) {
1972 if (mcip->mci_promisc_list != NULL) {
1973 mutex_exit(&mac_srs->srs_lock);
1974 mac_promisc_client_dispatch(mcip, head);
1975 mutex_enter(&mac_srs->srs_lock);
1976 }
1977 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1978 mutex_exit(&mac_srs->srs_lock);
1979 mac_protect_intercept_dhcp(mcip, head);
1980 mutex_enter(&mac_srs->srs_lock);
1981 }
1982 }
1983
1984 /*
1985 * Check if SRS itself is doing the processing
1986 * This direct path does not apply when subflows are present. In this
1987 * case, packets need to be dispatched to a soft ring according to the
1988 * flow's bandwidth and other resources contraints.
1989 */
1990 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1991 mac_direct_rx_t proc;
1992 void *arg1;
1993 mac_resource_handle_t arg2;
1994
1995 /*
1996 * This is the case when a Rx is directly
1997 * assigned and we have a fully classified
1998 * protocol chain. We can deal with it in
1999 * one shot.
2000 */
2001 proc = srs_rx->sr_func;
2002 arg1 = srs_rx->sr_arg1;
2003 arg2 = srs_rx->sr_arg2;
2004
2005 mac_srs->srs_state |= SRS_CLIENT_PROC;
2006 mutex_exit(&mac_srs->srs_lock);
2007 if (tid != 0) {
2008 (void) untimeout(tid);
2009 tid = 0;
2010 }
2011
2012 proc(arg1, arg2, head, NULL);
2013 /*
2014 * Decrement the size and count here itelf
2015 * since the packet has been processed.
2016 */
2017 mutex_enter(&mac_srs->srs_lock);
2018 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
2019 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
2020
2021 if (mac_srs->srs_state & SRS_CLIENT_WAIT)
2022 cv_signal(&mac_srs->srs_client_cv);
2023 mac_srs->srs_state &= ~SRS_CLIENT_PROC;
2024 } else {
2025 /* Some kind of softrings based fanout is required */
2026 mutex_exit(&mac_srs->srs_lock);
2027 if (tid != 0) {
2028 (void) untimeout(tid);
2029 tid = 0;
2030 }
2031
2032 /*
2033 * Since the fanout routines can deal with chains,
2034 * shoot the entire chain up.
2035 */
2036 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
2037 mac_rx_srs_fanout(mac_srs, head);
2038 else
2039 mac_rx_srs_proto_fanout(mac_srs, head);
2040 mutex_enter(&mac_srs->srs_lock);
2041 }
2042
2043 /*
2044 * Send the poll thread to pick up any packets arrived
2045 * so far. This also serves as the last check in case
2046 * nothing else is queued in the SRS. The poll thread
2047 * is signalled only in the case the drain was done
2048 * by the worker thread and SRS_WORKER is set. The
2049 * worker thread can run in parallel as long as the
2050 * SRS_WORKER flag is set. We we have nothing else to
2051 * process, we can exit while leaving SRS_PROC set
2052 * which gives the poll thread control to process and
2053 * cleanup once it returns from the NIC.
2054 *
2055 * If we have nothing else to process, we need to
2056 * ensure that we keep holding the srs_lock till
2057 * all the checks below are done and control is
2058 * handed to the poll thread if it was running.
2059 */
2060 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2061 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2062 if (mac_srs->srs_first != NULL) {
2063 if (proc_type == SRS_WORKER) {
2064 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2065 if (srs_rx->sr_poll_pkt_cnt <=
2066 srs_rx->sr_lowat)
2067 MAC_SRS_POLL_RING(mac_srs);
2068 goto again;
2069 } else {
2070 cv_signal(&mac_srs->srs_async);
2071 }
2072 }
2073 }
2074 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2075
2076 done:
2077
2078 if (mac_srs->srs_state & SRS_GET_PKTS) {
2079 /*
2080 * Poll thread is already running. Leave the
2081 * SRS_RPOC set and hand over the control to
2082 * poll thread.
2083 */
2084 mac_srs->srs_state &= ~proc_type;
2085 return;
2086 }
2087
2088 /*
2089 * If we can't process packets because we have exceeded
2090 * B/W limit for this tick, just set the timeout
2091 * and leave.
2092 *
2093 * Even if there are no packets queued in SRS, we
2094 * need to make sure that the shared counter is
2095 * clear and any associated softrings have cleared
2096 * all the backlog. Otherwise, leave the interface
2097 * in polling mode and the poll thread will get
2098 * signalled once the count goes down to zero.
2099 *
2100 * If someone is already draining the queue (SRS_PROC is
2101 * set) when the srs_poll_pkt_cnt goes down to zero,
2102 * then it means that drain is already running and we
2103 * will turn off polling at that time if there is
2104 * no backlog. As long as there are packets queued either
2105 * is soft ring set or its soft rings, we will leave
2106 * the interface in polling mode.
2107 */
2108 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2109 if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
2110 ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
2111 (srs_rx->sr_poll_pkt_cnt > 0))) {
2112 MAC_SRS_POLLING_ON(mac_srs);
2113 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2114 if ((mac_srs->srs_first != NULL) &&
2115 (mac_srs->srs_tid == NULL))
2116 mac_srs->srs_tid = timeout(mac_srs_fire,
2117 mac_srs, 1);
2118 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2119 return;
2120 }
2121 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2122
2123 leave_poll:
2124
2125 /* Nothing else to do. Get out of poll mode */
2126 MAC_SRS_POLLING_OFF(mac_srs);
2127 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2128 }
2129
2130 /*
2131 * mac_srs_worker
2132 *
2133 * The SRS worker routine. Drains the queue when no one else is
2134 * processing it.
2135 */
2136 void
2137 mac_srs_worker(mac_soft_ring_set_t *mac_srs)
2138 {
2139 kmutex_t *lock = &mac_srs->srs_lock;
2140 kcondvar_t *async = &mac_srs->srs_async;
2141 callb_cpr_t cprinfo;
2142 boolean_t bw_ctl_flag;
2143
2144 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
2145 mutex_enter(lock);
2146
2147 start:
2148 for (;;) {
2149 bw_ctl_flag = B_FALSE;
2150 if (mac_srs->srs_type & SRST_BW_CONTROL) {
2151 MAC_SRS_BW_LOCK(mac_srs);
2152 MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2153 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
2154 bw_ctl_flag = B_TRUE;
2155 MAC_SRS_BW_UNLOCK(mac_srs);
2156 }
2157 /*
2158 * The SRS_BW_ENFORCED flag may change since we have dropped
2159 * the mac_bw_lock. However the drain function can handle both
2160 * a drainable SRS or a bandwidth controlled SRS, and the
2161 * effect of scheduling a timeout is to wakeup the worker
2162 * thread which in turn will call the drain function. Since
2163 * we release the srs_lock atomically only in the cv_wait there
2164 * isn't a fear of waiting for ever.
2165 */
2166 while (((mac_srs->srs_state & SRS_PROC) ||
2167 (mac_srs->srs_first == NULL) || bw_ctl_flag ||
2168 (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
2169 !(mac_srs->srs_state & SRS_PAUSE)) {
2170 /*
2171 * If we have packets queued and we are here
2172 * because B/W control is in place, we better
2173 * schedule the worker wakeup after 1 tick
2174 * to see if bandwidth control can be relaxed.
2175 */
2176 if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
2177 /*
2178 * We need to ensure that a timer is already
2179 * scheduled or we force schedule one for
2180 * later so that we can continue processing
2181 * after this quanta is over.
2182 */
2183 mac_srs->srs_tid = timeout(mac_srs_fire,
2184 mac_srs, 1);
2185 }
2186 wait:
2187 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2188 cv_wait(async, lock);
2189 CALLB_CPR_SAFE_END(&cprinfo, lock);
2190
2191 if (mac_srs->srs_state & SRS_PAUSE)
2192 goto done;
2193 if (mac_srs->srs_state & SRS_PROC)
2194 goto wait;
2195
2196 if (mac_srs->srs_first != NULL &&
2197 mac_srs->srs_type & SRST_BW_CONTROL) {
2198 MAC_SRS_BW_LOCK(mac_srs);
2199 if (mac_srs->srs_bw->mac_bw_state &
2200 SRS_BW_ENFORCED) {
2201 MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2202 }
2203 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
2204 SRS_BW_ENFORCED;
2205 MAC_SRS_BW_UNLOCK(mac_srs);
2206 }
2207 }
2208
2209 if (mac_srs->srs_state & SRS_PAUSE)
2210 goto done;
2211 mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
2212 }
2213 done:
2214 /*
2215 * The Rx SRS quiesce logic first cuts off packet supply to the SRS
2216 * from both hard and soft classifications and waits for such threads
2217 * to finish before signaling the worker. So at this point the only
2218 * thread left that could be competing with the worker is the poll
2219 * thread. In the case of Tx, there shouldn't be any thread holding
2220 * SRS_PROC at this point.
2221 */
2222 if (!(mac_srs->srs_state & SRS_PROC)) {
2223 mac_srs->srs_state |= SRS_PROC;
2224 } else {
2225 ASSERT((mac_srs->srs_type & SRST_TX) == 0);
2226 /*
2227 * Poll thread still owns the SRS and is still running
2228 */
2229 ASSERT((mac_srs->srs_poll_thr == NULL) ||
2230 ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
2231 SRS_POLL_THR_OWNER));
2232 }
2233 mac_srs_worker_quiesce(mac_srs);
2234 /*
2235 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
2236 * of the quiesce operation
2237 */
2238 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
2239 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
2240
2241 if (mac_srs->srs_state & SRS_RESTART) {
2242 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
2243 mac_srs_worker_restart(mac_srs);
2244 mac_srs->srs_state &= ~SRS_PROC;
2245 goto start;
2246 }
2247
2248 if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
2249 mac_srs_worker_quiesce(mac_srs);
2250
2251 mac_srs->srs_state &= ~SRS_PROC;
2252 /* The macro drops the srs_lock */
2253 CALLB_CPR_EXIT(&cprinfo);
2254 thread_exit();
2255 }
2256
2257 /*
2258 * mac_rx_srs_subflow_process
2259 *
2260 * Receive side routine called from interrupt path when there are
2261 * sub flows present on this SRS.
2262 */
2263 /* ARGSUSED */
2264 void
2265 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
2266 mblk_t *mp_chain, boolean_t loopback)
2267 {
2268 flow_entry_t *flent = NULL;
2269 flow_entry_t *prev_flent = NULL;
2270 mblk_t *mp = NULL;
2271 mblk_t *tail = NULL;
2272 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs;
2273 mac_client_impl_t *mcip;
2274
2275 mcip = mac_srs->srs_mcip;
2276 ASSERT(mcip != NULL);
2277
2278 /*
2279 * We need to determine the SRS for every packet
2280 * by walking the flow table, if we don't get any,
2281 * then we proceed using the SRS we came with.
2282 */
2283 mp = tail = mp_chain;
2284 while (mp != NULL) {
2285
2286 /*
2287 * We will increment the stats for the mactching subflow.
2288 * when we get the bytes/pkt count for the classified packets
2289 * later in mac_rx_srs_process.
2290 */
2291 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
2292 FLOW_INBOUND, &flent);
2293
2294 if (mp == mp_chain || flent == prev_flent) {
2295 if (prev_flent != NULL)
2296 FLOW_REFRELE(prev_flent);
2297 prev_flent = flent;
2298 flent = NULL;
2299 tail = mp;
2300 mp = mp->b_next;
2301 continue;
2302 }
2303 tail->b_next = NULL;
2304 /*
2305 * A null indicates, this is for the mac_srs itself.
2306 * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
2307 */
2308 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2309 mac_rx_srs_process(arg,
2310 (mac_resource_handle_t)mac_srs, mp_chain,
2311 loopback);
2312 } else {
2313 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2314 prev_flent->fe_cb_arg2, mp_chain, loopback);
2315 FLOW_REFRELE(prev_flent);
2316 }
2317 prev_flent = flent;
2318 flent = NULL;
2319 mp_chain = mp;
2320 tail = mp;
2321 mp = mp->b_next;
2322 }
2323 /* Last chain */
2324 ASSERT(mp_chain != NULL);
2325 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2326 mac_rx_srs_process(arg,
2327 (mac_resource_handle_t)mac_srs, mp_chain, loopback);
2328 } else {
2329 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2330 prev_flent->fe_cb_arg2, mp_chain, loopback);
2331 FLOW_REFRELE(prev_flent);
2332 }
2333 }
2334
2335 /*
2336 * mac_rx_srs_process
2337 *
2338 * Receive side routine called from the interrupt path.
2339 *
2340 * loopback is set to force a context switch on the loopback
2341 * path between MAC clients.
2342 */
2343 /* ARGSUSED */
2344 void
2345 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
2346 boolean_t loopback)
2347 {
2348 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs;
2349 mblk_t *mp, *tail, *head;
2350 int count = 0;
2351 int count1;
2352 size_t sz = 0;
2353 size_t chain_sz, sz1;
2354 mac_bw_ctl_t *mac_bw;
2355 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
2356
2357 /*
2358 * Set the tail, count and sz. We set the sz irrespective
2359 * of whether we are doing B/W control or not for the
2360 * purpose of updating the stats.
2361 */
2362 mp = tail = mp_chain;
2363 while (mp != NULL) {
2364 tail = mp;
2365 count++;
2366 sz += msgdsize(mp);
2367 mp = mp->b_next;
2368 }
2369
2370 mutex_enter(&mac_srs->srs_lock);
2371
2372 if (loopback) {
2373 SRS_RX_STAT_UPDATE(mac_srs, lclbytes, sz);
2374 SRS_RX_STAT_UPDATE(mac_srs, lclcnt, count);
2375
2376 } else {
2377 SRS_RX_STAT_UPDATE(mac_srs, intrbytes, sz);
2378 SRS_RX_STAT_UPDATE(mac_srs, intrcnt, count);
2379 }
2380
2381 /*
2382 * If the SRS in already being processed; has been blanked;
2383 * can be processed by worker thread only; or the B/W limit
2384 * has been reached, then queue the chain and check if
2385 * worker thread needs to be awakend.
2386 */
2387 if (mac_srs->srs_type & SRST_BW_CONTROL) {
2388 mac_bw = mac_srs->srs_bw;
2389 ASSERT(mac_bw != NULL);
2390 mutex_enter(&mac_bw->mac_bw_lock);
2391 mac_bw->mac_bw_intr += sz;
2392 if (mac_bw->mac_bw_limit == 0) {
2393 /* zero bandwidth: drop all */
2394 srs_rx->sr_stat.mrs_sdrops += count;
2395 mac_bw->mac_bw_drop_bytes += sz;
2396 mutex_exit(&mac_bw->mac_bw_lock);
2397 mutex_exit(&mac_srs->srs_lock);
2398 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
2399 return;
2400 } else {
2401 if ((mac_bw->mac_bw_sz + sz) <=
2402 mac_bw->mac_bw_drop_threshold) {
2403 mutex_exit(&mac_bw->mac_bw_lock);
2404 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
2405 tail, count, sz);
2406 } else {
2407 mp = mp_chain;
2408 chain_sz = 0;
2409 count1 = 0;
2410 tail = NULL;
2411 head = NULL;
2412 while (mp != NULL) {
2413 sz1 = msgdsize(mp);
2414 if (mac_bw->mac_bw_sz + chain_sz + sz1 >
2415 mac_bw->mac_bw_drop_threshold)
2416 break;
2417 chain_sz += sz1;
2418 count1++;
2419 tail = mp;
2420 mp = mp->b_next;
2421 }
2422 mutex_exit(&mac_bw->mac_bw_lock);
2423 if (tail != NULL) {
2424 head = tail->b_next;
2425 tail->b_next = NULL;
2426 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
2427 mp_chain, tail, count1, chain_sz);
2428 sz -= chain_sz;
2429 count -= count1;
2430 } else {
2431 /* Can't pick up any */
2432 head = mp_chain;
2433 }
2434 if (head != NULL) {
2435 /* Drop any packet over the threshold */
2436 srs_rx->sr_stat.mrs_sdrops += count;
2437 mutex_enter(&mac_bw->mac_bw_lock);
2438 mac_bw->mac_bw_drop_bytes += sz;
2439 mutex_exit(&mac_bw->mac_bw_lock);
2440 freemsgchain(head);
2441 }
2442 }
2443 MAC_SRS_WORKER_WAKEUP(mac_srs);
2444 mutex_exit(&mac_srs->srs_lock);
2445 return;
2446 }
2447 }
2448
2449 /*
2450 * If the total number of packets queued in the SRS and
2451 * its associated soft rings exceeds the max allowed,
2452 * then drop the chain. If we are polling capable, this
2453 * shouldn't be happening.
2454 */
2455 if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
2456 (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
2457 mac_bw = mac_srs->srs_bw;
2458 srs_rx->sr_stat.mrs_sdrops += count;
2459 mutex_enter(&mac_bw->mac_bw_lock);
2460 mac_bw->mac_bw_drop_bytes += sz;
2461 mutex_exit(&mac_bw->mac_bw_lock);
2462 freemsgchain(mp_chain);
2463 mutex_exit(&mac_srs->srs_lock);
2464 return;
2465 }
2466
2467 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
2468
2469 if (!(mac_srs->srs_state & SRS_PROC)) {
2470 /*
2471 * If we are coming via loopback, if we are not optimizing for
2472 * latency, or if our stack is running deep, we should signal
2473 * the worker thread.
2474 */
2475 if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT) ||
2476 MAC_RX_SRS_TOODEEP()) {
2477 /*
2478 * For loopback, We need to let the worker take
2479 * over as we don't want to continue in the same
2480 * thread even if we can. This could lead to stack
2481 * overflows and may also end up using
2482 * resources (cpu) incorrectly.
2483 */
2484 cv_signal(&mac_srs->srs_async);
2485 } else {
2486 /*
2487 * Seems like no one is processing the SRS and
2488 * there is no backlog. We also inline process
2489 * our packet if its a single packet in non
2490 * latency optimized case (in latency optimized
2491 * case, we inline process chains of any size).
2492 */
2493 mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
2494 }
2495 }
2496 mutex_exit(&mac_srs->srs_lock);
2497 }
2498
2499 /* TX SIDE ROUTINES (RUNTIME) */
2500
2501 /*
2502 * mac_tx_srs_no_desc
2503 *
2504 * This routine is called by Tx single ring default mode
2505 * when Tx ring runs out of descs.
2506 */
2507 mac_tx_cookie_t
2508 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2509 uint16_t flag, mblk_t **ret_mp)
2510 {
2511 mac_tx_cookie_t cookie = NULL;
2512 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2513 boolean_t wakeup_worker = B_TRUE;
2514 uint32_t tx_mode = srs_tx->st_mode;
2515 int cnt, sz;
2516 mblk_t *tail;
2517
2518 ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
2519 if (flag & MAC_DROP_ON_NO_DESC) {
2520 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2521 } else {
2522 if (mac_srs->srs_first != NULL)
2523 wakeup_worker = B_FALSE;
2524 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2525 if (flag & MAC_TX_NO_ENQUEUE) {
2526 /*
2527 * If TX_QUEUED is not set, queue the
2528 * packet and let mac_tx_srs_drain()
2529 * set the TX_BLOCKED bit for the
2530 * reasons explained above. Otherwise,
2531 * return the mblks.
2532 */
2533 if (wakeup_worker) {
2534 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2535 mp_chain, tail, cnt, sz);
2536 } else {
2537 MAC_TX_SET_NO_ENQUEUE(mac_srs,
2538 mp_chain, ret_mp, cookie);
2539 }
2540 } else {
2541 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2542 tail, cnt, sz, cookie);
2543 }
2544 if (wakeup_worker)
2545 cv_signal(&mac_srs->srs_async);
2546 }
2547 return (cookie);
2548 }
2549
2550 /*
2551 * mac_tx_srs_enqueue
2552 *
2553 * This routine is called when Tx SRS is operating in either serializer
2554 * or bandwidth mode. In serializer mode, a packet will get enqueued
2555 * when a thread cannot enter SRS exclusively. In bandwidth mode,
2556 * packets gets queued if allowed byte-count limit for a tick is
2557 * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
2558 * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
2559 * the default mode or fanout mode. Here packets get dropped or
2560 * returned back to the caller only after hi-watermark worth of data
2561 * is queued.
2562 */
2563 static mac_tx_cookie_t
2564 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2565 uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
2566 {
2567 mac_tx_cookie_t cookie = NULL;
2568 int cnt, sz;
2569 mblk_t *tail;
2570 boolean_t wakeup_worker = B_TRUE;
2571
2572 /*
2573 * Ignore fanout hint if we don't have multiple tx rings.
2574 */
2575 if (!MAC_TX_SOFT_RINGS(mac_srs))
2576 fanout_hint = 0;
2577
2578 if (mac_srs->srs_first != NULL)
2579 wakeup_worker = B_FALSE;
2580 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2581 if (flag & MAC_DROP_ON_NO_DESC) {
2582 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
2583 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2584 } else {
2585 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2586 mp_chain, tail, cnt, sz);
2587 }
2588 } else if (flag & MAC_TX_NO_ENQUEUE) {
2589 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
2590 (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
2591 MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
2592 ret_mp, cookie);
2593 } else {
2594 mp_chain->b_prev = (mblk_t *)fanout_hint;
2595 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2596 mp_chain, tail, cnt, sz);
2597 }
2598 } else {
2599 /*
2600 * If you are BW_ENFORCED, just enqueue the
2601 * packet. srs_worker will drain it at the
2602 * prescribed rate. Before enqueueing, save
2603 * the fanout hint.
2604 */
2605 mp_chain->b_prev = (mblk_t *)fanout_hint;
2606 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2607 tail, cnt, sz, cookie);
2608 }
2609 if (wakeup_worker)
2610 cv_signal(&mac_srs->srs_async);
2611 return (cookie);
2612 }
2613
2614 /*
2615 * There are seven tx modes:
2616 *
2617 * 1) Default mode (SRS_TX_DEFAULT)
2618 * 2) Serialization mode (SRS_TX_SERIALIZE)
2619 * 3) Fanout mode (SRS_TX_FANOUT)
2620 * 4) Bandwdith mode (SRS_TX_BW)
2621 * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
2622 * 6) aggr Tx mode (SRS_TX_AGGR)
2623 * 7) aggr Tx bw mode (SRS_TX_BW_AGGR)
2624 *
2625 * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
2626 * based on the number of Tx rings requested for an SRS and whether
2627 * bandwidth control is requested or not.
2628 *
2629 * The default mode (i.e., no fanout/no bandwidth) is used when the
2630 * underlying NIC does not have Tx rings or just one Tx ring. In this mode,
2631 * the SRS acts as a pass-thru. Packets will go directly to mac_tx_send().
2632 * When the underlying Tx ring runs out of Tx descs, it starts queueing up
2633 * packets in SRS. When flow-control is relieved, the srs_worker drains
2634 * the queued packets and informs blocked clients to restart sending
2635 * packets.
2636 *
2637 * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. This
2638 * mode is used when the link has no Tx rings or only one Tx ring.
2639 *
2640 * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
2641 * Tx rings. Each Tx ring will have a soft ring associated with it.
2642 * These soft rings will be hung off the Tx SRS. Queueing if it happens
2643 * due to lack of Tx desc will be in individual soft ring (and not srs)
2644 * associated with Tx ring.
2645 *
2646 * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
2647 * only if bw is available. Otherwise the packets will be queued in
2648 * SRS. If fanout to multiple Tx rings is configured, the packets will
2649 * be fanned out among the soft rings associated with the Tx rings.
2650 *
2651 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
2652 * invokes an aggr function, aggr_find_tx_ring(), to find a pseudo Tx ring
2653 * belonging to a port on which the packet has to be sent. Aggr will
2654 * always have a pseudo Tx ring associated with it even when it is an
2655 * aggregation over a single NIC that has no Tx rings. Even in such a
2656 * case, the single pseudo Tx ring will have a soft ring associated with
2657 * it and the soft ring will hang off the SRS.
2658 *
2659 * If a bandwidth is specified for an aggr, SRS_TX_BW_AGGR mode is used.
2660 * In this mode, the bandwidth is first applied on the outgoing packets
2661 * and later mac_tx_addr_mode() function is called to send the packet out
2662 * of one of the pseudo Tx rings.
2663 *
2664 * Four flags are used in srs_state for indicating flow control
2665 * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
2666 * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
2667 * driver below.
2668 * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
2669 * and flow-control pressure is applied back to clients. The clients expect
2670 * wakeup when flow-control is relieved.
2671 * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
2672 * got returned back to client either due to lack of Tx descs or due to bw
2673 * control reasons. The clients expect a wakeup when condition is relieved.
2674 *
2675 * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
2676 * some clients set the following values too: MAC_DROP_ON_NO_DESC,
2677 * MAC_TX_NO_ENQUEUE
2678 * Mac clients that do not want packets to be enqueued in the mac layer set
2679 * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
2680 * Tx soft rings but instead get dropped when the NIC runs out of desc. The
2681 * behaviour of this flag is different when the Tx is running in serializer
2682 * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
2683 * get dropped when Tx high watermark is reached.
2684 * There are some mac clients like vsw, aggr that want the mblks to be
2685 * returned back to clients instead of being queued in Tx SRS (or Tx soft
2686 * rings) under flow-control (i.e., out of desc or exceeding bw limits)
2687 * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
2688 * In the default and Tx fanout mode, the un-transmitted mblks will be
2689 * returned back to the clients when the driver runs out of Tx descs.
2690 * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
2691 * soft ring) so that the clients can be woken up when Tx desc become
2692 * available. When running in serializer or bandwidth mode mode,
2693 * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
2694 */
2695
2696 mac_tx_func_t
2697 mac_tx_get_func(uint32_t mode)
2698 {
2699 return (mac_tx_mode_list[mode].mac_tx_func);
2700 }
2701
2702 /* ARGSUSED */
2703 static mac_tx_cookie_t
2704 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2705 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2706 {
2707 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2708 mac_tx_stats_t stats;
2709 mac_tx_cookie_t cookie = NULL;
2710
2711 ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
2712
2713 /* Regular case with a single Tx ring */
2714 /*
2715 * SRS_TX_BLOCKED is set when underlying NIC runs
2716 * out of Tx descs and messages start getting
2717 * queued. It won't get reset until
2718 * tx_srs_drain() completely drains out the
2719 * messages.
2720 */
2721 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2722 /* Tx descs/resources not available */
2723 mutex_enter(&mac_srs->srs_lock);
2724 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2725 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
2726 flag, ret_mp);
2727 mutex_exit(&mac_srs->srs_lock);
2728 return (cookie);
2729 }
2730 /*
2731 * While we were computing mblk count, the
2732 * flow control condition got relieved.
2733 * Continue with the transmission.
2734 */
2735 mutex_exit(&mac_srs->srs_lock);
2736 }
2737
2738 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2739 mp_chain, &stats);
2740
2741 /*
2742 * Multiple threads could be here sending packets.
2743 * Under such conditions, it is not possible to
2744 * automically set SRS_TX_BLOCKED bit to indicate
2745 * out of tx desc condition. To atomically set
2746 * this, we queue the returned packet and do
2747 * the setting of SRS_TX_BLOCKED in
2748 * mac_tx_srs_drain().
2749 */
2750 if (mp_chain != NULL) {
2751 mutex_enter(&mac_srs->srs_lock);
2752 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
2753 mutex_exit(&mac_srs->srs_lock);
2754 return (cookie);
2755 }
2756 SRS_TX_STATS_UPDATE(mac_srs, &stats);
2757
2758 return (NULL);
2759 }
2760
2761 /*
2762 * mac_tx_serialize_mode
2763 *
2764 * This is an experimental mode implemented as per the request of PAE.
2765 * In this mode, all callers attempting to send a packet to the NIC
2766 * will get serialized. Only one thread at any time will access the
2767 * NIC to send the packet out.
2768 */
2769 /* ARGSUSED */
2770 static mac_tx_cookie_t
2771 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2772 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2773 {
2774 mac_tx_stats_t stats;
2775 mac_tx_cookie_t cookie = NULL;
2776 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2777
2778 /* Single ring, serialize below */
2779 ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
2780 mutex_enter(&mac_srs->srs_lock);
2781 if ((mac_srs->srs_first != NULL) ||
2782 (mac_srs->srs_state & SRS_PROC)) {
2783 /*
2784 * In serialization mode, queue all packets until
2785 * TX_HIWAT is set.
2786 * If drop bit is set, drop if TX_HIWAT is set.
2787 * If no_enqueue is set, still enqueue until hiwat
2788 * is set and return mblks after TX_HIWAT is set.
2789 */
2790 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
2791 flag, NULL, ret_mp);
2792 mutex_exit(&mac_srs->srs_lock);
2793 return (cookie);
2794 }
2795 /*
2796 * No packets queued, nothing on proc and no flow
2797 * control condition. Fast-path, ok. Do inline
2798 * processing.
2799 */
2800 mac_srs->srs_state |= SRS_PROC;
2801 mutex_exit(&mac_srs->srs_lock);
2802
2803 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2804 mp_chain, &stats);
2805
2806 mutex_enter(&mac_srs->srs_lock);
2807 mac_srs->srs_state &= ~SRS_PROC;
2808 if (mp_chain != NULL) {
2809 cookie = mac_tx_srs_enqueue(mac_srs,
2810 mp_chain, flag, NULL, ret_mp);
2811 }
2812 if (mac_srs->srs_first != NULL) {
2813 /*
2814 * We processed inline our packet and a new
2815 * packet/s got queued while we were
2816 * processing. Wakeup srs worker
2817 */
2818 cv_signal(&mac_srs->srs_async);
2819 }
2820 mutex_exit(&mac_srs->srs_lock);
2821
2822 if (cookie == NULL)
2823 SRS_TX_STATS_UPDATE(mac_srs, &stats);
2824
2825 return (cookie);
2826 }
2827
2828 /*
2829 * mac_tx_fanout_mode
2830 *
2831 * In this mode, the SRS will have access to multiple Tx rings to send
2832 * the packet out. The fanout hint that is passed as an argument is
2833 * used to find an appropriate ring to fanout the traffic. Each Tx
2834 * ring, in turn, will have a soft ring associated with it. If a Tx
2835 * ring runs out of Tx desc's the returned packet will be queued in
2836 * the soft ring associated with that Tx ring. The srs itself will not
2837 * queue any packets.
2838 */
2839
2840 #define MAC_TX_SOFT_RING_PROCESS(chain) { \
2841 index = COMPUTE_INDEX(hash, mac_srs->srs_tx_ring_count), \
2842 softring = mac_srs->srs_tx_soft_rings[index]; \
2843 cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
2844 DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \
2845 }
2846
2847 static mac_tx_cookie_t
2848 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2849 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2850 {
2851 mac_soft_ring_t *softring;
2852 uint64_t hash;
2853 uint_t index;
2854 mac_tx_cookie_t cookie = NULL;
2855
2856 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
2857 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
2858 if (fanout_hint != 0) {
2859 /*
2860 * The hint is specified by the caller, simply pass the
2861 * whole chain to the soft ring.
2862 */
2863 hash = HASH_HINT(fanout_hint);
2864 MAC_TX_SOFT_RING_PROCESS(mp_chain);
2865 } else {
2866 mblk_t *last_mp, *cur_mp, *sub_chain;
2867 uint64_t last_hash = 0;
2868 uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
2869
2870 /*
2871 * Compute the hash from the contents (headers) of the
2872 * packets of the mblk chain. Split the chains into
2873 * subchains of the same conversation.
2874 *
2875 * Since there may be more than one ring used for
2876 * sub-chains of the same call, and since the caller
2877 * does not maintain per conversation state since it
2878 * passed a zero hint, unsent subchains will be
2879 * dropped.
2880 */
2881
2882 flag |= MAC_DROP_ON_NO_DESC;
2883 ret_mp = NULL;
2884
2885 ASSERT(ret_mp == NULL);
2886
2887 sub_chain = NULL;
2888 last_mp = NULL;
2889
2890 for (cur_mp = mp_chain; cur_mp != NULL;
2891 cur_mp = cur_mp->b_next) {
2892 hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
2893 B_TRUE);
2894 if (last_hash != 0 && hash != last_hash) {
2895 /*
2896 * Starting a different subchain, send current
2897 * chain out.
2898 */
2899 ASSERT(last_mp != NULL);
2900 last_mp->b_next = NULL;
2901 MAC_TX_SOFT_RING_PROCESS(sub_chain);
2902 sub_chain = NULL;
2903 }
2904
2905 /* add packet to subchain */
2906 if (sub_chain == NULL)
2907 sub_chain = cur_mp;
2908 last_mp = cur_mp;
2909 last_hash = hash;
2910 }
2911
2912 if (sub_chain != NULL) {
2913 /* send last subchain */
2914 ASSERT(last_mp != NULL);
2915 last_mp->b_next = NULL;
2916 MAC_TX_SOFT_RING_PROCESS(sub_chain);
2917 }
2918
2919 cookie = NULL;
2920 }
2921
2922 return (cookie);
2923 }
2924
2925 /*
2926 * mac_tx_bw_mode
2927 *
2928 * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
2929 * only if bw is available. Otherwise the packets will be queued in
2930 * SRS. If the SRS has multiple Tx rings, then packets will get fanned
2931 * out to a Tx rings.
2932 */
2933 static mac_tx_cookie_t
2934 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2935 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2936 {
2937 int cnt, sz;
2938 mblk_t *tail;
2939 mac_tx_cookie_t cookie = NULL;
2940 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2941 clock_t now;
2942
2943 ASSERT(TX_BANDWIDTH_MODE(mac_srs));
2944 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
2945 mutex_enter(&mac_srs->srs_lock);
2946 if (mac_srs->srs_bw->mac_bw_limit == 0) {
2947 /*
2948 * zero bandwidth, no traffic is sent: drop the packets,
2949 * or return the whole chain if the caller requests all
2950 * unsent packets back.
2951 */
2952 if (flag & MAC_TX_NO_ENQUEUE) {
2953 cookie = (mac_tx_cookie_t)mac_srs;
2954 *ret_mp = mp_chain;
2955 } else {
2956 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2957 }
2958 mutex_exit(&mac_srs->srs_lock);
2959 return (cookie);
2960 } else if ((mac_srs->srs_first != NULL) ||
2961 (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2962 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2963 fanout_hint, ret_mp);
2964 mutex_exit(&mac_srs->srs_lock);
2965 return (cookie);
2966 }
2967 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2968 now = ddi_get_lbolt();
2969 if (mac_srs->srs_bw->mac_bw_curr_time != now) {
2970 mac_srs->srs_bw->mac_bw_curr_time = now;
2971 mac_srs->srs_bw->mac_bw_used = 0;
2972 } else if (mac_srs->srs_bw->mac_bw_used >
2973 mac_srs->srs_bw->mac_bw_limit) {
2974 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
2975 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2976 mp_chain, tail, cnt, sz);
2977 /*
2978 * Wakeup worker thread. Note that worker
2979 * thread has to be woken up so that it
2980 * can fire up the timer to be woken up
2981 * on the next tick. Also once
2982 * BW_ENFORCED is set, it can only be
2983 * reset by srs_worker thread. Until then
2984 * all packets will get queued up in SRS
2985 * and hence this this code path won't be
2986 * entered until BW_ENFORCED is reset.
2987 */
2988 cv_signal(&mac_srs->srs_async);
2989 mutex_exit(&mac_srs->srs_lock);
2990 return (cookie);
2991 }
2992
2993 mac_srs->srs_bw->mac_bw_used += sz;
2994 mutex_exit(&mac_srs->srs_lock);
2995
2996 if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
2997 mac_soft_ring_t *softring;
2998 uint_t indx, hash;
2999
3000 hash = HASH_HINT(fanout_hint);
3001 indx = COMPUTE_INDEX(hash,
3002 mac_srs->srs_tx_ring_count);
3003 softring = mac_srs->srs_tx_soft_rings[indx];
3004 return (mac_tx_soft_ring_process(softring, mp_chain, flag,
3005 ret_mp));
3006 } else if (srs_tx->st_mode == SRS_TX_BW_AGGR) {
3007 return (mac_tx_aggr_mode(mac_srs, mp_chain,
3008 fanout_hint, flag, ret_mp));
3009 } else {
3010 mac_tx_stats_t stats;
3011
3012 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3013 mp_chain, &stats);
3014
3015 if (mp_chain != NULL) {
3016 mutex_enter(&mac_srs->srs_lock);
3017 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3018 if (mac_srs->srs_bw->mac_bw_used > sz)
3019 mac_srs->srs_bw->mac_bw_used -= sz;
3020 else
3021 mac_srs->srs_bw->mac_bw_used = 0;
3022 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
3023 fanout_hint, ret_mp);
3024 mutex_exit(&mac_srs->srs_lock);
3025 return (cookie);
3026 }
3027 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3028
3029 return (NULL);
3030 }
3031 }
3032
3033 /*
3034 * mac_tx_aggr_mode
3035 *
3036 * This routine invokes an aggr function, aggr_find_tx_ring(), to find
3037 * a (pseudo) Tx ring belonging to a port on which the packet has to
3038 * be sent. aggr_find_tx_ring() first finds the outgoing port based on
3039 * L2/L3/L4 policy and then uses the fanout_hint passed to it to pick
3040 * a Tx ring from the selected port.
3041 *
3042 * Note that a port can be deleted from the aggregation. In such a case,
3043 * the aggregation layer first separates the port from the rest of the
3044 * ports making sure that port (and thus any Tx rings associated with
3045 * it) won't get selected in the call to aggr_find_tx_ring() function.
3046 * Later calls are made to mac_group_rem_ring() passing pseudo Tx ring
3047 * handles one by one which in turn will quiesce the Tx SRS and remove
3048 * the soft ring associated with the pseudo Tx ring. Unlike Rx side
3049 * where a cookie is used to protect against mac_rx_ring() calls on
3050 * rings that have been removed, no such cookie is needed on the Tx
3051 * side as the pseudo Tx ring won't be available anymore to
3052 * aggr_find_tx_ring() once the port has been removed.
3053 */
3054 static mac_tx_cookie_t
3055 mac_tx_aggr_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3056 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
3057 {
3058 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3059 mac_tx_ring_fn_t find_tx_ring_fn;
3060 mac_ring_handle_t ring = NULL;
3061 void *arg;
3062 mac_soft_ring_t *sringp;
3063
3064 find_tx_ring_fn = srs_tx->st_capab_aggr.mca_find_tx_ring_fn;
3065 arg = srs_tx->st_capab_aggr.mca_arg;
3066 if (find_tx_ring_fn(arg, mp_chain, fanout_hint, &ring) == NULL)
3067 return (NULL);
3068 sringp = srs_tx->st_soft_rings[((mac_ring_t *)ring)->mr_index];
3069 return (mac_tx_soft_ring_process(sringp, mp_chain, flag, ret_mp));
3070 }
3071
3072 void
3073 mac_tx_invoke_callbacks(mac_client_impl_t *mcip, mac_tx_cookie_t cookie)
3074 {
3075 mac_cb_t *mcb;
3076 mac_tx_notify_cb_t *mtnfp;
3077
3078 /* Wakeup callback registered clients */
3079 MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
3080 for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
3081 mcb = mcb->mcb_nextp) {
3082 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
3083 mtnfp->mtnf_fn(mtnfp->mtnf_arg, cookie);
3084 }
3085 MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
3086 &mcip->mci_tx_notify_cb_list);
3087 }
3088
3089 /* ARGSUSED */
3090 void
3091 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
3092 {
3093 mblk_t *head, *tail;
3094 size_t sz;
3095 uint32_t tx_mode;
3096 uint_t saved_pkt_count;
3097 mac_tx_stats_t stats;
3098 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3099 clock_t now;
3100
3101 saved_pkt_count = 0;
3102 ASSERT(mutex_owned(&mac_srs->srs_lock));
3103 ASSERT(!(mac_srs->srs_state & SRS_PROC));
3104
3105 mac_srs->srs_state |= SRS_PROC;
3106
3107 tx_mode = srs_tx->st_mode;
3108 if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
3109 if (mac_srs->srs_first != NULL) {
3110 head = mac_srs->srs_first;
3111 tail = mac_srs->srs_last;
3112 saved_pkt_count = mac_srs->srs_count;
3113 mac_srs->srs_first = NULL;
3114 mac_srs->srs_last = NULL;
3115 mac_srs->srs_count = 0;
3116 mutex_exit(&mac_srs->srs_lock);
3117
3118 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3119 head, &stats);
3120
3121 mutex_enter(&mac_srs->srs_lock);
3122 if (head != NULL) {
3123 /* Device out of tx desc, set block */
3124 if (head->b_next == NULL)
3125 VERIFY(head == tail);
3126 tail->b_next = mac_srs->srs_first;
3127 mac_srs->srs_first = head;
3128 mac_srs->srs_count +=
3129 (saved_pkt_count - stats.mts_opackets);
3130 if (mac_srs->srs_last == NULL)
3131 mac_srs->srs_last = tail;
3132 MAC_TX_SRS_BLOCK(mac_srs, head);
3133 } else {
3134 srs_tx->st_woken_up = B_FALSE;
3135 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3136 }
3137 }
3138 } else if (tx_mode == SRS_TX_BW) {
3139 /*
3140 * We are here because the timer fired and we have some data
3141 * to tranmit. Also mac_tx_srs_worker should have reset
3142 * SRS_BW_ENFORCED flag
3143 */
3144 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
3145 head = tail = mac_srs->srs_first;
3146 while (mac_srs->srs_first != NULL) {
3147 tail = mac_srs->srs_first;
3148 tail->b_prev = NULL;
3149 mac_srs->srs_first = tail->b_next;
3150 if (mac_srs->srs_first == NULL)
3151 mac_srs->srs_last = NULL;
3152 mac_srs->srs_count--;
3153 sz = msgdsize(tail);
3154 mac_srs->srs_size -= sz;
3155 saved_pkt_count++;
3156 MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
3157
3158 if (mac_srs->srs_bw->mac_bw_used <
3159 mac_srs->srs_bw->mac_bw_limit)
3160 continue;
3161
3162 now = ddi_get_lbolt();
3163 if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3164 mac_srs->srs_bw->mac_bw_curr_time = now;
3165 mac_srs->srs_bw->mac_bw_used = sz;
3166 continue;
3167 }
3168 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3169 break;
3170 }
3171
3172 ASSERT((head == NULL && tail == NULL) ||
3173 (head != NULL && tail != NULL));
3174 if (tail != NULL) {
3175 tail->b_next = NULL;
3176 mutex_exit(&mac_srs->srs_lock);
3177
3178 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3179 head, &stats);
3180
3181 mutex_enter(&mac_srs->srs_lock);
3182 if (head != NULL) {
3183 uint_t size_sent;
3184
3185 /* Device out of tx desc, set block */
3186 if (head->b_next == NULL)
3187 VERIFY(head == tail);
3188 tail->b_next = mac_srs->srs_first;
3189 mac_srs->srs_first = head;
3190 mac_srs->srs_count +=
3191 (saved_pkt_count - stats.mts_opackets);
3192 if (mac_srs->srs_last == NULL)
3193 mac_srs->srs_last = tail;
3194 size_sent = sz - stats.mts_obytes;
3195 mac_srs->srs_size += size_sent;
3196 mac_srs->srs_bw->mac_bw_sz += size_sent;
3197 if (mac_srs->srs_bw->mac_bw_used > size_sent) {
3198 mac_srs->srs_bw->mac_bw_used -=
3199 size_sent;
3200 } else {
3201 mac_srs->srs_bw->mac_bw_used = 0;
3202 }
3203 MAC_TX_SRS_BLOCK(mac_srs, head);
3204 } else {
3205 srs_tx->st_woken_up = B_FALSE;
3206 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3207 }
3208 }
3209 } else if (tx_mode == SRS_TX_BW_FANOUT || tx_mode == SRS_TX_BW_AGGR) {
3210 mblk_t *prev;
3211 uint64_t hint;
3212
3213 /*
3214 * We are here because the timer fired and we
3215 * have some quota to tranmit.
3216 */
3217 prev = NULL;
3218 head = tail = mac_srs->srs_first;
3219 while (mac_srs->srs_first != NULL) {
3220 tail = mac_srs->srs_first;
3221 mac_srs->srs_first = tail->b_next;
3222 if (mac_srs->srs_first == NULL)
3223 mac_srs->srs_last = NULL;
3224 mac_srs->srs_count--;
3225 sz = msgdsize(tail);
3226 mac_srs->srs_size -= sz;
3227 mac_srs->srs_bw->mac_bw_used += sz;
3228 if (prev == NULL)
3229 hint = (ulong_t)tail->b_prev;
3230 if (hint != (ulong_t)tail->b_prev) {
3231 prev->b_next = NULL;
3232 mutex_exit(&mac_srs->srs_lock);
3233 TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3234 head = tail;
3235 hint = (ulong_t)tail->b_prev;
3236 mutex_enter(&mac_srs->srs_lock);
3237 }
3238
3239 prev = tail;
3240 tail->b_prev = NULL;
3241 if (mac_srs->srs_bw->mac_bw_used <
3242 mac_srs->srs_bw->mac_bw_limit)
3243 continue;
3244
3245 now = ddi_get_lbolt();
3246 if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3247 mac_srs->srs_bw->mac_bw_curr_time = now;
3248 mac_srs->srs_bw->mac_bw_used = 0;
3249 continue;
3250 }
3251 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3252 break;
3253 }
3254 ASSERT((head == NULL && tail == NULL) ||
3255 (head != NULL && tail != NULL));
3256 if (tail != NULL) {
3257 tail->b_next = NULL;
3258 mutex_exit(&mac_srs->srs_lock);
3259 TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3260 mutex_enter(&mac_srs->srs_lock);
3261 }
3262 }
3263 /*
3264 * SRS_TX_FANOUT case not considered here because packets
3265 * won't be queued in the SRS for this case. Packets will
3266 * be sent directly to soft rings underneath and if there
3267 * is any queueing at all, it would be in Tx side soft
3268 * rings.
3269 */
3270
3271 /*
3272 * When srs_count becomes 0, reset SRS_TX_HIWAT and
3273 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
3274 */
3275 if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
3276 (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
3277 mac_client_impl_t *mcip = mac_srs->srs_mcip;
3278 boolean_t wakeup_required = B_FALSE;
3279
3280 if (mac_srs->srs_state &
3281 (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
3282 wakeup_required = B_TRUE;
3283 }
3284 mac_srs->srs_state &= ~(SRS_TX_HIWAT |
3285 SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
3286 mutex_exit(&mac_srs->srs_lock);
3287 if (wakeup_required) {
3288 mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)mac_srs);
3289 /*
3290 * If the client is not the primary MAC client, then we
3291 * need to send the notification to the clients upper
3292 * MAC, i.e. mci_upper_mip.
3293 */
3294 mac_tx_notify(mcip->mci_upper_mip != NULL ?
3295 mcip->mci_upper_mip : mcip->mci_mip);
3296 }
3297 mutex_enter(&mac_srs->srs_lock);
3298 }
3299 mac_srs->srs_state &= ~SRS_PROC;
3300 }
3301
3302 /*
3303 * Given a packet, get the flow_entry that identifies the flow
3304 * to which that packet belongs. The flow_entry will contain
3305 * the transmit function to be used to send the packet. If the
3306 * function returns NULL, the packet should be sent using the
3307 * underlying NIC.
3308 */
3309 static flow_entry_t *
3310 mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
3311 {
3312 flow_entry_t *flent = NULL;
3313 mac_client_impl_t *mcip;
3314 int err;
3315
3316 /*
3317 * Do classification on the packet.
3318 */
3319 err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
3320 if (err != 0)
3321 return (NULL);
3322
3323 /*
3324 * This flent might just be an additional one on the MAC client,
3325 * i.e. for classification purposes (different fdesc), however
3326 * the resources, SRS et. al., are in the mci_flent, so if
3327 * this isn't the mci_flent, we need to get it.
3328 */
3329 if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
3330 FLOW_REFRELE(flent);
3331 flent = mcip->mci_flent;
3332 FLOW_TRY_REFHOLD(flent, err);
3333 if (err != 0)
3334 return (NULL);
3335 }
3336
3337 return (flent);
3338 }
3339
3340 /*
3341 * This macro is only meant to be used by mac_tx_send().
3342 */
3343 #define CHECK_VID_AND_ADD_TAG(mp) { \
3344 if (vid_check) { \
3345 int err = 0; \
3346 \
3347 MAC_VID_CHECK(src_mcip, (mp), err); \
3348 if (err != 0) { \
3349 freemsg((mp)); \
3350 (mp) = next; \
3351 oerrors++; \
3352 continue; \
3353 } \
3354 } \
3355 if (add_tag) { \
3356 (mp) = mac_add_vlan_tag((mp), 0, vid); \
3357 if ((mp) == NULL) { \
3358 (mp) = next; \
3359 oerrors++; \
3360 continue; \
3361 } \
3362 } \
3363 }
3364
3365 mblk_t *
3366 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
3367 mac_tx_stats_t *stats)
3368 {
3369 mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
3370 mac_impl_t *mip = src_mcip->mci_mip;
3371 uint_t obytes = 0, opackets = 0, oerrors = 0;
3372 mblk_t *mp = NULL, *next;
3373 boolean_t vid_check, add_tag;
3374 uint16_t vid = 0;
3375
3376 if (mip->mi_nclients > 1) {
3377 vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
3378 add_tag = MAC_TAG_NEEDED(src_mcip);
3379 if (add_tag)
3380 vid = mac_client_vid(mch);
3381 } else {
3382 ASSERT(mip->mi_nclients == 1);
3383 vid_check = add_tag = B_FALSE;
3384 }
3385
3386 /*
3387 * Fastpath: if there's only one client, we simply send
3388 * the packet down to the underlying NIC.
3389 */
3390 if (mip->mi_nactiveclients == 1) {
3391 DTRACE_PROBE2(fastpath,
3392 mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
3393
3394 mp = mp_chain;
3395 while (mp != NULL) {
3396 next = mp->b_next;
3397 mp->b_next = NULL;
3398 opackets++;
3399 obytes += (mp->b_cont == NULL ? MBLKL(mp) :
3400 msgdsize(mp));
3401
3402 CHECK_VID_AND_ADD_TAG(mp);
3403 MAC_TX(mip, ring, mp, src_mcip);
3404
3405 /*
3406 * If the driver is out of descriptors and does a
3407 * partial send it will return a chain of unsent
3408 * mblks. Adjust the accounting stats.
3409 */
3410 if (mp != NULL) {
3411 opackets--;
3412 obytes -= msgdsize(mp);
3413 mp->b_next = next;
3414 break;
3415 }
3416 mp = next;
3417 }
3418 goto done;
3419 }
3420
3421 /*
3422 * No fastpath, we either have more than one MAC client
3423 * defined on top of the same MAC, or one or more MAC
3424 * client promiscuous callbacks.
3425 */
3426 DTRACE_PROBE3(slowpath, mac_client_impl_t *,
3427 src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
3428
3429 mp = mp_chain;
3430 while (mp != NULL) {
3431 flow_entry_t *dst_flow_ent;
3432 void *flow_cookie;
3433 size_t pkt_size;
3434 mblk_t *mp1;
3435
3436 next = mp->b_next;
3437 mp->b_next = NULL;
3438 opackets++;
3439 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
3440 obytes += pkt_size;
3441 CHECK_VID_AND_ADD_TAG(mp);
3442
3443 /*
3444 * Find the destination.
3445 */
3446 dst_flow_ent = mac_tx_classify(mip, mp);
3447
3448 if (dst_flow_ent != NULL) {
3449 size_t hdrsize;
3450 int err = 0;
3451
3452 if (mip->mi_info.mi_nativemedia == DL_ETHER) {
3453 struct ether_vlan_header *evhp =
3454 (struct ether_vlan_header *)mp->b_rptr;
3455
3456 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
3457 hdrsize = sizeof (*evhp);
3458 else
3459 hdrsize = sizeof (struct ether_header);
3460 } else {
3461 mac_header_info_t mhi;
3462
3463 err = mac_header_info((mac_handle_t)mip,
3464 mp, &mhi);
3465 if (err == 0)
3466 hdrsize = mhi.mhi_hdrsize;
3467 }
3468
3469 /*
3470 * Got a matching flow. It's either another
3471 * MAC client, or a broadcast/multicast flow.
3472 * Make sure the packet size is within the
3473 * allowed size. If not drop the packet and
3474 * move to next packet.
3475 */
3476 if (err != 0 ||
3477 (pkt_size - hdrsize) > mip->mi_sdu_max) {
3478 oerrors++;
3479 DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
3480 mblk_t *, mp);
3481 freemsg(mp);
3482 mp = next;
3483 FLOW_REFRELE(dst_flow_ent);
3484 continue;
3485 }
3486 flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
3487 if (flow_cookie != NULL) {
3488 /*
3489 * The vnic_bcast_send function expects
3490 * to receive the sender MAC client
3491 * as value for arg2.
3492 */
3493 mac_bcast_send(flow_cookie, src_mcip, mp,
3494 B_TRUE);
3495 } else {
3496 /*
3497 * loopback the packet to a local MAC
3498 * client. We force a context switch
3499 * if both source and destination MAC
3500 * clients are used by IP, i.e.
3501 * bypass is set.
3502 */
3503 boolean_t do_switch;
3504 mac_client_impl_t *dst_mcip =
3505 dst_flow_ent->fe_mcip;
3506
3507 /*
3508 * Check if there are promiscuous mode
3509 * callbacks defined. This check is
3510 * done here in the 'else' case and
3511 * not in other cases because this
3512 * path is for local loopback
3513 * communication which does not go
3514 * through MAC_TX(). For paths that go
3515 * through MAC_TX(), the promisc_list
3516 * check is done inside the MAC_TX()
3517 * macro.
3518 */
3519 if (mip->mi_promisc_list != NULL)
3520 mac_promisc_dispatch(mip, mp, src_mcip);
3521
3522 do_switch = ((src_mcip->mci_state_flags &
3523 dst_mcip->mci_state_flags &
3524 MCIS_CLIENT_POLL_CAPABLE) != 0);
3525
3526 if ((mp1 = mac_fix_cksum(mp)) != NULL) {
3527 (dst_flow_ent->fe_cb_fn)(
3528 dst_flow_ent->fe_cb_arg1,
3529 dst_flow_ent->fe_cb_arg2,
3530 mp1, do_switch);
3531 }
3532 }
3533 FLOW_REFRELE(dst_flow_ent);
3534 } else {
3535 /*
3536 * Unknown destination, send via the underlying
3537 * NIC.
3538 */
3539 MAC_TX(mip, ring, mp, src_mcip);
3540 if (mp != NULL) {
3541 /*
3542 * Adjust for the last packet that
3543 * could not be transmitted
3544 */
3545 opackets--;
3546 obytes -= pkt_size;
3547 mp->b_next = next;
3548 break;
3549 }
3550 }
3551 mp = next;
3552 }
3553
3554 done:
3555 stats->mts_obytes = obytes;
3556 stats->mts_opackets = opackets;
3557 stats->mts_oerrors = oerrors;
3558 return (mp);
3559 }
3560
3561 /*
3562 * mac_tx_srs_ring_present
3563 *
3564 * Returns whether the specified ring is part of the specified SRS.
3565 */
3566 boolean_t
3567 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3568 {
3569 int i;
3570 mac_soft_ring_t *soft_ring;
3571
3572 if (srs->srs_tx.st_arg2 == tx_ring)
3573 return (B_TRUE);
3574
3575 for (i = 0; i < srs->srs_tx_ring_count; i++) {
3576 soft_ring = srs->srs_tx_soft_rings[i];
3577 if (soft_ring->s_ring_tx_arg2 == tx_ring)
3578 return (B_TRUE);
3579 }
3580
3581 return (B_FALSE);
3582 }
3583
3584 /*
3585 * mac_tx_srs_get_soft_ring
3586 *
3587 * Returns the TX soft ring associated with the given ring, if present.
3588 */
3589 mac_soft_ring_t *
3590 mac_tx_srs_get_soft_ring(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3591 {
3592 int i;
3593 mac_soft_ring_t *soft_ring;
3594
3595 if (srs->srs_tx.st_arg2 == tx_ring)
3596 return (NULL);
3597
3598 for (i = 0; i < srs->srs_tx_ring_count; i++) {
3599 soft_ring = srs->srs_tx_soft_rings[i];
3600 if (soft_ring->s_ring_tx_arg2 == tx_ring)
3601 return (soft_ring);
3602 }
3603
3604 return (NULL);
3605 }
3606
3607 /*
3608 * mac_tx_srs_wakeup
3609 *
3610 * Called when Tx desc become available. Wakeup the appropriate worker
3611 * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
3612 * state field.
3613 */
3614 void
3615 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
3616 {
3617 int i;
3618 mac_soft_ring_t *sringp;
3619 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3620
3621 mutex_enter(&mac_srs->srs_lock);
3622 /*
3623 * srs_tx_ring_count == 0 is the single ring mode case. In
3624 * this mode, there will not be Tx soft rings associated
3625 * with the SRS.
3626 */
3627 if (!MAC_TX_SOFT_RINGS(mac_srs)) {
3628 if (srs_tx->st_arg2 == ring &&
3629 mac_srs->srs_state & SRS_TX_BLOCKED) {
3630 mac_srs->srs_state &= ~SRS_TX_BLOCKED;
3631 srs_tx->st_stat.mts_unblockcnt++;
3632 cv_signal(&mac_srs->srs_async);
3633 }
3634 /*
3635 * A wakeup can come before tx_srs_drain() could
3636 * grab srs lock and set SRS_TX_BLOCKED. So
3637 * always set woken_up flag when we come here.
3638 */
3639 srs_tx->st_woken_up = B_TRUE;
3640 mutex_exit(&mac_srs->srs_lock);
3641 return;
3642 }
3643
3644 /*
3645 * If you are here, it is for FANOUT, BW_FANOUT,
3646 * AGGR_MODE or AGGR_BW_MODE case
3647 */
3648 for (i = 0; i < mac_srs->srs_tx_ring_count; i++) {
3649 sringp = mac_srs->srs_tx_soft_rings[i];
3650 mutex_enter(&sringp->s_ring_lock);
3651 if (sringp->s_ring_tx_arg2 == ring) {
3652 if (sringp->s_ring_state & S_RING_BLOCK) {
3653 sringp->s_ring_state &= ~S_RING_BLOCK;
3654 sringp->s_st_stat.mts_unblockcnt++;
3655 cv_signal(&sringp->s_ring_async);
3656 }
3657 sringp->s_ring_tx_woken_up = B_TRUE;
3658 }
3659 mutex_exit(&sringp->s_ring_lock);
3660 }
3661 mutex_exit(&mac_srs->srs_lock);
3662 }
3663
3664 /*
3665 * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
3666 * the blocked clients again.
3667 */
3668 void
3669 mac_tx_notify(mac_impl_t *mip)
3670 {
3671 i_mac_notify(mip, MAC_NOTE_TX);
3672 }
3673
3674 /*
3675 * RX SOFTRING RELATED FUNCTIONS
3676 *
3677 * These functions really belong in mac_soft_ring.c and here for
3678 * a short period.
3679 */
3680
3681 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \
3682 /* \
3683 * Enqueue our mblk chain. \
3684 */ \
3685 ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \
3686 \
3687 if ((ringp)->s_ring_last != NULL) \
3688 (ringp)->s_ring_last->b_next = (mp); \
3689 else \
3690 (ringp)->s_ring_first = (mp); \
3691 (ringp)->s_ring_last = (tail); \
3692 (ringp)->s_ring_count += (cnt); \
3693 ASSERT((ringp)->s_ring_count > 0); \
3694 if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \
3695 (ringp)->s_ring_size += sz; \
3696 } \
3697 }
3698
3699 /*
3700 * Default entry point to deliver a packet chain to a MAC client.
3701 * If the MAC client has flows, do the classification with these
3702 * flows as well.
3703 */
3704 /* ARGSUSED */
3705 void
3706 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
3707 mac_header_info_t *arg3)
3708 {
3709 mac_client_impl_t *mcip = arg1;
3710
3711 if (mcip->mci_nvids == 1 &&
3712 !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
3713 /*
3714 * If the client has exactly one VID associated with it
3715 * and striping of VLAN header is not disabled,
3716 * remove the VLAN tag from the packet before
3717 * passing it on to the client's receive callback.
3718 * Note that this needs to be done after we dispatch
3719 * the packet to the promiscuous listeners of the
3720 * client, since they expect to see the whole
3721 * frame including the VLAN headers.
3722 */
3723 mp_chain = mac_strip_vlan_tag_chain(mp_chain);
3724 }
3725
3726 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
3727 }
3728
3729 /*
3730 * mac_rx_soft_ring_process
3731 *
3732 * process a chain for a given soft ring. The number of packets queued
3733 * in the SRS and its associated soft rings (including this one) is
3734 * very small (tracked by srs_poll_pkt_cnt), then allow the entering
3735 * thread (interrupt or poll thread) to do inline processing. This
3736 * helps keep the latency down under low load.
3737 *
3738 * The proc and arg for each mblk is already stored in the mblk in
3739 * appropriate places.
3740 */
3741 /* ARGSUSED */
3742 void
3743 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
3744 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
3745 {
3746 mac_direct_rx_t proc;
3747 void *arg1;
3748 mac_resource_handle_t arg2;
3749 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3750
3751 ASSERT(ringp != NULL);
3752 ASSERT(mp_chain != NULL);
3753 ASSERT(tail != NULL);
3754 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3755
3756 mutex_enter(&ringp->s_ring_lock);
3757 ringp->s_ring_total_inpkt += cnt;
3758 ringp->s_ring_total_rbytes += sz;
3759 if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
3760 !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
3761 /* If on processor or blanking on, then enqueue and return */
3762 if (ringp->s_ring_state & S_RING_BLANK ||
3763 ringp->s_ring_state & S_RING_PROC) {
3764 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3765 mutex_exit(&ringp->s_ring_lock);
3766 return;
3767 }
3768 proc = ringp->s_ring_rx_func;
3769 arg1 = ringp->s_ring_rx_arg1;
3770 arg2 = ringp->s_ring_rx_arg2;
3771 /*
3772 * See if anything is already queued. If we are the
3773 * first packet, do inline processing else queue the
3774 * packet and do the drain.
3775 */
3776 if (ringp->s_ring_first == NULL) {
3777 /*
3778 * Fast-path, ok to process and nothing queued.
3779 */
3780 ringp->s_ring_run = curthread;
3781 ringp->s_ring_state |= (S_RING_PROC);
3782
3783 mutex_exit(&ringp->s_ring_lock);
3784
3785 /*
3786 * We are the chain of 1 packet so
3787 * go through this fast path.
3788 */
3789 ASSERT(mp_chain->b_next == NULL);
3790
3791 (*proc)(arg1, arg2, mp_chain, NULL);
3792
3793 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3794 /*
3795 * If we have a soft ring set which is doing
3796 * bandwidth control, we need to decrement
3797 * srs_size and count so it the SRS can have a
3798 * accurate idea of what is the real data
3799 * queued between SRS and its soft rings. We
3800 * decrement the counters only when the packet
3801 * gets processed by both SRS and the soft ring.
3802 */
3803 mutex_enter(&mac_srs->srs_lock);
3804 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
3805 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
3806 mutex_exit(&mac_srs->srs_lock);
3807
3808 mutex_enter(&ringp->s_ring_lock);
3809 ringp->s_ring_run = NULL;
3810 ringp->s_ring_state &= ~S_RING_PROC;
3811 if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
3812 cv_signal(&ringp->s_ring_client_cv);
3813
3814 if ((ringp->s_ring_first == NULL) ||
3815 (ringp->s_ring_state & S_RING_BLANK)) {
3816 /*
3817 * We processed inline our packet and
3818 * nothing new has arrived or our
3819 * receiver doesn't want to receive
3820 * any packets. We are done.
3821 */
3822 mutex_exit(&ringp->s_ring_lock);
3823 return;
3824 }
3825 } else {
3826 SOFT_RING_ENQUEUE_CHAIN(ringp,
3827 mp_chain, tail, cnt, sz);
3828 }
3829
3830 /*
3831 * We are here because either we couldn't do inline
3832 * processing (because something was already
3833 * queued), or we had a chain of more than one
3834 * packet, or something else arrived after we were
3835 * done with inline processing.
3836 */
3837 ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3838 ASSERT(ringp->s_ring_first != NULL);
3839
3840 ringp->s_ring_drain_func(ringp);
3841 mutex_exit(&ringp->s_ring_lock);
3842 return;
3843 } else {
3844 /* ST_RING_WORKER_ONLY case */
3845 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3846 mac_soft_ring_worker_wakeup(ringp);
3847 mutex_exit(&ringp->s_ring_lock);
3848 }
3849 }
3850
3851 /*
3852 * TX SOFTRING RELATED FUNCTIONS
3853 *
3854 * These functions really belong in mac_soft_ring.c and here for
3855 * a short period.
3856 */
3857
3858 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \
3859 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \
3860 ringp->s_ring_state |= S_RING_ENQUEUED; \
3861 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \
3862 }
3863
3864 /*
3865 * mac_tx_sring_queued
3866 *
3867 * When we are out of transmit descriptors and we already have a
3868 * queue that exceeds hiwat (or the client called us with
3869 * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
3870 * soft ring pointer as the opaque cookie for the client enable
3871 * flow control.
3872 */
3873 static mac_tx_cookie_t
3874 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
3875 mblk_t **ret_mp)
3876 {
3877 int cnt;
3878 size_t sz;
3879 mblk_t *tail;
3880 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3881 mac_tx_cookie_t cookie = NULL;
3882 boolean_t wakeup_worker = B_TRUE;
3883
3884 ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3885 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3886 if (flag & MAC_DROP_ON_NO_DESC) {
3887 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
3888 /* increment freed stats */
3889 ringp->s_ring_drops += cnt;
3890 cookie = (mac_tx_cookie_t)ringp;
3891 } else {
3892 if (ringp->s_ring_first != NULL)
3893 wakeup_worker = B_FALSE;
3894
3895 if (flag & MAC_TX_NO_ENQUEUE) {
3896 /*
3897 * If QUEUED is not set, queue the packet
3898 * and let mac_tx_soft_ring_drain() set
3899 * the TX_BLOCKED bit for the reasons
3900 * explained above. Otherwise, return the
3901 * mblks.
3902 */
3903 if (wakeup_worker) {
3904 TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
3905 mp_chain, tail, cnt, sz);
3906 } else {
3907 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
3908 cookie = (mac_tx_cookie_t)ringp;
3909 *ret_mp = mp_chain;
3910 }
3911 } else {
3912 boolean_t enqueue = B_TRUE;
3913
3914 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3915 /*
3916 * flow-controlled. Store ringp in cookie
3917 * so that it can be returned as
3918 * mac_tx_cookie_t to client
3919 */
3920 ringp->s_ring_state |= S_RING_TX_HIWAT;
3921 cookie = (mac_tx_cookie_t)ringp;
3922 ringp->s_ring_hiwat_cnt++;
3923 if (ringp->s_ring_count >
3924 ringp->s_ring_tx_max_q_cnt) {
3925 /* increment freed stats */
3926 ringp->s_ring_drops += cnt;
3927 /*
3928 * b_prev may be set to the fanout hint
3929 * hence can't use freemsg directly
3930 */
3931 mac_pkt_drop(NULL, NULL,
3932 mp_chain, B_FALSE);
3933 DTRACE_PROBE1(tx_queued_hiwat,
3934 mac_soft_ring_t *, ringp);
3935 enqueue = B_FALSE;
3936 }
3937 }
3938 if (enqueue) {
3939 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
3940 tail, cnt, sz);
3941 }
3942 }
3943 if (wakeup_worker)
3944 cv_signal(&ringp->s_ring_async);
3945 }
3946 return (cookie);
3947 }
3948
3949
3950 /*
3951 * mac_tx_soft_ring_process
3952 *
3953 * This routine is called when fanning out outgoing traffic among
3954 * multipe Tx rings.
3955 * Note that a soft ring is associated with a h/w Tx ring.
3956 */
3957 mac_tx_cookie_t
3958 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
3959 uint16_t flag, mblk_t **ret_mp)
3960 {
3961 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3962 int cnt;
3963 size_t sz;
3964 mblk_t *tail;
3965 mac_tx_cookie_t cookie = NULL;
3966
3967 ASSERT(ringp != NULL);
3968 ASSERT(mp_chain != NULL);
3969 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3970 /*
3971 * The following modes can come here: SRS_TX_BW_FANOUT,
3972 * SRS_TX_FANOUT, SRS_TX_AGGR, SRS_TX_BW_AGGR.
3973 */
3974 ASSERT(MAC_TX_SOFT_RINGS(mac_srs));
3975 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
3976 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT ||
3977 mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
3978 mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
3979
3980 if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
3981 /* Serialization mode */
3982
3983 mutex_enter(&ringp->s_ring_lock);
3984 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3985 cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3986 flag, ret_mp);
3987 mutex_exit(&ringp->s_ring_lock);
3988 return (cookie);
3989 }
3990 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3991 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3992 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
3993 /*
3994 * If ring is blocked due to lack of Tx
3995 * descs, just return. Worker thread
3996 * will get scheduled when Tx desc's
3997 * become available.
3998 */
3999 mutex_exit(&ringp->s_ring_lock);
4000 return (cookie);
4001 }
4002 mac_soft_ring_worker_wakeup(ringp);
4003 mutex_exit(&ringp->s_ring_lock);
4004 return (cookie);
4005 } else {
4006 /* Default fanout mode */
4007 /*
4008 * S_RING_BLOCKED is set when underlying NIC runs
4009 * out of Tx descs and messages start getting
4010 * queued. It won't get reset until
4011 * tx_srs_drain() completely drains out the
4012 * messages.
4013 */
4014 mac_tx_stats_t stats;
4015
4016 if (ringp->s_ring_state & S_RING_ENQUEUED) {
4017 /* Tx descs/resources not available */
4018 mutex_enter(&ringp->s_ring_lock);
4019 if (ringp->s_ring_state & S_RING_ENQUEUED) {
4020 cookie = mac_tx_sring_enqueue(ringp, mp_chain,
4021 flag, ret_mp);
4022 mutex_exit(&ringp->s_ring_lock);
4023 return (cookie);
4024 }
4025 /*
4026 * While we were computing mblk count, the
4027 * flow control condition got relieved.
4028 * Continue with the transmission.
4029 */
4030 mutex_exit(&ringp->s_ring_lock);
4031 }
4032
4033 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
4034 ringp->s_ring_tx_arg2, mp_chain, &stats);
4035
4036 /*
4037 * Multiple threads could be here sending packets.
4038 * Under such conditions, it is not possible to
4039 * automically set S_RING_BLOCKED bit to indicate
4040 * out of tx desc condition. To atomically set
4041 * this, we queue the returned packet and do
4042 * the setting of S_RING_BLOCKED in
4043 * mac_tx_soft_ring_drain().
4044 */
4045 if (mp_chain != NULL) {
4046 mutex_enter(&ringp->s_ring_lock);
4047 cookie =
4048 mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
4049 mutex_exit(&ringp->s_ring_lock);
4050 return (cookie);
4051 }
4052 SRS_TX_STATS_UPDATE(mac_srs, &stats);
4053 SOFTRING_TX_STATS_UPDATE(ringp, &stats);
4054
4055 return (NULL);
4056 }
4057 }