Print this page
918 Need better IP fanout (esp. with VLANs present)
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/io/mac/mac_sched.c
+++ new/usr/src/uts/common/io/mac/mac_sched.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 - */
25 -/*
26 24 * Copyright 2011 Joyent, Inc. All rights reserved.
25 + * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27 26 */
28 27
29 28 #include <sys/types.h>
30 29 #include <sys/callb.h>
31 30 #include <sys/sdt.h>
32 31 #include <sys/strsubr.h>
33 32 #include <sys/strsun.h>
34 33 #include <sys/vlan.h>
35 34 #include <sys/stack.h>
36 35 #include <sys/archsystm.h>
37 36 #include <inet/ipsec_impl.h>
38 37 #include <inet/ip_impl.h>
39 38 #include <inet/sadb.h>
40 39 #include <inet/ipsecesp.h>
41 40 #include <inet/ipsecah.h>
42 41 #include <inet/ip6.h>
43 42
44 43 #include <sys/mac_impl.h>
45 44 #include <sys/mac_client_impl.h>
46 45 #include <sys/mac_client_priv.h>
47 46 #include <sys/mac_soft_ring.h>
48 47 #include <sys/mac_flow_impl.h>
49 48
50 49 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
51 50 uintptr_t, uint16_t, mblk_t **);
52 51 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
53 52 uintptr_t, uint16_t, mblk_t **);
54 53 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
55 54 uintptr_t, uint16_t, mblk_t **);
56 55 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
57 56 uintptr_t, uint16_t, mblk_t **);
58 57 static mac_tx_cookie_t mac_tx_aggr_mode(mac_soft_ring_set_t *, mblk_t *,
59 58 uintptr_t, uint16_t, mblk_t **);
60 59
61 60 typedef struct mac_tx_mode_s {
62 61 mac_tx_srs_mode_t mac_tx_mode;
63 62 mac_tx_func_t mac_tx_func;
64 63 } mac_tx_mode_t;
65 64
66 65 /*
67 66 * There are seven modes of operation on the Tx side. These modes get set
68 67 * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
69 68 * none of the other modes are user configurable. They get selected by
70 69 * the system depending upon whether the link (or flow) has multiple Tx
71 70 * rings or a bandwidth configured, or if the link is an aggr, etc.
72 71 *
73 72 * When the Tx SRS is operating in aggr mode (st_mode) or if there are
74 73 * multiple Tx rings owned by Tx SRS, then each Tx ring (pseudo or
75 74 * otherwise) will have a soft ring associated with it. These soft rings
76 75 * are stored in srs_tx_soft_rings[] array.
77 76 *
78 77 * Additionally in the case of aggr, there is the st_soft_rings[] array
79 78 * in the mac_srs_tx_t structure. This array is used to store the same
80 79 * set of soft rings that are present in srs_tx_soft_rings[] array but
81 80 * in a different manner. The soft ring associated with the pseudo Tx
82 81 * ring is saved at mr_index (of the pseudo ring) in st_soft_rings[]
83 82 * array. This helps in quickly getting the soft ring associated with the
84 83 * Tx ring when aggr_find_tx_ring() returns the pseudo Tx ring that is to
85 84 * be used for transmit.
86 85 */
87 86 mac_tx_mode_t mac_tx_mode_list[] = {
88 87 {SRS_TX_DEFAULT, mac_tx_single_ring_mode},
89 88 {SRS_TX_SERIALIZE, mac_tx_serializer_mode},
90 89 {SRS_TX_FANOUT, mac_tx_fanout_mode},
91 90 {SRS_TX_BW, mac_tx_bw_mode},
92 91 {SRS_TX_BW_FANOUT, mac_tx_bw_mode},
93 92 {SRS_TX_AGGR, mac_tx_aggr_mode},
94 93 {SRS_TX_BW_AGGR, mac_tx_bw_mode}
95 94 };
96 95
97 96 /*
98 97 * Soft Ring Set (SRS) - The Run time code that deals with
99 98 * dynamic polling from the hardware, bandwidth enforcement,
100 99 * fanout etc.
101 100 *
102 101 * We try to use H/W classification on NIC and assign traffic for
103 102 * a MAC address to a particular Rx ring or ring group. There is a
104 103 * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
105 104 * switches the underlying Rx ring between interrupt and
106 105 * polling mode and enforces any specified B/W control.
107 106 *
108 107 * There is always a SRS created and tied to each H/W and S/W rule.
109 108 * Whenever we create a H/W rule, we always add the the same rule to
110 109 * S/W classifier and tie a SRS to it.
111 110 *
112 111 * In case a B/W control is specified, it is broken into bytes
113 112 * per ticks and as soon as the quota for a tick is exhausted,
114 113 * the underlying Rx ring is forced into poll mode for remainder of
115 114 * the tick. The SRS poll thread only polls for bytes that are
116 115 * allowed to come in the SRS. We typically let 4x the configured
117 116 * B/W worth of packets to come in the SRS (to prevent unnecessary
118 117 * drops due to bursts) but only process the specified amount.
119 118 *
120 119 * A MAC client (e.g. a VNIC or aggr) can have 1 or more
121 120 * Rx rings (and corresponding SRSs) assigned to it. The SRS
122 121 * in turn can have softrings to do protocol level fanout or
123 122 * softrings to do S/W based fanout or both. In case the NIC
124 123 * has no Rx rings, we do S/W classification to respective SRS.
125 124 * The S/W classification rule is always setup and ready. This
126 125 * allows the MAC layer to reassign Rx rings whenever needed
127 126 * but packets still continue to flow via the default path and
128 127 * getting S/W classified to correct SRS.
129 128 *
130 129 * The SRS's are used on both Tx and Rx side. They use the same
131 130 * data structure but the processing routines have slightly different
132 131 * semantics due to the fact that Rx side needs to do dynamic
133 132 * polling etc.
134 133 *
135 134 * Dynamic Polling Notes
136 135 * =====================
137 136 *
138 137 * Each Soft ring set is capable of switching its Rx ring between
139 138 * interrupt and poll mode and actively 'polls' for packets in
140 139 * poll mode. If the SRS is implementing a B/W limit, it makes
141 140 * sure that only Max allowed packets are pulled in poll mode
142 141 * and goes to poll mode as soon as B/W limit is exceeded. As
143 142 * such, there are no overheads to implement B/W limits.
144 143 *
145 144 * In poll mode, its better to keep the pipeline going where the
146 145 * SRS worker thread keeps processing packets and poll thread
147 146 * keeps bringing more packets (specially if they get to run
148 147 * on different CPUs). This also prevents the overheads associated
149 148 * by excessive signalling (on NUMA machines, this can be
150 149 * pretty devastating). The exception is latency optimized case
151 150 * where worker thread does no work and interrupt and poll thread
152 151 * are allowed to do their own drain.
153 152 *
154 153 * We use the following policy to control Dynamic Polling:
155 154 * 1) We switch to poll mode anytime the processing
156 155 * thread causes a backlog to build up in SRS and
157 156 * its associated Soft Rings (sr_poll_pkt_cnt > 0).
158 157 * 2) As long as the backlog stays under the low water
159 158 * mark (sr_lowat), we poll the H/W for more packets.
160 159 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
161 160 * water mark, we stay in poll mode but don't poll
162 161 * the H/W for more packets.
163 162 * 4) Anytime in polling mode, if we poll the H/W for
164 163 * packets and find nothing plus we have an existing
165 164 * backlog (sr_poll_pkt_cnt > 0), we stay in polling
166 165 * mode but don't poll the H/W for packets anymore
167 166 * (let the polling thread go to sleep).
168 167 * 5) Once the backlog is relived (packets are processed)
169 168 * we reenable polling (by signalling the poll thread)
170 169 * only when the backlog dips below sr_poll_thres.
171 170 * 6) sr_hiwat is used exclusively when we are not
172 171 * polling capable and is used to decide when to
173 172 * drop packets so the SRS queue length doesn't grow
174 173 * infinitely.
175 174 *
176 175 * NOTE: Also see the block level comment on top of mac_soft_ring.c
177 176 */
178 177
179 178 /*
180 179 * mac_latency_optimize
181 180 *
182 181 * Controls whether the poll thread can process the packets inline
183 182 * or let the SRS worker thread do the processing. This applies if
184 183 * the SRS was not being processed. For latency sensitive traffic,
185 184 * this needs to be true to allow inline processing. For throughput
186 185 * under load, this should be false.
187 186 *
188 187 * This (and other similar) tunable should be rolled into a link
189 188 * or flow specific workload hint that can be set using dladm
190 189 * linkprop (instead of multiple such tunables).
191 190 */
192 191 boolean_t mac_latency_optimize = B_TRUE;
193 192
194 193 /*
195 194 * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
196 195 *
197 196 * queue a mp or chain in soft ring set and increment the
198 197 * local count (srs_count) for the SRS and the shared counter
199 198 * (srs_poll_pkt_cnt - shared between SRS and its soft rings
200 199 * to track the total unprocessed packets for polling to work
201 200 * correctly).
202 201 *
203 202 * The size (total bytes queued) counters are incremented only
204 203 * if we are doing B/W control.
205 204 */
206 205 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
207 206 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
208 207 if ((mac_srs)->srs_last != NULL) \
209 208 (mac_srs)->srs_last->b_next = (head); \
210 209 else \
211 210 (mac_srs)->srs_first = (head); \
212 211 (mac_srs)->srs_last = (tail); \
213 212 (mac_srs)->srs_count += count; \
214 213 }
215 214
216 215 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
217 216 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \
218 217 \
219 218 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \
220 219 srs_rx->sr_poll_pkt_cnt += count; \
221 220 ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \
222 221 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \
223 222 (mac_srs)->srs_size += (sz); \
224 223 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \
225 224 (mac_srs)->srs_bw->mac_bw_sz += (sz); \
226 225 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \
227 226 } \
228 227 }
229 228
230 229 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
231 230 mac_srs->srs_state |= SRS_ENQUEUED; \
232 231 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \
233 232 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \
234 233 (mac_srs)->srs_size += (sz); \
235 234 (mac_srs)->srs_bw->mac_bw_sz += (sz); \
236 235 } \
237 236 }
238 237
239 238 /*
240 239 * Turn polling on routines
241 240 */
242 241 #define MAC_SRS_POLLING_ON(mac_srs) { \
243 242 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
244 243 if (((mac_srs)->srs_state & \
245 244 (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \
246 245 (mac_srs)->srs_state |= SRS_POLLING; \
247 246 (void) mac_hwring_disable_intr((mac_ring_handle_t) \
248 247 (mac_srs)->srs_ring); \
249 248 (mac_srs)->srs_rx.sr_poll_on++; \
250 249 } \
251 250 }
252 251
253 252 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \
254 253 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
255 254 if (((mac_srs)->srs_state & \
256 255 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \
257 256 (SRS_POLLING_CAPAB|SRS_WORKER)) { \
258 257 (mac_srs)->srs_state |= SRS_POLLING; \
259 258 (void) mac_hwring_disable_intr((mac_ring_handle_t) \
260 259 (mac_srs)->srs_ring); \
261 260 (mac_srs)->srs_rx.sr_worker_poll_on++; \
262 261 } \
263 262 }
264 263
265 264 /*
266 265 * MAC_SRS_POLL_RING
267 266 *
268 267 * Signal the SRS poll thread to poll the underlying H/W ring
269 268 * provided it wasn't already polling (SRS_GET_PKTS was set).
270 269 *
271 270 * Poll thread gets to run only from mac_rx_srs_drain() and only
272 271 * if the drain was being done by the worker thread.
273 272 */
274 273 #define MAC_SRS_POLL_RING(mac_srs) { \
275 274 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \
276 275 \
277 276 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
278 277 srs_rx->sr_poll_thr_sig++; \
279 278 if (((mac_srs)->srs_state & \
280 279 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \
281 280 (SRS_WORKER|SRS_POLLING_CAPAB)) { \
282 281 (mac_srs)->srs_state |= SRS_GET_PKTS; \
283 282 cv_signal(&(mac_srs)->srs_cv); \
284 283 } else { \
285 284 srs_rx->sr_poll_thr_busy++; \
286 285 } \
287 286 }
288 287
289 288 /*
290 289 * MAC_SRS_CHECK_BW_CONTROL
291 290 *
292 291 * Check to see if next tick has started so we can reset the
293 292 * SRS_BW_ENFORCED flag and allow more packets to come in the
294 293 * system.
295 294 */
296 295 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \
297 296 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
298 297 ASSERT(((mac_srs)->srs_type & SRST_TX) || \
299 298 MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \
300 299 clock_t now = ddi_get_lbolt(); \
301 300 if ((mac_srs)->srs_bw->mac_bw_curr_time != now) { \
302 301 (mac_srs)->srs_bw->mac_bw_curr_time = now; \
303 302 (mac_srs)->srs_bw->mac_bw_used = 0; \
304 303 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \
305 304 (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
306 305 } \
307 306 }
308 307
309 308 /*
310 309 * MAC_SRS_WORKER_WAKEUP
311 310 *
312 311 * Wake up the SRS worker thread to process the queue as long as
313 312 * no one else is processing the queue. If we are optimizing for
314 313 * latency, we wake up the worker thread immediately or else we
315 314 * wait mac_srs_worker_wakeup_ticks before worker thread gets
316 315 * woken up.
317 316 */
318 317 int mac_srs_worker_wakeup_ticks = 0;
319 318 #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \
320 319 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
321 320 if (!((mac_srs)->srs_state & SRS_PROC) && \
322 321 (mac_srs)->srs_tid == NULL) { \
323 322 if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \
324 323 (mac_srs_worker_wakeup_ticks == 0)) \
325 324 cv_signal(&(mac_srs)->srs_async); \
326 325 else \
327 326 (mac_srs)->srs_tid = \
328 327 timeout(mac_srs_fire, (mac_srs), \
329 328 mac_srs_worker_wakeup_ticks); \
330 329 } \
331 330 }
332 331
333 332 #define TX_BANDWIDTH_MODE(mac_srs) \
334 333 ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \
335 334 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT || \
336 335 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_AGGR)
337 336
338 337 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \
339 338 if (tx_mode == SRS_TX_BW_FANOUT) \
340 339 (void) mac_tx_fanout_mode(mac_srs, head, hint, 0, NULL);\
341 340 else \
342 341 (void) mac_tx_aggr_mode(mac_srs, head, hint, 0, NULL); \
343 342 }
344 343
345 344 /*
346 345 * MAC_TX_SRS_BLOCK
347 346 *
348 347 * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
349 348 * will be set only if srs_tx_woken_up is FALSE. If
350 349 * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
351 350 * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
352 351 * attempt to transmit again and not setting SRS_TX_BLOCKED does
353 352 * that.
354 353 */
355 354 #define MAC_TX_SRS_BLOCK(srs, mp) { \
356 355 ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \
357 356 if ((srs)->srs_tx.st_woken_up) { \
358 357 (srs)->srs_tx.st_woken_up = B_FALSE; \
359 358 } else { \
360 359 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \
361 360 (srs)->srs_state |= SRS_TX_BLOCKED; \
362 361 (srs)->srs_tx.st_stat.mts_blockcnt++; \
363 362 } \
364 363 }
365 364
366 365 /*
367 366 * MAC_TX_SRS_TEST_HIWAT
368 367 *
369 368 * Called before queueing a packet onto Tx SRS to test and set
370 369 * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
371 370 */
372 371 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \
373 372 boolean_t enqueue = 1; \
374 373 \
375 374 if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \
376 375 /* \
377 376 * flow-controlled. Store srs in cookie so that it \
378 377 * can be returned as mac_tx_cookie_t to client \
379 378 */ \
380 379 (srs)->srs_state |= SRS_TX_HIWAT; \
381 380 cookie = (mac_tx_cookie_t)srs; \
382 381 (srs)->srs_tx.st_hiwat_cnt++; \
383 382 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \
384 383 /* increment freed stats */ \
385 384 (srs)->srs_tx.st_stat.mts_sdrops += cnt; \
386 385 /* \
387 386 * b_prev may be set to the fanout hint \
388 387 * hence can't use freemsg directly \
389 388 */ \
390 389 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \
391 390 DTRACE_PROBE1(tx_queued_hiwat, \
392 391 mac_soft_ring_set_t *, srs); \
393 392 enqueue = 0; \
394 393 } \
395 394 } \
396 395 if (enqueue) \
397 396 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \
398 397 }
399 398
400 399 /* Some utility macros */
401 400 #define MAC_SRS_BW_LOCK(srs) \
402 401 if (!(srs->srs_type & SRST_TX)) \
403 402 mutex_enter(&srs->srs_bw->mac_bw_lock);
404 403
405 404 #define MAC_SRS_BW_UNLOCK(srs) \
406 405 if (!(srs->srs_type & SRST_TX)) \
407 406 mutex_exit(&srs->srs_bw->mac_bw_lock);
408 407
409 408 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \
410 409 mac_pkt_drop(NULL, NULL, mp, B_FALSE); \
411 410 /* increment freed stats */ \
412 411 mac_srs->srs_tx.st_stat.mts_sdrops++; \
413 412 cookie = (mac_tx_cookie_t)srs; \
414 413 }
415 414
416 415 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \
417 416 mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \
418 417 cookie = (mac_tx_cookie_t)srs; \
419 418 *ret_mp = mp_chain; \
420 419 }
421 420
422 421 /*
423 422 * MAC_RX_SRS_TOODEEP
424 423 *
425 424 * Macro called as part of receive-side processing to determine if handling
426 425 * can occur in situ (in the interrupt thread) or if it should be left to a
427 426 * worker thread. Note that the constant used to make this determination is
428 427 * not entirely made-up, and is a result of some emprical validation. That
429 428 * said, the constant is left as a static variable to allow it to be
430 429 * dynamically tuned in the field if and as needed.
431 430 */
432 431 static uintptr_t mac_rx_srs_stack_needed = 10240;
433 432 static uint_t mac_rx_srs_stack_toodeep;
434 433
435 434 #ifndef STACK_GROWTH_DOWN
436 435 #error Downward stack growth assumed.
437 436 #endif
438 437
439 438 #define MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
440 439 (uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \
441 440 ++mac_rx_srs_stack_toodeep)
442 441
443 442
444 443 /*
445 444 * Drop the rx packet and advance to the next one in the chain.
446 445 */
447 446 static void
448 447 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
449 448 {
450 449 mac_srs_rx_t *srs_rx = &srs->srs_rx;
451 450
452 451 ASSERT(mp->b_next == NULL);
453 452 mutex_enter(&srs->srs_lock);
454 453 MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
455 454 MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
456 455 mutex_exit(&srs->srs_lock);
457 456
458 457 srs_rx->sr_stat.mrs_sdrops++;
459 458 freemsg(mp);
460 459 }
461 460
462 461 /* DATAPATH RUNTIME ROUTINES */
463 462
464 463 /*
465 464 * mac_srs_fire
466 465 *
467 466 * Timer callback routine for waking up the SRS worker thread.
468 467 */
469 468 static void
470 469 mac_srs_fire(void *arg)
471 470 {
472 471 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
473 472
474 473 mutex_enter(&mac_srs->srs_lock);
475 474 if (mac_srs->srs_tid == 0) {
476 475 mutex_exit(&mac_srs->srs_lock);
477 476 return;
478 477 }
479 478
480 479 mac_srs->srs_tid = 0;
481 480 if (!(mac_srs->srs_state & SRS_PROC))
482 481 cv_signal(&mac_srs->srs_async);
483 482
484 483 mutex_exit(&mac_srs->srs_lock);
485 484 }
486 485
487 486 /*
488 487 * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
489 488 * and it is used on the TX path.
490 489 */
491 490 #define HASH_HINT(hint) \
492 491 ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
493 492
494 493
495 494 /*
496 495 * hash based on the src address and the port information.
497 496 */
498 497 #define HASH_ADDR(src, ports) \
499 498 (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \
500 499 ((ports) >> 8) ^ (ports))
501 500
502 501 #define COMPUTE_INDEX(key, sz) (key % sz)
503 502
504 503 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \
505 504 if ((tail) != NULL) { \
506 505 ASSERT((tail)->b_next == NULL); \
507 506 (tail)->b_next = (mp); \
508 507 } else { \
509 508 ASSERT((head) == NULL); \
510 509 (head) = (mp); \
511 510 } \
512 511 (tail) = (mp); \
513 512 (cnt)++; \
514 513 if ((bw_ctl)) \
515 514 (sz) += (sz0); \
516 515 }
517 516
518 517 #define MAC_FANOUT_DEFAULT 0
519 518 #define MAC_FANOUT_RND_ROBIN 1
520 519 int mac_fanout_type = MAC_FANOUT_DEFAULT;
521 520
522 521 #define MAX_SR_TYPES 3
|
↓ open down ↓ |
486 lines elided |
↑ open up ↑ |
523 522 /* fanout types for port based hashing */
524 523 enum pkt_type {
525 524 V4_TCP = 0,
526 525 V4_UDP,
527 526 OTH,
528 527 UNDEF
529 528 };
530 529
531 530 /*
532 531 * In general we do port based hashing to spread traffic over different
533 - * softrings. The below tunable allows to override that behavior. Setting it
534 - * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
535 - * is also the applicable to ipv6 packets carrying multiple optional headers
536 - * and other uncommon packet types.
532 + * softrings. The below tunables allow to override that behavior. Setting one
533 + * (depending on IPv6 or IPv4) to B_TRUE allows a fanout based on src
534 + * IPv6 or IPv4 address. This behavior is also applicable to IPv6 packets
535 + * carrying multiple optional headers and other uncommon packet types.
537 536 */
538 537 boolean_t mac_src_ipv6_fanout = B_FALSE;
538 +boolean_t mac_src_ipv4_fanout = B_FALSE;
539 539
540 540 /*
541 541 * Pair of local and remote ports in the transport header
542 542 */
543 543 #define PORTS_SIZE 4
544 544
545 545 /*
546 546 * mac_rx_srs_proto_fanout
547 547 *
548 548 * This routine delivers packets destined to an SRS into one of the
549 549 * protocol soft rings.
550 550 *
551 551 * Given a chain of packets we need to split it up into multiple sub chains
552 552 * destined into TCP, UDP or OTH soft ring. Instead of entering
553 553 * the soft ring one packet at a time, we want to enter it in the form of a
554 554 * chain otherwise we get this start/stop behaviour where the worker thread
555 555 * goes to sleep and then next packets comes in forcing it to wake up etc.
556 556 */
557 557 static void
558 558 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
559 559 {
560 560 struct ether_header *ehp;
561 561 struct ether_vlan_header *evhp;
562 562 uint32_t sap;
563 563 ipha_t *ipha;
564 564 uint8_t *dstaddr;
565 565 size_t hdrsize;
566 566 mblk_t *mp;
567 567 mblk_t *headmp[MAX_SR_TYPES];
568 568 mblk_t *tailmp[MAX_SR_TYPES];
569 569 int cnt[MAX_SR_TYPES];
570 570 size_t sz[MAX_SR_TYPES];
571 571 size_t sz1;
572 572 boolean_t bw_ctl;
573 573 boolean_t hw_classified;
574 574 boolean_t dls_bypass;
575 575 boolean_t is_ether;
576 576 boolean_t is_unicast;
577 577 enum pkt_type type;
578 578 mac_client_impl_t *mcip = mac_srs->srs_mcip;
579 579
580 580 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
581 581 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
582 582
583 583 /*
584 584 * If we don't have a Rx ring, S/W classification would have done
585 585 * its job and its a packet meant for us. If we were polling on
586 586 * the default ring (i.e. there was a ring assigned to this SRS),
587 587 * then we need to make sure that the mac address really belongs
588 588 * to us.
589 589 */
590 590 hw_classified = mac_srs->srs_ring != NULL &&
591 591 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
592 592
593 593 /*
594 594 * Special clients (eg. VLAN, non ether, etc) need DLS
595 595 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
596 596 * such SRSs. Another way of disabling bypass is to set the
597 597 * MCIS_RX_BYPASS_DISABLE flag.
598 598 */
599 599 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
600 600 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
601 601
602 602 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
603 603 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
604 604 bzero(cnt, MAX_SR_TYPES * sizeof (int));
605 605 bzero(sz, MAX_SR_TYPES * sizeof (size_t));
606 606
607 607 /*
608 608 * We got a chain from SRS that we need to send to the soft rings.
609 609 * Since squeues for TCP & IPv4 sap poll their soft rings (for
610 610 * performance reasons), we need to separate out v4_tcp, v4_udp
611 611 * and the rest goes in other.
612 612 */
613 613 while (head != NULL) {
614 614 mp = head;
615 615 head = head->b_next;
616 616 mp->b_next = NULL;
617 617
618 618 type = OTH;
619 619 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
620 620
621 621 if (is_ether) {
622 622 /*
623 623 * At this point we can be sure the packet at least
624 624 * has an ether header.
625 625 */
626 626 if (sz1 < sizeof (struct ether_header)) {
627 627 mac_rx_drop_pkt(mac_srs, mp);
628 628 continue;
629 629 }
630 630 ehp = (struct ether_header *)mp->b_rptr;
631 631
632 632 /*
633 633 * Determine if this is a VLAN or non-VLAN packet.
634 634 */
635 635 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
636 636 evhp = (struct ether_vlan_header *)mp->b_rptr;
637 637 sap = ntohs(evhp->ether_type);
638 638 hdrsize = sizeof (struct ether_vlan_header);
639 639 /*
640 640 * Check if the VID of the packet, if any,
641 641 * belongs to this client.
642 642 */
643 643 if (!mac_client_check_flow_vid(mcip,
644 644 VLAN_ID(ntohs(evhp->ether_tci)))) {
645 645 mac_rx_drop_pkt(mac_srs, mp);
646 646 continue;
647 647 }
648 648 } else {
649 649 hdrsize = sizeof (struct ether_header);
650 650 }
651 651 is_unicast =
652 652 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
653 653 dstaddr = (uint8_t *)&ehp->ether_dhost;
654 654 } else {
655 655 mac_header_info_t mhi;
656 656
657 657 if (mac_header_info((mac_handle_t)mcip->mci_mip,
658 658 mp, &mhi) != 0) {
659 659 mac_rx_drop_pkt(mac_srs, mp);
660 660 continue;
661 661 }
662 662 hdrsize = mhi.mhi_hdrsize;
663 663 sap = mhi.mhi_bindsap;
664 664 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
665 665 dstaddr = (uint8_t *)mhi.mhi_daddr;
666 666 }
667 667
668 668 if (!dls_bypass) {
669 669 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
670 670 cnt[type], bw_ctl, sz[type], sz1, mp);
671 671 continue;
672 672 }
673 673
674 674 if (sap == ETHERTYPE_IP) {
675 675 /*
676 676 * If we are H/W classified, but we have promisc
677 677 * on, then we need to check for the unicast address.
678 678 */
679 679 if (hw_classified && mcip->mci_promisc_list != NULL) {
680 680 mac_address_t *map;
681 681
682 682 rw_enter(&mcip->mci_rw_lock, RW_READER);
683 683 map = mcip->mci_unicast;
684 684 if (bcmp(dstaddr, map->ma_addr,
685 685 map->ma_len) == 0)
686 686 type = UNDEF;
687 687 rw_exit(&mcip->mci_rw_lock);
688 688 } else if (is_unicast) {
689 689 type = UNDEF;
690 690 }
691 691 }
692 692
693 693 /*
694 694 * This needs to become a contract with the driver for
695 695 * the fast path.
696 696 *
697 697 * In the normal case the packet will have at least the L2
698 698 * header and the IP + Transport header in the same mblk.
699 699 * This is usually the case when the NIC driver sends up
700 700 * the packet. This is also true when the stack generates
701 701 * a packet that is looped back and when the stack uses the
702 702 * fastpath mechanism. The normal case is optimized for
703 703 * performance and may bypass DLS. All other cases go through
704 704 * the 'OTH' type path without DLS bypass.
705 705 */
706 706
707 707 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
708 708 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
709 709 type = OTH;
710 710
711 711 if (type == OTH) {
712 712 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
713 713 cnt[type], bw_ctl, sz[type], sz1, mp);
714 714 continue;
715 715 }
716 716
717 717 ASSERT(type == UNDEF);
718 718 /*
719 719 * We look for at least 4 bytes past the IP header to get
720 720 * the port information. If we get an IP fragment, we don't
721 721 * have the port information, and we use just the protocol
722 722 * information.
723 723 */
724 724 switch (ipha->ipha_protocol) {
725 725 case IPPROTO_TCP:
726 726 type = V4_TCP;
727 727 mp->b_rptr += hdrsize;
728 728 break;
729 729 case IPPROTO_UDP:
730 730 type = V4_UDP;
731 731 mp->b_rptr += hdrsize;
732 732 break;
733 733 default:
734 734 type = OTH;
735 735 break;
736 736 }
737 737
738 738 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
739 739 bw_ctl, sz[type], sz1, mp);
740 740 }
741 741
742 742 for (type = V4_TCP; type < UNDEF; type++) {
743 743 if (headmp[type] != NULL) {
744 744 mac_soft_ring_t *softring;
745 745
746 746 ASSERT(tailmp[type]->b_next == NULL);
747 747 switch (type) {
748 748 case V4_TCP:
749 749 softring = mac_srs->srs_tcp_soft_rings[0];
750 750 break;
751 751 case V4_UDP:
752 752 softring = mac_srs->srs_udp_soft_rings[0];
|
↓ open down ↓ |
204 lines elided |
↑ open up ↑ |
753 753 break;
754 754 case OTH:
755 755 softring = mac_srs->srs_oth_soft_rings[0];
756 756 }
757 757 mac_rx_soft_ring_process(mcip, softring,
758 758 headmp[type], tailmp[type], cnt[type], sz[type]);
759 759 }
760 760 }
761 761 }
762 762
763 -int fanout_unalligned = 0;
763 +int fanout_unaligned = 0;
764 764
765 765 /*
766 766 * mac_rx_srs_long_fanout
767 767 *
768 - * The fanout routine for IPv6
768 + * The fanout routine for VLANs, and for anything else that isn't performing
769 + * explicit dls bypass. Returns -1 on an error (drop the packet due to a
770 + * malformed packet), 0 on success, with values written in *indx and *type.
769 771 */
770 772 static int
771 773 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
772 774 uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
773 775 {
774 776 ip6_t *ip6h;
777 + ipha_t *ipha;
775 778 uint8_t *whereptr;
776 779 uint_t hash;
777 780 uint16_t remlen;
778 781 uint8_t nexthdr;
779 782 uint16_t hdr_len;
783 + uint32_t src_val;
784 + boolean_t modifiable = B_TRUE;
785 + boolean_t v6;
780 786
787 + ASSERT(MBLKL(mp) >= hdrsize);
788 +
781 789 if (sap == ETHERTYPE_IPV6) {
782 - boolean_t modifiable = B_TRUE;
790 + v6 = B_TRUE;
791 + hdr_len = IPV6_HDR_LEN;
792 + } else if (sap == ETHERTYPE_IP) {
793 + v6 = B_FALSE;
794 + hdr_len = IP_SIMPLE_HDR_LENGTH;
795 + } else {
796 + *indx = 0;
797 + *type = OTH;
798 + return (0);
799 + }
783 800
784 - ASSERT(MBLKL(mp) >= hdrsize);
801 + ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
802 + ipha = (ipha_t *)ip6h;
785 803
786 - ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
787 - if ((unsigned char *)ip6h == mp->b_wptr) {
788 - /*
789 - * The first mblk_t only includes the mac header.
790 - * Note that it is safe to change the mp pointer here,
791 - * as the subsequent operation does not assume mp
792 - * points to the start of the mac header.
793 - */
794 - mp = mp->b_cont;
804 + if ((uint8_t *)ip6h == mp->b_wptr) {
805 + /*
806 + * The first mblk_t only includes the mac header.
807 + * Note that it is safe to change the mp pointer here,
808 + * as the subsequent operation does not assume mp
809 + * points to the start of the mac header.
810 + */
811 + mp = mp->b_cont;
795 812
796 - /*
797 - * Make sure ip6h holds the full ip6_t structure.
798 - */
799 - if (mp == NULL)
800 - return (-1);
813 + /*
814 + * Make sure the IP header points to an entire one.
815 + */
816 + if (mp == NULL)
817 + return (-1);
801 818
802 - if (MBLKL(mp) < IPV6_HDR_LEN) {
803 - modifiable = (DB_REF(mp) == 1);
819 + if (MBLKL(mp) < hdr_len) {
820 + modifiable = (DB_REF(mp) == 1);
804 821
805 - if (modifiable &&
806 - !pullupmsg(mp, IPV6_HDR_LEN)) {
807 - return (-1);
808 - }
809 - }
810 -
811 - ip6h = (ip6_t *)mp->b_rptr;
822 + if (modifiable && !pullupmsg(mp, hdr_len))
823 + return (-1);
812 824 }
813 825
814 - if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
815 - ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
816 - /*
817 - * If either ip6h is not alligned, or ip6h does not
818 - * hold the complete ip6_t structure (a pullupmsg()
819 - * is not an option since it would result in an
820 - * unalligned ip6h), fanout to the default ring. Note
821 - * that this may cause packets reordering.
822 - */
823 - *indx = 0;
824 - *type = OTH;
825 - fanout_unalligned++;
826 - return (0);
827 - }
826 + ip6h = (ip6_t *)mp->b_rptr;
827 + ipha = (ipha_t *)ip6h;
828 + }
828 829
830 + if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
831 + ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
832 + /*
833 + * If either the IP header is not aligned, or it does not hold
834 + * the complete simple structure (a pullupmsg() is not an
835 + * option since it would result in an unaligned IP header),
836 + * fanout to the default ring.
837 + *
838 + * Note that this may cause packet reordering.
839 + */
840 + *indx = 0;
841 + *type = OTH;
842 + fanout_unaligned++;
843 + return (0);
844 + }
845 +
846 + /*
847 + * Extract next-header, full header length, and source-hash value
848 + * using v4/v6 specific fields.
849 + */
850 + if (v6) {
829 851 remlen = ntohs(ip6h->ip6_plen);
830 852 nexthdr = ip6h->ip6_nxt;
831 -
832 - if (remlen < MIN_EHDR_LEN)
833 - return (-1);
853 + src_val = V4_PART_OF_V6(ip6h->ip6_src);
834 854 /*
835 855 * Do src based fanout if below tunable is set to B_TRUE or
836 856 * when mac_ip_hdr_length_v6() fails because of malformed
837 - * packets or because mblk's need to be concatenated using
857 + * packets or because mblks need to be concatenated using
838 858 * pullupmsg().
839 859 */
840 860 if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
841 861 mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
842 862 goto src_based_fanout;
843 863 }
844 - whereptr = (uint8_t *)ip6h + hdr_len;
845 -
846 - /* If the transport is one of below, we do port based fanout */
847 - switch (nexthdr) {
848 - case IPPROTO_TCP:
849 - case IPPROTO_UDP:
850 - case IPPROTO_SCTP:
851 - case IPPROTO_ESP:
852 - /*
853 - * If the ports in the transport header is not part of
854 - * the mblk, do src_based_fanout, instead of calling
855 - * pullupmsg().
856 - */
857 - if (mp->b_cont != NULL &&
858 - whereptr + PORTS_SIZE > mp->b_wptr) {
859 - goto src_based_fanout;
860 - }
861 - break;
862 - default:
863 - break;
864 + } else {
865 + hdr_len = IPH_HDR_LENGTH(ipha);
866 + remlen = ntohs(ipha->ipha_length) - hdr_len;
867 + nexthdr = ipha->ipha_protocol;
868 + src_val = (uint32_t)ipha->ipha_src;
869 + /*
870 + * Catch IPv4 fragment case here. IPv6 has nexthdr == FRAG
871 + * for its equivalent case.
872 + */
873 + if (mac_src_ipv4_fanout ||
874 + (ntohs(ipha->ipha_fragment_offset_and_flags) &
875 + (IPH_MF | IPH_OFFSET)) != 0) {
876 + goto src_based_fanout;
864 877 }
878 + }
879 + if (remlen < MIN_EHDR_LEN)
880 + return (-1);
881 + whereptr = (uint8_t *)ip6h + hdr_len;
865 882
866 - switch (nexthdr) {
867 - case IPPROTO_TCP:
868 - hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
869 - *(uint32_t *)whereptr);
870 - *indx = COMPUTE_INDEX(hash,
871 - mac_srs->srs_tcp_ring_count);
872 - *type = OTH;
873 - break;
883 + /* If the transport is one of below, we do port/SPI based fanout */
884 + switch (nexthdr) {
885 + case IPPROTO_TCP:
886 + case IPPROTO_UDP:
887 + case IPPROTO_SCTP:
888 + case IPPROTO_ESP:
889 + /*
890 + * If the ports or SPI in the transport header is not part of
891 + * the mblk, do src_based_fanout, instead of calling
892 + * pullupmsg().
893 + */
894 + if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
895 + break; /* out of switch... */
896 + /* FALLTHRU */
897 + default:
898 + goto src_based_fanout;
899 + }
874 900
875 - case IPPROTO_UDP:
876 - case IPPROTO_SCTP:
877 - case IPPROTO_ESP:
878 - if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
879 - hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
880 - *(uint32_t *)whereptr);
881 - *indx = COMPUTE_INDEX(hash,
882 - mac_srs->srs_udp_ring_count);
883 - } else {
884 - *indx = mac_srs->srs_ind %
885 - mac_srs->srs_udp_ring_count;
886 - mac_srs->srs_ind++;
887 - }
888 - *type = OTH;
889 - break;
890 -
891 - /* For all other protocol, do source based fanout */
892 - default:
893 - goto src_based_fanout;
901 + switch (nexthdr) {
902 + case IPPROTO_TCP:
903 + hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
904 + *indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
905 + *type = OTH;
906 + break;
907 + case IPPROTO_UDP:
908 + case IPPROTO_SCTP:
909 + case IPPROTO_ESP:
910 + if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
911 + hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
912 + *indx = COMPUTE_INDEX(hash,
913 + mac_srs->srs_udp_ring_count);
914 + } else {
915 + *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
916 + mac_srs->srs_ind++;
894 917 }
895 - } else {
896 - *indx = 0;
897 918 *type = OTH;
919 + break;
898 920 }
899 921 return (0);
900 922
901 923 src_based_fanout:
902 - hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
924 + hash = HASH_ADDR(src_val, (uint32_t)0);
903 925 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
904 926 *type = OTH;
905 927 return (0);
906 928 }
907 929
908 930 /*
909 931 * mac_rx_srs_fanout
910 932 *
911 933 * This routine delivers packets destined to an SRS into a soft ring member
912 934 * of the set.
913 935 *
914 936 * Given a chain of packets we need to split it up into multiple sub chains
915 937 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
916 938 * the soft ring one packet at a time, we want to enter it in the form of a
917 939 * chain otherwise we get this start/stop behaviour where the worker thread
918 940 * goes to sleep and then next packets comes in forcing it to wake up etc.
919 941 *
920 942 * Note:
921 943 * Since we know what is the maximum fanout possible, we create a 2D array
922 944 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
923 945 * variables so that we can enter the softrings with chain. We need the
924 946 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
925 947 * for each packet would be expensive). If we ever want to have the
926 948 * ability to have unlimited fanout, we should probably declare a head,
927 949 * tail, cnt, sz with each soft ring (a data struct which contains a softring
928 950 * along with these members) and create an array of this uber struct so we
929 951 * don't have to do kmem_alloc.
930 952 */
931 953 int fanout_oth1 = 0;
932 954 int fanout_oth2 = 0;
933 955 int fanout_oth3 = 0;
934 956 int fanout_oth4 = 0;
935 957 int fanout_oth5 = 0;
936 958
937 959 static void
938 960 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
939 961 {
940 962 struct ether_header *ehp;
941 963 struct ether_vlan_header *evhp;
942 964 uint32_t sap;
943 965 ipha_t *ipha;
944 966 uint8_t *dstaddr;
945 967 uint_t indx;
946 968 size_t ports_offset;
947 969 size_t ipha_len;
948 970 size_t hdrsize;
949 971 uint_t hash;
950 972 mblk_t *mp;
951 973 mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
952 974 mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
953 975 int cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
954 976 size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT];
955 977 size_t sz1;
956 978 boolean_t bw_ctl;
957 979 boolean_t hw_classified;
958 980 boolean_t dls_bypass;
959 981 boolean_t is_ether;
960 982 boolean_t is_unicast;
961 983 int fanout_cnt;
962 984 enum pkt_type type;
963 985 mac_client_impl_t *mcip = mac_srs->srs_mcip;
964 986
965 987 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
966 988 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
967 989
968 990 /*
969 991 * If we don't have a Rx ring, S/W classification would have done
970 992 * its job and its a packet meant for us. If we were polling on
971 993 * the default ring (i.e. there was a ring assigned to this SRS),
972 994 * then we need to make sure that the mac address really belongs
973 995 * to us.
974 996 */
975 997 hw_classified = mac_srs->srs_ring != NULL &&
976 998 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
977 999
978 1000 /*
979 1001 * Special clients (eg. VLAN, non ether, etc) need DLS
980 1002 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
981 1003 * such SRSs. Another way of disabling bypass is to set the
982 1004 * MCIS_RX_BYPASS_DISABLE flag.
983 1005 */
984 1006 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
985 1007 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
986 1008
987 1009 /*
988 1010 * Since the softrings are never destroyed and we always
989 1011 * create equal number of softrings for TCP, UDP and rest,
990 1012 * its OK to check one of them for count and use it without
991 1013 * any lock. In future, if soft rings get destroyed because
992 1014 * of reduction in fanout, we will need to ensure that happens
993 1015 * behind the SRS_PROC.
994 1016 */
995 1017 fanout_cnt = mac_srs->srs_tcp_ring_count;
996 1018
997 1019 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
998 1020 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
999 1021 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
1000 1022 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
1001 1023
1002 1024 /*
1003 1025 * We got a chain from SRS that we need to send to the soft rings.
1004 1026 * Since squeues for TCP & IPv4 sap poll their soft rings (for
1005 1027 * performance reasons), we need to separate out v4_tcp, v4_udp
1006 1028 * and the rest goes in other.
1007 1029 */
1008 1030 while (head != NULL) {
1009 1031 mp = head;
1010 1032 head = head->b_next;
1011 1033 mp->b_next = NULL;
1012 1034
1013 1035 type = OTH;
1014 1036 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1015 1037
1016 1038 if (is_ether) {
1017 1039 /*
1018 1040 * At this point we can be sure the packet at least
1019 1041 * has an ether header.
1020 1042 */
1021 1043 if (sz1 < sizeof (struct ether_header)) {
1022 1044 mac_rx_drop_pkt(mac_srs, mp);
1023 1045 continue;
1024 1046 }
1025 1047 ehp = (struct ether_header *)mp->b_rptr;
1026 1048
1027 1049 /*
1028 1050 * Determine if this is a VLAN or non-VLAN packet.
1029 1051 */
1030 1052 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1031 1053 evhp = (struct ether_vlan_header *)mp->b_rptr;
1032 1054 sap = ntohs(evhp->ether_type);
1033 1055 hdrsize = sizeof (struct ether_vlan_header);
1034 1056 /*
1035 1057 * Check if the VID of the packet, if any,
1036 1058 * belongs to this client.
1037 1059 */
1038 1060 if (!mac_client_check_flow_vid(mcip,
1039 1061 VLAN_ID(ntohs(evhp->ether_tci)))) {
1040 1062 mac_rx_drop_pkt(mac_srs, mp);
1041 1063 continue;
1042 1064 }
1043 1065 } else {
1044 1066 hdrsize = sizeof (struct ether_header);
1045 1067 }
1046 1068 is_unicast =
1047 1069 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1048 1070 dstaddr = (uint8_t *)&ehp->ether_dhost;
1049 1071 } else {
1050 1072 mac_header_info_t mhi;
1051 1073
1052 1074 if (mac_header_info((mac_handle_t)mcip->mci_mip,
1053 1075 mp, &mhi) != 0) {
1054 1076 mac_rx_drop_pkt(mac_srs, mp);
1055 1077 continue;
1056 1078 }
1057 1079 hdrsize = mhi.mhi_hdrsize;
1058 1080 sap = mhi.mhi_bindsap;
1059 1081 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
1060 1082 dstaddr = (uint8_t *)mhi.mhi_daddr;
1061 1083 }
1062 1084
1063 1085 if (!dls_bypass) {
1064 1086 if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1065 1087 hdrsize, &type, &indx) == -1) {
1066 1088 mac_rx_drop_pkt(mac_srs, mp);
1067 1089 continue;
1068 1090 }
1069 1091
1070 1092 FANOUT_ENQUEUE_MP(headmp[type][indx],
1071 1093 tailmp[type][indx], cnt[type][indx], bw_ctl,
1072 1094 sz[type][indx], sz1, mp);
1073 1095 continue;
1074 1096 }
1075 1097
1076 1098
1077 1099 /*
1078 1100 * If we are using the default Rx ring where H/W or S/W
1079 1101 * classification has not happened, we need to verify if
1080 1102 * this unicast packet really belongs to us.
1081 1103 */
1082 1104 if (sap == ETHERTYPE_IP) {
1083 1105 /*
1084 1106 * If we are H/W classified, but we have promisc
1085 1107 * on, then we need to check for the unicast address.
1086 1108 */
1087 1109 if (hw_classified && mcip->mci_promisc_list != NULL) {
1088 1110 mac_address_t *map;
1089 1111
1090 1112 rw_enter(&mcip->mci_rw_lock, RW_READER);
1091 1113 map = mcip->mci_unicast;
1092 1114 if (bcmp(dstaddr, map->ma_addr,
1093 1115 map->ma_len) == 0)
1094 1116 type = UNDEF;
1095 1117 rw_exit(&mcip->mci_rw_lock);
1096 1118 } else if (is_unicast) {
1097 1119 type = UNDEF;
1098 1120 }
1099 1121 }
1100 1122
1101 1123 /*
1102 1124 * This needs to become a contract with the driver for
1103 1125 * the fast path.
1104 1126 */
1105 1127
1106 1128 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1107 1129 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
1108 1130 type = OTH;
1109 1131 fanout_oth1++;
1110 1132 }
1111 1133
1112 1134 if (type != OTH) {
1113 1135 uint16_t frag_offset_flags;
1114 1136
1115 1137 switch (ipha->ipha_protocol) {
1116 1138 case IPPROTO_TCP:
1117 1139 case IPPROTO_UDP:
1118 1140 case IPPROTO_SCTP:
1119 1141 case IPPROTO_ESP:
1120 1142 ipha_len = IPH_HDR_LENGTH(ipha);
1121 1143 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
1122 1144 mp->b_wptr) {
1123 1145 type = OTH;
1124 1146 break;
1125 1147 }
1126 1148 frag_offset_flags =
1127 1149 ntohs(ipha->ipha_fragment_offset_and_flags);
1128 1150 if ((frag_offset_flags &
1129 1151 (IPH_MF | IPH_OFFSET)) != 0) {
1130 1152 type = OTH;
1131 1153 fanout_oth3++;
1132 1154 break;
1133 1155 }
1134 1156 ports_offset = hdrsize + ipha_len;
1135 1157 break;
1136 1158 default:
1137 1159 type = OTH;
1138 1160 fanout_oth4++;
1139 1161 break;
1140 1162 }
1141 1163 }
1142 1164
1143 1165 if (type == OTH) {
1144 1166 if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1145 1167 hdrsize, &type, &indx) == -1) {
1146 1168 mac_rx_drop_pkt(mac_srs, mp);
1147 1169 continue;
1148 1170 }
1149 1171
1150 1172 FANOUT_ENQUEUE_MP(headmp[type][indx],
1151 1173 tailmp[type][indx], cnt[type][indx], bw_ctl,
1152 1174 sz[type][indx], sz1, mp);
1153 1175 continue;
1154 1176 }
1155 1177
1156 1178 ASSERT(type == UNDEF);
1157 1179
1158 1180 /*
1159 1181 * XXX-Sunay: We should hold srs_lock since ring_count
1160 1182 * below can change. But if we are always called from
1161 1183 * mac_rx_srs_drain and SRS_PROC is set, then we can
1162 1184 * enforce that ring_count can't be changed i.e.
1163 1185 * to change fanout type or ring count, the calling
1164 1186 * thread needs to be behind SRS_PROC.
1165 1187 */
1166 1188 switch (ipha->ipha_protocol) {
1167 1189 case IPPROTO_TCP:
1168 1190 /*
1169 1191 * Note that for ESP, we fanout on SPI and it is at the
1170 1192 * same offset as the 2x16-bit ports. So it is clumped
1171 1193 * along with TCP, UDP and SCTP.
1172 1194 */
1173 1195 hash = HASH_ADDR(ipha->ipha_src,
1174 1196 *(uint32_t *)(mp->b_rptr + ports_offset));
1175 1197 indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
1176 1198 type = V4_TCP;
1177 1199 mp->b_rptr += hdrsize;
1178 1200 break;
1179 1201 case IPPROTO_UDP:
1180 1202 case IPPROTO_SCTP:
1181 1203 case IPPROTO_ESP:
1182 1204 if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
1183 1205 hash = HASH_ADDR(ipha->ipha_src,
1184 1206 *(uint32_t *)(mp->b_rptr + ports_offset));
1185 1207 indx = COMPUTE_INDEX(hash,
1186 1208 mac_srs->srs_udp_ring_count);
1187 1209 } else {
1188 1210 indx = mac_srs->srs_ind %
1189 1211 mac_srs->srs_udp_ring_count;
1190 1212 mac_srs->srs_ind++;
1191 1213 }
1192 1214 type = V4_UDP;
1193 1215 mp->b_rptr += hdrsize;
1194 1216 break;
1195 1217 default:
1196 1218 indx = 0;
1197 1219 type = OTH;
1198 1220 }
1199 1221
1200 1222 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
1201 1223 cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
1202 1224 }
1203 1225
1204 1226 for (type = V4_TCP; type < UNDEF; type++) {
1205 1227 int i;
1206 1228
1207 1229 for (i = 0; i < fanout_cnt; i++) {
1208 1230 if (headmp[type][i] != NULL) {
1209 1231 mac_soft_ring_t *softring;
1210 1232
1211 1233 ASSERT(tailmp[type][i]->b_next == NULL);
1212 1234 switch (type) {
1213 1235 case V4_TCP:
1214 1236 softring =
1215 1237 mac_srs->srs_tcp_soft_rings[i];
1216 1238 break;
1217 1239 case V4_UDP:
1218 1240 softring =
1219 1241 mac_srs->srs_udp_soft_rings[i];
1220 1242 break;
1221 1243 case OTH:
1222 1244 softring =
1223 1245 mac_srs->srs_oth_soft_rings[i];
1224 1246 break;
1225 1247 }
1226 1248 mac_rx_soft_ring_process(mcip,
1227 1249 softring, headmp[type][i], tailmp[type][i],
1228 1250 cnt[type][i], sz[type][i]);
1229 1251 }
1230 1252 }
1231 1253 }
1232 1254 }
1233 1255
1234 1256 #define SRS_BYTES_TO_PICKUP 150000
1235 1257 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
1236 1258
1237 1259 /*
1238 1260 * mac_rx_srs_poll_ring
1239 1261 *
1240 1262 * This SRS Poll thread uses this routine to poll the underlying hardware
1241 1263 * Rx ring to get a chain of packets. It can inline process that chain
1242 1264 * if mac_latency_optimize is set (default) or signal the SRS worker thread
1243 1265 * to do the remaining processing.
1244 1266 *
1245 1267 * Since packets come in the system via interrupt or poll path, we also
1246 1268 * update the stats and deal with promiscous clients here.
1247 1269 */
1248 1270 void
1249 1271 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
1250 1272 {
1251 1273 kmutex_t *lock = &mac_srs->srs_lock;
1252 1274 kcondvar_t *async = &mac_srs->srs_cv;
1253 1275 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
1254 1276 mblk_t *head, *tail, *mp;
1255 1277 callb_cpr_t cprinfo;
1256 1278 ssize_t bytes_to_pickup;
1257 1279 size_t sz;
1258 1280 int count;
1259 1281 mac_client_impl_t *smcip;
1260 1282
1261 1283 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
1262 1284 mutex_enter(lock);
1263 1285
1264 1286 start:
1265 1287 for (;;) {
1266 1288 if (mac_srs->srs_state & SRS_PAUSE)
1267 1289 goto done;
1268 1290
1269 1291 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1270 1292 cv_wait(async, lock);
1271 1293 CALLB_CPR_SAFE_END(&cprinfo, lock);
1272 1294
1273 1295 if (mac_srs->srs_state & SRS_PAUSE)
1274 1296 goto done;
1275 1297
1276 1298 check_again:
1277 1299 if (mac_srs->srs_type & SRST_BW_CONTROL) {
1278 1300 /*
1279 1301 * We pick as many bytes as we are allowed to queue.
1280 1302 * Its possible that we will exceed the total
1281 1303 * packets queued in case this SRS is part of the
1282 1304 * Rx ring group since > 1 poll thread can be pulling
1283 1305 * upto the max allowed packets at the same time
1284 1306 * but that should be OK.
1285 1307 */
1286 1308 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1287 1309 bytes_to_pickup =
1288 1310 mac_srs->srs_bw->mac_bw_drop_threshold -
1289 1311 mac_srs->srs_bw->mac_bw_sz;
1290 1312 /*
1291 1313 * We shouldn't have been signalled if we
1292 1314 * have 0 or less bytes to pick but since
1293 1315 * some of the bytes accounting is driver
1294 1316 * dependant, we do the safety check.
1295 1317 */
1296 1318 if (bytes_to_pickup < 0)
1297 1319 bytes_to_pickup = 0;
1298 1320 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1299 1321 } else {
1300 1322 /*
1301 1323 * ToDO: Need to change the polling API
1302 1324 * to add a packet count and a flag which
1303 1325 * tells the driver whether we want packets
1304 1326 * based on a count, or bytes, or all the
1305 1327 * packets queued in the driver/HW. This
1306 1328 * way, we never have to check the limits
1307 1329 * on poll path. We truly let only as many
1308 1330 * packets enter the system as we are willing
1309 1331 * to process or queue.
1310 1332 *
1311 1333 * Something along the lines of
1312 1334 * pkts_to_pickup = mac_soft_ring_max_q_cnt -
1313 1335 * mac_srs->srs_poll_pkt_cnt
1314 1336 */
1315 1337
1316 1338 /*
1317 1339 * Since we are not doing B/W control, pick
1318 1340 * as many packets as allowed.
1319 1341 */
1320 1342 bytes_to_pickup = max_bytes_to_pickup;
1321 1343 }
1322 1344
1323 1345 /* Poll the underlying Hardware */
1324 1346 mutex_exit(lock);
1325 1347 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
1326 1348 mutex_enter(lock);
1327 1349
1328 1350 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
1329 1351 SRS_POLL_THR_OWNER);
1330 1352
1331 1353 mp = tail = head;
1332 1354 count = 0;
1333 1355 sz = 0;
1334 1356 while (mp != NULL) {
1335 1357 tail = mp;
1336 1358 sz += msgdsize(mp);
1337 1359 mp = mp->b_next;
1338 1360 count++;
1339 1361 }
1340 1362
1341 1363 if (head != NULL) {
1342 1364 tail->b_next = NULL;
1343 1365 smcip = mac_srs->srs_mcip;
1344 1366
1345 1367 SRS_RX_STAT_UPDATE(mac_srs, pollbytes, sz);
1346 1368 SRS_RX_STAT_UPDATE(mac_srs, pollcnt, count);
1347 1369
1348 1370 /*
1349 1371 * If there are any promiscuous mode callbacks
1350 1372 * defined for this MAC client, pass them a copy
1351 1373 * if appropriate and also update the counters.
1352 1374 */
1353 1375 if (smcip != NULL) {
1354 1376 if (smcip->mci_mip->mi_promisc_list != NULL) {
1355 1377 mutex_exit(lock);
1356 1378 mac_promisc_dispatch(smcip->mci_mip,
1357 1379 head, NULL);
1358 1380 mutex_enter(lock);
1359 1381 }
1360 1382 }
1361 1383 if (mac_srs->srs_type & SRST_BW_CONTROL) {
1362 1384 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1363 1385 mac_srs->srs_bw->mac_bw_polled += sz;
1364 1386 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1365 1387 }
1366 1388 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
1367 1389 count, sz);
1368 1390 if (count <= 10)
1369 1391 srs_rx->sr_stat.mrs_chaincntundr10++;
1370 1392 else if (count > 10 && count <= 50)
1371 1393 srs_rx->sr_stat.mrs_chaincnt10to50++;
1372 1394 else
1373 1395 srs_rx->sr_stat.mrs_chaincntover50++;
1374 1396 }
1375 1397
1376 1398 /*
1377 1399 * We are guaranteed that SRS_PROC will be set if we
1378 1400 * are here. Also, poll thread gets to run only if
1379 1401 * the drain was being done by a worker thread although
1380 1402 * its possible that worker thread is still running
1381 1403 * and poll thread was sent down to keep the pipeline
1382 1404 * going instead of doing a complete drain and then
1383 1405 * trying to poll the NIC.
1384 1406 *
1385 1407 * So we need to check SRS_WORKER flag to make sure
1386 1408 * that the worker thread is not processing the queue
1387 1409 * in parallel to us. The flags and conditions are
1388 1410 * protected by the srs_lock to prevent any race. We
1389 1411 * ensure that we don't drop the srs_lock from now
1390 1412 * till the end and similarly we don't drop the srs_lock
1391 1413 * in mac_rx_srs_drain() till similar condition check
1392 1414 * are complete. The mac_rx_srs_drain() needs to ensure
1393 1415 * that SRS_WORKER flag remains set as long as its
1394 1416 * processing the queue.
1395 1417 */
1396 1418 if (!(mac_srs->srs_state & SRS_WORKER) &&
1397 1419 (mac_srs->srs_first != NULL)) {
1398 1420 /*
1399 1421 * We have packets to process and worker thread
1400 1422 * is not running. Check to see if poll thread is
1401 1423 * allowed to process.
1402 1424 */
1403 1425 if (mac_srs->srs_state & SRS_LATENCY_OPT) {
1404 1426 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
1405 1427 if (!(mac_srs->srs_state & SRS_PAUSE) &&
1406 1428 srs_rx->sr_poll_pkt_cnt <=
1407 1429 srs_rx->sr_lowat) {
1408 1430 srs_rx->sr_poll_again++;
1409 1431 goto check_again;
1410 1432 }
1411 1433 /*
1412 1434 * We are already above low water mark
1413 1435 * so stay in the polling mode but no
1414 1436 * need to poll. Once we dip below
1415 1437 * the polling threshold, the processing
1416 1438 * thread (soft ring) will signal us
1417 1439 * to poll again (MAC_UPDATE_SRS_COUNT)
1418 1440 */
1419 1441 srs_rx->sr_poll_drain_no_poll++;
1420 1442 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1421 1443 /*
1422 1444 * In B/W control case, its possible
1423 1445 * that the backlog built up due to
1424 1446 * B/W limit being reached and packets
1425 1447 * are queued only in SRS. In this case,
1426 1448 * we should schedule worker thread
1427 1449 * since no one else will wake us up.
1428 1450 */
1429 1451 if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
1430 1452 (mac_srs->srs_tid == NULL)) {
1431 1453 mac_srs->srs_tid =
1432 1454 timeout(mac_srs_fire, mac_srs, 1);
1433 1455 srs_rx->sr_poll_worker_wakeup++;
1434 1456 }
1435 1457 } else {
1436 1458 /*
1437 1459 * Wakeup the worker thread for more processing.
1438 1460 * We optimize for throughput in this case.
1439 1461 */
1440 1462 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1441 1463 MAC_SRS_WORKER_WAKEUP(mac_srs);
1442 1464 srs_rx->sr_poll_sig_worker++;
1443 1465 }
1444 1466 } else if ((mac_srs->srs_first == NULL) &&
1445 1467 !(mac_srs->srs_state & SRS_WORKER)) {
1446 1468 /*
1447 1469 * There is nothing queued in SRS and
1448 1470 * no worker thread running. Plus we
1449 1471 * didn't get anything from the H/W
1450 1472 * as well (head == NULL);
1451 1473 */
1452 1474 ASSERT(head == NULL);
1453 1475 mac_srs->srs_state &=
1454 1476 ~(SRS_PROC|SRS_GET_PKTS);
1455 1477
1456 1478 /*
1457 1479 * If we have a packets in soft ring, don't allow
1458 1480 * more packets to come into this SRS by keeping the
1459 1481 * interrupts off but not polling the H/W. The
1460 1482 * poll thread will get signaled as soon as
1461 1483 * srs_poll_pkt_cnt dips below poll threshold.
1462 1484 */
1463 1485 if (srs_rx->sr_poll_pkt_cnt == 0) {
1464 1486 srs_rx->sr_poll_intr_enable++;
1465 1487 MAC_SRS_POLLING_OFF(mac_srs);
1466 1488 } else {
1467 1489 /*
1468 1490 * We know nothing is queued in SRS
1469 1491 * since we are here after checking
1470 1492 * srs_first is NULL. The backlog
1471 1493 * is entirely due to packets queued
1472 1494 * in Soft ring which will wake us up
1473 1495 * and get the interface out of polling
1474 1496 * mode once the backlog dips below
1475 1497 * sr_poll_thres.
1476 1498 */
1477 1499 srs_rx->sr_poll_no_poll++;
1478 1500 }
1479 1501 } else {
1480 1502 /*
1481 1503 * Worker thread is already running.
1482 1504 * Nothing much to do. If the polling
1483 1505 * was enabled, worker thread will deal
1484 1506 * with that.
1485 1507 */
1486 1508 mac_srs->srs_state &= ~SRS_GET_PKTS;
1487 1509 srs_rx->sr_poll_goto_sleep++;
1488 1510 }
1489 1511 }
1490 1512 done:
1491 1513 mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
1492 1514 cv_signal(&mac_srs->srs_async);
1493 1515 /*
1494 1516 * If this is a temporary quiesce then wait for the restart signal
1495 1517 * from the srs worker. Then clear the flags and signal the srs worker
1496 1518 * to ensure a positive handshake and go back to start.
1497 1519 */
1498 1520 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
1499 1521 cv_wait(async, lock);
1500 1522 if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
1501 1523 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
1502 1524 mac_srs->srs_state &=
1503 1525 ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
1504 1526 cv_signal(&mac_srs->srs_async);
1505 1527 goto start;
1506 1528 } else {
1507 1529 mac_srs->srs_state |= SRS_POLL_THR_EXITED;
1508 1530 cv_signal(&mac_srs->srs_async);
1509 1531 CALLB_CPR_EXIT(&cprinfo);
1510 1532 thread_exit();
1511 1533 }
1512 1534 }
1513 1535
1514 1536 /*
1515 1537 * mac_srs_pick_chain
1516 1538 *
1517 1539 * In Bandwidth control case, checks how many packets can be processed
1518 1540 * and return them in a sub chain.
1519 1541 */
1520 1542 static mblk_t *
1521 1543 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
1522 1544 size_t *chain_sz, int *chain_cnt)
1523 1545 {
1524 1546 mblk_t *head = NULL;
1525 1547 mblk_t *tail = NULL;
1526 1548 size_t sz;
1527 1549 size_t tsz = 0;
1528 1550 int cnt = 0;
1529 1551 mblk_t *mp;
1530 1552
1531 1553 ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1532 1554 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1533 1555 if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
1534 1556 mac_srs->srs_bw->mac_bw_limit) ||
1535 1557 (mac_srs->srs_bw->mac_bw_limit == 0)) {
1536 1558 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1537 1559 head = mac_srs->srs_first;
1538 1560 mac_srs->srs_first = NULL;
1539 1561 *chain_tail = mac_srs->srs_last;
1540 1562 mac_srs->srs_last = NULL;
1541 1563 *chain_sz = mac_srs->srs_size;
1542 1564 *chain_cnt = mac_srs->srs_count;
1543 1565 mac_srs->srs_count = 0;
1544 1566 mac_srs->srs_size = 0;
1545 1567 return (head);
1546 1568 }
1547 1569
1548 1570 /*
1549 1571 * Can't clear the entire backlog.
1550 1572 * Need to find how many packets to pick
1551 1573 */
1552 1574 ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
1553 1575 while ((mp = mac_srs->srs_first) != NULL) {
1554 1576 sz = msgdsize(mp);
1555 1577 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
1556 1578 mac_srs->srs_bw->mac_bw_limit) {
1557 1579 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
1558 1580 mac_srs->srs_bw->mac_bw_state |=
1559 1581 SRS_BW_ENFORCED;
1560 1582 break;
1561 1583 }
1562 1584
1563 1585 /*
1564 1586 * The _size & cnt is decremented from the softrings
1565 1587 * when they send up the packet for polling to work
1566 1588 * properly.
1567 1589 */
1568 1590 tsz += sz;
1569 1591 cnt++;
1570 1592 mac_srs->srs_count--;
1571 1593 mac_srs->srs_size -= sz;
1572 1594 if (tail != NULL)
1573 1595 tail->b_next = mp;
1574 1596 else
1575 1597 head = mp;
1576 1598 tail = mp;
1577 1599 mac_srs->srs_first = mac_srs->srs_first->b_next;
1578 1600 }
1579 1601 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1580 1602 if (mac_srs->srs_first == NULL)
1581 1603 mac_srs->srs_last = NULL;
1582 1604
1583 1605 if (tail != NULL)
1584 1606 tail->b_next = NULL;
1585 1607 *chain_tail = tail;
1586 1608 *chain_cnt = cnt;
1587 1609 *chain_sz = tsz;
1588 1610
1589 1611 return (head);
1590 1612 }
1591 1613
1592 1614 /*
1593 1615 * mac_rx_srs_drain
1594 1616 *
1595 1617 * The SRS drain routine. Gets to run to clear the queue. Any thread
1596 1618 * (worker, interrupt, poll) can call this based on processing model.
1597 1619 * The first thing we do is disable interrupts if possible and then
1598 1620 * drain the queue. we also try to poll the underlying hardware if
1599 1621 * there is a dedicated hardware Rx ring assigned to this SRS.
1600 1622 *
1601 1623 * There is a equivalent drain routine in bandwidth control mode
1602 1624 * mac_rx_srs_drain_bw. There is some code duplication between the two
1603 1625 * routines but they are highly performance sensitive and are easier
1604 1626 * to read/debug if they stay separate. Any code changes here might
1605 1627 * also apply to mac_rx_srs_drain_bw as well.
1606 1628 */
1607 1629 void
1608 1630 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1609 1631 {
1610 1632 mblk_t *head;
1611 1633 mblk_t *tail;
1612 1634 timeout_id_t tid;
1613 1635 int cnt = 0;
1614 1636 mac_client_impl_t *mcip = mac_srs->srs_mcip;
1615 1637 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
1616 1638
1617 1639 ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1618 1640 ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
1619 1641
1620 1642 /* If we are blanked i.e. can't do upcalls, then we are done */
1621 1643 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1622 1644 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1623 1645 (mac_srs->srs_state & SRS_PAUSE));
1624 1646 goto out;
1625 1647 }
1626 1648
1627 1649 if (mac_srs->srs_first == NULL)
1628 1650 goto out;
1629 1651
1630 1652 if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
1631 1653 (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
1632 1654 /*
1633 1655 * In the normal case, the SRS worker thread does no
1634 1656 * work and we wait for a backlog to build up before
1635 1657 * we switch into polling mode. In case we are
1636 1658 * optimizing for throughput, we use the worker thread
1637 1659 * as well. The goal is to let worker thread process
1638 1660 * the queue and poll thread to feed packets into
1639 1661 * the queue. As such, we should signal the poll
1640 1662 * thread to try and get more packets.
1641 1663 *
1642 1664 * We could have pulled this check in the POLL_RING
1643 1665 * macro itself but keeping it explicit here makes
1644 1666 * the architecture more human understandable.
1645 1667 */
1646 1668 MAC_SRS_POLL_RING(mac_srs);
1647 1669 }
1648 1670
1649 1671 again:
1650 1672 head = mac_srs->srs_first;
1651 1673 mac_srs->srs_first = NULL;
1652 1674 tail = mac_srs->srs_last;
1653 1675 mac_srs->srs_last = NULL;
1654 1676 cnt = mac_srs->srs_count;
1655 1677 mac_srs->srs_count = 0;
1656 1678
1657 1679 ASSERT(head != NULL);
1658 1680 ASSERT(tail != NULL);
1659 1681
1660 1682 if ((tid = mac_srs->srs_tid) != 0)
1661 1683 mac_srs->srs_tid = 0;
1662 1684
1663 1685 mac_srs->srs_state |= (SRS_PROC|proc_type);
1664 1686
1665 1687
1666 1688 /*
1667 1689 * mcip is NULL for broadcast and multicast flows. The promisc
1668 1690 * callbacks for broadcast and multicast packets are delivered from
1669 1691 * mac_rx() and we don't need to worry about that case in this path
1670 1692 */
1671 1693 if (mcip != NULL) {
1672 1694 if (mcip->mci_promisc_list != NULL) {
1673 1695 mutex_exit(&mac_srs->srs_lock);
1674 1696 mac_promisc_client_dispatch(mcip, head);
1675 1697 mutex_enter(&mac_srs->srs_lock);
1676 1698 }
1677 1699 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1678 1700 mutex_exit(&mac_srs->srs_lock);
1679 1701 mac_protect_intercept_dhcp(mcip, head);
1680 1702 mutex_enter(&mac_srs->srs_lock);
1681 1703 }
1682 1704 }
1683 1705
1684 1706 /*
1685 1707 * Check if SRS itself is doing the processing
1686 1708 * This direct path does not apply when subflows are present. In this
1687 1709 * case, packets need to be dispatched to a soft ring according to the
1688 1710 * flow's bandwidth and other resources contraints.
1689 1711 */
1690 1712 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1691 1713 mac_direct_rx_t proc;
1692 1714 void *arg1;
1693 1715 mac_resource_handle_t arg2;
1694 1716
1695 1717 /*
1696 1718 * This is the case when a Rx is directly
1697 1719 * assigned and we have a fully classified
1698 1720 * protocol chain. We can deal with it in
1699 1721 * one shot.
1700 1722 */
1701 1723 proc = srs_rx->sr_func;
1702 1724 arg1 = srs_rx->sr_arg1;
1703 1725 arg2 = srs_rx->sr_arg2;
1704 1726
1705 1727 mac_srs->srs_state |= SRS_CLIENT_PROC;
1706 1728 mutex_exit(&mac_srs->srs_lock);
1707 1729 if (tid != 0) {
1708 1730 (void) untimeout(tid);
1709 1731 tid = 0;
1710 1732 }
1711 1733
1712 1734 proc(arg1, arg2, head, NULL);
1713 1735 /*
1714 1736 * Decrement the size and count here itelf
1715 1737 * since the packet has been processed.
1716 1738 */
1717 1739 mutex_enter(&mac_srs->srs_lock);
1718 1740 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1719 1741 if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1720 1742 cv_signal(&mac_srs->srs_client_cv);
1721 1743 mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1722 1744 } else {
1723 1745 /* Some kind of softrings based fanout is required */
1724 1746 mutex_exit(&mac_srs->srs_lock);
1725 1747 if (tid != 0) {
1726 1748 (void) untimeout(tid);
1727 1749 tid = 0;
1728 1750 }
1729 1751
1730 1752 /*
1731 1753 * Since the fanout routines can deal with chains,
1732 1754 * shoot the entire chain up.
1733 1755 */
1734 1756 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1735 1757 mac_rx_srs_fanout(mac_srs, head);
1736 1758 else
1737 1759 mac_rx_srs_proto_fanout(mac_srs, head);
1738 1760 mutex_enter(&mac_srs->srs_lock);
1739 1761 }
1740 1762
1741 1763 if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) &&
1742 1764 (mac_srs->srs_first != NULL)) {
1743 1765 /*
1744 1766 * More packets arrived while we were clearing the
1745 1767 * SRS. This can be possible because of one of
1746 1768 * three conditions below:
1747 1769 * 1) The driver is using multiple worker threads
1748 1770 * to send the packets to us.
1749 1771 * 2) The driver has a race in switching
1750 1772 * between interrupt and polling mode or
1751 1773 * 3) Packets are arriving in this SRS via the
1752 1774 * S/W classification as well.
1753 1775 *
1754 1776 * We should switch to polling mode and see if we
1755 1777 * need to send the poll thread down. Also, signal
1756 1778 * the worker thread to process whats just arrived.
1757 1779 */
1758 1780 MAC_SRS_POLLING_ON(mac_srs);
1759 1781 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
1760 1782 srs_rx->sr_drain_poll_sig++;
1761 1783 MAC_SRS_POLL_RING(mac_srs);
1762 1784 }
1763 1785
1764 1786 /*
1765 1787 * If we didn't signal the poll thread, we need
1766 1788 * to deal with the pending packets ourselves.
1767 1789 */
1768 1790 if (proc_type == SRS_WORKER) {
1769 1791 srs_rx->sr_drain_again++;
1770 1792 goto again;
1771 1793 } else {
1772 1794 srs_rx->sr_drain_worker_sig++;
1773 1795 cv_signal(&mac_srs->srs_async);
1774 1796 }
1775 1797 }
1776 1798
1777 1799 out:
1778 1800 if (mac_srs->srs_state & SRS_GET_PKTS) {
1779 1801 /*
1780 1802 * Poll thread is already running. Leave the
1781 1803 * SRS_RPOC set and hand over the control to
1782 1804 * poll thread.
1783 1805 */
1784 1806 mac_srs->srs_state &= ~proc_type;
1785 1807 srs_rx->sr_drain_poll_running++;
1786 1808 return;
1787 1809 }
1788 1810
1789 1811 /*
1790 1812 * Even if there are no packets queued in SRS, we
1791 1813 * need to make sure that the shared counter is
1792 1814 * clear and any associated softrings have cleared
1793 1815 * all the backlog. Otherwise, leave the interface
1794 1816 * in polling mode and the poll thread will get
1795 1817 * signalled once the count goes down to zero.
1796 1818 *
1797 1819 * If someone is already draining the queue (SRS_PROC is
1798 1820 * set) when the srs_poll_pkt_cnt goes down to zero,
1799 1821 * then it means that drain is already running and we
1800 1822 * will turn off polling at that time if there is
1801 1823 * no backlog.
1802 1824 *
1803 1825 * As long as there are packets queued either
1804 1826 * in soft ring set or its soft rings, we will leave
1805 1827 * the interface in polling mode (even if the drain
1806 1828 * was done being the interrupt thread). We signal
1807 1829 * the poll thread as well if we have dipped below
1808 1830 * low water mark.
1809 1831 *
1810 1832 * NOTE: We can't use the MAC_SRS_POLLING_ON macro
1811 1833 * since that turn polling on only for worker thread.
1812 1834 * Its not worth turning polling on for interrupt
1813 1835 * thread (since NIC will not issue another interrupt)
1814 1836 * unless a backlog builds up.
1815 1837 */
1816 1838 if ((srs_rx->sr_poll_pkt_cnt > 0) &&
1817 1839 (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
1818 1840 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1819 1841 srs_rx->sr_drain_keep_polling++;
1820 1842 MAC_SRS_POLLING_ON(mac_srs);
1821 1843 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
1822 1844 MAC_SRS_POLL_RING(mac_srs);
1823 1845 return;
1824 1846 }
1825 1847
1826 1848 /* Nothing else to do. Get out of poll mode */
1827 1849 MAC_SRS_POLLING_OFF(mac_srs);
1828 1850 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1829 1851 srs_rx->sr_drain_finish_intr++;
1830 1852 }
1831 1853
1832 1854 /*
1833 1855 * mac_rx_srs_drain_bw
1834 1856 *
1835 1857 * The SRS BW drain routine. Gets to run to clear the queue. Any thread
1836 1858 * (worker, interrupt, poll) can call this based on processing model.
1837 1859 * The first thing we do is disable interrupts if possible and then
1838 1860 * drain the queue. we also try to poll the underlying hardware if
1839 1861 * there is a dedicated hardware Rx ring assigned to this SRS.
1840 1862 *
1841 1863 * There is a equivalent drain routine in non bandwidth control mode
1842 1864 * mac_rx_srs_drain. There is some code duplication between the two
1843 1865 * routines but they are highly performance sensitive and are easier
1844 1866 * to read/debug if they stay separate. Any code changes here might
1845 1867 * also apply to mac_rx_srs_drain as well.
1846 1868 */
1847 1869 void
1848 1870 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1849 1871 {
1850 1872 mblk_t *head;
1851 1873 mblk_t *tail;
1852 1874 timeout_id_t tid;
1853 1875 size_t sz = 0;
1854 1876 int cnt = 0;
1855 1877 mac_client_impl_t *mcip = mac_srs->srs_mcip;
1856 1878 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
1857 1879 clock_t now;
1858 1880
1859 1881 ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1860 1882 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
1861 1883 again:
1862 1884 /* Check if we are doing B/W control */
1863 1885 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1864 1886 now = ddi_get_lbolt();
1865 1887 if (mac_srs->srs_bw->mac_bw_curr_time != now) {
1866 1888 mac_srs->srs_bw->mac_bw_curr_time = now;
1867 1889 mac_srs->srs_bw->mac_bw_used = 0;
1868 1890 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
1869 1891 mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
1870 1892 } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
1871 1893 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1872 1894 goto done;
1873 1895 } else if (mac_srs->srs_bw->mac_bw_used >
1874 1896 mac_srs->srs_bw->mac_bw_limit) {
1875 1897 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
1876 1898 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1877 1899 goto done;
1878 1900 }
1879 1901 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1880 1902
1881 1903 /* If we are blanked i.e. can't do upcalls, then we are done */
1882 1904 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1883 1905 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1884 1906 (mac_srs->srs_state & SRS_PAUSE));
1885 1907 goto done;
1886 1908 }
1887 1909
1888 1910 sz = 0;
1889 1911 cnt = 0;
1890 1912 if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
1891 1913 /*
1892 1914 * We couldn't pick up a single packet.
1893 1915 */
1894 1916 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1895 1917 if ((mac_srs->srs_bw->mac_bw_used == 0) &&
1896 1918 (mac_srs->srs_size != 0) &&
1897 1919 !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
1898 1920 /*
1899 1921 * Seems like configured B/W doesn't
1900 1922 * even allow processing of 1 packet
1901 1923 * per tick.
1902 1924 *
1903 1925 * XXX: raise the limit to processing
1904 1926 * at least 1 packet per tick.
1905 1927 */
1906 1928 mac_srs->srs_bw->mac_bw_limit +=
1907 1929 mac_srs->srs_bw->mac_bw_limit;
1908 1930 mac_srs->srs_bw->mac_bw_drop_threshold +=
1909 1931 mac_srs->srs_bw->mac_bw_drop_threshold;
1910 1932 cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
1911 1933 "raised B/W limit to %d since not even a "
1912 1934 "single packet can be processed per "
1913 1935 "tick %d\n", (void *)mac_srs,
1914 1936 (int)mac_srs->srs_bw->mac_bw_limit,
1915 1937 (int)msgdsize(mac_srs->srs_first));
1916 1938 }
1917 1939 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1918 1940 goto done;
1919 1941 }
1920 1942
1921 1943 ASSERT(head != NULL);
1922 1944 ASSERT(tail != NULL);
1923 1945
1924 1946 /* zero bandwidth: drop all and return to interrupt mode */
1925 1947 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1926 1948 if (mac_srs->srs_bw->mac_bw_limit == 0) {
1927 1949 srs_rx->sr_stat.mrs_sdrops += cnt;
1928 1950 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
1929 1951 mac_srs->srs_bw->mac_bw_sz -= sz;
1930 1952 mac_srs->srs_bw->mac_bw_drop_bytes += sz;
1931 1953 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1932 1954 mac_pkt_drop(NULL, NULL, head, B_FALSE);
1933 1955 goto leave_poll;
1934 1956 } else {
1935 1957 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1936 1958 }
1937 1959
1938 1960 if ((tid = mac_srs->srs_tid) != 0)
1939 1961 mac_srs->srs_tid = 0;
1940 1962
1941 1963 mac_srs->srs_state |= (SRS_PROC|proc_type);
1942 1964 MAC_SRS_WORKER_POLLING_ON(mac_srs);
1943 1965
1944 1966 /*
1945 1967 * mcip is NULL for broadcast and multicast flows. The promisc
1946 1968 * callbacks for broadcast and multicast packets are delivered from
1947 1969 * mac_rx() and we don't need to worry about that case in this path
1948 1970 */
1949 1971 if (mcip != NULL) {
1950 1972 if (mcip->mci_promisc_list != NULL) {
1951 1973 mutex_exit(&mac_srs->srs_lock);
1952 1974 mac_promisc_client_dispatch(mcip, head);
1953 1975 mutex_enter(&mac_srs->srs_lock);
1954 1976 }
1955 1977 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1956 1978 mutex_exit(&mac_srs->srs_lock);
1957 1979 mac_protect_intercept_dhcp(mcip, head);
1958 1980 mutex_enter(&mac_srs->srs_lock);
1959 1981 }
1960 1982 }
1961 1983
1962 1984 /*
1963 1985 * Check if SRS itself is doing the processing
1964 1986 * This direct path does not apply when subflows are present. In this
1965 1987 * case, packets need to be dispatched to a soft ring according to the
1966 1988 * flow's bandwidth and other resources contraints.
1967 1989 */
1968 1990 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1969 1991 mac_direct_rx_t proc;
1970 1992 void *arg1;
1971 1993 mac_resource_handle_t arg2;
1972 1994
1973 1995 /*
1974 1996 * This is the case when a Rx is directly
1975 1997 * assigned and we have a fully classified
1976 1998 * protocol chain. We can deal with it in
1977 1999 * one shot.
1978 2000 */
1979 2001 proc = srs_rx->sr_func;
1980 2002 arg1 = srs_rx->sr_arg1;
1981 2003 arg2 = srs_rx->sr_arg2;
1982 2004
1983 2005 mac_srs->srs_state |= SRS_CLIENT_PROC;
1984 2006 mutex_exit(&mac_srs->srs_lock);
1985 2007 if (tid != 0) {
1986 2008 (void) untimeout(tid);
1987 2009 tid = 0;
1988 2010 }
1989 2011
1990 2012 proc(arg1, arg2, head, NULL);
1991 2013 /*
1992 2014 * Decrement the size and count here itelf
1993 2015 * since the packet has been processed.
1994 2016 */
1995 2017 mutex_enter(&mac_srs->srs_lock);
1996 2018 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1997 2019 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
1998 2020
1999 2021 if (mac_srs->srs_state & SRS_CLIENT_WAIT)
2000 2022 cv_signal(&mac_srs->srs_client_cv);
2001 2023 mac_srs->srs_state &= ~SRS_CLIENT_PROC;
2002 2024 } else {
2003 2025 /* Some kind of softrings based fanout is required */
2004 2026 mutex_exit(&mac_srs->srs_lock);
2005 2027 if (tid != 0) {
2006 2028 (void) untimeout(tid);
2007 2029 tid = 0;
2008 2030 }
2009 2031
2010 2032 /*
2011 2033 * Since the fanout routines can deal with chains,
2012 2034 * shoot the entire chain up.
2013 2035 */
2014 2036 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
2015 2037 mac_rx_srs_fanout(mac_srs, head);
2016 2038 else
2017 2039 mac_rx_srs_proto_fanout(mac_srs, head);
2018 2040 mutex_enter(&mac_srs->srs_lock);
2019 2041 }
2020 2042
2021 2043 /*
2022 2044 * Send the poll thread to pick up any packets arrived
2023 2045 * so far. This also serves as the last check in case
2024 2046 * nothing else is queued in the SRS. The poll thread
2025 2047 * is signalled only in the case the drain was done
2026 2048 * by the worker thread and SRS_WORKER is set. The
2027 2049 * worker thread can run in parallel as long as the
2028 2050 * SRS_WORKER flag is set. We we have nothing else to
2029 2051 * process, we can exit while leaving SRS_PROC set
2030 2052 * which gives the poll thread control to process and
2031 2053 * cleanup once it returns from the NIC.
2032 2054 *
2033 2055 * If we have nothing else to process, we need to
2034 2056 * ensure that we keep holding the srs_lock till
2035 2057 * all the checks below are done and control is
2036 2058 * handed to the poll thread if it was running.
2037 2059 */
2038 2060 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2039 2061 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2040 2062 if (mac_srs->srs_first != NULL) {
2041 2063 if (proc_type == SRS_WORKER) {
2042 2064 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2043 2065 if (srs_rx->sr_poll_pkt_cnt <=
2044 2066 srs_rx->sr_lowat)
2045 2067 MAC_SRS_POLL_RING(mac_srs);
2046 2068 goto again;
2047 2069 } else {
2048 2070 cv_signal(&mac_srs->srs_async);
2049 2071 }
2050 2072 }
2051 2073 }
2052 2074 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2053 2075
2054 2076 done:
2055 2077
2056 2078 if (mac_srs->srs_state & SRS_GET_PKTS) {
2057 2079 /*
2058 2080 * Poll thread is already running. Leave the
2059 2081 * SRS_RPOC set and hand over the control to
2060 2082 * poll thread.
2061 2083 */
2062 2084 mac_srs->srs_state &= ~proc_type;
2063 2085 return;
2064 2086 }
2065 2087
2066 2088 /*
2067 2089 * If we can't process packets because we have exceeded
2068 2090 * B/W limit for this tick, just set the timeout
2069 2091 * and leave.
2070 2092 *
2071 2093 * Even if there are no packets queued in SRS, we
2072 2094 * need to make sure that the shared counter is
2073 2095 * clear and any associated softrings have cleared
2074 2096 * all the backlog. Otherwise, leave the interface
2075 2097 * in polling mode and the poll thread will get
2076 2098 * signalled once the count goes down to zero.
2077 2099 *
2078 2100 * If someone is already draining the queue (SRS_PROC is
2079 2101 * set) when the srs_poll_pkt_cnt goes down to zero,
2080 2102 * then it means that drain is already running and we
2081 2103 * will turn off polling at that time if there is
2082 2104 * no backlog. As long as there are packets queued either
2083 2105 * is soft ring set or its soft rings, we will leave
2084 2106 * the interface in polling mode.
2085 2107 */
2086 2108 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2087 2109 if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
2088 2110 ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
2089 2111 (srs_rx->sr_poll_pkt_cnt > 0))) {
2090 2112 MAC_SRS_POLLING_ON(mac_srs);
2091 2113 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2092 2114 if ((mac_srs->srs_first != NULL) &&
2093 2115 (mac_srs->srs_tid == NULL))
2094 2116 mac_srs->srs_tid = timeout(mac_srs_fire,
2095 2117 mac_srs, 1);
2096 2118 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2097 2119 return;
2098 2120 }
2099 2121 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2100 2122
2101 2123 leave_poll:
2102 2124
2103 2125 /* Nothing else to do. Get out of poll mode */
2104 2126 MAC_SRS_POLLING_OFF(mac_srs);
2105 2127 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2106 2128 }
2107 2129
2108 2130 /*
2109 2131 * mac_srs_worker
2110 2132 *
2111 2133 * The SRS worker routine. Drains the queue when no one else is
2112 2134 * processing it.
2113 2135 */
2114 2136 void
2115 2137 mac_srs_worker(mac_soft_ring_set_t *mac_srs)
2116 2138 {
2117 2139 kmutex_t *lock = &mac_srs->srs_lock;
2118 2140 kcondvar_t *async = &mac_srs->srs_async;
2119 2141 callb_cpr_t cprinfo;
2120 2142 boolean_t bw_ctl_flag;
2121 2143
2122 2144 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
2123 2145 mutex_enter(lock);
2124 2146
2125 2147 start:
2126 2148 for (;;) {
2127 2149 bw_ctl_flag = B_FALSE;
2128 2150 if (mac_srs->srs_type & SRST_BW_CONTROL) {
2129 2151 MAC_SRS_BW_LOCK(mac_srs);
2130 2152 MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2131 2153 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
2132 2154 bw_ctl_flag = B_TRUE;
2133 2155 MAC_SRS_BW_UNLOCK(mac_srs);
2134 2156 }
2135 2157 /*
2136 2158 * The SRS_BW_ENFORCED flag may change since we have dropped
2137 2159 * the mac_bw_lock. However the drain function can handle both
2138 2160 * a drainable SRS or a bandwidth controlled SRS, and the
2139 2161 * effect of scheduling a timeout is to wakeup the worker
2140 2162 * thread which in turn will call the drain function. Since
2141 2163 * we release the srs_lock atomically only in the cv_wait there
2142 2164 * isn't a fear of waiting for ever.
2143 2165 */
2144 2166 while (((mac_srs->srs_state & SRS_PROC) ||
2145 2167 (mac_srs->srs_first == NULL) || bw_ctl_flag ||
2146 2168 (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
2147 2169 !(mac_srs->srs_state & SRS_PAUSE)) {
2148 2170 /*
2149 2171 * If we have packets queued and we are here
2150 2172 * because B/W control is in place, we better
2151 2173 * schedule the worker wakeup after 1 tick
2152 2174 * to see if bandwidth control can be relaxed.
2153 2175 */
2154 2176 if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
2155 2177 /*
2156 2178 * We need to ensure that a timer is already
2157 2179 * scheduled or we force schedule one for
2158 2180 * later so that we can continue processing
2159 2181 * after this quanta is over.
2160 2182 */
2161 2183 mac_srs->srs_tid = timeout(mac_srs_fire,
2162 2184 mac_srs, 1);
2163 2185 }
2164 2186 wait:
2165 2187 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2166 2188 cv_wait(async, lock);
2167 2189 CALLB_CPR_SAFE_END(&cprinfo, lock);
2168 2190
2169 2191 if (mac_srs->srs_state & SRS_PAUSE)
2170 2192 goto done;
2171 2193 if (mac_srs->srs_state & SRS_PROC)
2172 2194 goto wait;
2173 2195
2174 2196 if (mac_srs->srs_first != NULL &&
2175 2197 mac_srs->srs_type & SRST_BW_CONTROL) {
2176 2198 MAC_SRS_BW_LOCK(mac_srs);
2177 2199 if (mac_srs->srs_bw->mac_bw_state &
2178 2200 SRS_BW_ENFORCED) {
2179 2201 MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2180 2202 }
2181 2203 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
2182 2204 SRS_BW_ENFORCED;
2183 2205 MAC_SRS_BW_UNLOCK(mac_srs);
2184 2206 }
2185 2207 }
2186 2208
2187 2209 if (mac_srs->srs_state & SRS_PAUSE)
2188 2210 goto done;
2189 2211 mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
2190 2212 }
2191 2213 done:
2192 2214 /*
2193 2215 * The Rx SRS quiesce logic first cuts off packet supply to the SRS
2194 2216 * from both hard and soft classifications and waits for such threads
2195 2217 * to finish before signaling the worker. So at this point the only
2196 2218 * thread left that could be competing with the worker is the poll
2197 2219 * thread. In the case of Tx, there shouldn't be any thread holding
2198 2220 * SRS_PROC at this point.
2199 2221 */
2200 2222 if (!(mac_srs->srs_state & SRS_PROC)) {
2201 2223 mac_srs->srs_state |= SRS_PROC;
2202 2224 } else {
2203 2225 ASSERT((mac_srs->srs_type & SRST_TX) == 0);
2204 2226 /*
2205 2227 * Poll thread still owns the SRS and is still running
2206 2228 */
2207 2229 ASSERT((mac_srs->srs_poll_thr == NULL) ||
2208 2230 ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
2209 2231 SRS_POLL_THR_OWNER));
2210 2232 }
2211 2233 mac_srs_worker_quiesce(mac_srs);
2212 2234 /*
2213 2235 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
2214 2236 * of the quiesce operation
2215 2237 */
2216 2238 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
2217 2239 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
2218 2240
2219 2241 if (mac_srs->srs_state & SRS_RESTART) {
2220 2242 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
2221 2243 mac_srs_worker_restart(mac_srs);
2222 2244 mac_srs->srs_state &= ~SRS_PROC;
2223 2245 goto start;
2224 2246 }
2225 2247
2226 2248 if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
2227 2249 mac_srs_worker_quiesce(mac_srs);
2228 2250
2229 2251 mac_srs->srs_state &= ~SRS_PROC;
2230 2252 /* The macro drops the srs_lock */
2231 2253 CALLB_CPR_EXIT(&cprinfo);
2232 2254 thread_exit();
2233 2255 }
2234 2256
2235 2257 /*
2236 2258 * mac_rx_srs_subflow_process
2237 2259 *
2238 2260 * Receive side routine called from interrupt path when there are
2239 2261 * sub flows present on this SRS.
2240 2262 */
2241 2263 /* ARGSUSED */
2242 2264 void
2243 2265 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
2244 2266 mblk_t *mp_chain, boolean_t loopback)
2245 2267 {
2246 2268 flow_entry_t *flent = NULL;
2247 2269 flow_entry_t *prev_flent = NULL;
2248 2270 mblk_t *mp = NULL;
2249 2271 mblk_t *tail = NULL;
2250 2272 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs;
2251 2273 mac_client_impl_t *mcip;
2252 2274
2253 2275 mcip = mac_srs->srs_mcip;
2254 2276 ASSERT(mcip != NULL);
2255 2277
2256 2278 /*
2257 2279 * We need to determine the SRS for every packet
2258 2280 * by walking the flow table, if we don't get any,
2259 2281 * then we proceed using the SRS we came with.
2260 2282 */
2261 2283 mp = tail = mp_chain;
2262 2284 while (mp != NULL) {
2263 2285
2264 2286 /*
2265 2287 * We will increment the stats for the mactching subflow.
2266 2288 * when we get the bytes/pkt count for the classified packets
2267 2289 * later in mac_rx_srs_process.
2268 2290 */
2269 2291 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
2270 2292 FLOW_INBOUND, &flent);
2271 2293
2272 2294 if (mp == mp_chain || flent == prev_flent) {
2273 2295 if (prev_flent != NULL)
2274 2296 FLOW_REFRELE(prev_flent);
2275 2297 prev_flent = flent;
2276 2298 flent = NULL;
2277 2299 tail = mp;
2278 2300 mp = mp->b_next;
2279 2301 continue;
2280 2302 }
2281 2303 tail->b_next = NULL;
2282 2304 /*
2283 2305 * A null indicates, this is for the mac_srs itself.
2284 2306 * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
2285 2307 */
2286 2308 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2287 2309 mac_rx_srs_process(arg,
2288 2310 (mac_resource_handle_t)mac_srs, mp_chain,
2289 2311 loopback);
2290 2312 } else {
2291 2313 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2292 2314 prev_flent->fe_cb_arg2, mp_chain, loopback);
2293 2315 FLOW_REFRELE(prev_flent);
2294 2316 }
2295 2317 prev_flent = flent;
2296 2318 flent = NULL;
2297 2319 mp_chain = mp;
2298 2320 tail = mp;
2299 2321 mp = mp->b_next;
2300 2322 }
2301 2323 /* Last chain */
2302 2324 ASSERT(mp_chain != NULL);
2303 2325 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2304 2326 mac_rx_srs_process(arg,
2305 2327 (mac_resource_handle_t)mac_srs, mp_chain, loopback);
2306 2328 } else {
2307 2329 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2308 2330 prev_flent->fe_cb_arg2, mp_chain, loopback);
2309 2331 FLOW_REFRELE(prev_flent);
2310 2332 }
2311 2333 }
2312 2334
2313 2335 /*
2314 2336 * mac_rx_srs_process
2315 2337 *
2316 2338 * Receive side routine called from the interrupt path.
2317 2339 *
2318 2340 * loopback is set to force a context switch on the loopback
2319 2341 * path between MAC clients.
2320 2342 */
2321 2343 /* ARGSUSED */
2322 2344 void
2323 2345 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
2324 2346 boolean_t loopback)
2325 2347 {
2326 2348 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs;
2327 2349 mblk_t *mp, *tail, *head;
2328 2350 int count = 0;
2329 2351 int count1;
2330 2352 size_t sz = 0;
2331 2353 size_t chain_sz, sz1;
2332 2354 mac_bw_ctl_t *mac_bw;
2333 2355 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
2334 2356
2335 2357 /*
2336 2358 * Set the tail, count and sz. We set the sz irrespective
2337 2359 * of whether we are doing B/W control or not for the
2338 2360 * purpose of updating the stats.
2339 2361 */
2340 2362 mp = tail = mp_chain;
2341 2363 while (mp != NULL) {
2342 2364 tail = mp;
2343 2365 count++;
2344 2366 sz += msgdsize(mp);
2345 2367 mp = mp->b_next;
2346 2368 }
2347 2369
2348 2370 mutex_enter(&mac_srs->srs_lock);
2349 2371
2350 2372 if (loopback) {
2351 2373 SRS_RX_STAT_UPDATE(mac_srs, lclbytes, sz);
2352 2374 SRS_RX_STAT_UPDATE(mac_srs, lclcnt, count);
2353 2375
2354 2376 } else {
2355 2377 SRS_RX_STAT_UPDATE(mac_srs, intrbytes, sz);
2356 2378 SRS_RX_STAT_UPDATE(mac_srs, intrcnt, count);
2357 2379 }
2358 2380
2359 2381 /*
2360 2382 * If the SRS in already being processed; has been blanked;
2361 2383 * can be processed by worker thread only; or the B/W limit
2362 2384 * has been reached, then queue the chain and check if
2363 2385 * worker thread needs to be awakend.
2364 2386 */
2365 2387 if (mac_srs->srs_type & SRST_BW_CONTROL) {
2366 2388 mac_bw = mac_srs->srs_bw;
2367 2389 ASSERT(mac_bw != NULL);
2368 2390 mutex_enter(&mac_bw->mac_bw_lock);
2369 2391 mac_bw->mac_bw_intr += sz;
2370 2392 if (mac_bw->mac_bw_limit == 0) {
2371 2393 /* zero bandwidth: drop all */
2372 2394 srs_rx->sr_stat.mrs_sdrops += count;
2373 2395 mac_bw->mac_bw_drop_bytes += sz;
2374 2396 mutex_exit(&mac_bw->mac_bw_lock);
2375 2397 mutex_exit(&mac_srs->srs_lock);
2376 2398 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
2377 2399 return;
2378 2400 } else {
2379 2401 if ((mac_bw->mac_bw_sz + sz) <=
2380 2402 mac_bw->mac_bw_drop_threshold) {
2381 2403 mutex_exit(&mac_bw->mac_bw_lock);
2382 2404 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
2383 2405 tail, count, sz);
2384 2406 } else {
2385 2407 mp = mp_chain;
2386 2408 chain_sz = 0;
2387 2409 count1 = 0;
2388 2410 tail = NULL;
2389 2411 head = NULL;
2390 2412 while (mp != NULL) {
2391 2413 sz1 = msgdsize(mp);
2392 2414 if (mac_bw->mac_bw_sz + chain_sz + sz1 >
2393 2415 mac_bw->mac_bw_drop_threshold)
2394 2416 break;
2395 2417 chain_sz += sz1;
2396 2418 count1++;
2397 2419 tail = mp;
2398 2420 mp = mp->b_next;
2399 2421 }
2400 2422 mutex_exit(&mac_bw->mac_bw_lock);
2401 2423 if (tail != NULL) {
2402 2424 head = tail->b_next;
2403 2425 tail->b_next = NULL;
2404 2426 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
2405 2427 mp_chain, tail, count1, chain_sz);
2406 2428 sz -= chain_sz;
2407 2429 count -= count1;
2408 2430 } else {
2409 2431 /* Can't pick up any */
2410 2432 head = mp_chain;
2411 2433 }
2412 2434 if (head != NULL) {
2413 2435 /* Drop any packet over the threshold */
2414 2436 srs_rx->sr_stat.mrs_sdrops += count;
2415 2437 mutex_enter(&mac_bw->mac_bw_lock);
2416 2438 mac_bw->mac_bw_drop_bytes += sz;
2417 2439 mutex_exit(&mac_bw->mac_bw_lock);
2418 2440 freemsgchain(head);
2419 2441 }
2420 2442 }
2421 2443 MAC_SRS_WORKER_WAKEUP(mac_srs);
2422 2444 mutex_exit(&mac_srs->srs_lock);
2423 2445 return;
2424 2446 }
2425 2447 }
2426 2448
2427 2449 /*
2428 2450 * If the total number of packets queued in the SRS and
2429 2451 * its associated soft rings exceeds the max allowed,
2430 2452 * then drop the chain. If we are polling capable, this
2431 2453 * shouldn't be happening.
2432 2454 */
2433 2455 if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
2434 2456 (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
2435 2457 mac_bw = mac_srs->srs_bw;
2436 2458 srs_rx->sr_stat.mrs_sdrops += count;
2437 2459 mutex_enter(&mac_bw->mac_bw_lock);
2438 2460 mac_bw->mac_bw_drop_bytes += sz;
2439 2461 mutex_exit(&mac_bw->mac_bw_lock);
2440 2462 freemsgchain(mp_chain);
2441 2463 mutex_exit(&mac_srs->srs_lock);
2442 2464 return;
2443 2465 }
2444 2466
2445 2467 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
2446 2468
2447 2469 if (!(mac_srs->srs_state & SRS_PROC)) {
2448 2470 /*
2449 2471 * If we are coming via loopback, if we are not optimizing for
2450 2472 * latency, or if our stack is running deep, we should signal
2451 2473 * the worker thread.
2452 2474 */
2453 2475 if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT) ||
2454 2476 MAC_RX_SRS_TOODEEP()) {
2455 2477 /*
2456 2478 * For loopback, We need to let the worker take
2457 2479 * over as we don't want to continue in the same
2458 2480 * thread even if we can. This could lead to stack
2459 2481 * overflows and may also end up using
2460 2482 * resources (cpu) incorrectly.
2461 2483 */
2462 2484 cv_signal(&mac_srs->srs_async);
2463 2485 } else {
2464 2486 /*
2465 2487 * Seems like no one is processing the SRS and
2466 2488 * there is no backlog. We also inline process
2467 2489 * our packet if its a single packet in non
2468 2490 * latency optimized case (in latency optimized
2469 2491 * case, we inline process chains of any size).
2470 2492 */
2471 2493 mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
2472 2494 }
2473 2495 }
2474 2496 mutex_exit(&mac_srs->srs_lock);
2475 2497 }
2476 2498
2477 2499 /* TX SIDE ROUTINES (RUNTIME) */
2478 2500
2479 2501 /*
2480 2502 * mac_tx_srs_no_desc
2481 2503 *
2482 2504 * This routine is called by Tx single ring default mode
2483 2505 * when Tx ring runs out of descs.
2484 2506 */
2485 2507 mac_tx_cookie_t
2486 2508 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2487 2509 uint16_t flag, mblk_t **ret_mp)
2488 2510 {
2489 2511 mac_tx_cookie_t cookie = NULL;
2490 2512 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2491 2513 boolean_t wakeup_worker = B_TRUE;
2492 2514 uint32_t tx_mode = srs_tx->st_mode;
2493 2515 int cnt, sz;
2494 2516 mblk_t *tail;
2495 2517
2496 2518 ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
2497 2519 if (flag & MAC_DROP_ON_NO_DESC) {
2498 2520 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2499 2521 } else {
2500 2522 if (mac_srs->srs_first != NULL)
2501 2523 wakeup_worker = B_FALSE;
2502 2524 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2503 2525 if (flag & MAC_TX_NO_ENQUEUE) {
2504 2526 /*
2505 2527 * If TX_QUEUED is not set, queue the
2506 2528 * packet and let mac_tx_srs_drain()
2507 2529 * set the TX_BLOCKED bit for the
2508 2530 * reasons explained above. Otherwise,
2509 2531 * return the mblks.
2510 2532 */
2511 2533 if (wakeup_worker) {
2512 2534 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2513 2535 mp_chain, tail, cnt, sz);
2514 2536 } else {
2515 2537 MAC_TX_SET_NO_ENQUEUE(mac_srs,
2516 2538 mp_chain, ret_mp, cookie);
2517 2539 }
2518 2540 } else {
2519 2541 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2520 2542 tail, cnt, sz, cookie);
2521 2543 }
2522 2544 if (wakeup_worker)
2523 2545 cv_signal(&mac_srs->srs_async);
2524 2546 }
2525 2547 return (cookie);
2526 2548 }
2527 2549
2528 2550 /*
2529 2551 * mac_tx_srs_enqueue
2530 2552 *
2531 2553 * This routine is called when Tx SRS is operating in either serializer
2532 2554 * or bandwidth mode. In serializer mode, a packet will get enqueued
2533 2555 * when a thread cannot enter SRS exclusively. In bandwidth mode,
2534 2556 * packets gets queued if allowed byte-count limit for a tick is
2535 2557 * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
2536 2558 * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
2537 2559 * the default mode or fanout mode. Here packets get dropped or
2538 2560 * returned back to the caller only after hi-watermark worth of data
2539 2561 * is queued.
2540 2562 */
2541 2563 static mac_tx_cookie_t
2542 2564 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2543 2565 uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
2544 2566 {
2545 2567 mac_tx_cookie_t cookie = NULL;
2546 2568 int cnt, sz;
2547 2569 mblk_t *tail;
2548 2570 boolean_t wakeup_worker = B_TRUE;
2549 2571
2550 2572 /*
2551 2573 * Ignore fanout hint if we don't have multiple tx rings.
2552 2574 */
2553 2575 if (!MAC_TX_SOFT_RINGS(mac_srs))
2554 2576 fanout_hint = 0;
2555 2577
2556 2578 if (mac_srs->srs_first != NULL)
2557 2579 wakeup_worker = B_FALSE;
2558 2580 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2559 2581 if (flag & MAC_DROP_ON_NO_DESC) {
2560 2582 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
2561 2583 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2562 2584 } else {
2563 2585 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2564 2586 mp_chain, tail, cnt, sz);
2565 2587 }
2566 2588 } else if (flag & MAC_TX_NO_ENQUEUE) {
2567 2589 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
2568 2590 (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
2569 2591 MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
2570 2592 ret_mp, cookie);
2571 2593 } else {
2572 2594 mp_chain->b_prev = (mblk_t *)fanout_hint;
2573 2595 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2574 2596 mp_chain, tail, cnt, sz);
2575 2597 }
2576 2598 } else {
2577 2599 /*
2578 2600 * If you are BW_ENFORCED, just enqueue the
2579 2601 * packet. srs_worker will drain it at the
2580 2602 * prescribed rate. Before enqueueing, save
2581 2603 * the fanout hint.
2582 2604 */
2583 2605 mp_chain->b_prev = (mblk_t *)fanout_hint;
2584 2606 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2585 2607 tail, cnt, sz, cookie);
2586 2608 }
2587 2609 if (wakeup_worker)
2588 2610 cv_signal(&mac_srs->srs_async);
2589 2611 return (cookie);
2590 2612 }
2591 2613
2592 2614 /*
2593 2615 * There are seven tx modes:
2594 2616 *
2595 2617 * 1) Default mode (SRS_TX_DEFAULT)
2596 2618 * 2) Serialization mode (SRS_TX_SERIALIZE)
2597 2619 * 3) Fanout mode (SRS_TX_FANOUT)
2598 2620 * 4) Bandwdith mode (SRS_TX_BW)
2599 2621 * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
2600 2622 * 6) aggr Tx mode (SRS_TX_AGGR)
2601 2623 * 7) aggr Tx bw mode (SRS_TX_BW_AGGR)
2602 2624 *
2603 2625 * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
2604 2626 * based on the number of Tx rings requested for an SRS and whether
2605 2627 * bandwidth control is requested or not.
2606 2628 *
2607 2629 * The default mode (i.e., no fanout/no bandwidth) is used when the
2608 2630 * underlying NIC does not have Tx rings or just one Tx ring. In this mode,
2609 2631 * the SRS acts as a pass-thru. Packets will go directly to mac_tx_send().
2610 2632 * When the underlying Tx ring runs out of Tx descs, it starts queueing up
2611 2633 * packets in SRS. When flow-control is relieved, the srs_worker drains
2612 2634 * the queued packets and informs blocked clients to restart sending
2613 2635 * packets.
2614 2636 *
2615 2637 * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. This
2616 2638 * mode is used when the link has no Tx rings or only one Tx ring.
2617 2639 *
2618 2640 * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
2619 2641 * Tx rings. Each Tx ring will have a soft ring associated with it.
2620 2642 * These soft rings will be hung off the Tx SRS. Queueing if it happens
2621 2643 * due to lack of Tx desc will be in individual soft ring (and not srs)
2622 2644 * associated with Tx ring.
2623 2645 *
2624 2646 * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
2625 2647 * only if bw is available. Otherwise the packets will be queued in
2626 2648 * SRS. If fanout to multiple Tx rings is configured, the packets will
2627 2649 * be fanned out among the soft rings associated with the Tx rings.
2628 2650 *
2629 2651 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
2630 2652 * invokes an aggr function, aggr_find_tx_ring(), to find a pseudo Tx ring
2631 2653 * belonging to a port on which the packet has to be sent. Aggr will
2632 2654 * always have a pseudo Tx ring associated with it even when it is an
2633 2655 * aggregation over a single NIC that has no Tx rings. Even in such a
2634 2656 * case, the single pseudo Tx ring will have a soft ring associated with
2635 2657 * it and the soft ring will hang off the SRS.
2636 2658 *
2637 2659 * If a bandwidth is specified for an aggr, SRS_TX_BW_AGGR mode is used.
2638 2660 * In this mode, the bandwidth is first applied on the outgoing packets
2639 2661 * and later mac_tx_addr_mode() function is called to send the packet out
2640 2662 * of one of the pseudo Tx rings.
2641 2663 *
2642 2664 * Four flags are used in srs_state for indicating flow control
2643 2665 * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
2644 2666 * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
2645 2667 * driver below.
2646 2668 * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
2647 2669 * and flow-control pressure is applied back to clients. The clients expect
2648 2670 * wakeup when flow-control is relieved.
2649 2671 * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
2650 2672 * got returned back to client either due to lack of Tx descs or due to bw
2651 2673 * control reasons. The clients expect a wakeup when condition is relieved.
2652 2674 *
2653 2675 * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
2654 2676 * some clients set the following values too: MAC_DROP_ON_NO_DESC,
2655 2677 * MAC_TX_NO_ENQUEUE
2656 2678 * Mac clients that do not want packets to be enqueued in the mac layer set
2657 2679 * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
2658 2680 * Tx soft rings but instead get dropped when the NIC runs out of desc. The
2659 2681 * behaviour of this flag is different when the Tx is running in serializer
2660 2682 * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
2661 2683 * get dropped when Tx high watermark is reached.
2662 2684 * There are some mac clients like vsw, aggr that want the mblks to be
2663 2685 * returned back to clients instead of being queued in Tx SRS (or Tx soft
2664 2686 * rings) under flow-control (i.e., out of desc or exceeding bw limits)
2665 2687 * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
2666 2688 * In the default and Tx fanout mode, the un-transmitted mblks will be
2667 2689 * returned back to the clients when the driver runs out of Tx descs.
2668 2690 * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
2669 2691 * soft ring) so that the clients can be woken up when Tx desc become
2670 2692 * available. When running in serializer or bandwidth mode mode,
2671 2693 * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
2672 2694 */
2673 2695
2674 2696 mac_tx_func_t
2675 2697 mac_tx_get_func(uint32_t mode)
2676 2698 {
2677 2699 return (mac_tx_mode_list[mode].mac_tx_func);
2678 2700 }
2679 2701
2680 2702 /* ARGSUSED */
2681 2703 static mac_tx_cookie_t
2682 2704 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2683 2705 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2684 2706 {
2685 2707 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2686 2708 mac_tx_stats_t stats;
2687 2709 mac_tx_cookie_t cookie = NULL;
2688 2710
2689 2711 ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
2690 2712
2691 2713 /* Regular case with a single Tx ring */
2692 2714 /*
2693 2715 * SRS_TX_BLOCKED is set when underlying NIC runs
2694 2716 * out of Tx descs and messages start getting
2695 2717 * queued. It won't get reset until
2696 2718 * tx_srs_drain() completely drains out the
2697 2719 * messages.
2698 2720 */
2699 2721 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2700 2722 /* Tx descs/resources not available */
2701 2723 mutex_enter(&mac_srs->srs_lock);
2702 2724 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2703 2725 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
2704 2726 flag, ret_mp);
2705 2727 mutex_exit(&mac_srs->srs_lock);
2706 2728 return (cookie);
2707 2729 }
2708 2730 /*
2709 2731 * While we were computing mblk count, the
2710 2732 * flow control condition got relieved.
2711 2733 * Continue with the transmission.
2712 2734 */
2713 2735 mutex_exit(&mac_srs->srs_lock);
2714 2736 }
2715 2737
2716 2738 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2717 2739 mp_chain, &stats);
2718 2740
2719 2741 /*
2720 2742 * Multiple threads could be here sending packets.
2721 2743 * Under such conditions, it is not possible to
2722 2744 * automically set SRS_TX_BLOCKED bit to indicate
2723 2745 * out of tx desc condition. To atomically set
2724 2746 * this, we queue the returned packet and do
2725 2747 * the setting of SRS_TX_BLOCKED in
2726 2748 * mac_tx_srs_drain().
2727 2749 */
2728 2750 if (mp_chain != NULL) {
2729 2751 mutex_enter(&mac_srs->srs_lock);
2730 2752 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
2731 2753 mutex_exit(&mac_srs->srs_lock);
2732 2754 return (cookie);
2733 2755 }
2734 2756 SRS_TX_STATS_UPDATE(mac_srs, &stats);
2735 2757
2736 2758 return (NULL);
2737 2759 }
2738 2760
2739 2761 /*
2740 2762 * mac_tx_serialize_mode
2741 2763 *
2742 2764 * This is an experimental mode implemented as per the request of PAE.
2743 2765 * In this mode, all callers attempting to send a packet to the NIC
2744 2766 * will get serialized. Only one thread at any time will access the
2745 2767 * NIC to send the packet out.
2746 2768 */
2747 2769 /* ARGSUSED */
2748 2770 static mac_tx_cookie_t
2749 2771 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2750 2772 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2751 2773 {
2752 2774 mac_tx_stats_t stats;
2753 2775 mac_tx_cookie_t cookie = NULL;
2754 2776 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2755 2777
2756 2778 /* Single ring, serialize below */
2757 2779 ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
2758 2780 mutex_enter(&mac_srs->srs_lock);
2759 2781 if ((mac_srs->srs_first != NULL) ||
2760 2782 (mac_srs->srs_state & SRS_PROC)) {
2761 2783 /*
2762 2784 * In serialization mode, queue all packets until
2763 2785 * TX_HIWAT is set.
2764 2786 * If drop bit is set, drop if TX_HIWAT is set.
2765 2787 * If no_enqueue is set, still enqueue until hiwat
2766 2788 * is set and return mblks after TX_HIWAT is set.
2767 2789 */
2768 2790 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
2769 2791 flag, NULL, ret_mp);
2770 2792 mutex_exit(&mac_srs->srs_lock);
2771 2793 return (cookie);
2772 2794 }
2773 2795 /*
2774 2796 * No packets queued, nothing on proc and no flow
2775 2797 * control condition. Fast-path, ok. Do inline
2776 2798 * processing.
2777 2799 */
2778 2800 mac_srs->srs_state |= SRS_PROC;
2779 2801 mutex_exit(&mac_srs->srs_lock);
2780 2802
2781 2803 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2782 2804 mp_chain, &stats);
2783 2805
2784 2806 mutex_enter(&mac_srs->srs_lock);
2785 2807 mac_srs->srs_state &= ~SRS_PROC;
2786 2808 if (mp_chain != NULL) {
2787 2809 cookie = mac_tx_srs_enqueue(mac_srs,
2788 2810 mp_chain, flag, NULL, ret_mp);
2789 2811 }
2790 2812 if (mac_srs->srs_first != NULL) {
2791 2813 /*
2792 2814 * We processed inline our packet and a new
2793 2815 * packet/s got queued while we were
2794 2816 * processing. Wakeup srs worker
2795 2817 */
2796 2818 cv_signal(&mac_srs->srs_async);
2797 2819 }
2798 2820 mutex_exit(&mac_srs->srs_lock);
2799 2821
2800 2822 if (cookie == NULL)
2801 2823 SRS_TX_STATS_UPDATE(mac_srs, &stats);
2802 2824
2803 2825 return (cookie);
2804 2826 }
2805 2827
2806 2828 /*
2807 2829 * mac_tx_fanout_mode
2808 2830 *
2809 2831 * In this mode, the SRS will have access to multiple Tx rings to send
2810 2832 * the packet out. The fanout hint that is passed as an argument is
2811 2833 * used to find an appropriate ring to fanout the traffic. Each Tx
2812 2834 * ring, in turn, will have a soft ring associated with it. If a Tx
2813 2835 * ring runs out of Tx desc's the returned packet will be queued in
2814 2836 * the soft ring associated with that Tx ring. The srs itself will not
2815 2837 * queue any packets.
2816 2838 */
2817 2839
2818 2840 #define MAC_TX_SOFT_RING_PROCESS(chain) { \
2819 2841 index = COMPUTE_INDEX(hash, mac_srs->srs_tx_ring_count), \
2820 2842 softring = mac_srs->srs_tx_soft_rings[index]; \
2821 2843 cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
2822 2844 DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \
2823 2845 }
2824 2846
2825 2847 static mac_tx_cookie_t
2826 2848 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2827 2849 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2828 2850 {
2829 2851 mac_soft_ring_t *softring;
2830 2852 uint64_t hash;
2831 2853 uint_t index;
2832 2854 mac_tx_cookie_t cookie = NULL;
2833 2855
2834 2856 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
2835 2857 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
2836 2858 if (fanout_hint != 0) {
2837 2859 /*
2838 2860 * The hint is specified by the caller, simply pass the
2839 2861 * whole chain to the soft ring.
2840 2862 */
2841 2863 hash = HASH_HINT(fanout_hint);
2842 2864 MAC_TX_SOFT_RING_PROCESS(mp_chain);
2843 2865 } else {
2844 2866 mblk_t *last_mp, *cur_mp, *sub_chain;
2845 2867 uint64_t last_hash = 0;
2846 2868 uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
2847 2869
2848 2870 /*
2849 2871 * Compute the hash from the contents (headers) of the
2850 2872 * packets of the mblk chain. Split the chains into
2851 2873 * subchains of the same conversation.
2852 2874 *
2853 2875 * Since there may be more than one ring used for
2854 2876 * sub-chains of the same call, and since the caller
2855 2877 * does not maintain per conversation state since it
2856 2878 * passed a zero hint, unsent subchains will be
2857 2879 * dropped.
2858 2880 */
2859 2881
2860 2882 flag |= MAC_DROP_ON_NO_DESC;
2861 2883 ret_mp = NULL;
2862 2884
2863 2885 ASSERT(ret_mp == NULL);
2864 2886
2865 2887 sub_chain = NULL;
2866 2888 last_mp = NULL;
2867 2889
2868 2890 for (cur_mp = mp_chain; cur_mp != NULL;
2869 2891 cur_mp = cur_mp->b_next) {
2870 2892 hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
2871 2893 B_TRUE);
2872 2894 if (last_hash != 0 && hash != last_hash) {
2873 2895 /*
2874 2896 * Starting a different subchain, send current
2875 2897 * chain out.
2876 2898 */
2877 2899 ASSERT(last_mp != NULL);
2878 2900 last_mp->b_next = NULL;
2879 2901 MAC_TX_SOFT_RING_PROCESS(sub_chain);
2880 2902 sub_chain = NULL;
2881 2903 }
2882 2904
2883 2905 /* add packet to subchain */
2884 2906 if (sub_chain == NULL)
2885 2907 sub_chain = cur_mp;
2886 2908 last_mp = cur_mp;
2887 2909 last_hash = hash;
2888 2910 }
2889 2911
2890 2912 if (sub_chain != NULL) {
2891 2913 /* send last subchain */
2892 2914 ASSERT(last_mp != NULL);
2893 2915 last_mp->b_next = NULL;
2894 2916 MAC_TX_SOFT_RING_PROCESS(sub_chain);
2895 2917 }
2896 2918
2897 2919 cookie = NULL;
2898 2920 }
2899 2921
2900 2922 return (cookie);
2901 2923 }
2902 2924
2903 2925 /*
2904 2926 * mac_tx_bw_mode
2905 2927 *
2906 2928 * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
2907 2929 * only if bw is available. Otherwise the packets will be queued in
2908 2930 * SRS. If the SRS has multiple Tx rings, then packets will get fanned
2909 2931 * out to a Tx rings.
2910 2932 */
2911 2933 static mac_tx_cookie_t
2912 2934 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2913 2935 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2914 2936 {
2915 2937 int cnt, sz;
2916 2938 mblk_t *tail;
2917 2939 mac_tx_cookie_t cookie = NULL;
2918 2940 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2919 2941 clock_t now;
2920 2942
2921 2943 ASSERT(TX_BANDWIDTH_MODE(mac_srs));
2922 2944 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
2923 2945 mutex_enter(&mac_srs->srs_lock);
2924 2946 if (mac_srs->srs_bw->mac_bw_limit == 0) {
2925 2947 /*
2926 2948 * zero bandwidth, no traffic is sent: drop the packets,
2927 2949 * or return the whole chain if the caller requests all
2928 2950 * unsent packets back.
2929 2951 */
2930 2952 if (flag & MAC_TX_NO_ENQUEUE) {
2931 2953 cookie = (mac_tx_cookie_t)mac_srs;
2932 2954 *ret_mp = mp_chain;
2933 2955 } else {
2934 2956 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2935 2957 }
2936 2958 mutex_exit(&mac_srs->srs_lock);
2937 2959 return (cookie);
2938 2960 } else if ((mac_srs->srs_first != NULL) ||
2939 2961 (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2940 2962 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2941 2963 fanout_hint, ret_mp);
2942 2964 mutex_exit(&mac_srs->srs_lock);
2943 2965 return (cookie);
2944 2966 }
2945 2967 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2946 2968 now = ddi_get_lbolt();
2947 2969 if (mac_srs->srs_bw->mac_bw_curr_time != now) {
2948 2970 mac_srs->srs_bw->mac_bw_curr_time = now;
2949 2971 mac_srs->srs_bw->mac_bw_used = 0;
2950 2972 } else if (mac_srs->srs_bw->mac_bw_used >
2951 2973 mac_srs->srs_bw->mac_bw_limit) {
2952 2974 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
2953 2975 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2954 2976 mp_chain, tail, cnt, sz);
2955 2977 /*
2956 2978 * Wakeup worker thread. Note that worker
2957 2979 * thread has to be woken up so that it
2958 2980 * can fire up the timer to be woken up
2959 2981 * on the next tick. Also once
2960 2982 * BW_ENFORCED is set, it can only be
2961 2983 * reset by srs_worker thread. Until then
2962 2984 * all packets will get queued up in SRS
2963 2985 * and hence this this code path won't be
2964 2986 * entered until BW_ENFORCED is reset.
2965 2987 */
2966 2988 cv_signal(&mac_srs->srs_async);
2967 2989 mutex_exit(&mac_srs->srs_lock);
2968 2990 return (cookie);
2969 2991 }
2970 2992
2971 2993 mac_srs->srs_bw->mac_bw_used += sz;
2972 2994 mutex_exit(&mac_srs->srs_lock);
2973 2995
2974 2996 if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
2975 2997 mac_soft_ring_t *softring;
2976 2998 uint_t indx, hash;
2977 2999
2978 3000 hash = HASH_HINT(fanout_hint);
2979 3001 indx = COMPUTE_INDEX(hash,
2980 3002 mac_srs->srs_tx_ring_count);
2981 3003 softring = mac_srs->srs_tx_soft_rings[indx];
2982 3004 return (mac_tx_soft_ring_process(softring, mp_chain, flag,
2983 3005 ret_mp));
2984 3006 } else if (srs_tx->st_mode == SRS_TX_BW_AGGR) {
2985 3007 return (mac_tx_aggr_mode(mac_srs, mp_chain,
2986 3008 fanout_hint, flag, ret_mp));
2987 3009 } else {
2988 3010 mac_tx_stats_t stats;
2989 3011
2990 3012 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2991 3013 mp_chain, &stats);
2992 3014
2993 3015 if (mp_chain != NULL) {
2994 3016 mutex_enter(&mac_srs->srs_lock);
2995 3017 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2996 3018 if (mac_srs->srs_bw->mac_bw_used > sz)
2997 3019 mac_srs->srs_bw->mac_bw_used -= sz;
2998 3020 else
2999 3021 mac_srs->srs_bw->mac_bw_used = 0;
3000 3022 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
3001 3023 fanout_hint, ret_mp);
3002 3024 mutex_exit(&mac_srs->srs_lock);
3003 3025 return (cookie);
3004 3026 }
3005 3027 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3006 3028
3007 3029 return (NULL);
3008 3030 }
3009 3031 }
3010 3032
3011 3033 /*
3012 3034 * mac_tx_aggr_mode
3013 3035 *
3014 3036 * This routine invokes an aggr function, aggr_find_tx_ring(), to find
3015 3037 * a (pseudo) Tx ring belonging to a port on which the packet has to
3016 3038 * be sent. aggr_find_tx_ring() first finds the outgoing port based on
3017 3039 * L2/L3/L4 policy and then uses the fanout_hint passed to it to pick
3018 3040 * a Tx ring from the selected port.
3019 3041 *
3020 3042 * Note that a port can be deleted from the aggregation. In such a case,
3021 3043 * the aggregation layer first separates the port from the rest of the
3022 3044 * ports making sure that port (and thus any Tx rings associated with
3023 3045 * it) won't get selected in the call to aggr_find_tx_ring() function.
3024 3046 * Later calls are made to mac_group_rem_ring() passing pseudo Tx ring
3025 3047 * handles one by one which in turn will quiesce the Tx SRS and remove
3026 3048 * the soft ring associated with the pseudo Tx ring. Unlike Rx side
3027 3049 * where a cookie is used to protect against mac_rx_ring() calls on
3028 3050 * rings that have been removed, no such cookie is needed on the Tx
3029 3051 * side as the pseudo Tx ring won't be available anymore to
3030 3052 * aggr_find_tx_ring() once the port has been removed.
3031 3053 */
3032 3054 static mac_tx_cookie_t
3033 3055 mac_tx_aggr_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3034 3056 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
3035 3057 {
3036 3058 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3037 3059 mac_tx_ring_fn_t find_tx_ring_fn;
3038 3060 mac_ring_handle_t ring = NULL;
3039 3061 void *arg;
3040 3062 mac_soft_ring_t *sringp;
3041 3063
3042 3064 find_tx_ring_fn = srs_tx->st_capab_aggr.mca_find_tx_ring_fn;
3043 3065 arg = srs_tx->st_capab_aggr.mca_arg;
3044 3066 if (find_tx_ring_fn(arg, mp_chain, fanout_hint, &ring) == NULL)
3045 3067 return (NULL);
3046 3068 sringp = srs_tx->st_soft_rings[((mac_ring_t *)ring)->mr_index];
3047 3069 return (mac_tx_soft_ring_process(sringp, mp_chain, flag, ret_mp));
3048 3070 }
3049 3071
3050 3072 void
3051 3073 mac_tx_invoke_callbacks(mac_client_impl_t *mcip, mac_tx_cookie_t cookie)
3052 3074 {
3053 3075 mac_cb_t *mcb;
3054 3076 mac_tx_notify_cb_t *mtnfp;
3055 3077
3056 3078 /* Wakeup callback registered clients */
3057 3079 MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
3058 3080 for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
3059 3081 mcb = mcb->mcb_nextp) {
3060 3082 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
3061 3083 mtnfp->mtnf_fn(mtnfp->mtnf_arg, cookie);
3062 3084 }
3063 3085 MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
3064 3086 &mcip->mci_tx_notify_cb_list);
3065 3087 }
3066 3088
3067 3089 /* ARGSUSED */
3068 3090 void
3069 3091 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
3070 3092 {
3071 3093 mblk_t *head, *tail;
3072 3094 size_t sz;
3073 3095 uint32_t tx_mode;
3074 3096 uint_t saved_pkt_count;
3075 3097 mac_tx_stats_t stats;
3076 3098 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3077 3099 clock_t now;
3078 3100
3079 3101 saved_pkt_count = 0;
3080 3102 ASSERT(mutex_owned(&mac_srs->srs_lock));
3081 3103 ASSERT(!(mac_srs->srs_state & SRS_PROC));
3082 3104
3083 3105 mac_srs->srs_state |= SRS_PROC;
3084 3106
3085 3107 tx_mode = srs_tx->st_mode;
3086 3108 if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
3087 3109 if (mac_srs->srs_first != NULL) {
3088 3110 head = mac_srs->srs_first;
3089 3111 tail = mac_srs->srs_last;
3090 3112 saved_pkt_count = mac_srs->srs_count;
3091 3113 mac_srs->srs_first = NULL;
3092 3114 mac_srs->srs_last = NULL;
3093 3115 mac_srs->srs_count = 0;
3094 3116 mutex_exit(&mac_srs->srs_lock);
3095 3117
3096 3118 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3097 3119 head, &stats);
3098 3120
3099 3121 mutex_enter(&mac_srs->srs_lock);
3100 3122 if (head != NULL) {
3101 3123 /* Device out of tx desc, set block */
3102 3124 if (head->b_next == NULL)
3103 3125 VERIFY(head == tail);
3104 3126 tail->b_next = mac_srs->srs_first;
3105 3127 mac_srs->srs_first = head;
3106 3128 mac_srs->srs_count +=
3107 3129 (saved_pkt_count - stats.mts_opackets);
3108 3130 if (mac_srs->srs_last == NULL)
3109 3131 mac_srs->srs_last = tail;
3110 3132 MAC_TX_SRS_BLOCK(mac_srs, head);
3111 3133 } else {
3112 3134 srs_tx->st_woken_up = B_FALSE;
3113 3135 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3114 3136 }
3115 3137 }
3116 3138 } else if (tx_mode == SRS_TX_BW) {
3117 3139 /*
3118 3140 * We are here because the timer fired and we have some data
3119 3141 * to tranmit. Also mac_tx_srs_worker should have reset
3120 3142 * SRS_BW_ENFORCED flag
3121 3143 */
3122 3144 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
3123 3145 head = tail = mac_srs->srs_first;
3124 3146 while (mac_srs->srs_first != NULL) {
3125 3147 tail = mac_srs->srs_first;
3126 3148 tail->b_prev = NULL;
3127 3149 mac_srs->srs_first = tail->b_next;
3128 3150 if (mac_srs->srs_first == NULL)
3129 3151 mac_srs->srs_last = NULL;
3130 3152 mac_srs->srs_count--;
3131 3153 sz = msgdsize(tail);
3132 3154 mac_srs->srs_size -= sz;
3133 3155 saved_pkt_count++;
3134 3156 MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
3135 3157
3136 3158 if (mac_srs->srs_bw->mac_bw_used <
3137 3159 mac_srs->srs_bw->mac_bw_limit)
3138 3160 continue;
3139 3161
3140 3162 now = ddi_get_lbolt();
3141 3163 if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3142 3164 mac_srs->srs_bw->mac_bw_curr_time = now;
3143 3165 mac_srs->srs_bw->mac_bw_used = sz;
3144 3166 continue;
3145 3167 }
3146 3168 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3147 3169 break;
3148 3170 }
3149 3171
3150 3172 ASSERT((head == NULL && tail == NULL) ||
3151 3173 (head != NULL && tail != NULL));
3152 3174 if (tail != NULL) {
3153 3175 tail->b_next = NULL;
3154 3176 mutex_exit(&mac_srs->srs_lock);
3155 3177
3156 3178 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3157 3179 head, &stats);
3158 3180
3159 3181 mutex_enter(&mac_srs->srs_lock);
3160 3182 if (head != NULL) {
3161 3183 uint_t size_sent;
3162 3184
3163 3185 /* Device out of tx desc, set block */
3164 3186 if (head->b_next == NULL)
3165 3187 VERIFY(head == tail);
3166 3188 tail->b_next = mac_srs->srs_first;
3167 3189 mac_srs->srs_first = head;
3168 3190 mac_srs->srs_count +=
3169 3191 (saved_pkt_count - stats.mts_opackets);
3170 3192 if (mac_srs->srs_last == NULL)
3171 3193 mac_srs->srs_last = tail;
3172 3194 size_sent = sz - stats.mts_obytes;
3173 3195 mac_srs->srs_size += size_sent;
3174 3196 mac_srs->srs_bw->mac_bw_sz += size_sent;
3175 3197 if (mac_srs->srs_bw->mac_bw_used > size_sent) {
3176 3198 mac_srs->srs_bw->mac_bw_used -=
3177 3199 size_sent;
3178 3200 } else {
3179 3201 mac_srs->srs_bw->mac_bw_used = 0;
3180 3202 }
3181 3203 MAC_TX_SRS_BLOCK(mac_srs, head);
3182 3204 } else {
3183 3205 srs_tx->st_woken_up = B_FALSE;
3184 3206 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3185 3207 }
3186 3208 }
3187 3209 } else if (tx_mode == SRS_TX_BW_FANOUT || tx_mode == SRS_TX_BW_AGGR) {
3188 3210 mblk_t *prev;
3189 3211 uint64_t hint;
3190 3212
3191 3213 /*
3192 3214 * We are here because the timer fired and we
3193 3215 * have some quota to tranmit.
3194 3216 */
3195 3217 prev = NULL;
3196 3218 head = tail = mac_srs->srs_first;
3197 3219 while (mac_srs->srs_first != NULL) {
3198 3220 tail = mac_srs->srs_first;
3199 3221 mac_srs->srs_first = tail->b_next;
3200 3222 if (mac_srs->srs_first == NULL)
3201 3223 mac_srs->srs_last = NULL;
3202 3224 mac_srs->srs_count--;
3203 3225 sz = msgdsize(tail);
3204 3226 mac_srs->srs_size -= sz;
3205 3227 mac_srs->srs_bw->mac_bw_used += sz;
3206 3228 if (prev == NULL)
3207 3229 hint = (ulong_t)tail->b_prev;
3208 3230 if (hint != (ulong_t)tail->b_prev) {
3209 3231 prev->b_next = NULL;
3210 3232 mutex_exit(&mac_srs->srs_lock);
3211 3233 TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3212 3234 head = tail;
3213 3235 hint = (ulong_t)tail->b_prev;
3214 3236 mutex_enter(&mac_srs->srs_lock);
3215 3237 }
3216 3238
3217 3239 prev = tail;
3218 3240 tail->b_prev = NULL;
3219 3241 if (mac_srs->srs_bw->mac_bw_used <
3220 3242 mac_srs->srs_bw->mac_bw_limit)
3221 3243 continue;
3222 3244
3223 3245 now = ddi_get_lbolt();
3224 3246 if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3225 3247 mac_srs->srs_bw->mac_bw_curr_time = now;
3226 3248 mac_srs->srs_bw->mac_bw_used = 0;
3227 3249 continue;
3228 3250 }
3229 3251 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3230 3252 break;
3231 3253 }
3232 3254 ASSERT((head == NULL && tail == NULL) ||
3233 3255 (head != NULL && tail != NULL));
3234 3256 if (tail != NULL) {
3235 3257 tail->b_next = NULL;
3236 3258 mutex_exit(&mac_srs->srs_lock);
3237 3259 TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3238 3260 mutex_enter(&mac_srs->srs_lock);
3239 3261 }
3240 3262 }
3241 3263 /*
3242 3264 * SRS_TX_FANOUT case not considered here because packets
3243 3265 * won't be queued in the SRS for this case. Packets will
3244 3266 * be sent directly to soft rings underneath and if there
3245 3267 * is any queueing at all, it would be in Tx side soft
3246 3268 * rings.
3247 3269 */
3248 3270
3249 3271 /*
3250 3272 * When srs_count becomes 0, reset SRS_TX_HIWAT and
3251 3273 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
3252 3274 */
3253 3275 if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
3254 3276 (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
3255 3277 mac_client_impl_t *mcip = mac_srs->srs_mcip;
3256 3278 boolean_t wakeup_required = B_FALSE;
3257 3279
3258 3280 if (mac_srs->srs_state &
3259 3281 (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
3260 3282 wakeup_required = B_TRUE;
3261 3283 }
3262 3284 mac_srs->srs_state &= ~(SRS_TX_HIWAT |
3263 3285 SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
3264 3286 mutex_exit(&mac_srs->srs_lock);
3265 3287 if (wakeup_required) {
3266 3288 mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)mac_srs);
3267 3289 /*
3268 3290 * If the client is not the primary MAC client, then we
3269 3291 * need to send the notification to the clients upper
3270 3292 * MAC, i.e. mci_upper_mip.
3271 3293 */
3272 3294 mac_tx_notify(mcip->mci_upper_mip != NULL ?
3273 3295 mcip->mci_upper_mip : mcip->mci_mip);
3274 3296 }
3275 3297 mutex_enter(&mac_srs->srs_lock);
3276 3298 }
3277 3299 mac_srs->srs_state &= ~SRS_PROC;
3278 3300 }
3279 3301
3280 3302 /*
3281 3303 * Given a packet, get the flow_entry that identifies the flow
3282 3304 * to which that packet belongs. The flow_entry will contain
3283 3305 * the transmit function to be used to send the packet. If the
3284 3306 * function returns NULL, the packet should be sent using the
3285 3307 * underlying NIC.
3286 3308 */
3287 3309 static flow_entry_t *
3288 3310 mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
3289 3311 {
3290 3312 flow_entry_t *flent = NULL;
3291 3313 mac_client_impl_t *mcip;
3292 3314 int err;
3293 3315
3294 3316 /*
3295 3317 * Do classification on the packet.
3296 3318 */
3297 3319 err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
3298 3320 if (err != 0)
3299 3321 return (NULL);
3300 3322
3301 3323 /*
3302 3324 * This flent might just be an additional one on the MAC client,
3303 3325 * i.e. for classification purposes (different fdesc), however
3304 3326 * the resources, SRS et. al., are in the mci_flent, so if
3305 3327 * this isn't the mci_flent, we need to get it.
3306 3328 */
3307 3329 if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
3308 3330 FLOW_REFRELE(flent);
3309 3331 flent = mcip->mci_flent;
3310 3332 FLOW_TRY_REFHOLD(flent, err);
3311 3333 if (err != 0)
3312 3334 return (NULL);
3313 3335 }
3314 3336
3315 3337 return (flent);
3316 3338 }
3317 3339
3318 3340 /*
3319 3341 * This macro is only meant to be used by mac_tx_send().
3320 3342 */
3321 3343 #define CHECK_VID_AND_ADD_TAG(mp) { \
3322 3344 if (vid_check) { \
3323 3345 int err = 0; \
3324 3346 \
3325 3347 MAC_VID_CHECK(src_mcip, (mp), err); \
3326 3348 if (err != 0) { \
3327 3349 freemsg((mp)); \
3328 3350 (mp) = next; \
3329 3351 oerrors++; \
3330 3352 continue; \
3331 3353 } \
3332 3354 } \
3333 3355 if (add_tag) { \
3334 3356 (mp) = mac_add_vlan_tag((mp), 0, vid); \
3335 3357 if ((mp) == NULL) { \
3336 3358 (mp) = next; \
3337 3359 oerrors++; \
3338 3360 continue; \
3339 3361 } \
3340 3362 } \
3341 3363 }
3342 3364
3343 3365 mblk_t *
3344 3366 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
3345 3367 mac_tx_stats_t *stats)
3346 3368 {
3347 3369 mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
3348 3370 mac_impl_t *mip = src_mcip->mci_mip;
3349 3371 uint_t obytes = 0, opackets = 0, oerrors = 0;
3350 3372 mblk_t *mp = NULL, *next;
3351 3373 boolean_t vid_check, add_tag;
3352 3374 uint16_t vid = 0;
3353 3375
3354 3376 if (mip->mi_nclients > 1) {
3355 3377 vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
3356 3378 add_tag = MAC_TAG_NEEDED(src_mcip);
3357 3379 if (add_tag)
3358 3380 vid = mac_client_vid(mch);
3359 3381 } else {
3360 3382 ASSERT(mip->mi_nclients == 1);
3361 3383 vid_check = add_tag = B_FALSE;
3362 3384 }
3363 3385
3364 3386 /*
3365 3387 * Fastpath: if there's only one client, we simply send
3366 3388 * the packet down to the underlying NIC.
3367 3389 */
3368 3390 if (mip->mi_nactiveclients == 1) {
3369 3391 DTRACE_PROBE2(fastpath,
3370 3392 mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
3371 3393
3372 3394 mp = mp_chain;
3373 3395 while (mp != NULL) {
3374 3396 next = mp->b_next;
3375 3397 mp->b_next = NULL;
3376 3398 opackets++;
3377 3399 obytes += (mp->b_cont == NULL ? MBLKL(mp) :
3378 3400 msgdsize(mp));
3379 3401
3380 3402 CHECK_VID_AND_ADD_TAG(mp);
3381 3403 MAC_TX(mip, ring, mp, src_mcip);
3382 3404
3383 3405 /*
3384 3406 * If the driver is out of descriptors and does a
3385 3407 * partial send it will return a chain of unsent
3386 3408 * mblks. Adjust the accounting stats.
3387 3409 */
3388 3410 if (mp != NULL) {
3389 3411 opackets--;
3390 3412 obytes -= msgdsize(mp);
3391 3413 mp->b_next = next;
3392 3414 break;
3393 3415 }
3394 3416 mp = next;
3395 3417 }
3396 3418 goto done;
3397 3419 }
3398 3420
3399 3421 /*
3400 3422 * No fastpath, we either have more than one MAC client
3401 3423 * defined on top of the same MAC, or one or more MAC
3402 3424 * client promiscuous callbacks.
3403 3425 */
3404 3426 DTRACE_PROBE3(slowpath, mac_client_impl_t *,
3405 3427 src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
3406 3428
3407 3429 mp = mp_chain;
3408 3430 while (mp != NULL) {
3409 3431 flow_entry_t *dst_flow_ent;
3410 3432 void *flow_cookie;
3411 3433 size_t pkt_size;
3412 3434 mblk_t *mp1;
3413 3435
3414 3436 next = mp->b_next;
3415 3437 mp->b_next = NULL;
3416 3438 opackets++;
3417 3439 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
3418 3440 obytes += pkt_size;
3419 3441 CHECK_VID_AND_ADD_TAG(mp);
3420 3442
3421 3443 /*
3422 3444 * Find the destination.
3423 3445 */
3424 3446 dst_flow_ent = mac_tx_classify(mip, mp);
3425 3447
3426 3448 if (dst_flow_ent != NULL) {
3427 3449 size_t hdrsize;
3428 3450 int err = 0;
3429 3451
3430 3452 if (mip->mi_info.mi_nativemedia == DL_ETHER) {
3431 3453 struct ether_vlan_header *evhp =
3432 3454 (struct ether_vlan_header *)mp->b_rptr;
3433 3455
3434 3456 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
3435 3457 hdrsize = sizeof (*evhp);
3436 3458 else
3437 3459 hdrsize = sizeof (struct ether_header);
3438 3460 } else {
3439 3461 mac_header_info_t mhi;
3440 3462
3441 3463 err = mac_header_info((mac_handle_t)mip,
3442 3464 mp, &mhi);
3443 3465 if (err == 0)
3444 3466 hdrsize = mhi.mhi_hdrsize;
3445 3467 }
3446 3468
3447 3469 /*
3448 3470 * Got a matching flow. It's either another
3449 3471 * MAC client, or a broadcast/multicast flow.
3450 3472 * Make sure the packet size is within the
3451 3473 * allowed size. If not drop the packet and
3452 3474 * move to next packet.
3453 3475 */
3454 3476 if (err != 0 ||
3455 3477 (pkt_size - hdrsize) > mip->mi_sdu_max) {
3456 3478 oerrors++;
3457 3479 DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
3458 3480 mblk_t *, mp);
3459 3481 freemsg(mp);
3460 3482 mp = next;
3461 3483 FLOW_REFRELE(dst_flow_ent);
3462 3484 continue;
3463 3485 }
3464 3486 flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
3465 3487 if (flow_cookie != NULL) {
3466 3488 /*
3467 3489 * The vnic_bcast_send function expects
3468 3490 * to receive the sender MAC client
3469 3491 * as value for arg2.
3470 3492 */
3471 3493 mac_bcast_send(flow_cookie, src_mcip, mp,
3472 3494 B_TRUE);
3473 3495 } else {
3474 3496 /*
3475 3497 * loopback the packet to a local MAC
3476 3498 * client. We force a context switch
3477 3499 * if both source and destination MAC
3478 3500 * clients are used by IP, i.e.
3479 3501 * bypass is set.
3480 3502 */
3481 3503 boolean_t do_switch;
3482 3504 mac_client_impl_t *dst_mcip =
3483 3505 dst_flow_ent->fe_mcip;
3484 3506
3485 3507 /*
3486 3508 * Check if there are promiscuous mode
3487 3509 * callbacks defined. This check is
3488 3510 * done here in the 'else' case and
3489 3511 * not in other cases because this
3490 3512 * path is for local loopback
3491 3513 * communication which does not go
3492 3514 * through MAC_TX(). For paths that go
3493 3515 * through MAC_TX(), the promisc_list
3494 3516 * check is done inside the MAC_TX()
3495 3517 * macro.
3496 3518 */
3497 3519 if (mip->mi_promisc_list != NULL)
3498 3520 mac_promisc_dispatch(mip, mp, src_mcip);
3499 3521
3500 3522 do_switch = ((src_mcip->mci_state_flags &
3501 3523 dst_mcip->mci_state_flags &
3502 3524 MCIS_CLIENT_POLL_CAPABLE) != 0);
3503 3525
3504 3526 if ((mp1 = mac_fix_cksum(mp)) != NULL) {
3505 3527 (dst_flow_ent->fe_cb_fn)(
3506 3528 dst_flow_ent->fe_cb_arg1,
3507 3529 dst_flow_ent->fe_cb_arg2,
3508 3530 mp1, do_switch);
3509 3531 }
3510 3532 }
3511 3533 FLOW_REFRELE(dst_flow_ent);
3512 3534 } else {
3513 3535 /*
3514 3536 * Unknown destination, send via the underlying
3515 3537 * NIC.
3516 3538 */
3517 3539 MAC_TX(mip, ring, mp, src_mcip);
3518 3540 if (mp != NULL) {
3519 3541 /*
3520 3542 * Adjust for the last packet that
3521 3543 * could not be transmitted
3522 3544 */
3523 3545 opackets--;
3524 3546 obytes -= pkt_size;
3525 3547 mp->b_next = next;
3526 3548 break;
3527 3549 }
3528 3550 }
3529 3551 mp = next;
3530 3552 }
3531 3553
3532 3554 done:
3533 3555 stats->mts_obytes = obytes;
3534 3556 stats->mts_opackets = opackets;
3535 3557 stats->mts_oerrors = oerrors;
3536 3558 return (mp);
3537 3559 }
3538 3560
3539 3561 /*
3540 3562 * mac_tx_srs_ring_present
3541 3563 *
3542 3564 * Returns whether the specified ring is part of the specified SRS.
3543 3565 */
3544 3566 boolean_t
3545 3567 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3546 3568 {
3547 3569 int i;
3548 3570 mac_soft_ring_t *soft_ring;
3549 3571
3550 3572 if (srs->srs_tx.st_arg2 == tx_ring)
3551 3573 return (B_TRUE);
3552 3574
3553 3575 for (i = 0; i < srs->srs_tx_ring_count; i++) {
3554 3576 soft_ring = srs->srs_tx_soft_rings[i];
3555 3577 if (soft_ring->s_ring_tx_arg2 == tx_ring)
3556 3578 return (B_TRUE);
3557 3579 }
3558 3580
3559 3581 return (B_FALSE);
3560 3582 }
3561 3583
3562 3584 /*
3563 3585 * mac_tx_srs_get_soft_ring
3564 3586 *
3565 3587 * Returns the TX soft ring associated with the given ring, if present.
3566 3588 */
3567 3589 mac_soft_ring_t *
3568 3590 mac_tx_srs_get_soft_ring(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3569 3591 {
3570 3592 int i;
3571 3593 mac_soft_ring_t *soft_ring;
3572 3594
3573 3595 if (srs->srs_tx.st_arg2 == tx_ring)
3574 3596 return (NULL);
3575 3597
3576 3598 for (i = 0; i < srs->srs_tx_ring_count; i++) {
3577 3599 soft_ring = srs->srs_tx_soft_rings[i];
3578 3600 if (soft_ring->s_ring_tx_arg2 == tx_ring)
3579 3601 return (soft_ring);
3580 3602 }
3581 3603
3582 3604 return (NULL);
3583 3605 }
3584 3606
3585 3607 /*
3586 3608 * mac_tx_srs_wakeup
3587 3609 *
3588 3610 * Called when Tx desc become available. Wakeup the appropriate worker
3589 3611 * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
3590 3612 * state field.
3591 3613 */
3592 3614 void
3593 3615 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
3594 3616 {
3595 3617 int i;
3596 3618 mac_soft_ring_t *sringp;
3597 3619 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3598 3620
3599 3621 mutex_enter(&mac_srs->srs_lock);
3600 3622 /*
3601 3623 * srs_tx_ring_count == 0 is the single ring mode case. In
3602 3624 * this mode, there will not be Tx soft rings associated
3603 3625 * with the SRS.
3604 3626 */
3605 3627 if (!MAC_TX_SOFT_RINGS(mac_srs)) {
3606 3628 if (srs_tx->st_arg2 == ring &&
3607 3629 mac_srs->srs_state & SRS_TX_BLOCKED) {
3608 3630 mac_srs->srs_state &= ~SRS_TX_BLOCKED;
3609 3631 srs_tx->st_stat.mts_unblockcnt++;
3610 3632 cv_signal(&mac_srs->srs_async);
3611 3633 }
3612 3634 /*
3613 3635 * A wakeup can come before tx_srs_drain() could
3614 3636 * grab srs lock and set SRS_TX_BLOCKED. So
3615 3637 * always set woken_up flag when we come here.
3616 3638 */
3617 3639 srs_tx->st_woken_up = B_TRUE;
3618 3640 mutex_exit(&mac_srs->srs_lock);
3619 3641 return;
3620 3642 }
3621 3643
3622 3644 /*
3623 3645 * If you are here, it is for FANOUT, BW_FANOUT,
3624 3646 * AGGR_MODE or AGGR_BW_MODE case
3625 3647 */
3626 3648 for (i = 0; i < mac_srs->srs_tx_ring_count; i++) {
3627 3649 sringp = mac_srs->srs_tx_soft_rings[i];
3628 3650 mutex_enter(&sringp->s_ring_lock);
3629 3651 if (sringp->s_ring_tx_arg2 == ring) {
3630 3652 if (sringp->s_ring_state & S_RING_BLOCK) {
3631 3653 sringp->s_ring_state &= ~S_RING_BLOCK;
3632 3654 sringp->s_st_stat.mts_unblockcnt++;
3633 3655 cv_signal(&sringp->s_ring_async);
3634 3656 }
3635 3657 sringp->s_ring_tx_woken_up = B_TRUE;
3636 3658 }
3637 3659 mutex_exit(&sringp->s_ring_lock);
3638 3660 }
3639 3661 mutex_exit(&mac_srs->srs_lock);
3640 3662 }
3641 3663
3642 3664 /*
3643 3665 * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
3644 3666 * the blocked clients again.
3645 3667 */
3646 3668 void
3647 3669 mac_tx_notify(mac_impl_t *mip)
3648 3670 {
3649 3671 i_mac_notify(mip, MAC_NOTE_TX);
3650 3672 }
3651 3673
3652 3674 /*
3653 3675 * RX SOFTRING RELATED FUNCTIONS
3654 3676 *
3655 3677 * These functions really belong in mac_soft_ring.c and here for
3656 3678 * a short period.
3657 3679 */
3658 3680
3659 3681 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \
3660 3682 /* \
3661 3683 * Enqueue our mblk chain. \
3662 3684 */ \
3663 3685 ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \
3664 3686 \
3665 3687 if ((ringp)->s_ring_last != NULL) \
3666 3688 (ringp)->s_ring_last->b_next = (mp); \
3667 3689 else \
3668 3690 (ringp)->s_ring_first = (mp); \
3669 3691 (ringp)->s_ring_last = (tail); \
3670 3692 (ringp)->s_ring_count += (cnt); \
3671 3693 ASSERT((ringp)->s_ring_count > 0); \
3672 3694 if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \
3673 3695 (ringp)->s_ring_size += sz; \
3674 3696 } \
3675 3697 }
3676 3698
3677 3699 /*
3678 3700 * Default entry point to deliver a packet chain to a MAC client.
3679 3701 * If the MAC client has flows, do the classification with these
3680 3702 * flows as well.
3681 3703 */
3682 3704 /* ARGSUSED */
3683 3705 void
3684 3706 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
3685 3707 mac_header_info_t *arg3)
3686 3708 {
3687 3709 mac_client_impl_t *mcip = arg1;
3688 3710
3689 3711 if (mcip->mci_nvids == 1 &&
3690 3712 !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
3691 3713 /*
3692 3714 * If the client has exactly one VID associated with it
3693 3715 * and striping of VLAN header is not disabled,
3694 3716 * remove the VLAN tag from the packet before
3695 3717 * passing it on to the client's receive callback.
3696 3718 * Note that this needs to be done after we dispatch
3697 3719 * the packet to the promiscuous listeners of the
3698 3720 * client, since they expect to see the whole
3699 3721 * frame including the VLAN headers.
3700 3722 */
3701 3723 mp_chain = mac_strip_vlan_tag_chain(mp_chain);
3702 3724 }
3703 3725
3704 3726 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
3705 3727 }
3706 3728
3707 3729 /*
3708 3730 * mac_rx_soft_ring_process
3709 3731 *
3710 3732 * process a chain for a given soft ring. The number of packets queued
3711 3733 * in the SRS and its associated soft rings (including this one) is
3712 3734 * very small (tracked by srs_poll_pkt_cnt), then allow the entering
3713 3735 * thread (interrupt or poll thread) to do inline processing. This
3714 3736 * helps keep the latency down under low load.
3715 3737 *
3716 3738 * The proc and arg for each mblk is already stored in the mblk in
3717 3739 * appropriate places.
3718 3740 */
3719 3741 /* ARGSUSED */
3720 3742 void
3721 3743 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
3722 3744 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
3723 3745 {
3724 3746 mac_direct_rx_t proc;
3725 3747 void *arg1;
3726 3748 mac_resource_handle_t arg2;
3727 3749 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3728 3750
3729 3751 ASSERT(ringp != NULL);
3730 3752 ASSERT(mp_chain != NULL);
3731 3753 ASSERT(tail != NULL);
3732 3754 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3733 3755
3734 3756 mutex_enter(&ringp->s_ring_lock);
3735 3757 ringp->s_ring_total_inpkt += cnt;
3736 3758 ringp->s_ring_total_rbytes += sz;
3737 3759 if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
3738 3760 !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
3739 3761 /* If on processor or blanking on, then enqueue and return */
3740 3762 if (ringp->s_ring_state & S_RING_BLANK ||
3741 3763 ringp->s_ring_state & S_RING_PROC) {
3742 3764 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3743 3765 mutex_exit(&ringp->s_ring_lock);
3744 3766 return;
3745 3767 }
3746 3768 proc = ringp->s_ring_rx_func;
3747 3769 arg1 = ringp->s_ring_rx_arg1;
3748 3770 arg2 = ringp->s_ring_rx_arg2;
3749 3771 /*
3750 3772 * See if anything is already queued. If we are the
3751 3773 * first packet, do inline processing else queue the
3752 3774 * packet and do the drain.
3753 3775 */
3754 3776 if (ringp->s_ring_first == NULL) {
3755 3777 /*
3756 3778 * Fast-path, ok to process and nothing queued.
3757 3779 */
3758 3780 ringp->s_ring_run = curthread;
3759 3781 ringp->s_ring_state |= (S_RING_PROC);
3760 3782
3761 3783 mutex_exit(&ringp->s_ring_lock);
3762 3784
3763 3785 /*
3764 3786 * We are the chain of 1 packet so
3765 3787 * go through this fast path.
3766 3788 */
3767 3789 ASSERT(mp_chain->b_next == NULL);
3768 3790
3769 3791 (*proc)(arg1, arg2, mp_chain, NULL);
3770 3792
3771 3793 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3772 3794 /*
3773 3795 * If we have a soft ring set which is doing
3774 3796 * bandwidth control, we need to decrement
3775 3797 * srs_size and count so it the SRS can have a
3776 3798 * accurate idea of what is the real data
3777 3799 * queued between SRS and its soft rings. We
3778 3800 * decrement the counters only when the packet
3779 3801 * gets processed by both SRS and the soft ring.
3780 3802 */
3781 3803 mutex_enter(&mac_srs->srs_lock);
3782 3804 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
3783 3805 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
3784 3806 mutex_exit(&mac_srs->srs_lock);
3785 3807
3786 3808 mutex_enter(&ringp->s_ring_lock);
3787 3809 ringp->s_ring_run = NULL;
3788 3810 ringp->s_ring_state &= ~S_RING_PROC;
3789 3811 if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
3790 3812 cv_signal(&ringp->s_ring_client_cv);
3791 3813
3792 3814 if ((ringp->s_ring_first == NULL) ||
3793 3815 (ringp->s_ring_state & S_RING_BLANK)) {
3794 3816 /*
3795 3817 * We processed inline our packet and
3796 3818 * nothing new has arrived or our
3797 3819 * receiver doesn't want to receive
3798 3820 * any packets. We are done.
3799 3821 */
3800 3822 mutex_exit(&ringp->s_ring_lock);
3801 3823 return;
3802 3824 }
3803 3825 } else {
3804 3826 SOFT_RING_ENQUEUE_CHAIN(ringp,
3805 3827 mp_chain, tail, cnt, sz);
3806 3828 }
3807 3829
3808 3830 /*
3809 3831 * We are here because either we couldn't do inline
3810 3832 * processing (because something was already
3811 3833 * queued), or we had a chain of more than one
3812 3834 * packet, or something else arrived after we were
3813 3835 * done with inline processing.
3814 3836 */
3815 3837 ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3816 3838 ASSERT(ringp->s_ring_first != NULL);
3817 3839
3818 3840 ringp->s_ring_drain_func(ringp);
3819 3841 mutex_exit(&ringp->s_ring_lock);
3820 3842 return;
3821 3843 } else {
3822 3844 /* ST_RING_WORKER_ONLY case */
3823 3845 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3824 3846 mac_soft_ring_worker_wakeup(ringp);
3825 3847 mutex_exit(&ringp->s_ring_lock);
3826 3848 }
3827 3849 }
3828 3850
3829 3851 /*
3830 3852 * TX SOFTRING RELATED FUNCTIONS
3831 3853 *
3832 3854 * These functions really belong in mac_soft_ring.c and here for
3833 3855 * a short period.
3834 3856 */
3835 3857
3836 3858 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \
3837 3859 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \
3838 3860 ringp->s_ring_state |= S_RING_ENQUEUED; \
3839 3861 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \
3840 3862 }
3841 3863
3842 3864 /*
3843 3865 * mac_tx_sring_queued
3844 3866 *
3845 3867 * When we are out of transmit descriptors and we already have a
3846 3868 * queue that exceeds hiwat (or the client called us with
3847 3869 * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
3848 3870 * soft ring pointer as the opaque cookie for the client enable
3849 3871 * flow control.
3850 3872 */
3851 3873 static mac_tx_cookie_t
3852 3874 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
3853 3875 mblk_t **ret_mp)
3854 3876 {
3855 3877 int cnt;
3856 3878 size_t sz;
3857 3879 mblk_t *tail;
3858 3880 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3859 3881 mac_tx_cookie_t cookie = NULL;
3860 3882 boolean_t wakeup_worker = B_TRUE;
3861 3883
3862 3884 ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3863 3885 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3864 3886 if (flag & MAC_DROP_ON_NO_DESC) {
3865 3887 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
3866 3888 /* increment freed stats */
3867 3889 ringp->s_ring_drops += cnt;
3868 3890 cookie = (mac_tx_cookie_t)ringp;
3869 3891 } else {
3870 3892 if (ringp->s_ring_first != NULL)
3871 3893 wakeup_worker = B_FALSE;
3872 3894
3873 3895 if (flag & MAC_TX_NO_ENQUEUE) {
3874 3896 /*
3875 3897 * If QUEUED is not set, queue the packet
3876 3898 * and let mac_tx_soft_ring_drain() set
3877 3899 * the TX_BLOCKED bit for the reasons
3878 3900 * explained above. Otherwise, return the
3879 3901 * mblks.
3880 3902 */
3881 3903 if (wakeup_worker) {
3882 3904 TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
3883 3905 mp_chain, tail, cnt, sz);
3884 3906 } else {
3885 3907 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
3886 3908 cookie = (mac_tx_cookie_t)ringp;
3887 3909 *ret_mp = mp_chain;
3888 3910 }
3889 3911 } else {
3890 3912 boolean_t enqueue = B_TRUE;
3891 3913
3892 3914 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3893 3915 /*
3894 3916 * flow-controlled. Store ringp in cookie
3895 3917 * so that it can be returned as
3896 3918 * mac_tx_cookie_t to client
3897 3919 */
3898 3920 ringp->s_ring_state |= S_RING_TX_HIWAT;
3899 3921 cookie = (mac_tx_cookie_t)ringp;
3900 3922 ringp->s_ring_hiwat_cnt++;
3901 3923 if (ringp->s_ring_count >
3902 3924 ringp->s_ring_tx_max_q_cnt) {
3903 3925 /* increment freed stats */
3904 3926 ringp->s_ring_drops += cnt;
3905 3927 /*
3906 3928 * b_prev may be set to the fanout hint
3907 3929 * hence can't use freemsg directly
3908 3930 */
3909 3931 mac_pkt_drop(NULL, NULL,
3910 3932 mp_chain, B_FALSE);
3911 3933 DTRACE_PROBE1(tx_queued_hiwat,
3912 3934 mac_soft_ring_t *, ringp);
3913 3935 enqueue = B_FALSE;
3914 3936 }
3915 3937 }
3916 3938 if (enqueue) {
3917 3939 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
3918 3940 tail, cnt, sz);
3919 3941 }
3920 3942 }
3921 3943 if (wakeup_worker)
3922 3944 cv_signal(&ringp->s_ring_async);
3923 3945 }
3924 3946 return (cookie);
3925 3947 }
3926 3948
3927 3949
3928 3950 /*
3929 3951 * mac_tx_soft_ring_process
3930 3952 *
3931 3953 * This routine is called when fanning out outgoing traffic among
3932 3954 * multipe Tx rings.
3933 3955 * Note that a soft ring is associated with a h/w Tx ring.
3934 3956 */
3935 3957 mac_tx_cookie_t
3936 3958 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
3937 3959 uint16_t flag, mblk_t **ret_mp)
3938 3960 {
3939 3961 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3940 3962 int cnt;
3941 3963 size_t sz;
3942 3964 mblk_t *tail;
3943 3965 mac_tx_cookie_t cookie = NULL;
3944 3966
3945 3967 ASSERT(ringp != NULL);
3946 3968 ASSERT(mp_chain != NULL);
3947 3969 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3948 3970 /*
3949 3971 * The following modes can come here: SRS_TX_BW_FANOUT,
3950 3972 * SRS_TX_FANOUT, SRS_TX_AGGR, SRS_TX_BW_AGGR.
3951 3973 */
3952 3974 ASSERT(MAC_TX_SOFT_RINGS(mac_srs));
3953 3975 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
3954 3976 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT ||
3955 3977 mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
3956 3978 mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
3957 3979
3958 3980 if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
3959 3981 /* Serialization mode */
3960 3982
3961 3983 mutex_enter(&ringp->s_ring_lock);
3962 3984 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3963 3985 cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3964 3986 flag, ret_mp);
3965 3987 mutex_exit(&ringp->s_ring_lock);
3966 3988 return (cookie);
3967 3989 }
3968 3990 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3969 3991 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3970 3992 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
3971 3993 /*
3972 3994 * If ring is blocked due to lack of Tx
3973 3995 * descs, just return. Worker thread
3974 3996 * will get scheduled when Tx desc's
3975 3997 * become available.
3976 3998 */
3977 3999 mutex_exit(&ringp->s_ring_lock);
3978 4000 return (cookie);
3979 4001 }
3980 4002 mac_soft_ring_worker_wakeup(ringp);
3981 4003 mutex_exit(&ringp->s_ring_lock);
3982 4004 return (cookie);
3983 4005 } else {
3984 4006 /* Default fanout mode */
3985 4007 /*
3986 4008 * S_RING_BLOCKED is set when underlying NIC runs
3987 4009 * out of Tx descs and messages start getting
3988 4010 * queued. It won't get reset until
3989 4011 * tx_srs_drain() completely drains out the
3990 4012 * messages.
3991 4013 */
3992 4014 mac_tx_stats_t stats;
3993 4015
3994 4016 if (ringp->s_ring_state & S_RING_ENQUEUED) {
3995 4017 /* Tx descs/resources not available */
3996 4018 mutex_enter(&ringp->s_ring_lock);
3997 4019 if (ringp->s_ring_state & S_RING_ENQUEUED) {
3998 4020 cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3999 4021 flag, ret_mp);
4000 4022 mutex_exit(&ringp->s_ring_lock);
4001 4023 return (cookie);
4002 4024 }
4003 4025 /*
4004 4026 * While we were computing mblk count, the
4005 4027 * flow control condition got relieved.
4006 4028 * Continue with the transmission.
4007 4029 */
4008 4030 mutex_exit(&ringp->s_ring_lock);
4009 4031 }
4010 4032
4011 4033 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
4012 4034 ringp->s_ring_tx_arg2, mp_chain, &stats);
4013 4035
4014 4036 /*
4015 4037 * Multiple threads could be here sending packets.
4016 4038 * Under such conditions, it is not possible to
4017 4039 * automically set S_RING_BLOCKED bit to indicate
4018 4040 * out of tx desc condition. To atomically set
4019 4041 * this, we queue the returned packet and do
4020 4042 * the setting of S_RING_BLOCKED in
4021 4043 * mac_tx_soft_ring_drain().
4022 4044 */
4023 4045 if (mp_chain != NULL) {
4024 4046 mutex_enter(&ringp->s_ring_lock);
4025 4047 cookie =
4026 4048 mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
4027 4049 mutex_exit(&ringp->s_ring_lock);
4028 4050 return (cookie);
4029 4051 }
4030 4052 SRS_TX_STATS_UPDATE(mac_srs, &stats);
4031 4053 SOFTRING_TX_STATS_UPDATE(ringp, &stats);
4032 4054
4033 4055 return (NULL);
4034 4056 }
4035 4057 }
|
↓ open down ↓ |
3123 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX