1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
25 */
26
27 /*
28 * MAC Services Module
29 *
30 * The GLDv3 framework locking - The MAC layer
31 * --------------------------------------------
32 *
33 * The MAC layer is central to the GLD framework and can provide the locking
34 * framework needed for itself and for the use of MAC clients. MAC end points
35 * are fairly disjoint and don't share a lot of state. So a coarse grained
36 * multi-threading scheme is to single thread all create/modify/delete or set
37 * type of control operations on a per mac end point while allowing data threads
38 * concurrently.
39 *
40 * Control operations (set) that modify a mac end point are always serialized on
41 * a per mac end point basis, We have at most 1 such thread per mac end point
42 * at a time.
43 *
44 * All other operations that are not serialized are essentially multi-threaded.
45 * For example a control operation (get) like getting statistics which may not
46 * care about reading values atomically or data threads sending or receiving
47 * data. Mostly these type of operations don't modify the control state. Any
48 * state these operations care about are protected using traditional locks.
49 *
50 * The perimeter only serializes serial operations. It does not imply there
51 * aren't any other concurrent operations. However a serialized operation may
52 * sometimes need to make sure it is the only thread. In this case it needs
53 * to use reference counting mechanisms to cv_wait until any current data
54 * threads are done.
55 *
56 * The mac layer itself does not hold any locks across a call to another layer.
57 * The perimeter is however held across a down call to the driver to make the
58 * whole control operation atomic with respect to other control operations.
59 * Also the data path and get type control operations may proceed concurrently.
60 * These operations synchronize with the single serial operation on a given mac
61 * end point using regular locks. The perimeter ensures that conflicting
62 * operations like say a mac_multicast_add and a mac_multicast_remove on the
63 * same mac end point don't interfere with each other and also ensures that the
64 * changes in the mac layer and the call to the underlying driver to say add a
65 * multicast address are done atomically without interference from a thread
66 * trying to delete the same address.
67 *
68 * For example, consider
69 * mac_multicst_add()
70 * {
71 * mac_perimeter_enter(); serialize all control operations
72 *
73 * grab list lock protect against access by data threads
74 * add to list
75 * drop list lock
76 *
77 * call driver's mi_multicst
78 *
79 * mac_perimeter_exit();
80 * }
81 *
82 * To lessen the number of serialization locks and simplify the lock hierarchy,
83 * we serialize all the control operations on a per mac end point by using a
84 * single serialization lock called the perimeter. We allow recursive entry into
85 * the perimeter to facilitate use of this mechanism by both the mac client and
86 * the MAC layer itself.
87 *
88 * MAC client means an entity that does an operation on a mac handle
89 * obtained from a mac_open/mac_client_open. Similarly MAC driver means
90 * an entity that does an operation on a mac handle obtained from a
91 * mac_register. An entity could be both client and driver but on different
92 * handles eg. aggr. and should only make the corresponding mac interface calls
93 * i.e. mac driver interface or mac client interface as appropriate for that
94 * mac handle.
95 *
96 * General rules.
97 * -------------
98 *
99 * R1. The lock order of upcall threads is natually opposite to downcall
100 * threads. Hence upcalls must not hold any locks across layers for fear of
101 * recursive lock enter and lock order violation. This applies to all layers.
102 *
103 * R2. The perimeter is just another lock. Since it is held in the down
104 * direction, acquiring the perimeter in an upcall is prohibited as it would
105 * cause a deadlock. This applies to all layers.
106 *
107 * Note that upcalls that need to grab the mac perimeter (for example
108 * mac_notify upcalls) can still achieve that by posting the request to a
109 * thread, which can then grab all the required perimeters and locks in the
110 * right global order. Note that in the above example the mac layer iself
111 * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
112 * to the client must do that. Please see the aggr code for an example.
113 *
114 * MAC client rules
115 * ----------------
116 *
117 * R3. A MAC client may use the MAC provided perimeter facility to serialize
118 * control operations on a per mac end point. It does this by by acquring
119 * and holding the perimeter across a sequence of calls to the mac layer.
120 * This ensures atomicity across the entire block of mac calls. In this
121 * model the MAC client must not hold any client locks across the calls to
122 * the mac layer. This model is the preferred solution.
123 *
124 * R4. However if a MAC client has a lot of global state across all mac end
125 * points the per mac end point serialization may not be sufficient. In this
126 * case the client may choose to use global locks or use its own serialization.
127 * To avoid deadlocks, these client layer locks held across the mac calls
128 * in the control path must never be acquired by the data path for the reason
129 * mentioned below.
130 *
131 * (Assume that a control operation that holds a client lock blocks in the
132 * mac layer waiting for upcall reference counts to drop to zero. If an upcall
133 * data thread that holds this reference count, tries to acquire the same
134 * client lock subsequently it will deadlock).
135 *
136 * A MAC client may follow either the R3 model or the R4 model, but can't
137 * mix both. In the former, the hierarchy is Perim -> client locks, but in
138 * the latter it is client locks -> Perim.
139 *
140 * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
141 * context since they may block while trying to acquire the perimeter.
142 * In addition some calls may block waiting for upcall refcnts to come down to
143 * zero.
144 *
145 * R6. MAC clients must make sure that they are single threaded and all threads
146 * from the top (in particular data threads) have finished before calling
147 * mac_client_close. The MAC framework does not track the number of client
148 * threads using the mac client handle. Also mac clients must make sure
149 * they have undone all the control operations before calling mac_client_close.
150 * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
151 * mac_unicast_add/mac_multicast_add.
152 *
153 * MAC framework rules
154 * -------------------
155 *
156 * R7. The mac layer itself must not hold any mac layer locks (except the mac
157 * perimeter) across a call to any other layer from the mac layer. The call to
158 * any other layer could be via mi_* entry points, classifier entry points into
159 * the driver or via upcall pointers into layers above. The mac perimeter may
160 * be acquired or held only in the down direction, for e.g. when calling into
161 * a mi_* driver enty point to provide atomicity of the operation.
162 *
163 * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
164 * mac driver interfaces, the MAC layer must provide a cut out for control
165 * interfaces like upcall notifications and start them in a separate thread.
166 *
167 * R9. Note that locking order also implies a plumbing order. For example
168 * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
169 * to plumb in any other order must be failed at mac_open time, otherwise it
170 * could lead to deadlocks due to inverse locking order.
171 *
172 * R10. MAC driver interfaces must not block since the driver could call them
173 * in interrupt context.
174 *
175 * R11. Walkers must preferably not hold any locks while calling walker
176 * callbacks. Instead these can operate on reference counts. In simple
177 * callbacks it may be ok to hold a lock and call the callbacks, but this is
178 * harder to maintain in the general case of arbitrary callbacks.
179 *
180 * R12. The MAC layer must protect upcall notification callbacks using reference
181 * counts rather than holding locks across the callbacks.
182 *
183 * R13. Given the variety of drivers, it is preferable if the MAC layer can make
184 * sure that any pointers (such as mac ring pointers) it passes to the driver
185 * remain valid until mac unregister time. Currently the mac layer achieves
186 * this by using generation numbers for rings and freeing the mac rings only
187 * at unregister time. The MAC layer must provide a layer of indirection and
188 * must not expose underlying driver rings or driver data structures/pointers
189 * directly to MAC clients.
190 *
191 * MAC driver rules
192 * ----------------
193 *
194 * R14. It would be preferable if MAC drivers don't hold any locks across any
195 * mac call. However at a minimum they must not hold any locks across data
196 * upcalls. They must also make sure that all references to mac data structures
197 * are cleaned up and that it is single threaded at mac_unregister time.
198 *
199 * R15. MAC driver interfaces don't block and so the action may be done
200 * asynchronously in a separate thread as for example handling notifications.
201 * The driver must not assume that the action is complete when the call
202 * returns.
203 *
204 * R16. Drivers must maintain a generation number per Rx ring, and pass it
205 * back to mac_rx_ring(); They are expected to increment the generation
206 * number whenever the ring's stop routine is invoked.
207 * See comments in mac_rx_ring();
208 *
209 * R17 Similarly mi_stop is another synchronization point and the driver must
210 * ensure that all upcalls are done and there won't be any future upcall
211 * before returning from mi_stop.
212 *
213 * R18. The driver may assume that all set/modify control operations via
214 * the mi_* entry points are single threaded on a per mac end point.
215 *
216 * Lock and Perimeter hierarchy scenarios
217 * ---------------------------------------
218 *
219 * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
220 *
221 * ft_lock -> fe_lock [mac_flow_lookup]
222 *
223 * mi_rw_lock -> fe_lock [mac_bcast_send]
224 *
225 * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
226 *
227 * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
228 *
229 * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
230 *
231 * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
232 * client to driver. In the case of clients that explictly use the mac provided
233 * perimeter mechanism for its serialization, the hierarchy is
234 * Perimeter -> mac layer locks, since the client never holds any locks across
235 * the mac calls. In the case of clients that use its own locks the hierarchy
236 * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
237 * calls mac_perim_enter/exit in this case.
238 *
239 * Subflow creation rules
240 * ---------------------------
241 * o In case of a user specified cpulist present on underlying link and flows,
242 * the flows cpulist must be a subset of the underlying link.
243 * o In case of a user specified fanout mode present on link and flow, the
244 * subflow fanout count has to be less than or equal to that of the
245 * underlying link. The cpu-bindings for the subflows will be a subset of
246 * the underlying link.
247 * o In case if no cpulist specified on both underlying link and flow, the
248 * underlying link relies on a MAC tunable to provide out of box fanout.
249 * The subflow will have no cpulist (the subflow will be unbound)
250 * o In case if no cpulist is specified on the underlying link, a subflow can
251 * carry either a user-specified cpulist or fanout count. The cpu-bindings
252 * for the subflow will not adhere to restriction that they need to be subset
253 * of the underlying link.
254 * o In case where the underlying link is carrying either a user specified
255 * cpulist or fanout mode and for a unspecified subflow, the subflow will be
256 * created unbound.
257 * o While creating unbound subflows, bandwidth mode changes attempt to
258 * figure a right fanout count. In such cases the fanout count will override
259 * the unbound cpu-binding behavior.
260 * o In addition to this, while cycling between flow and link properties, we
261 * impose a restriction that if a link property has a subflow with
262 * user-specified attributes, we will not allow changing the link property.
263 * The administrator needs to reset all the user specified properties for the
264 * subflows before attempting a link property change.
265 * Some of the above rules can be overridden by specifying additional command
266 * line options while creating or modifying link or subflow properties.
267 */
268
269 #include <sys/types.h>
270 #include <sys/conf.h>
271 #include <sys/id_space.h>
272 #include <sys/esunddi.h>
273 #include <sys/stat.h>
274 #include <sys/mkdev.h>
275 #include <sys/stream.h>
276 #include <sys/strsun.h>
277 #include <sys/strsubr.h>
278 #include <sys/dlpi.h>
279 #include <sys/list.h>
280 #include <sys/modhash.h>
281 #include <sys/mac_provider.h>
282 #include <sys/mac_client_impl.h>
283 #include <sys/mac_soft_ring.h>
284 #include <sys/mac_stat.h>
285 #include <sys/mac_impl.h>
286 #include <sys/mac.h>
287 #include <sys/dls.h>
288 #include <sys/dld.h>
289 #include <sys/modctl.h>
290 #include <sys/fs/dv_node.h>
291 #include <sys/thread.h>
292 #include <sys/proc.h>
293 #include <sys/callb.h>
294 #include <sys/cpuvar.h>
295 #include <sys/atomic.h>
296 #include <sys/bitmap.h>
297 #include <sys/sdt.h>
298 #include <sys/mac_flow.h>
299 #include <sys/ddi_intr_impl.h>
300 #include <sys/disp.h>
301 #include <sys/sdt.h>
302 #include <sys/vnic.h>
303 #include <sys/vnic_impl.h>
304 #include <sys/vlan.h>
305 #include <inet/ip.h>
306 #include <inet/ip6.h>
307 #include <sys/exacct.h>
308 #include <sys/exacct_impl.h>
309 #include <inet/nd.h>
310 #include <sys/ethernet.h>
311 #include <sys/pool.h>
312 #include <sys/pool_pset.h>
313 #include <sys/cpupart.h>
314 #include <inet/wifi_ioctl.h>
315 #include <net/wpa.h>
316
317 #define IMPL_HASHSZ 67 /* prime */
318
319 kmem_cache_t *i_mac_impl_cachep;
320 mod_hash_t *i_mac_impl_hash;
321 krwlock_t i_mac_impl_lock;
322 uint_t i_mac_impl_count;
323 static kmem_cache_t *mac_ring_cache;
324 static id_space_t *minor_ids;
325 static uint32_t minor_count;
326 static pool_event_cb_t mac_pool_event_reg;
327
328 /*
329 * Logging stuff. Perhaps mac_logging_interval could be broken into
330 * mac_flow_log_interval and mac_link_log_interval if we want to be
331 * able to schedule them differently.
332 */
333 uint_t mac_logging_interval;
334 boolean_t mac_flow_log_enable;
335 boolean_t mac_link_log_enable;
336 timeout_id_t mac_logging_timer;
337
338 /* for debugging, see MAC_DBG_PRT() in mac_impl.h */
339 int mac_dbg = 0;
340
341 #define MACTYPE_KMODDIR "mac"
342 #define MACTYPE_HASHSZ 67
343 static mod_hash_t *i_mactype_hash;
344 /*
345 * i_mactype_lock synchronizes threads that obtain references to mactype_t
346 * structures through i_mactype_getplugin().
347 */
348 static kmutex_t i_mactype_lock;
349
350 /*
351 * mac_tx_percpu_cnt
352 *
353 * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
354 * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
355 * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
356 * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
357 */
358 int mac_tx_percpu_cnt;
359 int mac_tx_percpu_cnt_max = 128;
360
361 /*
362 * Call back functions for the bridge module. These are guaranteed to be valid
363 * when holding a reference on a link or when holding mip->mi_bridge_lock and
364 * mi_bridge_link is non-NULL.
365 */
366 mac_bridge_tx_t mac_bridge_tx_cb;
367 mac_bridge_rx_t mac_bridge_rx_cb;
368 mac_bridge_ref_t mac_bridge_ref_cb;
369 mac_bridge_ls_t mac_bridge_ls_cb;
370
371 static int i_mac_constructor(void *, void *, int);
372 static void i_mac_destructor(void *, void *);
373 static int i_mac_ring_ctor(void *, void *, int);
374 static void i_mac_ring_dtor(void *, void *);
375 static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
376 void mac_tx_client_flush(mac_client_impl_t *);
377 void mac_tx_client_block(mac_client_impl_t *);
378 static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
379 static int mac_start_group_and_rings(mac_group_t *);
380 static void mac_stop_group_and_rings(mac_group_t *);
381 static void mac_pool_event_cb(pool_event_t, int, void *);
382
383 typedef struct netinfo_s {
384 list_node_t ni_link;
385 void *ni_record;
386 int ni_size;
387 int ni_type;
388 } netinfo_t;
389
390 /*
391 * Module initialization functions.
392 */
393
394 void
395 mac_init(void)
396 {
397 mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
398 boot_max_ncpus);
399
400 /* Upper bound is mac_tx_percpu_cnt_max */
401 if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
402 mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
403
404 if (mac_tx_percpu_cnt < 1) {
405 /* Someone set max_tx_percpu_cnt_max to 0 or less */
406 mac_tx_percpu_cnt = 1;
407 }
408
409 ASSERT(mac_tx_percpu_cnt >= 1);
410 mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
411 /*
412 * Make it of the form 2**N - 1 in the range
413 * [0 .. mac_tx_percpu_cnt_max - 1]
414 */
415 mac_tx_percpu_cnt--;
416
417 i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
418 sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
419 NULL, NULL, NULL, 0);
420 ASSERT(i_mac_impl_cachep != NULL);
421
422 mac_ring_cache = kmem_cache_create("mac_ring_cache",
423 sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
424 NULL, NULL, 0);
425 ASSERT(mac_ring_cache != NULL);
426
427 i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
428 IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
429 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
430 rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
431
432 mac_flow_init();
433 mac_soft_ring_init();
434 mac_bcast_init();
435 mac_client_init();
436
437 i_mac_impl_count = 0;
438
439 i_mactype_hash = mod_hash_create_extended("mactype_hash",
440 MACTYPE_HASHSZ,
441 mod_hash_null_keydtor, mod_hash_null_valdtor,
442 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
443
444 /*
445 * Allocate an id space to manage minor numbers. The range of the
446 * space will be from MAC_MAX_MINOR+1 to MAC_PRIVATE_MINOR-1. This
447 * leaves half of the 32-bit minors available for driver private use.
448 */
449 minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1,
450 MAC_PRIVATE_MINOR-1);
451 ASSERT(minor_ids != NULL);
452 minor_count = 0;
453
454 /* Let's default to 20 seconds */
455 mac_logging_interval = 20;
456 mac_flow_log_enable = B_FALSE;
457 mac_link_log_enable = B_FALSE;
458 mac_logging_timer = 0;
459
460 /* Register to be notified of noteworthy pools events */
461 mac_pool_event_reg.pec_func = mac_pool_event_cb;
462 mac_pool_event_reg.pec_arg = NULL;
463 pool_event_cb_register(&mac_pool_event_reg);
464 }
465
466 int
467 mac_fini(void)
468 {
469
470 if (i_mac_impl_count > 0 || minor_count > 0)
471 return (EBUSY);
472
473 pool_event_cb_unregister(&mac_pool_event_reg);
474
475 id_space_destroy(minor_ids);
476 mac_flow_fini();
477
478 mod_hash_destroy_hash(i_mac_impl_hash);
479 rw_destroy(&i_mac_impl_lock);
480
481 mac_client_fini();
482 kmem_cache_destroy(mac_ring_cache);
483
484 mod_hash_destroy_hash(i_mactype_hash);
485 mac_soft_ring_finish();
486
487
488 return (0);
489 }
490
491 /*
492 * Initialize a GLDv3 driver's device ops. A driver that manages its own ops
493 * (e.g. softmac) may pass in a NULL ops argument.
494 */
495 void
496 mac_init_ops(struct dev_ops *ops, const char *name)
497 {
498 major_t major = ddi_name_to_major((char *)name);
499
500 /*
501 * By returning on error below, we are not letting the driver continue
502 * in an undefined context. The mac_register() function will faill if
503 * DN_GLDV3_DRIVER isn't set.
504 */
505 if (major == DDI_MAJOR_T_NONE)
506 return;
507 LOCK_DEV_OPS(&devnamesp[major].dn_lock);
508 devnamesp[major].dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
509 UNLOCK_DEV_OPS(&devnamesp[major].dn_lock);
510 if (ops != NULL)
511 dld_init_ops(ops, name);
512 }
513
514 void
515 mac_fini_ops(struct dev_ops *ops)
516 {
517 dld_fini_ops(ops);
518 }
519
520 /*ARGSUSED*/
521 static int
522 i_mac_constructor(void *buf, void *arg, int kmflag)
523 {
524 mac_impl_t *mip = buf;
525
526 bzero(buf, sizeof (mac_impl_t));
527
528 mip->mi_linkstate = LINK_STATE_UNKNOWN;
529
530 rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
531 mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
532 mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
533 mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
534
535 mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
536 cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
537 mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
538 cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
539
540 mutex_init(&mip->mi_bridge_lock, NULL, MUTEX_DEFAULT, NULL);
541
542 return (0);
543 }
544
545 /*ARGSUSED*/
546 static void
547 i_mac_destructor(void *buf, void *arg)
548 {
549 mac_impl_t *mip = buf;
550 mac_cb_info_t *mcbi;
551
552 ASSERT(mip->mi_ref == 0);
553 ASSERT(mip->mi_active == 0);
554 ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
555 ASSERT(mip->mi_devpromisc == 0);
556 ASSERT(mip->mi_ksp == NULL);
557 ASSERT(mip->mi_kstat_count == 0);
558 ASSERT(mip->mi_nclients == 0);
559 ASSERT(mip->mi_nactiveclients == 0);
560 ASSERT(mip->mi_single_active_client == NULL);
561 ASSERT(mip->mi_state_flags == 0);
562 ASSERT(mip->mi_factory_addr == NULL);
563 ASSERT(mip->mi_factory_addr_num == 0);
564 ASSERT(mip->mi_default_tx_ring == NULL);
565
566 mcbi = &mip->mi_notify_cb_info;
567 ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
568 ASSERT(mip->mi_notify_bits == 0);
569 ASSERT(mip->mi_notify_thread == NULL);
570 ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
571 mcbi->mcbi_lockp = NULL;
572
573 mcbi = &mip->mi_promisc_cb_info;
574 ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
575 ASSERT(mip->mi_promisc_list == NULL);
576 ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
577 mcbi->mcbi_lockp = NULL;
578
579 ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
580 ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
581
582 rw_destroy(&mip->mi_rw_lock);
583
584 mutex_destroy(&mip->mi_promisc_lock);
585 cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
586 mutex_destroy(&mip->mi_notify_lock);
587 cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
588 mutex_destroy(&mip->mi_ring_lock);
589
590 ASSERT(mip->mi_bridge_link == NULL);
591 }
592
593 /* ARGSUSED */
594 static int
595 i_mac_ring_ctor(void *buf, void *arg, int kmflag)
596 {
597 mac_ring_t *ring = (mac_ring_t *)buf;
598
599 bzero(ring, sizeof (mac_ring_t));
600 cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
601 mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
602 ring->mr_state = MR_FREE;
603 return (0);
604 }
605
606 /* ARGSUSED */
607 static void
608 i_mac_ring_dtor(void *buf, void *arg)
609 {
610 mac_ring_t *ring = (mac_ring_t *)buf;
611
612 cv_destroy(&ring->mr_cv);
613 mutex_destroy(&ring->mr_lock);
614 }
615
616 /*
617 * Common functions to do mac callback addition and deletion. Currently this is
618 * used by promisc callbacks and notify callbacks. List addition and deletion
619 * need to take care of list walkers. List walkers in general, can't hold list
620 * locks and make upcall callbacks due to potential lock order and recursive
621 * reentry issues. Instead list walkers increment the list walker count to mark
622 * the presence of a walker thread. Addition can be carefully done to ensure
623 * that the list walker always sees either the old list or the new list.
624 * However the deletion can't be done while the walker is active, instead the
625 * deleting thread simply marks the entry as logically deleted. The last walker
626 * physically deletes and frees up the logically deleted entries when the walk
627 * is complete.
628 */
629 void
630 mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
631 mac_cb_t *mcb_elem)
632 {
633 mac_cb_t *p;
634 mac_cb_t **pp;
635
636 /* Verify it is not already in the list */
637 for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
638 if (p == mcb_elem)
639 break;
640 }
641 VERIFY(p == NULL);
642
643 /*
644 * Add it to the head of the callback list. The membar ensures that
645 * the following list pointer manipulations reach global visibility
646 * in exactly the program order below.
647 */
648 ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
649
650 mcb_elem->mcb_nextp = *mcb_head;
651 membar_producer();
652 *mcb_head = mcb_elem;
653 }
654
655 /*
656 * Mark the entry as logically deleted. If there aren't any walkers unlink
657 * from the list. In either case return the corresponding status.
658 */
659 boolean_t
660 mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
661 mac_cb_t *mcb_elem)
662 {
663 mac_cb_t *p;
664 mac_cb_t **pp;
665
666 ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
667 /*
668 * Search the callback list for the entry to be removed
669 */
670 for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
671 if (p == mcb_elem)
672 break;
673 }
674 VERIFY(p != NULL);
675
676 /*
677 * If there are walkers just mark it as deleted and the last walker
678 * will remove from the list and free it.
679 */
680 if (mcbi->mcbi_walker_cnt != 0) {
681 p->mcb_flags |= MCB_CONDEMNED;
682 mcbi->mcbi_del_cnt++;
683 return (B_FALSE);
684 }
685
686 ASSERT(mcbi->mcbi_del_cnt == 0);
687 *pp = p->mcb_nextp;
688 p->mcb_nextp = NULL;
689 return (B_TRUE);
690 }
691
692 /*
693 * Wait for all pending callback removals to be completed
694 */
695 void
696 mac_callback_remove_wait(mac_cb_info_t *mcbi)
697 {
698 ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
699 while (mcbi->mcbi_del_cnt != 0) {
700 DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
701 cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
702 }
703 }
704
705 /*
706 * The last mac callback walker does the cleanup. Walk the list and unlik
707 * all the logically deleted entries and construct a temporary list of
708 * removed entries. Return the list of removed entries to the caller.
709 */
710 mac_cb_t *
711 mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
712 {
713 mac_cb_t *p;
714 mac_cb_t **pp;
715 mac_cb_t *rmlist = NULL; /* List of removed elements */
716 int cnt = 0;
717
718 ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
719 ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
720
721 pp = mcb_head;
722 while (*pp != NULL) {
723 if ((*pp)->mcb_flags & MCB_CONDEMNED) {
724 p = *pp;
725 *pp = p->mcb_nextp;
726 p->mcb_nextp = rmlist;
727 rmlist = p;
728 cnt++;
729 continue;
730 }
731 pp = &(*pp)->mcb_nextp;
732 }
733
734 ASSERT(mcbi->mcbi_del_cnt == cnt);
735 mcbi->mcbi_del_cnt = 0;
736 return (rmlist);
737 }
738
739 boolean_t
740 mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
741 {
742 mac_cb_t *mcb;
743
744 /* Verify it is not already in the list */
745 for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
746 if (mcb == mcb_elem)
747 return (B_TRUE);
748 }
749
750 return (B_FALSE);
751 }
752
753 boolean_t
754 mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
755 {
756 boolean_t found;
757
758 mutex_enter(mcbi->mcbi_lockp);
759 found = mac_callback_lookup(mcb_headp, mcb_elem);
760 mutex_exit(mcbi->mcbi_lockp);
761
762 return (found);
763 }
764
765 /* Free the list of removed callbacks */
766 void
767 mac_callback_free(mac_cb_t *rmlist)
768 {
769 mac_cb_t *mcb;
770 mac_cb_t *mcb_next;
771
772 for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
773 mcb_next = mcb->mcb_nextp;
774 kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
775 }
776 }
777
778 /*
779 * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
780 * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
781 * is only a single shared total walker count, and an entry can't be physically
782 * unlinked if a walker is active on either list. The last walker does this
783 * cleanup of logically deleted entries.
784 */
785 void
786 i_mac_promisc_walker_cleanup(mac_impl_t *mip)
787 {
788 mac_cb_t *rmlist;
789 mac_cb_t *mcb;
790 mac_cb_t *mcb_next;
791 mac_promisc_impl_t *mpip;
792
793 /*
794 * Construct a temporary list of deleted callbacks by walking the
795 * the mi_promisc_list. Then for each entry in the temporary list,
796 * remove it from the mci_promisc_list and free the entry.
797 */
798 rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
799 &mip->mi_promisc_list);
800
801 for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
802 mcb_next = mcb->mcb_nextp;
803 mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
804 VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
805 &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
806 mcb->mcb_flags = 0;
807 mcb->mcb_nextp = NULL;
808 kmem_cache_free(mac_promisc_impl_cache, mpip);
809 }
810 }
811
812 void
813 i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
814 {
815 mac_cb_info_t *mcbi;
816
817 /*
818 * Signal the notify thread even after mi_ref has become zero and
819 * mi_disabled is set. The synchronization with the notify thread
820 * happens in mac_unregister and that implies the driver must make
821 * sure it is single-threaded (with respect to mac calls) and that
822 * all pending mac calls have returned before it calls mac_unregister
823 */
824 rw_enter(&i_mac_impl_lock, RW_READER);
825 if (mip->mi_state_flags & MIS_DISABLED)
826 goto exit;
827
828 /*
829 * Guard against incorrect notifications. (Running a newer
830 * mac client against an older implementation?)
831 */
832 if (type >= MAC_NNOTE)
833 goto exit;
834
835 mcbi = &mip->mi_notify_cb_info;
836 mutex_enter(mcbi->mcbi_lockp);
837 mip->mi_notify_bits |= (1 << type);
838 cv_broadcast(&mcbi->mcbi_cv);
839 mutex_exit(mcbi->mcbi_lockp);
840
841 exit:
842 rw_exit(&i_mac_impl_lock);
843 }
844
845 /*
846 * Mac serialization primitives. Please see the block comment at the
847 * top of the file.
848 */
849 void
850 i_mac_perim_enter(mac_impl_t *mip)
851 {
852 mac_client_impl_t *mcip;
853
854 if (mip->mi_state_flags & MIS_IS_VNIC) {
855 /*
856 * This is a VNIC. Return the lower mac since that is what
857 * we want to serialize on.
858 */
859 mcip = mac_vnic_lower(mip);
860 mip = mcip->mci_mip;
861 }
862
863 mutex_enter(&mip->mi_perim_lock);
864 if (mip->mi_perim_owner == curthread) {
865 mip->mi_perim_ocnt++;
866 mutex_exit(&mip->mi_perim_lock);
867 return;
868 }
869
870 while (mip->mi_perim_owner != NULL)
871 cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
872
873 mip->mi_perim_owner = curthread;
874 ASSERT(mip->mi_perim_ocnt == 0);
875 mip->mi_perim_ocnt++;
876 #ifdef DEBUG
877 mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
878 MAC_PERIM_STACK_DEPTH);
879 #endif
880 mutex_exit(&mip->mi_perim_lock);
881 }
882
883 int
884 i_mac_perim_enter_nowait(mac_impl_t *mip)
885 {
886 /*
887 * The vnic is a special case, since the serialization is done based
888 * on the lower mac. If the lower mac is busy, it does not imply the
889 * vnic can't be unregistered. But in the case of other drivers,
890 * a busy perimeter or open mac handles implies that the mac is busy
891 * and can't be unregistered.
892 */
893 if (mip->mi_state_flags & MIS_IS_VNIC) {
894 i_mac_perim_enter(mip);
895 return (0);
896 }
897
898 mutex_enter(&mip->mi_perim_lock);
899 if (mip->mi_perim_owner != NULL) {
900 mutex_exit(&mip->mi_perim_lock);
901 return (EBUSY);
902 }
903 ASSERT(mip->mi_perim_ocnt == 0);
904 mip->mi_perim_owner = curthread;
905 mip->mi_perim_ocnt++;
906 mutex_exit(&mip->mi_perim_lock);
907
908 return (0);
909 }
910
911 void
912 i_mac_perim_exit(mac_impl_t *mip)
913 {
914 mac_client_impl_t *mcip;
915
916 if (mip->mi_state_flags & MIS_IS_VNIC) {
917 /*
918 * This is a VNIC. Return the lower mac since that is what
919 * we want to serialize on.
920 */
921 mcip = mac_vnic_lower(mip);
922 mip = mcip->mci_mip;
923 }
924
925 ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
926
927 mutex_enter(&mip->mi_perim_lock);
928 if (--mip->mi_perim_ocnt == 0) {
929 mip->mi_perim_owner = NULL;
930 cv_signal(&mip->mi_perim_cv);
931 }
932 mutex_exit(&mip->mi_perim_lock);
933 }
934
935 /*
936 * Returns whether the current thread holds the mac perimeter. Used in making
937 * assertions.
938 */
939 boolean_t
940 mac_perim_held(mac_handle_t mh)
941 {
942 mac_impl_t *mip = (mac_impl_t *)mh;
943 mac_client_impl_t *mcip;
944
945 if (mip->mi_state_flags & MIS_IS_VNIC) {
946 /*
947 * This is a VNIC. Return the lower mac since that is what
948 * we want to serialize on.
949 */
950 mcip = mac_vnic_lower(mip);
951 mip = mcip->mci_mip;
952 }
953 return (mip->mi_perim_owner == curthread);
954 }
955
956 /*
957 * mac client interfaces to enter the mac perimeter of a mac end point, given
958 * its mac handle, or macname or linkid.
959 */
960 void
961 mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
962 {
963 mac_impl_t *mip = (mac_impl_t *)mh;
964
965 i_mac_perim_enter(mip);
966 /*
967 * The mac_perim_handle_t returned encodes the 'mip' and whether a
968 * mac_open has been done internally while entering the perimeter.
969 * This information is used in mac_perim_exit
970 */
971 MAC_ENCODE_MPH(*mphp, mip, 0);
972 }
973
974 int
975 mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
976 {
977 int err;
978 mac_handle_t mh;
979
980 if ((err = mac_open(name, &mh)) != 0)
981 return (err);
982
983 mac_perim_enter_by_mh(mh, mphp);
984 MAC_ENCODE_MPH(*mphp, mh, 1);
985 return (0);
986 }
987
988 int
989 mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
990 {
991 int err;
992 mac_handle_t mh;
993
994 if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
995 return (err);
996
997 mac_perim_enter_by_mh(mh, mphp);
998 MAC_ENCODE_MPH(*mphp, mh, 1);
999 return (0);
1000 }
1001
1002 void
1003 mac_perim_exit(mac_perim_handle_t mph)
1004 {
1005 mac_impl_t *mip;
1006 boolean_t need_close;
1007
1008 MAC_DECODE_MPH(mph, mip, need_close);
1009 i_mac_perim_exit(mip);
1010 if (need_close)
1011 mac_close((mac_handle_t)mip);
1012 }
1013
1014 int
1015 mac_hold(const char *macname, mac_impl_t **pmip)
1016 {
1017 mac_impl_t *mip;
1018 int err;
1019
1020 /*
1021 * Check the device name length to make sure it won't overflow our
1022 * buffer.
1023 */
1024 if (strlen(macname) >= MAXNAMELEN)
1025 return (EINVAL);
1026
1027 /*
1028 * Look up its entry in the global hash table.
1029 */
1030 rw_enter(&i_mac_impl_lock, RW_WRITER);
1031 err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
1032 (mod_hash_val_t *)&mip);
1033
1034 if (err != 0) {
1035 rw_exit(&i_mac_impl_lock);
1036 return (ENOENT);
1037 }
1038
1039 if (mip->mi_state_flags & MIS_DISABLED) {
1040 rw_exit(&i_mac_impl_lock);
1041 return (ENOENT);
1042 }
1043
1044 if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
1045 rw_exit(&i_mac_impl_lock);
1046 return (EBUSY);
1047 }
1048
1049 mip->mi_ref++;
1050 rw_exit(&i_mac_impl_lock);
1051
1052 *pmip = mip;
1053 return (0);
1054 }
1055
1056 void
1057 mac_rele(mac_impl_t *mip)
1058 {
1059 rw_enter(&i_mac_impl_lock, RW_WRITER);
1060 ASSERT(mip->mi_ref != 0);
1061 if (--mip->mi_ref == 0) {
1062 ASSERT(mip->mi_nactiveclients == 0 &&
1063 !(mip->mi_state_flags & MIS_EXCLUSIVE));
1064 }
1065 rw_exit(&i_mac_impl_lock);
1066 }
1067
1068 /*
1069 * Private GLDv3 function to start a MAC instance.
1070 */
1071 int
1072 mac_start(mac_handle_t mh)
1073 {
1074 mac_impl_t *mip = (mac_impl_t *)mh;
1075 int err = 0;
1076 mac_group_t *defgrp;
1077
1078 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1079 ASSERT(mip->mi_start != NULL);
1080
1081 /*
1082 * Check whether the device is already started.
1083 */
1084 if (mip->mi_active++ == 0) {
1085 mac_ring_t *ring = NULL;
1086
1087 /*
1088 * Start the device.
1089 */
1090 err = mip->mi_start(mip->mi_driver);
1091 if (err != 0) {
1092 mip->mi_active--;
1093 return (err);
1094 }
1095
1096 /*
1097 * Start the default tx ring.
1098 */
1099 if (mip->mi_default_tx_ring != NULL) {
1100
1101 ring = (mac_ring_t *)mip->mi_default_tx_ring;
1102 if (ring->mr_state != MR_INUSE) {
1103 err = mac_start_ring(ring);
1104 if (err != 0) {
1105 mip->mi_active--;
1106 return (err);
1107 }
1108 }
1109 }
1110
1111 if ((defgrp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1112 /*
1113 * Start the default ring, since it will be needed
1114 * to receive broadcast and multicast traffic for
1115 * both primary and non-primary MAC clients.
1116 */
1117 ASSERT(defgrp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1118 err = mac_start_group_and_rings(defgrp);
1119 if (err != 0) {
1120 mip->mi_active--;
1121 if ((ring != NULL) &&
1122 (ring->mr_state == MR_INUSE))
1123 mac_stop_ring(ring);
1124 return (err);
1125 }
1126 mac_set_group_state(defgrp, MAC_GROUP_STATE_SHARED);
1127 }
1128 }
1129
1130 return (err);
1131 }
1132
1133 /*
1134 * Private GLDv3 function to stop a MAC instance.
1135 */
1136 void
1137 mac_stop(mac_handle_t mh)
1138 {
1139 mac_impl_t *mip = (mac_impl_t *)mh;
1140 mac_group_t *grp;
1141
1142 ASSERT(mip->mi_stop != NULL);
1143 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1144
1145 /*
1146 * Check whether the device is still needed.
1147 */
1148 ASSERT(mip->mi_active != 0);
1149 if (--mip->mi_active == 0) {
1150 if ((grp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1151 /*
1152 * There should be no more active clients since the
1153 * MAC is being stopped. Stop the default RX group
1154 * and transition it back to registered state.
1155 *
1156 * When clients are torn down, the groups
1157 * are release via mac_release_rx_group which
1158 * knows the the default group is always in
1159 * started mode since broadcast uses it. So
1160 * we can assert that their are no clients
1161 * (since mac_bcast_add doesn't register itself
1162 * as a client) and group is in SHARED state.
1163 */
1164 ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1165 ASSERT(MAC_GROUP_NO_CLIENT(grp) &&
1166 mip->mi_nactiveclients == 0);
1167 mac_stop_group_and_rings(grp);
1168 mac_set_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1169 }
1170
1171 if (mip->mi_default_tx_ring != NULL) {
1172 mac_ring_t *ring;
1173
1174 ring = (mac_ring_t *)mip->mi_default_tx_ring;
1175 if (ring->mr_state == MR_INUSE) {
1176 mac_stop_ring(ring);
1177 ring->mr_flag = 0;
1178 }
1179 }
1180
1181 /*
1182 * Stop the device.
1183 */
1184 mip->mi_stop(mip->mi_driver);
1185 }
1186 }
1187
1188 int
1189 i_mac_promisc_set(mac_impl_t *mip, boolean_t on)
1190 {
1191 int err = 0;
1192
1193 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1194 ASSERT(mip->mi_setpromisc != NULL);
1195
1196 if (on) {
1197 /*
1198 * Enable promiscuous mode on the device if not yet enabled.
1199 */
1200 if (mip->mi_devpromisc++ == 0) {
1201 err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1202 if (err != 0) {
1203 mip->mi_devpromisc--;
1204 return (err);
1205 }
1206 i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1207 }
1208 } else {
1209 if (mip->mi_devpromisc == 0)
1210 return (EPROTO);
1211
1212 /*
1213 * Disable promiscuous mode on the device if this is the last
1214 * enabling.
1215 */
1216 if (--mip->mi_devpromisc == 0) {
1217 err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1218 if (err != 0) {
1219 mip->mi_devpromisc++;
1220 return (err);
1221 }
1222 i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1223 }
1224 }
1225
1226 return (0);
1227 }
1228
1229 /*
1230 * The promiscuity state can change any time. If the caller needs to take
1231 * actions that are atomic with the promiscuity state, then the caller needs
1232 * to bracket the entire sequence with mac_perim_enter/exit
1233 */
1234 boolean_t
1235 mac_promisc_get(mac_handle_t mh)
1236 {
1237 mac_impl_t *mip = (mac_impl_t *)mh;
1238
1239 /*
1240 * Return the current promiscuity.
1241 */
1242 return (mip->mi_devpromisc != 0);
1243 }
1244
1245 /*
1246 * Invoked at MAC instance attach time to initialize the list
1247 * of factory MAC addresses supported by a MAC instance. This function
1248 * builds a local cache in the mac_impl_t for the MAC addresses
1249 * supported by the underlying hardware. The MAC clients themselves
1250 * use the mac_addr_factory*() functions to query and reserve
1251 * factory MAC addresses.
1252 */
1253 void
1254 mac_addr_factory_init(mac_impl_t *mip)
1255 {
1256 mac_capab_multifactaddr_t capab;
1257 uint8_t *addr;
1258 int i;
1259
1260 /*
1261 * First round to see how many factory MAC addresses are available.
1262 */
1263 bzero(&capab, sizeof (capab));
1264 if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1265 &capab) || (capab.mcm_naddr == 0)) {
1266 /*
1267 * The MAC instance doesn't support multiple factory
1268 * MAC addresses, we're done here.
1269 */
1270 return;
1271 }
1272
1273 /*
1274 * Allocate the space and get all the factory addresses.
1275 */
1276 addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1277 capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1278
1279 mip->mi_factory_addr_num = capab.mcm_naddr;
1280 mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1281 sizeof (mac_factory_addr_t), KM_SLEEP);
1282
1283 for (i = 0; i < capab.mcm_naddr; i++) {
1284 bcopy(addr + i * MAXMACADDRLEN,
1285 mip->mi_factory_addr[i].mfa_addr,
1286 mip->mi_type->mt_addr_length);
1287 mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1288 }
1289
1290 kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1291 }
1292
1293 void
1294 mac_addr_factory_fini(mac_impl_t *mip)
1295 {
1296 if (mip->mi_factory_addr == NULL) {
1297 ASSERT(mip->mi_factory_addr_num == 0);
1298 return;
1299 }
1300
1301 kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1302 sizeof (mac_factory_addr_t));
1303
1304 mip->mi_factory_addr = NULL;
1305 mip->mi_factory_addr_num = 0;
1306 }
1307
1308 /*
1309 * Reserve a factory MAC address. If *slot is set to -1, the function
1310 * attempts to reserve any of the available factory MAC addresses and
1311 * returns the reserved slot id. If no slots are available, the function
1312 * returns ENOSPC. If *slot is not set to -1, the function reserves
1313 * the specified slot if it is available, or returns EBUSY is the slot
1314 * is already used. Returns ENOTSUP if the underlying MAC does not
1315 * support multiple factory addresses. If the slot number is not -1 but
1316 * is invalid, returns EINVAL.
1317 */
1318 int
1319 mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1320 {
1321 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1322 mac_impl_t *mip = mcip->mci_mip;
1323 int i, ret = 0;
1324
1325 i_mac_perim_enter(mip);
1326 /*
1327 * Protect against concurrent readers that may need a self-consistent
1328 * view of the factory addresses
1329 */
1330 rw_enter(&mip->mi_rw_lock, RW_WRITER);
1331
1332 if (mip->mi_factory_addr_num == 0) {
1333 ret = ENOTSUP;
1334 goto bail;
1335 }
1336
1337 if (*slot != -1) {
1338 /* check the specified slot */
1339 if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1340 ret = EINVAL;
1341 goto bail;
1342 }
1343 if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1344 ret = EBUSY;
1345 goto bail;
1346 }
1347 } else {
1348 /* pick the next available slot */
1349 for (i = 0; i < mip->mi_factory_addr_num; i++) {
1350 if (!mip->mi_factory_addr[i].mfa_in_use)
1351 break;
1352 }
1353
1354 if (i == mip->mi_factory_addr_num) {
1355 ret = ENOSPC;
1356 goto bail;
1357 }
1358 *slot = i+1;
1359 }
1360
1361 mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1362 mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1363
1364 bail:
1365 rw_exit(&mip->mi_rw_lock);
1366 i_mac_perim_exit(mip);
1367 return (ret);
1368 }
1369
1370 /*
1371 * Release the specified factory MAC address slot.
1372 */
1373 void
1374 mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1375 {
1376 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1377 mac_impl_t *mip = mcip->mci_mip;
1378
1379 i_mac_perim_enter(mip);
1380 /*
1381 * Protect against concurrent readers that may need a self-consistent
1382 * view of the factory addresses
1383 */
1384 rw_enter(&mip->mi_rw_lock, RW_WRITER);
1385
1386 ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1387 ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1388
1389 mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1390
1391 rw_exit(&mip->mi_rw_lock);
1392 i_mac_perim_exit(mip);
1393 }
1394
1395 /*
1396 * Stores in mac_addr the value of the specified MAC address. Returns
1397 * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1398 * The caller must provide a string of at least MAXNAMELEN bytes.
1399 */
1400 void
1401 mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1402 uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1403 {
1404 mac_impl_t *mip = (mac_impl_t *)mh;
1405 boolean_t in_use;
1406
1407 ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1408
1409 /*
1410 * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1411 * and mi_rw_lock
1412 */
1413 rw_enter(&mip->mi_rw_lock, RW_READER);
1414 bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1415 *addr_len = mip->mi_type->mt_addr_length;
1416 in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1417 if (in_use && client_name != NULL) {
1418 bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1419 client_name, MAXNAMELEN);
1420 }
1421 if (in_use_arg != NULL)
1422 *in_use_arg = in_use;
1423 rw_exit(&mip->mi_rw_lock);
1424 }
1425
1426 /*
1427 * Returns the number of factory MAC addresses (in addition to the
1428 * primary MAC address), 0 if the underlying MAC doesn't support
1429 * that feature.
1430 */
1431 uint_t
1432 mac_addr_factory_num(mac_handle_t mh)
1433 {
1434 mac_impl_t *mip = (mac_impl_t *)mh;
1435
1436 return (mip->mi_factory_addr_num);
1437 }
1438
1439
1440 void
1441 mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1442 {
1443 mac_ring_t *ring;
1444
1445 for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1446 ring->mr_flag &= ~flag;
1447 }
1448
1449 /*
1450 * The following mac_hwrings_xxx() functions are private mac client functions
1451 * used by the aggr driver to access and control the underlying HW Rx group
1452 * and rings. In this case, the aggr driver has exclusive control of the
1453 * underlying HW Rx group/rings, it calls the following functions to
1454 * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
1455 * addresses, or set up the Rx callback.
1456 */
1457 /* ARGSUSED */
1458 static void
1459 mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1460 mblk_t *mp_chain, boolean_t loopback)
1461 {
1462 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs;
1463 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx;
1464 mac_direct_rx_t proc;
1465 void *arg1;
1466 mac_resource_handle_t arg2;
1467
1468 proc = srs_rx->sr_func;
1469 arg1 = srs_rx->sr_arg1;
1470 arg2 = mac_srs->srs_mrh;
1471
1472 proc(arg1, arg2, mp_chain, NULL);
1473 }
1474
1475 /*
1476 * This function is called to get the list of HW rings that are reserved by
1477 * an exclusive mac client.
1478 *
1479 * Return value: the number of HW rings.
1480 */
1481 int
1482 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1483 mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
1484 {
1485 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1486 flow_entry_t *flent = mcip->mci_flent;
1487 mac_group_t *grp;
1488 mac_ring_t *ring;
1489 int cnt = 0;
1490
1491 if (rtype == MAC_RING_TYPE_RX) {
1492 grp = flent->fe_rx_ring_group;
1493 } else if (rtype == MAC_RING_TYPE_TX) {
1494 grp = flent->fe_tx_ring_group;
1495 } else {
1496 ASSERT(B_FALSE);
1497 return (-1);
1498 }
1499 /*
1500 * The mac client did not reserve any RX group, return directly.
1501 * This is probably because the underlying MAC does not support
1502 * any groups.
1503 */
1504 if (hwgh != NULL)
1505 *hwgh = NULL;
1506 if (grp == NULL)
1507 return (0);
1508 /*
1509 * This group must be reserved by this mac client.
1510 */
1511 ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1512 (mcip == MAC_GROUP_ONLY_CLIENT(grp)));
1513
1514 for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) {
1515 ASSERT(cnt < MAX_RINGS_PER_GROUP);
1516 hwrh[cnt] = (mac_ring_handle_t)ring;
1517 }
1518 if (hwgh != NULL)
1519 *hwgh = (mac_group_handle_t)grp;
1520
1521 return (cnt);
1522 }
1523
1524 /*
1525 * This function is called to get info about Tx/Rx rings.
1526 *
1527 * Return value: returns uint_t which will have various bits set
1528 * that indicates different properties of the ring.
1529 */
1530 uint_t
1531 mac_hwring_getinfo(mac_ring_handle_t rh)
1532 {
1533 mac_ring_t *ring = (mac_ring_t *)rh;
1534 mac_ring_info_t *info = &ring->mr_info;
1535
1536 return (info->mri_flags);
1537 }
1538
1539 /*
1540 * Export ddi interrupt handles from the HW ring to the pseudo ring and
1541 * setup the RX callback of the mac client which exclusively controls
1542 * HW ring.
1543 */
1544 void
1545 mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh,
1546 mac_ring_handle_t pseudo_rh)
1547 {
1548 mac_ring_t *hw_ring = (mac_ring_t *)hwrh;
1549 mac_ring_t *pseudo_ring;
1550 mac_soft_ring_set_t *mac_srs = hw_ring->mr_srs;
1551
1552 if (pseudo_rh != NULL) {
1553 pseudo_ring = (mac_ring_t *)pseudo_rh;
1554 /* Export the ddi handles to pseudo ring */
1555 pseudo_ring->mr_info.mri_intr.mi_ddi_handle =
1556 hw_ring->mr_info.mri_intr.mi_ddi_handle;
1557 pseudo_ring->mr_info.mri_intr.mi_ddi_shared =
1558 hw_ring->mr_info.mri_intr.mi_ddi_shared;
1559 /*
1560 * Save a pointer to pseudo ring in the hw ring. If
1561 * interrupt handle changes, the hw ring will be
1562 * notified of the change (see mac_ring_intr_set())
1563 * and the appropriate change has to be made to
1564 * the pseudo ring that has exported the ddi handle.
1565 */
1566 hw_ring->mr_prh = pseudo_rh;
1567 }
1568
1569 if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1570 ASSERT(!(mac_srs->srs_type & SRST_TX));
1571 mac_srs->srs_mrh = prh;
1572 mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1573 }
1574 }
1575
1576 void
1577 mac_hwring_teardown(mac_ring_handle_t hwrh)
1578 {
1579 mac_ring_t *hw_ring = (mac_ring_t *)hwrh;
1580 mac_soft_ring_set_t *mac_srs;
1581
1582 if (hw_ring == NULL)
1583 return;
1584 hw_ring->mr_prh = NULL;
1585 if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1586 mac_srs = hw_ring->mr_srs;
1587 ASSERT(!(mac_srs->srs_type & SRST_TX));
1588 mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1589 mac_srs->srs_mrh = NULL;
1590 }
1591 }
1592
1593 int
1594 mac_hwring_disable_intr(mac_ring_handle_t rh)
1595 {
1596 mac_ring_t *rr_ring = (mac_ring_t *)rh;
1597 mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1598
1599 return (intr->mi_disable(intr->mi_handle));
1600 }
1601
1602 int
1603 mac_hwring_enable_intr(mac_ring_handle_t rh)
1604 {
1605 mac_ring_t *rr_ring = (mac_ring_t *)rh;
1606 mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1607
1608 return (intr->mi_enable(intr->mi_handle));
1609 }
1610
1611 int
1612 mac_hwring_start(mac_ring_handle_t rh)
1613 {
1614 mac_ring_t *rr_ring = (mac_ring_t *)rh;
1615
1616 MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1617 return (0);
1618 }
1619
1620 void
1621 mac_hwring_stop(mac_ring_handle_t rh)
1622 {
1623 mac_ring_t *rr_ring = (mac_ring_t *)rh;
1624
1625 mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1626 }
1627
1628 mblk_t *
1629 mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1630 {
1631 mac_ring_t *rr_ring = (mac_ring_t *)rh;
1632 mac_ring_info_t *info = &rr_ring->mr_info;
1633
1634 return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1635 }
1636
1637 /*
1638 * Send packets through a selected tx ring.
1639 */
1640 mblk_t *
1641 mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp)
1642 {
1643 mac_ring_t *ring = (mac_ring_t *)rh;
1644 mac_ring_info_t *info = &ring->mr_info;
1645
1646 ASSERT(ring->mr_type == MAC_RING_TYPE_TX &&
1647 ring->mr_state >= MR_INUSE);
1648 return (info->mri_tx(info->mri_driver, mp));
1649 }
1650
1651 /*
1652 * Query stats for a particular rx/tx ring
1653 */
1654 int
1655 mac_hwring_getstat(mac_ring_handle_t rh, uint_t stat, uint64_t *val)
1656 {
1657 mac_ring_t *ring = (mac_ring_t *)rh;
1658 mac_ring_info_t *info = &ring->mr_info;
1659
1660 return (info->mri_stat(info->mri_driver, stat, val));
1661 }
1662
1663 /*
1664 * Private function that is only used by aggr to send packets through
1665 * a port/Tx ring. Since aggr exposes a pseudo Tx ring even for ports
1666 * that does not expose Tx rings, aggr_ring_tx() entry point needs
1667 * access to mac_impl_t to send packets through m_tx() entry point.
1668 * It accomplishes this by calling mac_hwring_send_priv() function.
1669 */
1670 mblk_t *
1671 mac_hwring_send_priv(mac_client_handle_t mch, mac_ring_handle_t rh, mblk_t *mp)
1672 {
1673 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1674 mac_impl_t *mip = mcip->mci_mip;
1675
1676 MAC_TX(mip, rh, mp, mcip);
1677 return (mp);
1678 }
1679
1680 int
1681 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1682 {
1683 mac_group_t *group = (mac_group_t *)gh;
1684
1685 return (mac_group_addmac(group, addr));
1686 }
1687
1688 int
1689 mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1690 {
1691 mac_group_t *group = (mac_group_t *)gh;
1692
1693 return (mac_group_remmac(group, addr));
1694 }
1695
1696 /*
1697 * Set the RX group to be shared/reserved. Note that the group must be
1698 * started/stopped outside of this function.
1699 */
1700 void
1701 mac_set_group_state(mac_group_t *grp, mac_group_state_t state)
1702 {
1703 /*
1704 * If there is no change in the group state, just return.
1705 */
1706 if (grp->mrg_state == state)
1707 return;
1708
1709 switch (state) {
1710 case MAC_GROUP_STATE_RESERVED:
1711 /*
1712 * Successfully reserved the group.
1713 *
1714 * Given that there is an exclusive client controlling this
1715 * group, we enable the group level polling when available,
1716 * so that SRSs get to turn on/off individual rings they's
1717 * assigned to.
1718 */
1719 ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1720
1721 if (grp->mrg_type == MAC_RING_TYPE_RX &&
1722 GROUP_INTR_DISABLE_FUNC(grp) != NULL) {
1723 GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1724 }
1725 break;
1726
1727 case MAC_GROUP_STATE_SHARED:
1728 /*
1729 * Set all rings of this group to software classified.
1730 * If the group has an overriding interrupt, then re-enable it.
1731 */
1732 ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1733
1734 if (grp->mrg_type == MAC_RING_TYPE_RX &&
1735 GROUP_INTR_ENABLE_FUNC(grp) != NULL) {
1736 GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1737 }
1738 /* The ring is not available for reservations any more */
1739 break;
1740
1741 case MAC_GROUP_STATE_REGISTERED:
1742 /* Also callable from mac_register, perim is not held */
1743 break;
1744
1745 default:
1746 ASSERT(B_FALSE);
1747 break;
1748 }
1749
1750 grp->mrg_state = state;
1751 }
1752
1753 /*
1754 * Quiesce future hardware classified packets for the specified Rx ring
1755 */
1756 static void
1757 mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
1758 {
1759 ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
1760 ASSERT(ring_flag == MR_CONDEMNED || ring_flag == MR_QUIESCE);
1761
1762 mutex_enter(&rx_ring->mr_lock);
1763 rx_ring->mr_flag |= ring_flag;
1764 while (rx_ring->mr_refcnt != 0)
1765 cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
1766 mutex_exit(&rx_ring->mr_lock);
1767 }
1768
1769 /*
1770 * Please see mac_tx for details about the per cpu locking scheme
1771 */
1772 static void
1773 mac_tx_lock_all(mac_client_impl_t *mcip)
1774 {
1775 int i;
1776
1777 for (i = 0; i <= mac_tx_percpu_cnt; i++)
1778 mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1779 }
1780
1781 static void
1782 mac_tx_unlock_all(mac_client_impl_t *mcip)
1783 {
1784 int i;
1785
1786 for (i = mac_tx_percpu_cnt; i >= 0; i--)
1787 mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1788 }
1789
1790 static void
1791 mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
1792 {
1793 int i;
1794
1795 for (i = mac_tx_percpu_cnt; i > 0; i--)
1796 mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1797 }
1798
1799 static int
1800 mac_tx_sum_refcnt(mac_client_impl_t *mcip)
1801 {
1802 int i;
1803 int refcnt = 0;
1804
1805 for (i = 0; i <= mac_tx_percpu_cnt; i++)
1806 refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
1807
1808 return (refcnt);
1809 }
1810
1811 /*
1812 * Stop future Tx packets coming down from the client in preparation for
1813 * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
1814 * of rings between clients
1815 */
1816 void
1817 mac_tx_client_block(mac_client_impl_t *mcip)
1818 {
1819 mac_tx_lock_all(mcip);
1820 mcip->mci_tx_flag |= MCI_TX_QUIESCE;
1821 while (mac_tx_sum_refcnt(mcip) != 0) {
1822 mac_tx_unlock_allbutzero(mcip);
1823 cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1824 mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1825 mac_tx_lock_all(mcip);
1826 }
1827 mac_tx_unlock_all(mcip);
1828 }
1829
1830 void
1831 mac_tx_client_unblock(mac_client_impl_t *mcip)
1832 {
1833 mac_tx_lock_all(mcip);
1834 mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
1835 mac_tx_unlock_all(mcip);
1836 /*
1837 * We may fail to disable flow control for the last MAC_NOTE_TX
1838 * notification because the MAC client is quiesced. Send the
1839 * notification again.
1840 */
1841 i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
1842 }
1843
1844 /*
1845 * Wait for an SRS to quiesce. The SRS worker will signal us when the
1846 * quiesce is done.
1847 */
1848 static void
1849 mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
1850 {
1851 mutex_enter(&srs->srs_lock);
1852 while (!(srs->srs_state & srs_flag))
1853 cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
1854 mutex_exit(&srs->srs_lock);
1855 }
1856
1857 /*
1858 * Quiescing an Rx SRS is achieved by the following sequence. The protocol
1859 * works bottom up by cutting off packet flow from the bottommost point in the
1860 * mac, then the SRS, and then the soft rings. There are 2 use cases of this
1861 * mechanism. One is a temporary quiesce of the SRS, such as say while changing
1862 * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
1863 * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
1864 * for the SRS and MR flags. In the former case the threads pause waiting for
1865 * a restart, while in the latter case the threads exit. The Tx SRS teardown
1866 * is also mostly similar to the above.
1867 *
1868 * 1. Stop future hardware classified packets at the lowest level in the mac.
1869 * Remove any hardware classification rule (CONDEMNED case) and mark the
1870 * rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
1871 * from increasing. Upcalls from the driver that come through hardware
1872 * classification will be dropped in mac_rx from now on. Then we wait for
1873 * the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
1874 * sure there aren't any upcall threads from the driver through hardware
1875 * classification. In the case of SRS teardown we also remove the
1876 * classification rule in the driver.
1877 *
1878 * 2. Stop future software classified packets by marking the flow entry with
1879 * FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
1880 * increasing. We also remove the flow entry from the table in the latter
1881 * case. Then wait for the fe_refcnt to reach an appropriate quiescent value
1882 * that indicates there aren't any active threads using that flow entry.
1883 *
1884 * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
1885 * SRS worker thread, and the soft ring threads are quiesced in sequence
1886 * with the SRS worker thread serving as a master controller. This
1887 * mechansim is explained in mac_srs_worker_quiesce().
1888 *
1889 * The restart mechanism to reactivate the SRS and softrings is explained
1890 * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
1891 * restart sequence.
1892 */
1893 void
1894 mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1895 {
1896 flow_entry_t *flent = srs->srs_flent;
1897 uint_t mr_flag, srs_done_flag;
1898
1899 ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1900 ASSERT(!(srs->srs_type & SRST_TX));
1901
1902 if (srs_quiesce_flag == SRS_CONDEMNED) {
1903 mr_flag = MR_CONDEMNED;
1904 srs_done_flag = SRS_CONDEMNED_DONE;
1905 if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1906 mac_srs_client_poll_disable(srs->srs_mcip, srs);
1907 } else {
1908 ASSERT(srs_quiesce_flag == SRS_QUIESCE);
1909 mr_flag = MR_QUIESCE;
1910 srs_done_flag = SRS_QUIESCE_DONE;
1911 if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1912 mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
1913 }
1914
1915 if (srs->srs_ring != NULL) {
1916 mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
1917 } else {
1918 /*
1919 * SRS is driven by software classification. In case
1920 * of CONDEMNED, the top level teardown functions will
1921 * deal with flow removal.
1922 */
1923 if (srs_quiesce_flag != SRS_CONDEMNED) {
1924 FLOW_MARK(flent, FE_QUIESCE);
1925 mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1926 }
1927 }
1928
1929 /*
1930 * Signal the SRS to quiesce itself, and then cv_wait for the
1931 * SRS quiesce to complete. The SRS worker thread will wake us
1932 * up when the quiesce is complete
1933 */
1934 mac_srs_signal(srs, srs_quiesce_flag);
1935 mac_srs_quiesce_wait(srs, srs_done_flag);
1936 }
1937
1938 /*
1939 * Remove an SRS.
1940 */
1941 void
1942 mac_rx_srs_remove(mac_soft_ring_set_t *srs)
1943 {
1944 flow_entry_t *flent = srs->srs_flent;
1945 int i;
1946
1947 mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
1948 /*
1949 * Locate and remove our entry in the fe_rx_srs[] array, and
1950 * adjust the fe_rx_srs array entries and array count by
1951 * moving the last entry into the vacated spot.
1952 */
1953 mutex_enter(&flent->fe_lock);
1954 for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1955 if (flent->fe_rx_srs[i] == srs)
1956 break;
1957 }
1958
1959 ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
1960 if (i != flent->fe_rx_srs_cnt - 1) {
1961 flent->fe_rx_srs[i] =
1962 flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
1963 i = flent->fe_rx_srs_cnt - 1;
1964 }
1965
1966 flent->fe_rx_srs[i] = NULL;
1967 flent->fe_rx_srs_cnt--;
1968 mutex_exit(&flent->fe_lock);
1969
1970 mac_srs_free(srs);
1971 }
1972
1973 static void
1974 mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
1975 {
1976 mutex_enter(&srs->srs_lock);
1977 srs->srs_state &= ~flag;
1978 mutex_exit(&srs->srs_lock);
1979 }
1980
1981 void
1982 mac_rx_srs_restart(mac_soft_ring_set_t *srs)
1983 {
1984 flow_entry_t *flent = srs->srs_flent;
1985 mac_ring_t *mr;
1986
1987 ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1988 ASSERT((srs->srs_type & SRST_TX) == 0);
1989
1990 /*
1991 * This handles a change in the number of SRSs between the quiesce and
1992 * and restart operation of a flow.
1993 */
1994 if (!SRS_QUIESCED(srs))
1995 return;
1996
1997 /*
1998 * Signal the SRS to restart itself. Wait for the restart to complete
1999 * Note that we only restart the SRS if it is not marked as
2000 * permanently quiesced.
2001 */
2002 if (!SRS_QUIESCED_PERMANENT(srs)) {
2003 mac_srs_signal(srs, SRS_RESTART);
2004 mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2005 mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2006
2007 mac_srs_client_poll_restart(srs->srs_mcip, srs);
2008 }
2009
2010 /* Finally clear the flags to let the packets in */
2011 mr = srs->srs_ring;
2012 if (mr != NULL) {
2013 MAC_RING_UNMARK(mr, MR_QUIESCE);
2014 /* In case the ring was stopped, safely restart it */
2015 if (mr->mr_state != MR_INUSE)
2016 (void) mac_start_ring(mr);
2017 } else {
2018 FLOW_UNMARK(flent, FE_QUIESCE);
2019 }
2020 }
2021
2022 /*
2023 * Temporary quiesce of a flow and associated Rx SRS.
2024 * Please see block comment above mac_rx_classify_flow_rem.
2025 */
2026 /* ARGSUSED */
2027 int
2028 mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
2029 {
2030 int i;
2031
2032 for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2033 mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
2034 SRS_QUIESCE);
2035 }
2036 return (0);
2037 }
2038
2039 /*
2040 * Restart a flow and associated Rx SRS that has been quiesced temporarily
2041 * Please see block comment above mac_rx_classify_flow_rem
2042 */
2043 /* ARGSUSED */
2044 int
2045 mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
2046 {
2047 int i;
2048
2049 for (i = 0; i < flent->fe_rx_srs_cnt; i++)
2050 mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
2051
2052 return (0);
2053 }
2054
2055 void
2056 mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
2057 {
2058 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
2059 flow_entry_t *flent = mcip->mci_flent;
2060 mac_impl_t *mip = mcip->mci_mip;
2061 mac_soft_ring_set_t *mac_srs;
2062 int i;
2063
2064 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2065
2066 if (flent == NULL)
2067 return;
2068
2069 for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2070 mac_srs = flent->fe_rx_srs[i];
2071 mutex_enter(&mac_srs->srs_lock);
2072 if (on)
2073 mac_srs->srs_state |= SRS_QUIESCE_PERM;
2074 else
2075 mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
2076 mutex_exit(&mac_srs->srs_lock);
2077 }
2078 }
2079
2080 void
2081 mac_rx_client_quiesce(mac_client_handle_t mch)
2082 {
2083 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
2084 mac_impl_t *mip = mcip->mci_mip;
2085
2086 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2087
2088 if (MCIP_DATAPATH_SETUP(mcip)) {
2089 (void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
2090 NULL);
2091 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2092 mac_rx_classify_flow_quiesce, NULL);
2093 }
2094 }
2095
2096 void
2097 mac_rx_client_restart(mac_client_handle_t mch)
2098 {
2099 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
2100 mac_impl_t *mip = mcip->mci_mip;
2101
2102 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2103
2104 if (MCIP_DATAPATH_SETUP(mcip)) {
2105 (void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
2106 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2107 mac_rx_classify_flow_restart, NULL);
2108 }
2109 }
2110
2111 /*
2112 * This function only quiesces the Tx SRS and softring worker threads. Callers
2113 * need to make sure that there aren't any mac client threads doing current or
2114 * future transmits in the mac before calling this function.
2115 */
2116 void
2117 mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
2118 {
2119 mac_client_impl_t *mcip = srs->srs_mcip;
2120
2121 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2122
2123 ASSERT(srs->srs_type & SRST_TX);
2124 ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2125 srs_quiesce_flag == SRS_QUIESCE);
2126
2127 /*
2128 * Signal the SRS to quiesce itself, and then cv_wait for the
2129 * SRS quiesce to complete. The SRS worker thread will wake us
2130 * up when the quiesce is complete
2131 */
2132 mac_srs_signal(srs, srs_quiesce_flag);
2133 mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2134 SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2135 }
2136
2137 void
2138 mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2139 {
2140 /*
2141 * Resizing the fanout could result in creation of new SRSs.
2142 * They may not necessarily be in the quiesced state in which
2143 * case it need be restarted
2144 */
2145 if (!SRS_QUIESCED(srs))
2146 return;
2147
2148 mac_srs_signal(srs, SRS_RESTART);
2149 mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2150 mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2151 }
2152
2153 /*
2154 * Temporary quiesce of a flow and associated Rx SRS.
2155 * Please see block comment above mac_rx_srs_quiesce
2156 */
2157 /* ARGSUSED */
2158 int
2159 mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2160 {
2161 /*
2162 * The fe_tx_srs is null for a subflow on an interface that is
2163 * not plumbed
2164 */
2165 if (flent->fe_tx_srs != NULL)
2166 mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2167 return (0);
2168 }
2169
2170 /* ARGSUSED */
2171 int
2172 mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2173 {
2174 /*
2175 * The fe_tx_srs is null for a subflow on an interface that is
2176 * not plumbed
2177 */
2178 if (flent->fe_tx_srs != NULL)
2179 mac_tx_srs_restart(flent->fe_tx_srs);
2180 return (0);
2181 }
2182
2183 static void
2184 i_mac_tx_client_quiesce(mac_client_handle_t mch, uint_t srs_quiesce_flag)
2185 {
2186 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
2187
2188 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2189
2190 mac_tx_client_block(mcip);
2191 if (MCIP_TX_SRS(mcip) != NULL) {
2192 mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2193 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2194 mac_tx_flow_quiesce, NULL);
2195 }
2196 }
2197
2198 void
2199 mac_tx_client_quiesce(mac_client_handle_t mch)
2200 {
2201 i_mac_tx_client_quiesce(mch, SRS_QUIESCE);
2202 }
2203
2204 void
2205 mac_tx_client_condemn(mac_client_handle_t mch)
2206 {
2207 i_mac_tx_client_quiesce(mch, SRS_CONDEMNED);
2208 }
2209
2210 void
2211 mac_tx_client_restart(mac_client_handle_t mch)
2212 {
2213 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
2214
2215 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2216
2217 mac_tx_client_unblock(mcip);
2218 if (MCIP_TX_SRS(mcip) != NULL) {
2219 mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2220 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2221 mac_tx_flow_restart, NULL);
2222 }
2223 }
2224
2225 void
2226 mac_tx_client_flush(mac_client_impl_t *mcip)
2227 {
2228 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2229
2230 mac_tx_client_quiesce((mac_client_handle_t)mcip);
2231 mac_tx_client_restart((mac_client_handle_t)mcip);
2232 }
2233
2234 void
2235 mac_client_quiesce(mac_client_impl_t *mcip)
2236 {
2237 mac_rx_client_quiesce((mac_client_handle_t)mcip);
2238 mac_tx_client_quiesce((mac_client_handle_t)mcip);
2239 }
2240
2241 void
2242 mac_client_restart(mac_client_impl_t *mcip)
2243 {
2244 mac_rx_client_restart((mac_client_handle_t)mcip);
2245 mac_tx_client_restart((mac_client_handle_t)mcip);
2246 }
2247
2248 /*
2249 * Allocate a minor number.
2250 */
2251 minor_t
2252 mac_minor_hold(boolean_t sleep)
2253 {
2254 minor_t minor;
2255
2256 /*
2257 * Grab a value from the arena.
2258 */
2259 atomic_inc_32(&minor_count);
2260
2261 if (sleep)
2262 minor = (uint_t)id_alloc(minor_ids);
2263 else
2264 minor = (uint_t)id_alloc_nosleep(minor_ids);
2265
2266 if (minor == 0) {
2267 atomic_dec_32(&minor_count);
2268 return (0);
2269 }
2270
2271 return (minor);
2272 }
2273
2274 /*
2275 * Release a previously allocated minor number.
2276 */
2277 void
2278 mac_minor_rele(minor_t minor)
2279 {
2280 /*
2281 * Return the value to the arena.
2282 */
2283 id_free(minor_ids, minor);
2284 atomic_dec_32(&minor_count);
2285 }
2286
2287 uint32_t
2288 mac_no_notification(mac_handle_t mh)
2289 {
2290 mac_impl_t *mip = (mac_impl_t *)mh;
2291
2292 return (((mip->mi_state_flags & MIS_LEGACY) != 0) ?
2293 mip->mi_capab_legacy.ml_unsup_note : 0);
2294 }
2295
2296 /*
2297 * Prevent any new opens of this mac in preparation for unregister
2298 */
2299 int
2300 i_mac_disable(mac_impl_t *mip)
2301 {
2302 mac_client_impl_t *mcip;
2303
2304 rw_enter(&i_mac_impl_lock, RW_WRITER);
2305 if (mip->mi_state_flags & MIS_DISABLED) {
2306 /* Already disabled, return success */
2307 rw_exit(&i_mac_impl_lock);
2308 return (0);
2309 }
2310 /*
2311 * See if there are any other references to this mac_t (e.g., VLAN's).
2312 * If so return failure. If all the other checks below pass, then
2313 * set mi_disabled atomically under the i_mac_impl_lock to prevent
2314 * any new VLAN's from being created or new mac client opens of this
2315 * mac end point.
2316 */
2317 if (mip->mi_ref > 0) {
2318 rw_exit(&i_mac_impl_lock);
2319 return (EBUSY);
2320 }
2321
2322 /*
2323 * mac clients must delete all multicast groups they join before
2324 * closing. bcast groups are reference counted, the last client
2325 * to delete the group will wait till the group is physically
2326 * deleted. Since all clients have closed this mac end point
2327 * mi_bcast_ngrps must be zero at this point
2328 */
2329 ASSERT(mip->mi_bcast_ngrps == 0);
2330
2331 /*
2332 * Don't let go of this if it has some flows.
2333 * All other code guarantees no flows are added to a disabled
2334 * mac, therefore it is sufficient to check for the flow table
2335 * only here.
2336 */
2337 mcip = mac_primary_client_handle(mip);
2338 if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2339 rw_exit(&i_mac_impl_lock);
2340 return (ENOTEMPTY);
2341 }
2342
2343 mip->mi_state_flags |= MIS_DISABLED;
2344 rw_exit(&i_mac_impl_lock);
2345 return (0);
2346 }
2347
2348 int
2349 mac_disable_nowait(mac_handle_t mh)
2350 {
2351 mac_impl_t *mip = (mac_impl_t *)mh;
2352 int err;
2353
2354 if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2355 return (err);
2356 err = i_mac_disable(mip);
2357 i_mac_perim_exit(mip);
2358 return (err);
2359 }
2360
2361 int
2362 mac_disable(mac_handle_t mh)
2363 {
2364 mac_impl_t *mip = (mac_impl_t *)mh;
2365 int err;
2366
2367 i_mac_perim_enter(mip);
2368 err = i_mac_disable(mip);
2369 i_mac_perim_exit(mip);
2370
2371 /*
2372 * Clean up notification thread and wait for it to exit.
2373 */
2374 if (err == 0)
2375 i_mac_notify_exit(mip);
2376
2377 return (err);
2378 }
2379
2380 /*
2381 * Called when the MAC instance has a non empty flow table, to de-multiplex
2382 * incoming packets to the right flow.
2383 * The MAC's rw lock is assumed held as a READER.
2384 */
2385 /* ARGSUSED */
2386 static mblk_t *
2387 mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2388 {
2389 flow_entry_t *flent = NULL;
2390 uint_t flags = FLOW_INBOUND;
2391 int err;
2392
2393 /*
2394 * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
2395 * to mac_flow_lookup() so that the VLAN packets can be successfully
2396 * passed to the non-VLAN aggregation flows.
2397 *
2398 * Note that there is possibly a race between this and
2399 * mac_unicast_remove/add() and VLAN packets could be incorrectly
2400 * classified to non-VLAN flows of non-aggregation mac clients. These
2401 * VLAN packets will be then filtered out by the mac module.
2402 */
2403 if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
2404 flags |= FLOW_IGNORE_VLAN;
2405
2406 err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2407 if (err != 0) {
2408 /* no registered receive function */
2409 return (mp);
2410 } else {
2411 mac_client_impl_t *mcip;
2412
2413 /*
2414 * This flent might just be an additional one on the MAC client,
2415 * i.e. for classification purposes (different fdesc), however
2416 * the resources, SRS et. al., are in the mci_flent, so if
2417 * this isn't the mci_flent, we need to get it.
2418 */
2419 if ((mcip = flent->fe_mcip) != NULL &&
2420 mcip->mci_flent != flent) {
2421 FLOW_REFRELE(flent);
2422 flent = mcip->mci_flent;
2423 FLOW_TRY_REFHOLD(flent, err);
2424 if (err != 0)
2425 return (mp);
2426 }
2427 (flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2428 B_FALSE);
2429 FLOW_REFRELE(flent);
2430 }
2431 return (NULL);
2432 }
2433
2434 mblk_t *
2435 mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2436 {
2437 mac_impl_t *mip = (mac_impl_t *)mh;
2438 mblk_t *bp, *bp1, **bpp, *list = NULL;
2439
2440 /*
2441 * We walk the chain and attempt to classify each packet.
2442 * The packets that couldn't be classified will be returned
2443 * back to the caller.
2444 */
2445 bp = mp_chain;
2446 bpp = &list;
2447 while (bp != NULL) {
2448 bp1 = bp;
2449 bp = bp->b_next;
2450 bp1->b_next = NULL;
2451
2452 if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2453 *bpp = bp1;
2454 bpp = &bp1->b_next;
2455 }
2456 }
2457 return (list);
2458 }
2459
2460 static int
2461 mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2462 {
2463 mac_ring_handle_t ring = arg;
2464
2465 if (flent->fe_tx_srs)
2466 mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2467 return (0);
2468 }
2469
2470 void
2471 i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2472 {
2473 mac_client_impl_t *cclient;
2474 mac_soft_ring_set_t *mac_srs;
2475
2476 /*
2477 * After grabbing the mi_rw_lock, the list of clients can't change.
2478 * If there are any clients mi_disabled must be B_FALSE and can't
2479 * get set since there are clients. If there aren't any clients we
2480 * don't do anything. In any case the mip has to be valid. The driver
2481 * must make sure that it goes single threaded (with respect to mac
2482 * calls) and wait for all pending mac calls to finish before calling
2483 * mac_unregister.
2484 */
2485 rw_enter(&i_mac_impl_lock, RW_READER);
2486 if (mip->mi_state_flags & MIS_DISABLED) {
2487 rw_exit(&i_mac_impl_lock);
2488 return;
2489 }
2490
2491 /*
2492 * Get MAC tx srs from walking mac_client_handle list.
2493 */
2494 rw_enter(&mip->mi_rw_lock, RW_READER);
2495 for (cclient = mip->mi_clients_list; cclient != NULL;
2496 cclient = cclient->mci_client_next) {
2497 if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL) {
2498 mac_tx_srs_wakeup(mac_srs, ring);
2499 } else {
2500 /*
2501 * Aggr opens underlying ports in exclusive mode
2502 * and registers flow control callbacks using
2503 * mac_tx_client_notify(). When opened in
2504 * exclusive mode, Tx SRS won't be created
2505 * during mac_unicast_add().
2506 */
2507 if (cclient->mci_state_flags & MCIS_EXCLUSIVE) {
2508 mac_tx_invoke_callbacks(cclient,
2509 (mac_tx_cookie_t)ring);
2510 }
2511 }
2512 (void) mac_flow_walk(cclient->mci_subflow_tab,
2513 mac_tx_flow_srs_wakeup, ring);
2514 }
2515 rw_exit(&mip->mi_rw_lock);
2516 rw_exit(&i_mac_impl_lock);
2517 }
2518
2519 /* ARGSUSED */
2520 void
2521 mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2522 boolean_t add)
2523 {
2524 mac_impl_t *mip = (mac_impl_t *)mh;
2525
2526 i_mac_perim_enter((mac_impl_t *)mh);
2527 /*
2528 * If no specific refresh function was given then default to the
2529 * driver's m_multicst entry point.
2530 */
2531 if (refresh == NULL) {
2532 refresh = mip->mi_multicst;
2533 arg = mip->mi_driver;
2534 }
2535
2536 mac_bcast_refresh(mip, refresh, arg, add);
2537 i_mac_perim_exit((mac_impl_t *)mh);
2538 }
2539
2540 void
2541 mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2542 {
2543 mac_impl_t *mip = (mac_impl_t *)mh;
2544
2545 /*
2546 * If no specific refresh function was given then default to the
2547 * driver's m_promisc entry point.
2548 */
2549 if (refresh == NULL) {
2550 refresh = mip->mi_setpromisc;
2551 arg = mip->mi_driver;
2552 }
2553 ASSERT(refresh != NULL);
2554
2555 /*
2556 * Call the refresh function with the current promiscuity.
2557 */
2558 refresh(arg, (mip->mi_devpromisc != 0));
2559 }
2560
2561 /*
2562 * The mac client requests that the mac not to change its margin size to
2563 * be less than the specified value. If "current" is B_TRUE, then the client
2564 * requests the mac not to change its margin size to be smaller than the
2565 * current size. Further, return the current margin size value in this case.
2566 *
2567 * We keep every requested size in an ordered list from largest to smallest.
2568 */
2569 int
2570 mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2571 {
2572 mac_impl_t *mip = (mac_impl_t *)mh;
2573 mac_margin_req_t **pp, *p;
2574 int err = 0;
2575
2576 rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2577 if (current)
2578 *marginp = mip->mi_margin;
2579
2580 /*
2581 * If the current margin value cannot satisfy the margin requested,
2582 * return ENOTSUP directly.
2583 */
2584 if (*marginp > mip->mi_margin) {
2585 err = ENOTSUP;
2586 goto done;
2587 }
2588
2589 /*
2590 * Check whether the given margin is already in the list. If so,
2591 * bump the reference count.
2592 */
2593 for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2594 if (p->mmr_margin == *marginp) {
2595 /*
2596 * The margin requested is already in the list,
2597 * so just bump the reference count.
2598 */
2599 p->mmr_ref++;
2600 goto done;
2601 }
2602 if (p->mmr_margin < *marginp)
2603 break;
2604 }
2605
2606
2607 p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2608 p->mmr_margin = *marginp;
2609 p->mmr_ref++;
2610 p->mmr_nextp = *pp;
2611 *pp = p;
2612
2613 done:
2614 rw_exit(&(mip->mi_rw_lock));
2615 return (err);
2616 }
2617
2618 /*
2619 * The mac client requests to cancel its previous mac_margin_add() request.
2620 * We remove the requested margin size from the list.
2621 */
2622 int
2623 mac_margin_remove(mac_handle_t mh, uint32_t margin)
2624 {
2625 mac_impl_t *mip = (mac_impl_t *)mh;
2626 mac_margin_req_t **pp, *p;
2627 int err = 0;
2628
2629 rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2630 /*
2631 * Find the entry in the list for the given margin.
2632 */
2633 for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2634 if (p->mmr_margin == margin) {
2635 if (--p->mmr_ref == 0)
2636 break;
2637
2638 /*
2639 * There is still a reference to this address so
2640 * there's nothing more to do.
2641 */
2642 goto done;
2643 }
2644 }
2645
2646 /*
2647 * We did not find an entry for the given margin.
2648 */
2649 if (p == NULL) {
2650 err = ENOENT;
2651 goto done;
2652 }
2653
2654 ASSERT(p->mmr_ref == 0);
2655
2656 /*
2657 * Remove it from the list.
2658 */
2659 *pp = p->mmr_nextp;
2660 kmem_free(p, sizeof (mac_margin_req_t));
2661 done:
2662 rw_exit(&(mip->mi_rw_lock));
2663 return (err);
2664 }
2665
2666 boolean_t
2667 mac_margin_update(mac_handle_t mh, uint32_t margin)
2668 {
2669 mac_impl_t *mip = (mac_impl_t *)mh;
2670 uint32_t margin_needed = 0;
2671
2672 rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2673
2674 if (mip->mi_mmrp != NULL)
2675 margin_needed = mip->mi_mmrp->mmr_margin;
2676
2677 if (margin_needed <= margin)
2678 mip->mi_margin = margin;
2679
2680 rw_exit(&(mip->mi_rw_lock));
2681
2682 if (margin_needed <= margin)
2683 i_mac_notify(mip, MAC_NOTE_MARGIN);
2684
2685 return (margin_needed <= margin);
2686 }
2687
2688 /*
2689 * MAC clients use this interface to request that a MAC device not change its
2690 * MTU below the specified amount. At this time, that amount must be within the
2691 * range of the device's current minimum and the device's current maximum. eg. a
2692 * client cannot request a 3000 byte MTU when the device's MTU is currently
2693 * 2000.
2694 *
2695 * If "current" is set to B_TRUE, then the request is to simply to reserve the
2696 * current underlying mac's maximum for this mac client and return it in mtup.
2697 */
2698 int
2699 mac_mtu_add(mac_handle_t mh, uint32_t *mtup, boolean_t current)
2700 {
2701 mac_impl_t *mip = (mac_impl_t *)mh;
2702 mac_mtu_req_t *prev, *cur;
2703 mac_propval_range_t mpr;
2704 int err;
2705
2706 i_mac_perim_enter(mip);
2707 rw_enter(&mip->mi_rw_lock, RW_WRITER);
2708
2709 if (current == B_TRUE)
2710 *mtup = mip->mi_sdu_max;
2711 mpr.mpr_count = 1;
2712 err = mac_prop_info(mh, MAC_PROP_MTU, "mtu", NULL, 0, &mpr, NULL);
2713 if (err != 0) {
2714 rw_exit(&mip->mi_rw_lock);
2715 i_mac_perim_exit(mip);
2716 return (err);
2717 }
2718
2719 if (*mtup > mip->mi_sdu_max ||
2720 *mtup < mpr.mpr_range_uint32[0].mpur_min) {
2721 rw_exit(&mip->mi_rw_lock);
2722 i_mac_perim_exit(mip);
2723 return (ENOTSUP);
2724 }
2725
2726 prev = NULL;
2727 for (cur = mip->mi_mtrp; cur != NULL; cur = cur->mtr_nextp) {
2728 if (*mtup == cur->mtr_mtu) {
2729 cur->mtr_ref++;
2730 rw_exit(&mip->mi_rw_lock);
2731 i_mac_perim_exit(mip);
2732 return (0);
2733 }
2734
2735 if (*mtup > cur->mtr_mtu)
2736 break;
2737
2738 prev = cur;
2739 }
2740
2741 cur = kmem_alloc(sizeof (mac_mtu_req_t), KM_SLEEP);
2742 cur->mtr_mtu = *mtup;
2743 cur->mtr_ref = 1;
2744 if (prev != NULL) {
2745 cur->mtr_nextp = prev->mtr_nextp;
2746 prev->mtr_nextp = cur;
2747 } else {
2748 cur->mtr_nextp = mip->mi_mtrp;
2749 mip->mi_mtrp = cur;
2750 }
2751
2752 rw_exit(&mip->mi_rw_lock);
2753 i_mac_perim_exit(mip);
2754 return (0);
2755 }
2756
2757 int
2758 mac_mtu_remove(mac_handle_t mh, uint32_t mtu)
2759 {
2760 mac_impl_t *mip = (mac_impl_t *)mh;
2761 mac_mtu_req_t *cur, *prev;
2762
2763 i_mac_perim_enter(mip);
2764 rw_enter(&mip->mi_rw_lock, RW_WRITER);
2765
2766 prev = NULL;
2767 for (cur = mip->mi_mtrp; cur != NULL; cur = cur->mtr_nextp) {
2768 if (cur->mtr_mtu == mtu) {
2769 ASSERT(cur->mtr_ref > 0);
2770 cur->mtr_ref--;
2771 if (cur->mtr_ref == 0) {
2772 if (prev == NULL) {
2773 mip->mi_mtrp = cur->mtr_nextp;
2774 } else {
2775 prev->mtr_nextp = cur->mtr_nextp;
2776 }
2777 kmem_free(cur, sizeof (mac_mtu_req_t));
2778 }
2779 rw_exit(&mip->mi_rw_lock);
2780 i_mac_perim_exit(mip);
2781 return (0);
2782 }
2783
2784 prev = cur;
2785 }
2786
2787 rw_exit(&mip->mi_rw_lock);
2788 i_mac_perim_exit(mip);
2789 return (ENOENT);
2790 }
2791
2792 /*
2793 * MAC Type Plugin functions.
2794 */
2795
2796 mactype_t *
2797 mactype_getplugin(const char *pname)
2798 {
2799 mactype_t *mtype = NULL;
2800 boolean_t tried_modload = B_FALSE;
2801
2802 mutex_enter(&i_mactype_lock);
2803
2804 find_registered_mactype:
2805 if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
2806 (mod_hash_val_t *)&mtype) != 0) {
2807 if (!tried_modload) {
2808 /*
2809 * If the plugin has not yet been loaded, then
2810 * attempt to load it now. If modload() succeeds,
2811 * the plugin should have registered using
2812 * mactype_register(), in which case we can go back
2813 * and attempt to find it again.
2814 */
2815 if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
2816 tried_modload = B_TRUE;
2817 goto find_registered_mactype;
2818 }
2819 }
2820 } else {
2821 /*
2822 * Note that there's no danger that the plugin we've loaded
2823 * could be unloaded between the modload() step and the
2824 * reference count bump here, as we're holding
2825 * i_mactype_lock, which mactype_unregister() also holds.
2826 */
2827 atomic_inc_32(&mtype->mt_ref);
2828 }
2829
2830 mutex_exit(&i_mactype_lock);
2831 return (mtype);
2832 }
2833
2834 mactype_register_t *
2835 mactype_alloc(uint_t mactype_version)
2836 {
2837 mactype_register_t *mtrp;
2838
2839 /*
2840 * Make sure there isn't a version mismatch between the plugin and
2841 * the framework. In the future, if multiple versions are
2842 * supported, this check could become more sophisticated.
2843 */
2844 if (mactype_version != MACTYPE_VERSION)
2845 return (NULL);
2846
2847 mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
2848 mtrp->mtr_version = mactype_version;
2849 return (mtrp);
2850 }
2851
2852 void
2853 mactype_free(mactype_register_t *mtrp)
2854 {
2855 kmem_free(mtrp, sizeof (mactype_register_t));
2856 }
2857
2858 int
2859 mactype_register(mactype_register_t *mtrp)
2860 {
2861 mactype_t *mtp;
2862 mactype_ops_t *ops = mtrp->mtr_ops;
2863
2864 /* Do some sanity checking before we register this MAC type. */
2865 if (mtrp->mtr_ident == NULL || ops == NULL)
2866 return (EINVAL);
2867
2868 /*
2869 * Verify that all mandatory callbacks are set in the ops
2870 * vector.
2871 */
2872 if (ops->mtops_unicst_verify == NULL ||
2873 ops->mtops_multicst_verify == NULL ||
2874 ops->mtops_sap_verify == NULL ||
2875 ops->mtops_header == NULL ||
2876 ops->mtops_header_info == NULL) {
2877 return (EINVAL);
2878 }
2879
2880 mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
2881 mtp->mt_ident = mtrp->mtr_ident;
2882 mtp->mt_ops = *ops;
2883 mtp->mt_type = mtrp->mtr_mactype;
2884 mtp->mt_nativetype = mtrp->mtr_nativetype;
2885 mtp->mt_addr_length = mtrp->mtr_addrlen;
2886 if (mtrp->mtr_brdcst_addr != NULL) {
2887 mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
2888 bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
2889 mtrp->mtr_addrlen);
2890 }
2891
2892 mtp->mt_stats = mtrp->mtr_stats;
2893 mtp->mt_statcount = mtrp->mtr_statcount;
2894
2895 mtp->mt_mapping = mtrp->mtr_mapping;
2896 mtp->mt_mappingcount = mtrp->mtr_mappingcount;
2897
2898 if (mod_hash_insert(i_mactype_hash,
2899 (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
2900 kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2901 kmem_free(mtp, sizeof (*mtp));
2902 return (EEXIST);
2903 }
2904 return (0);
2905 }
2906
2907 int
2908 mactype_unregister(const char *ident)
2909 {
2910 mactype_t *mtp;
2911 mod_hash_val_t val;
2912 int err;
2913
2914 /*
2915 * Let's not allow MAC drivers to use this plugin while we're
2916 * trying to unregister it. Holding i_mactype_lock also prevents a
2917 * plugin from unregistering while a MAC driver is attempting to
2918 * hold a reference to it in i_mactype_getplugin().
2919 */
2920 mutex_enter(&i_mactype_lock);
2921
2922 if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
2923 (mod_hash_val_t *)&mtp)) != 0) {
2924 /* A plugin is trying to unregister, but it never registered. */
2925 err = ENXIO;
2926 goto done;
2927 }
2928
2929 if (mtp->mt_ref != 0) {
2930 err = EBUSY;
2931 goto done;
2932 }
2933
2934 err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
2935 ASSERT(err == 0);
2936 if (err != 0) {
2937 /* This should never happen, thus the ASSERT() above. */
2938 err = EINVAL;
2939 goto done;
2940 }
2941 ASSERT(mtp == (mactype_t *)val);
2942
2943 if (mtp->mt_brdcst_addr != NULL)
2944 kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2945 kmem_free(mtp, sizeof (mactype_t));
2946 done:
2947 mutex_exit(&i_mactype_lock);
2948 return (err);
2949 }
2950
2951 /*
2952 * Checks the size of the value size specified for a property as
2953 * part of a property operation. Returns B_TRUE if the size is
2954 * correct, B_FALSE otherwise.
2955 */
2956 boolean_t
2957 mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range)
2958 {
2959 uint_t minsize = 0;
2960
2961 if (is_range)
2962 return (valsize >= sizeof (mac_propval_range_t));
2963
2964 switch (id) {
2965 case MAC_PROP_ZONE:
2966 minsize = sizeof (dld_ioc_zid_t);
2967 break;
2968 case MAC_PROP_AUTOPUSH:
2969 if (valsize != 0)
2970 minsize = sizeof (struct dlautopush);
2971 break;
2972 case MAC_PROP_TAGMODE:
2973 minsize = sizeof (link_tagmode_t);
2974 break;
2975 case MAC_PROP_RESOURCE:
2976 case MAC_PROP_RESOURCE_EFF:
2977 minsize = sizeof (mac_resource_props_t);
2978 break;
2979 case MAC_PROP_DUPLEX:
2980 minsize = sizeof (link_duplex_t);
2981 break;
2982 case MAC_PROP_SPEED:
2983 minsize = sizeof (uint64_t);
2984 break;
2985 case MAC_PROP_STATUS:
2986 minsize = sizeof (link_state_t);
2987 break;
2988 case MAC_PROP_AUTONEG:
2989 case MAC_PROP_EN_AUTONEG:
2990 minsize = sizeof (uint8_t);
2991 break;
2992 case MAC_PROP_MTU:
2993 case MAC_PROP_LLIMIT:
2994 case MAC_PROP_LDECAY:
2995 minsize = sizeof (uint32_t);
2996 break;
2997 case MAC_PROP_FLOWCTRL:
2998 minsize = sizeof (link_flowctrl_t);
2999 break;
3000 case MAC_PROP_ADV_10GFDX_CAP:
3001 case MAC_PROP_EN_10GFDX_CAP:
3002 case MAC_PROP_ADV_1000HDX_CAP:
3003 case MAC_PROP_EN_1000HDX_CAP:
3004 case MAC_PROP_ADV_100FDX_CAP:
3005 case MAC_PROP_EN_100FDX_CAP:
3006 case MAC_PROP_ADV_100HDX_CAP:
3007 case MAC_PROP_EN_100HDX_CAP:
3008 case MAC_PROP_ADV_10FDX_CAP:
3009 case MAC_PROP_EN_10FDX_CAP:
3010 case MAC_PROP_ADV_10HDX_CAP:
3011 case MAC_PROP_EN_10HDX_CAP:
3012 case MAC_PROP_ADV_100T4_CAP:
3013 case MAC_PROP_EN_100T4_CAP:
3014 minsize = sizeof (uint8_t);
3015 break;
3016 case MAC_PROP_PVID:
3017 minsize = sizeof (uint16_t);
3018 break;
3019 case MAC_PROP_IPTUN_HOPLIMIT:
3020 minsize = sizeof (uint32_t);
3021 break;
3022 case MAC_PROP_IPTUN_ENCAPLIMIT:
3023 minsize = sizeof (uint32_t);
3024 break;
3025 case MAC_PROP_MAX_TX_RINGS_AVAIL:
3026 case MAC_PROP_MAX_RX_RINGS_AVAIL:
3027 case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3028 case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3029 minsize = sizeof (uint_t);
3030 break;
3031 case MAC_PROP_WL_ESSID:
3032 minsize = sizeof (wl_linkstatus_t);
3033 break;
3034 case MAC_PROP_WL_BSSID:
3035 minsize = sizeof (wl_bssid_t);
3036 break;
3037 case MAC_PROP_WL_BSSTYPE:
3038 minsize = sizeof (wl_bss_type_t);
3039 break;
3040 case MAC_PROP_WL_LINKSTATUS:
3041 minsize = sizeof (wl_linkstatus_t);
3042 break;
3043 case MAC_PROP_WL_DESIRED_RATES:
3044 minsize = sizeof (wl_rates_t);
3045 break;
3046 case MAC_PROP_WL_SUPPORTED_RATES:
3047 minsize = sizeof (wl_rates_t);
3048 break;
3049 case MAC_PROP_WL_AUTH_MODE:
3050 minsize = sizeof (wl_authmode_t);
3051 break;
3052 case MAC_PROP_WL_ENCRYPTION:
3053 minsize = sizeof (wl_encryption_t);
3054 break;
3055 case MAC_PROP_WL_RSSI:
3056 minsize = sizeof (wl_rssi_t);
3057 break;
3058 case MAC_PROP_WL_PHY_CONFIG:
3059 minsize = sizeof (wl_phy_conf_t);
3060 break;
3061 case MAC_PROP_WL_CAPABILITY:
3062 minsize = sizeof (wl_capability_t);
3063 break;
3064 case MAC_PROP_WL_WPA:
3065 minsize = sizeof (wl_wpa_t);
3066 break;
3067 case MAC_PROP_WL_SCANRESULTS:
3068 minsize = sizeof (wl_wpa_ess_t);
3069 break;
3070 case MAC_PROP_WL_POWER_MODE:
3071 minsize = sizeof (wl_ps_mode_t);
3072 break;
3073 case MAC_PROP_WL_RADIO:
3074 minsize = sizeof (wl_radio_t);
3075 break;
3076 case MAC_PROP_WL_ESS_LIST:
3077 minsize = sizeof (wl_ess_list_t);
3078 break;
3079 case MAC_PROP_WL_KEY_TAB:
3080 minsize = sizeof (wl_wep_key_tab_t);
3081 break;
3082 case MAC_PROP_WL_CREATE_IBSS:
3083 minsize = sizeof (wl_create_ibss_t);
3084 break;
3085 case MAC_PROP_WL_SETOPTIE:
3086 minsize = sizeof (wl_wpa_ie_t);
3087 break;
3088 case MAC_PROP_WL_DELKEY:
3089 minsize = sizeof (wl_del_key_t);
3090 break;
3091 case MAC_PROP_WL_KEY:
3092 minsize = sizeof (wl_key_t);
3093 break;
3094 case MAC_PROP_WL_MLME:
3095 minsize = sizeof (wl_mlme_t);
3096 break;
3097 }
3098
3099 return (valsize >= minsize);
3100 }
3101
3102 /*
3103 * mac_set_prop() sets MAC or hardware driver properties:
3104 *
3105 * - MAC-managed properties such as resource properties include maxbw,
3106 * priority, and cpu binding list, as well as the default port VID
3107 * used by bridging. These properties are consumed by the MAC layer
3108 * itself and not passed down to the driver. For resource control
3109 * properties, this function invokes mac_set_resources() which will
3110 * cache the property value in mac_impl_t and may call
3111 * mac_client_set_resource() to update property value of the primary
3112 * mac client, if it exists.
3113 *
3114 * - Properties which act on the hardware and must be passed to the
3115 * driver, such as MTU, through the driver's mc_setprop() entry point.
3116 */
3117 int
3118 mac_set_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3119 uint_t valsize)
3120 {
3121 int err = ENOTSUP;
3122 mac_impl_t *mip = (mac_impl_t *)mh;
3123
3124 ASSERT(MAC_PERIM_HELD(mh));
3125
3126 switch (id) {
3127 case MAC_PROP_RESOURCE: {
3128 mac_resource_props_t *mrp;
3129
3130 /* call mac_set_resources() for MAC properties */
3131 ASSERT(valsize >= sizeof (mac_resource_props_t));
3132 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3133 bcopy(val, mrp, sizeof (*mrp));
3134 err = mac_set_resources(mh, mrp);
3135 kmem_free(mrp, sizeof (*mrp));
3136 break;
3137 }
3138
3139 case MAC_PROP_PVID:
3140 ASSERT(valsize >= sizeof (uint16_t));
3141 if (mip->mi_state_flags & MIS_IS_VNIC)
3142 return (EINVAL);
3143 err = mac_set_pvid(mh, *(uint16_t *)val);
3144 break;
3145
3146 case MAC_PROP_MTU: {
3147 uint32_t mtu;
3148
3149 ASSERT(valsize >= sizeof (uint32_t));
3150 bcopy(val, &mtu, sizeof (mtu));
3151 err = mac_set_mtu(mh, mtu, NULL);
3152 break;
3153 }
3154
3155 case MAC_PROP_LLIMIT:
3156 case MAC_PROP_LDECAY: {
3157 uint32_t learnval;
3158
3159 if (valsize < sizeof (learnval) ||
3160 (mip->mi_state_flags & MIS_IS_VNIC))
3161 return (EINVAL);
3162 bcopy(val, &learnval, sizeof (learnval));
3163 if (learnval == 0 && id == MAC_PROP_LDECAY)
3164 return (EINVAL);
3165 if (id == MAC_PROP_LLIMIT)
3166 mip->mi_llimit = learnval;
3167 else
3168 mip->mi_ldecay = learnval;
3169 err = 0;
3170 break;
3171 }
3172
3173 default:
3174 /* For other driver properties, call driver's callback */
3175 if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
3176 err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
3177 name, id, valsize, val);
3178 }
3179 }
3180 return (err);
3181 }
3182
3183 /*
3184 * mac_get_prop() gets MAC or device driver properties.
3185 *
3186 * If the property is a driver property, mac_get_prop() calls driver's callback
3187 * entry point to get it.
3188 * If the property is a MAC property, mac_get_prop() invokes mac_get_resources()
3189 * which returns the cached value in mac_impl_t.
3190 */
3191 int
3192 mac_get_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3193 uint_t valsize)
3194 {
3195 int err = ENOTSUP;
3196 mac_impl_t *mip = (mac_impl_t *)mh;
3197 uint_t rings;
3198 uint_t vlinks;
3199
3200 bzero(val, valsize);
3201
3202 switch (id) {
3203 case MAC_PROP_RESOURCE: {
3204 mac_resource_props_t *mrp;
3205
3206 /* If mac property, read from cache */
3207 ASSERT(valsize >= sizeof (mac_resource_props_t));
3208 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3209 mac_get_resources(mh, mrp);
3210 bcopy(mrp, val, sizeof (*mrp));
3211 kmem_free(mrp, sizeof (*mrp));
3212 return (0);
3213 }
3214 case MAC_PROP_RESOURCE_EFF: {
3215 mac_resource_props_t *mrp;
3216
3217 /* If mac effective property, read from client */
3218 ASSERT(valsize >= sizeof (mac_resource_props_t));
3219 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3220 mac_get_effective_resources(mh, mrp);
3221 bcopy(mrp, val, sizeof (*mrp));
3222 kmem_free(mrp, sizeof (*mrp));
3223 return (0);
3224 }
3225
3226 case MAC_PROP_PVID:
3227 ASSERT(valsize >= sizeof (uint16_t));
3228 if (mip->mi_state_flags & MIS_IS_VNIC)
3229 return (EINVAL);
3230 *(uint16_t *)val = mac_get_pvid(mh);
3231 return (0);
3232
3233 case MAC_PROP_LLIMIT:
3234 case MAC_PROP_LDECAY:
3235 ASSERT(valsize >= sizeof (uint32_t));
3236 if (mip->mi_state_flags & MIS_IS_VNIC)
3237 return (EINVAL);
3238 if (id == MAC_PROP_LLIMIT)
3239 bcopy(&mip->mi_llimit, val, sizeof (mip->mi_llimit));
3240 else
3241 bcopy(&mip->mi_ldecay, val, sizeof (mip->mi_ldecay));
3242 return (0);
3243
3244 case MAC_PROP_MTU: {
3245 uint32_t sdu;
3246
3247 ASSERT(valsize >= sizeof (uint32_t));
3248 mac_sdu_get2(mh, NULL, &sdu, NULL);
3249 bcopy(&sdu, val, sizeof (sdu));
3250
3251 return (0);
3252 }
3253 case MAC_PROP_STATUS: {
3254 link_state_t link_state;
3255
3256 if (valsize < sizeof (link_state))
3257 return (EINVAL);
3258 link_state = mac_link_get(mh);
3259 bcopy(&link_state, val, sizeof (link_state));
3260
3261 return (0);
3262 }
3263
3264 case MAC_PROP_MAX_RX_RINGS_AVAIL:
3265 case MAC_PROP_MAX_TX_RINGS_AVAIL:
3266 ASSERT(valsize >= sizeof (uint_t));
3267 rings = id == MAC_PROP_MAX_RX_RINGS_AVAIL ?
3268 mac_rxavail_get(mh) : mac_txavail_get(mh);
3269 bcopy(&rings, val, sizeof (uint_t));
3270 return (0);
3271
3272 case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3273 case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3274 ASSERT(valsize >= sizeof (uint_t));
3275 vlinks = id == MAC_PROP_MAX_RXHWCLNT_AVAIL ?
3276 mac_rxhwlnksavail_get(mh) : mac_txhwlnksavail_get(mh);
3277 bcopy(&vlinks, val, sizeof (uint_t));
3278 return (0);
3279
3280 case MAC_PROP_RXRINGSRANGE:
3281 case MAC_PROP_TXRINGSRANGE:
3282 /*
3283 * The value for these properties are returned through
3284 * the MAC_PROP_RESOURCE property.
3285 */
3286 return (0);
3287
3288 default:
3289 break;
3290
3291 }
3292
3293 /* If driver property, request from driver */
3294 if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) {
3295 err = mip->mi_callbacks->mc_getprop(mip->mi_driver, name, id,
3296 valsize, val);
3297 }
3298
3299 return (err);
3300 }
3301
3302 /*
3303 * Helper function to initialize the range structure for use in
3304 * mac_get_prop. If the type can be other than uint32, we can
3305 * pass that as an arg.
3306 */
3307 static void
3308 _mac_set_range(mac_propval_range_t *range, uint32_t min, uint32_t max)
3309 {
3310 range->mpr_count = 1;
3311 range->mpr_type = MAC_PROPVAL_UINT32;
3312 range->mpr_range_uint32[0].mpur_min = min;
3313 range->mpr_range_uint32[0].mpur_max = max;
3314 }
3315
3316 /*
3317 * Returns information about the specified property, such as default
3318 * values or permissions.
3319 */
3320 int
3321 mac_prop_info(mac_handle_t mh, mac_prop_id_t id, char *name,
3322 void *default_val, uint_t default_size, mac_propval_range_t *range,
3323 uint_t *perm)
3324 {
3325 mac_prop_info_state_t state;
3326 mac_impl_t *mip = (mac_impl_t *)mh;
3327 uint_t max;
3328
3329 /*
3330 * A property is read/write by default unless the driver says
3331 * otherwise.
3332 */
3333 if (perm != NULL)
3334 *perm = MAC_PROP_PERM_RW;
3335
3336 if (default_val != NULL)
3337 bzero(default_val, default_size);
3338
3339 /*
3340 * First, handle framework properties for which we don't need to
3341 * involve the driver.
3342 */
3343 switch (id) {
3344 case MAC_PROP_RESOURCE:
3345 case MAC_PROP_PVID:
3346 case MAC_PROP_LLIMIT:
3347 case MAC_PROP_LDECAY:
3348 return (0);
3349
3350 case MAC_PROP_MAX_RX_RINGS_AVAIL:
3351 case MAC_PROP_MAX_TX_RINGS_AVAIL:
3352 case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3353 case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3354 if (perm != NULL)
3355 *perm = MAC_PROP_PERM_READ;
3356 return (0);
3357
3358 case MAC_PROP_RXRINGSRANGE:
3359 case MAC_PROP_TXRINGSRANGE:
3360 /*
3361 * Currently, we support range for RX and TX rings properties.
3362 * When we extend this support to maxbw, cpus and priority,
3363 * we should move this to mac_get_resources.
3364 * There is no default value for RX or TX rings.
3365 */
3366 if ((mip->mi_state_flags & MIS_IS_VNIC) &&
3367 mac_is_vnic_primary(mh)) {
3368 /*
3369 * We don't support setting rings for a VLAN
3370 * data link because it shares its ring with the
3371 * primary MAC client.
3372 */
3373 if (perm != NULL)
3374 *perm = MAC_PROP_PERM_READ;
3375 if (range != NULL)
3376 range->mpr_count = 0;
3377 } else if (range != NULL) {
3378 if (mip->mi_state_flags & MIS_IS_VNIC)
3379 mh = mac_get_lower_mac_handle(mh);
3380 mip = (mac_impl_t *)mh;
3381 if ((id == MAC_PROP_RXRINGSRANGE &&
3382 mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) ||
3383 (id == MAC_PROP_TXRINGSRANGE &&
3384 mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC)) {
3385 if (id == MAC_PROP_RXRINGSRANGE) {
3386 if ((mac_rxhwlnksavail_get(mh) +
3387 mac_rxhwlnksrsvd_get(mh)) <= 1) {
3388 /*
3389 * doesn't support groups or
3390 * rings
3391 */
3392 range->mpr_count = 0;
3393 } else {
3394 /*
3395 * supports specifying groups,
3396 * but not rings
3397 */
3398 _mac_set_range(range, 0, 0);
3399 }
3400 } else {
3401 if ((mac_txhwlnksavail_get(mh) +
3402 mac_txhwlnksrsvd_get(mh)) <= 1) {
3403 /*
3404 * doesn't support groups or
3405 * rings
3406 */
3407 range->mpr_count = 0;
3408 } else {
3409 /*
3410 * supports specifying groups,
3411 * but not rings
3412 */
3413 _mac_set_range(range, 0, 0);
3414 }
3415 }
3416 } else {
3417 max = id == MAC_PROP_RXRINGSRANGE ?
3418 mac_rxavail_get(mh) + mac_rxrsvd_get(mh) :
3419 mac_txavail_get(mh) + mac_txrsvd_get(mh);
3420 if (max <= 1) {
3421 /*
3422 * doesn't support groups or
3423 * rings
3424 */
3425 range->mpr_count = 0;
3426 } else {
3427 /*
3428 * -1 because we have to leave out the
3429 * default ring.
3430 */
3431 _mac_set_range(range, 1, max - 1);
3432 }
3433 }
3434 }
3435 return (0);
3436
3437 case MAC_PROP_STATUS:
3438 if (perm != NULL)
3439 *perm = MAC_PROP_PERM_READ;
3440 return (0);
3441 }
3442
3443 /*
3444 * Get the property info from the driver if it implements the
3445 * property info entry point.
3446 */
3447 bzero(&state, sizeof (state));
3448
3449 if (mip->mi_callbacks->mc_callbacks & MC_PROPINFO) {
3450 state.pr_default = default_val;
3451 state.pr_default_size = default_size;
3452
3453 /*
3454 * The caller specifies the maximum number of ranges
3455 * it can accomodate using mpr_count. We don't touch
3456 * this value until the driver returns from its
3457 * mc_propinfo() callback, and ensure we don't exceed
3458 * this number of range as the driver defines
3459 * supported range from its mc_propinfo().
3460 *
3461 * pr_range_cur_count keeps track of how many ranges
3462 * were defined by the driver from its mc_propinfo()
3463 * entry point.
3464 *
3465 * On exit, the user-specified range mpr_count returns
3466 * the number of ranges specified by the driver on
3467 * success, or the number of ranges it wanted to
3468 * define if that number of ranges could not be
3469 * accomodated by the specified range structure. In
3470 * the latter case, the caller will be able to
3471 * allocate a larger range structure, and query the
3472 * property again.
3473 */
3474 state.pr_range_cur_count = 0;
3475 state.pr_range = range;
3476
3477 mip->mi_callbacks->mc_propinfo(mip->mi_driver, name, id,
3478 (mac_prop_info_handle_t)&state);
3479
3480 if (state.pr_flags & MAC_PROP_INFO_RANGE)
3481 range->mpr_count = state.pr_range_cur_count;
3482
3483 /*
3484 * The operation could fail if the buffer supplied by
3485 * the user was too small for the range or default
3486 * value of the property.
3487 */
3488 if (state.pr_errno != 0)
3489 return (state.pr_errno);
3490
3491 if (perm != NULL && state.pr_flags & MAC_PROP_INFO_PERM)
3492 *perm = state.pr_perm;
3493 }
3494
3495 /*
3496 * The MAC layer may want to provide default values or allowed
3497 * ranges for properties if the driver does not provide a
3498 * property info entry point, or that entry point exists, but
3499 * it did not provide a default value or allowed ranges for
3500 * that property.
3501 */
3502 switch (id) {
3503 case MAC_PROP_MTU: {
3504 uint32_t sdu;
3505
3506 mac_sdu_get2(mh, NULL, &sdu, NULL);
3507
3508 if (range != NULL && !(state.pr_flags &
3509 MAC_PROP_INFO_RANGE)) {
3510 /* MTU range */
3511 _mac_set_range(range, sdu, sdu);
3512 }
3513
3514 if (default_val != NULL && !(state.pr_flags &
3515 MAC_PROP_INFO_DEFAULT)) {
3516 if (mip->mi_info.mi_media == DL_ETHER)
3517 sdu = ETHERMTU;
3518 /* default MTU value */
3519 bcopy(&sdu, default_val, sizeof (sdu));
3520 }
3521 }
3522 }
3523
3524 return (0);
3525 }
3526
3527 int
3528 mac_fastpath_disable(mac_handle_t mh)
3529 {
3530 mac_impl_t *mip = (mac_impl_t *)mh;
3531
3532 if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3533 return (0);
3534
3535 return (mip->mi_capab_legacy.ml_fastpath_disable(mip->mi_driver));
3536 }
3537
3538 void
3539 mac_fastpath_enable(mac_handle_t mh)
3540 {
3541 mac_impl_t *mip = (mac_impl_t *)mh;
3542
3543 if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3544 return;
3545
3546 mip->mi_capab_legacy.ml_fastpath_enable(mip->mi_driver);
3547 }
3548
3549 void
3550 mac_register_priv_prop(mac_impl_t *mip, char **priv_props)
3551 {
3552 uint_t nprops, i;
3553
3554 if (priv_props == NULL)
3555 return;
3556
3557 nprops = 0;
3558 while (priv_props[nprops] != NULL)
3559 nprops++;
3560 if (nprops == 0)
3561 return;
3562
3563
3564 mip->mi_priv_prop = kmem_zalloc(nprops * sizeof (char *), KM_SLEEP);
3565
3566 for (i = 0; i < nprops; i++) {
3567 mip->mi_priv_prop[i] = kmem_zalloc(MAXLINKPROPNAME, KM_SLEEP);
3568 (void) strlcpy(mip->mi_priv_prop[i], priv_props[i],
3569 MAXLINKPROPNAME);
3570 }
3571
3572 mip->mi_priv_prop_count = nprops;
3573 }
3574
3575 void
3576 mac_unregister_priv_prop(mac_impl_t *mip)
3577 {
3578 uint_t i;
3579
3580 if (mip->mi_priv_prop_count == 0) {
3581 ASSERT(mip->mi_priv_prop == NULL);
3582 return;
3583 }
3584
3585 for (i = 0; i < mip->mi_priv_prop_count; i++)
3586 kmem_free(mip->mi_priv_prop[i], MAXLINKPROPNAME);
3587 kmem_free(mip->mi_priv_prop, mip->mi_priv_prop_count *
3588 sizeof (char *));
3589
3590 mip->mi_priv_prop = NULL;
3591 mip->mi_priv_prop_count = 0;
3592 }
3593
3594 /*
3595 * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
3596 * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
3597 * cases if MAC free's the ring structure after mac_stop_ring(), any
3598 * illegal access to the ring structure coming from the driver will panic
3599 * the system. In order to protect the system from such inadverent access,
3600 * we maintain a cache of rings in the mac_impl_t after they get free'd up.
3601 * When packets are received on free'd up rings, MAC (through the generation
3602 * count mechanism) will drop such packets.
3603 */
3604 static mac_ring_t *
3605 mac_ring_alloc(mac_impl_t *mip)
3606 {
3607 mac_ring_t *ring;
3608
3609 mutex_enter(&mip->mi_ring_lock);
3610 if (mip->mi_ring_freelist != NULL) {
3611 ring = mip->mi_ring_freelist;
3612 mip->mi_ring_freelist = ring->mr_next;
3613 bzero(ring, sizeof (mac_ring_t));
3614 mutex_exit(&mip->mi_ring_lock);
3615 } else {
3616 mutex_exit(&mip->mi_ring_lock);
3617 ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
3618 }
3619 ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
3620 return (ring);
3621 }
3622
3623 static void
3624 mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
3625 {
3626 ASSERT(ring->mr_state == MR_FREE);
3627
3628 mutex_enter(&mip->mi_ring_lock);
3629 ring->mr_state = MR_FREE;
3630 ring->mr_flag = 0;
3631 ring->mr_next = mip->mi_ring_freelist;
3632 ring->mr_mip = NULL;
3633 mip->mi_ring_freelist = ring;
3634 mac_ring_stat_delete(ring);
3635 mutex_exit(&mip->mi_ring_lock);
3636 }
3637
3638 static void
3639 mac_ring_freeall(mac_impl_t *mip)
3640 {
3641 mac_ring_t *ring_next;
3642 mutex_enter(&mip->mi_ring_lock);
3643 mac_ring_t *ring = mip->mi_ring_freelist;
3644 while (ring != NULL) {
3645 ring_next = ring->mr_next;
3646 kmem_cache_free(mac_ring_cache, ring);
3647 ring = ring_next;
3648 }
3649 mip->mi_ring_freelist = NULL;
3650 mutex_exit(&mip->mi_ring_lock);
3651 }
3652
3653 int
3654 mac_start_ring(mac_ring_t *ring)
3655 {
3656 int rv = 0;
3657
3658 ASSERT(ring->mr_state == MR_FREE);
3659
3660 if (ring->mr_start != NULL) {
3661 rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
3662 if (rv != 0)
3663 return (rv);
3664 }
3665
3666 ring->mr_state = MR_INUSE;
3667 return (rv);
3668 }
3669
3670 void
3671 mac_stop_ring(mac_ring_t *ring)
3672 {
3673 ASSERT(ring->mr_state == MR_INUSE);
3674
3675 if (ring->mr_stop != NULL)
3676 ring->mr_stop(ring->mr_driver);
3677
3678 ring->mr_state = MR_FREE;
3679
3680 /*
3681 * Increment the ring generation number for this ring.
3682 */
3683 ring->mr_gen_num++;
3684 }
3685
3686 int
3687 mac_start_group(mac_group_t *group)
3688 {
3689 int rv = 0;
3690
3691 if (group->mrg_start != NULL)
3692 rv = group->mrg_start(group->mrg_driver);
3693
3694 return (rv);
3695 }
3696
3697 void
3698 mac_stop_group(mac_group_t *group)
3699 {
3700 if (group->mrg_stop != NULL)
3701 group->mrg_stop(group->mrg_driver);
3702 }
3703
3704 /*
3705 * Called from mac_start() on the default Rx group. Broadcast and multicast
3706 * packets are received only on the default group. Hence the default group
3707 * needs to be up even if the primary client is not up, for the other groups
3708 * to be functional. We do this by calling this function at mac_start time
3709 * itself. However the broadcast packets that are received can't make their
3710 * way beyond mac_rx until a mac client creates a broadcast flow.
3711 */
3712 static int
3713 mac_start_group_and_rings(mac_group_t *group)
3714 {
3715 mac_ring_t *ring;
3716 int rv = 0;
3717
3718 ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
3719 if ((rv = mac_start_group(group)) != 0)
3720 return (rv);
3721
3722 for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3723 ASSERT(ring->mr_state == MR_FREE);
3724 if ((rv = mac_start_ring(ring)) != 0)
3725 goto error;
3726 ring->mr_classify_type = MAC_SW_CLASSIFIER;
3727 }
3728 return (0);
3729
3730 error:
3731 mac_stop_group_and_rings(group);
3732 return (rv);
3733 }
3734
3735 /* Called from mac_stop on the default Rx group */
3736 static void
3737 mac_stop_group_and_rings(mac_group_t *group)
3738 {
3739 mac_ring_t *ring;
3740
3741 for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3742 if (ring->mr_state != MR_FREE) {
3743 mac_stop_ring(ring);
3744 ring->mr_flag = 0;
3745 ring->mr_classify_type = MAC_NO_CLASSIFIER;
3746 }
3747 }
3748 mac_stop_group(group);
3749 }
3750
3751
3752 static mac_ring_t *
3753 mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
3754 mac_capab_rings_t *cap_rings)
3755 {
3756 mac_ring_t *ring, *rnext;
3757 mac_ring_info_t ring_info;
3758 ddi_intr_handle_t ddi_handle;
3759
3760 ring = mac_ring_alloc(mip);
3761
3762 /* Prepare basic information of ring */
3763
3764 /*
3765 * Ring index is numbered to be unique across a particular device.
3766 * Ring index computation makes following assumptions:
3767 * - For drivers with static grouping (e.g. ixgbe, bge),
3768 * ring index exchanged with the driver (e.g. during mr_rget)
3769 * is unique only across the group the ring belongs to.
3770 * - Drivers with dynamic grouping (e.g. nxge), start
3771 * with single group (mrg_index = 0).
3772 */
3773 ring->mr_index = group->mrg_index * group->mrg_info.mgi_count + index;
3774 ring->mr_type = group->mrg_type;
3775 ring->mr_gh = (mac_group_handle_t)group;
3776
3777 /* Insert the new ring to the list. */
3778 ring->mr_next = group->mrg_rings;
3779 group->mrg_rings = ring;
3780
3781 /* Zero to reuse the info data structure */
3782 bzero(&ring_info, sizeof (ring_info));
3783
3784 /* Query ring information from driver */
3785 cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
3786 index, &ring_info, (mac_ring_handle_t)ring);
3787
3788 ring->mr_info = ring_info;
3789
3790 /*
3791 * The interrupt handle could be shared among multiple rings.
3792 * Thus if there is a bunch of rings that are sharing an
3793 * interrupt, then only one ring among the bunch will be made
3794 * available for interrupt re-targeting; the rest will have
3795 * ddi_shared flag set to TRUE and would not be available for
3796 * be interrupt re-targeting.
3797 */
3798 if ((ddi_handle = ring_info.mri_intr.mi_ddi_handle) != NULL) {
3799 rnext = ring->mr_next;
3800 while (rnext != NULL) {
3801 if (rnext->mr_info.mri_intr.mi_ddi_handle ==
3802 ddi_handle) {
3803 /*
3804 * If default ring (mr_index == 0) is part
3805 * of a group of rings sharing an
3806 * interrupt, then set ddi_shared flag for
3807 * the default ring and give another ring
3808 * the chance to be re-targeted.
3809 */
3810 if (rnext->mr_index == 0 &&
3811 !rnext->mr_info.mri_intr.mi_ddi_shared) {
3812 rnext->mr_info.mri_intr.mi_ddi_shared =
3813 B_TRUE;
3814 } else {
3815 ring->mr_info.mri_intr.mi_ddi_shared =
3816 B_TRUE;
3817 }
3818 break;
3819 }
3820 rnext = rnext->mr_next;
3821 }
3822 /*
3823 * If rnext is NULL, then no matching ddi_handle was found.
3824 * Rx rings get registered first. So if this is a Tx ring,
3825 * then go through all the Rx rings and see if there is a
3826 * matching ddi handle.
3827 */
3828 if (rnext == NULL && ring->mr_type == MAC_RING_TYPE_TX) {
3829 mac_compare_ddi_handle(mip->mi_rx_groups,
3830 mip->mi_rx_group_count, ring);
3831 }
3832 }
3833
3834 /* Update ring's status */
3835 ring->mr_state = MR_FREE;
3836 ring->mr_flag = 0;
3837
3838 /* Update the ring count of the group */
3839 group->mrg_cur_count++;
3840
3841 /* Create per ring kstats */
3842 if (ring->mr_stat != NULL) {
3843 ring->mr_mip = mip;
3844 mac_ring_stat_create(ring);
3845 }
3846
3847 return (ring);
3848 }
3849
3850 /*
3851 * Rings are chained together for easy regrouping.
3852 */
3853 static void
3854 mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
3855 mac_capab_rings_t *cap_rings)
3856 {
3857 int index;
3858
3859 /*
3860 * Initialize all ring members of this group. Size of zero will not
3861 * enter the loop, so it's safe for initializing an empty group.
3862 */
3863 for (index = size - 1; index >= 0; index--)
3864 (void) mac_init_ring(mip, group, index, cap_rings);
3865 }
3866
3867 int
3868 mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3869 {
3870 mac_capab_rings_t *cap_rings;
3871 mac_group_t *group;
3872 mac_group_t *groups;
3873 mac_group_info_t group_info;
3874 uint_t group_free = 0;
3875 uint_t ring_left;
3876 mac_ring_t *ring;
3877 int g;
3878 int err = 0;
3879 uint_t grpcnt;
3880 boolean_t pseudo_txgrp = B_FALSE;
3881
3882 switch (rtype) {
3883 case MAC_RING_TYPE_RX:
3884 ASSERT(mip->mi_rx_groups == NULL);
3885
3886 cap_rings = &mip->mi_rx_rings_cap;
3887 cap_rings->mr_type = MAC_RING_TYPE_RX;
3888 break;
3889 case MAC_RING_TYPE_TX:
3890 ASSERT(mip->mi_tx_groups == NULL);
3891
3892 cap_rings = &mip->mi_tx_rings_cap;
3893 cap_rings->mr_type = MAC_RING_TYPE_TX;
3894 break;
3895 default:
3896 ASSERT(B_FALSE);
3897 }
3898
3899 if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS, cap_rings))
3900 return (0);
3901 grpcnt = cap_rings->mr_gnum;
3902
3903 /*
3904 * If we have multiple TX rings, but only one TX group, we can
3905 * create pseudo TX groups (one per TX ring) in the MAC layer,
3906 * except for an aggr. For an aggr currently we maintain only
3907 * one group with all the rings (for all its ports), going
3908 * forwards we might change this.
3909 */
3910 if (rtype == MAC_RING_TYPE_TX &&
3911 cap_rings->mr_gnum == 0 && cap_rings->mr_rnum > 0 &&
3912 (mip->mi_state_flags & MIS_IS_AGGR) == 0) {
3913 /*
3914 * The -1 here is because we create a default TX group
3915 * with all the rings in it.
3916 */
3917 grpcnt = cap_rings->mr_rnum - 1;
3918 pseudo_txgrp = B_TRUE;
3919 }
3920
3921 /*
3922 * Allocate a contiguous buffer for all groups.
3923 */
3924 groups = kmem_zalloc(sizeof (mac_group_t) * (grpcnt+ 1), KM_SLEEP);
3925
3926 ring_left = cap_rings->mr_rnum;
3927
3928 /*
3929 * Get all ring groups if any, and get their ring members
3930 * if any.
3931 */
3932 for (g = 0; g < grpcnt; g++) {
3933 group = groups + g;
3934
3935 /* Prepare basic information of the group */
3936 group->mrg_index = g;
3937 group->mrg_type = rtype;
3938 group->mrg_state = MAC_GROUP_STATE_UNINIT;
3939 group->mrg_mh = (mac_handle_t)mip;
3940 group->mrg_next = group + 1;
3941
3942 /* Zero to reuse the info data structure */
3943 bzero(&group_info, sizeof (group_info));
3944
3945 if (pseudo_txgrp) {
3946 /*
3947 * This is a pseudo group that we created, apart
3948 * from setting the state there is nothing to be
3949 * done.
3950 */
3951 group->mrg_state = MAC_GROUP_STATE_REGISTERED;
3952 group_free++;
3953 continue;
3954 }
3955 /* Query group information from driver */
3956 cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
3957 (mac_group_handle_t)group);
3958
3959 switch (cap_rings->mr_group_type) {
3960 case MAC_GROUP_TYPE_DYNAMIC:
3961 if (cap_rings->mr_gaddring == NULL ||
3962 cap_rings->mr_gremring == NULL) {
3963 DTRACE_PROBE3(
3964 mac__init__rings_no_addremring,
3965 char *, mip->mi_name,
3966 mac_group_add_ring_t,
3967 cap_rings->mr_gaddring,
3968 mac_group_add_ring_t,
3969 cap_rings->mr_gremring);
3970 err = EINVAL;
3971 goto bail;
3972 }
3973
3974 switch (rtype) {
3975 case MAC_RING_TYPE_RX:
3976 /*
3977 * The first RX group must have non-zero
3978 * rings, and the following groups must
3979 * have zero rings.
3980 */
3981 if (g == 0 && group_info.mgi_count == 0) {
3982 DTRACE_PROBE1(
3983 mac__init__rings__rx__def__zero,
3984 char *, mip->mi_name);
3985 err = EINVAL;
3986 goto bail;
3987 }
3988 if (g > 0 && group_info.mgi_count != 0) {
3989 DTRACE_PROBE3(
3990 mac__init__rings__rx__nonzero,
3991 char *, mip->mi_name,
3992 int, g, int, group_info.mgi_count);
3993 err = EINVAL;
3994 goto bail;
3995 }
3996 break;
3997 case MAC_RING_TYPE_TX:
3998 /*
3999 * All TX ring groups must have zero rings.
4000 */
4001 if (group_info.mgi_count != 0) {
4002 DTRACE_PROBE3(
4003 mac__init__rings__tx__nonzero,
4004 char *, mip->mi_name,
4005 int, g, int, group_info.mgi_count);
4006 err = EINVAL;
4007 goto bail;
4008 }
4009 break;
4010 }
4011 break;
4012 case MAC_GROUP_TYPE_STATIC:
4013 /*
4014 * Note that an empty group is allowed, e.g., an aggr
4015 * would start with an empty group.
4016 */
4017 break;
4018 default:
4019 /* unknown group type */
4020 DTRACE_PROBE2(mac__init__rings__unknown__type,
4021 char *, mip->mi_name,
4022 int, cap_rings->mr_group_type);
4023 err = EINVAL;
4024 goto bail;
4025 }
4026
4027
4028 /*
4029 * Driver must register group->mgi_addmac/remmac() for rx groups
4030 * to support multiple MAC addresses.
4031 */
4032 if (rtype == MAC_RING_TYPE_RX) {
4033 if ((group_info.mgi_addmac == NULL) ||
4034 (group_info.mgi_addmac == NULL)) {
4035 goto bail;
4036 }
4037 }
4038
4039 /* Cache driver-supplied information */
4040 group->mrg_info = group_info;
4041
4042 /* Update the group's status and group count. */
4043 mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
4044 group_free++;
4045
4046 group->mrg_rings = NULL;
4047 group->mrg_cur_count = 0;
4048 mac_init_group(mip, group, group_info.mgi_count, cap_rings);
4049 ring_left -= group_info.mgi_count;
4050
4051 /* The current group size should be equal to default value */
4052 ASSERT(group->mrg_cur_count == group_info.mgi_count);
4053 }
4054
4055 /* Build up a dummy group for free resources as a pool */
4056 group = groups + grpcnt;
4057
4058 /* Prepare basic information of the group */
4059 group->mrg_index = -1;
4060 group->mrg_type = rtype;
4061 group->mrg_state = MAC_GROUP_STATE_UNINIT;
4062 group->mrg_mh = (mac_handle_t)mip;
4063 group->mrg_next = NULL;
4064
4065 /*
4066 * If there are ungrouped rings, allocate a continuous buffer for
4067 * remaining resources.
4068 */
4069 if (ring_left != 0) {
4070 group->mrg_rings = NULL;
4071 group->mrg_cur_count = 0;
4072 mac_init_group(mip, group, ring_left, cap_rings);
4073
4074 /* The current group size should be equal to ring_left */
4075 ASSERT(group->mrg_cur_count == ring_left);
4076
4077 ring_left = 0;
4078
4079 /* Update this group's status */
4080 mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
4081 } else
4082 group->mrg_rings = NULL;
4083
4084 ASSERT(ring_left == 0);
4085
4086 bail:
4087
4088 /* Cache other important information to finalize the initialization */
4089 switch (rtype) {
4090 case MAC_RING_TYPE_RX:
4091 mip->mi_rx_group_type = cap_rings->mr_group_type;
4092 mip->mi_rx_group_count = cap_rings->mr_gnum;
4093 mip->mi_rx_groups = groups;
4094 mip->mi_rx_donor_grp = groups;
4095 if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
4096 /*
4097 * The default ring is reserved since it is
4098 * used for sending the broadcast etc. packets.
4099 */
4100 mip->mi_rxrings_avail =
4101 mip->mi_rx_groups->mrg_cur_count - 1;
4102 mip->mi_rxrings_rsvd = 1;
4103 }
4104 /*
4105 * The default group cannot be reserved. It is used by
4106 * all the clients that do not have an exclusive group.
4107 */
4108 mip->mi_rxhwclnt_avail = mip->mi_rx_group_count - 1;
4109 mip->mi_rxhwclnt_used = 1;
4110 break;
4111 case MAC_RING_TYPE_TX:
4112 mip->mi_tx_group_type = pseudo_txgrp ? MAC_GROUP_TYPE_DYNAMIC :
4113 cap_rings->mr_group_type;
4114 mip->mi_tx_group_count = grpcnt;
4115 mip->mi_tx_group_free = group_free;
4116 mip->mi_tx_groups = groups;
4117
4118 group = groups + grpcnt;
4119 ring = group->mrg_rings;
4120 /*
4121 * The ring can be NULL in the case of aggr. Aggr will
4122 * have an empty Tx group which will get populated
4123 * later when pseudo Tx rings are added after
4124 * mac_register() is done.
4125 */
4126 if (ring == NULL) {
4127 ASSERT(mip->mi_state_flags & MIS_IS_AGGR);
4128 /*
4129 * pass the group to aggr so it can add Tx
4130 * rings to the group later.
4131 */
4132 cap_rings->mr_gget(mip->mi_driver, rtype, 0, NULL,
4133 (mac_group_handle_t)group);
4134 /*
4135 * Even though there are no rings at this time
4136 * (rings will come later), set the group
4137 * state to registered.
4138 */
4139 group->mrg_state = MAC_GROUP_STATE_REGISTERED;
4140 } else {
4141 /*
4142 * Ring 0 is used as the default one and it could be
4143 * assigned to a client as well.
4144 */
4145 while ((ring->mr_index != 0) && (ring->mr_next != NULL))
4146 ring = ring->mr_next;
4147 ASSERT(ring->mr_index == 0);
4148 mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4149 }
4150 if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC)
4151 mip->mi_txrings_avail = group->mrg_cur_count - 1;
4152 /*
4153 * The default ring cannot be reserved.
4154 */
4155 mip->mi_txrings_rsvd = 1;
4156 /*
4157 * The default group cannot be reserved. It will be shared
4158 * by clients that do not have an exclusive group.
4159 */
4160 mip->mi_txhwclnt_avail = mip->mi_tx_group_count;
4161 mip->mi_txhwclnt_used = 1;
4162 break;
4163 default:
4164 ASSERT(B_FALSE);
4165 }
4166
4167 if (err != 0)
4168 mac_free_rings(mip, rtype);
4169
4170 return (err);
4171 }
4172
4173 /*
4174 * The ddi interrupt handle could be shared amoung rings. If so, compare
4175 * the new ring's ddi handle with the existing ones and set ddi_shared
4176 * flag.
4177 */
4178 void
4179 mac_compare_ddi_handle(mac_group_t *groups, uint_t grpcnt, mac_ring_t *cring)
4180 {
4181 mac_group_t *group;
4182 mac_ring_t *ring;
4183 ddi_intr_handle_t ddi_handle;
4184 int g;
4185
4186 ddi_handle = cring->mr_info.mri_intr.mi_ddi_handle;
4187 for (g = 0; g < grpcnt; g++) {
4188 group = groups + g;
4189 for (ring = group->mrg_rings; ring != NULL;
4190 ring = ring->mr_next) {
4191 if (ring == cring)
4192 continue;
4193 if (ring->mr_info.mri_intr.mi_ddi_handle ==
4194 ddi_handle) {
4195 if (cring->mr_type == MAC_RING_TYPE_RX &&
4196 ring->mr_index == 0 &&
4197 !ring->mr_info.mri_intr.mi_ddi_shared) {
4198 ring->mr_info.mri_intr.mi_ddi_shared =
4199 B_TRUE;
4200 } else {
4201 cring->mr_info.mri_intr.mi_ddi_shared =
4202 B_TRUE;
4203 }
4204 return;
4205 }
4206 }
4207 }
4208 }
4209
4210 /*
4211 * Called to free all groups of particular type (RX or TX). It's assumed that
4212 * no clients are using these groups.
4213 */
4214 void
4215 mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
4216 {
4217 mac_group_t *group, *groups;
4218 uint_t group_count;
4219
4220 switch (rtype) {
4221 case MAC_RING_TYPE_RX:
4222 if (mip->mi_rx_groups == NULL)
4223 return;
4224
4225 groups = mip->mi_rx_groups;
4226 group_count = mip->mi_rx_group_count;
4227
4228 mip->mi_rx_groups = NULL;
4229 mip->mi_rx_donor_grp = NULL;
4230 mip->mi_rx_group_count = 0;
4231 break;
4232 case MAC_RING_TYPE_TX:
4233 ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
4234
4235 if (mip->mi_tx_groups == NULL)
4236 return;
4237
4238 groups = mip->mi_tx_groups;
4239 group_count = mip->mi_tx_group_count;
4240
4241 mip->mi_tx_groups = NULL;
4242 mip->mi_tx_group_count = 0;
4243 mip->mi_tx_group_free = 0;
4244 mip->mi_default_tx_ring = NULL;
4245 break;
4246 default:
4247 ASSERT(B_FALSE);
4248 }
4249
4250 for (group = groups; group != NULL; group = group->mrg_next) {
4251 mac_ring_t *ring;
4252
4253 if (group->mrg_cur_count == 0)
4254 continue;
4255
4256 ASSERT(group->mrg_rings != NULL);
4257
4258 while ((ring = group->mrg_rings) != NULL) {
4259 group->mrg_rings = ring->mr_next;
4260 mac_ring_free(mip, ring);
4261 }
4262 }
4263
4264 /* Free all the cached rings */
4265 mac_ring_freeall(mip);
4266 /* Free the block of group data strutures */
4267 kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
4268 }
4269
4270 /*
4271 * Associate a MAC address with a receive group.
4272 *
4273 * The return value of this function should always be checked properly, because
4274 * any type of failure could cause unexpected results. A group can be added
4275 * or removed with a MAC address only after it has been reserved. Ideally,
4276 * a successful reservation always leads to calling mac_group_addmac() to
4277 * steer desired traffic. Failure of adding an unicast MAC address doesn't
4278 * always imply that the group is functioning abnormally.
4279 *
4280 * Currently this function is called everywhere, and it reflects assumptions
4281 * about MAC addresses in the implementation. CR 6735196.
4282 */
4283 int
4284 mac_group_addmac(mac_group_t *group, const uint8_t *addr)
4285 {
4286 ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
4287 ASSERT(group->mrg_info.mgi_addmac != NULL);
4288
4289 return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
4290 }
4291
4292 /*
4293 * Remove the association between MAC address and receive group.
4294 */
4295 int
4296 mac_group_remmac(mac_group_t *group, const uint8_t *addr)
4297 {
4298 ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
4299 ASSERT(group->mrg_info.mgi_remmac != NULL);
4300
4301 return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
4302 }
4303
4304 /*
4305 * This is the entry point for packets transmitted through the bridging code.
4306 * If no bridge is in place, MAC_RING_TX transmits using tx ring. The 'rh'
4307 * pointer may be NULL to select the default ring.
4308 */
4309 mblk_t *
4310 mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
4311 {
4312 mac_handle_t mh;
4313
4314 /*
4315 * Once we take a reference on the bridge link, the bridge
4316 * module itself can't unload, so the callback pointers are
4317 * stable.
4318 */
4319 mutex_enter(&mip->mi_bridge_lock);
4320 if ((mh = mip->mi_bridge_link) != NULL)
4321 mac_bridge_ref_cb(mh, B_TRUE);
4322 mutex_exit(&mip->mi_bridge_lock);
4323 if (mh == NULL) {
4324 MAC_RING_TX(mip, rh, mp, mp);
4325 } else {
4326 mp = mac_bridge_tx_cb(mh, rh, mp);
4327 mac_bridge_ref_cb(mh, B_FALSE);
4328 }
4329
4330 return (mp);
4331 }
4332
4333 /*
4334 * Find a ring from its index.
4335 */
4336 mac_ring_handle_t
4337 mac_find_ring(mac_group_handle_t gh, int index)
4338 {
4339 mac_group_t *group = (mac_group_t *)gh;
4340 mac_ring_t *ring = group->mrg_rings;
4341
4342 for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
4343 if (ring->mr_index == index)
4344 break;
4345
4346 return ((mac_ring_handle_t)ring);
4347 }
4348 /*
4349 * Add a ring to an existing group.
4350 *
4351 * The ring must be either passed directly (for example if the ring
4352 * movement is initiated by the framework), or specified through a driver
4353 * index (for example when the ring is added by the driver.
4354 *
4355 * The caller needs to call mac_perim_enter() before calling this function.
4356 */
4357 int
4358 i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
4359 {
4360 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4361 mac_capab_rings_t *cap_rings;
4362 boolean_t driver_call = (ring == NULL);
4363 mac_group_type_t group_type;
4364 int ret = 0;
4365 flow_entry_t *flent;
4366
4367 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4368
4369 switch (group->mrg_type) {
4370 case MAC_RING_TYPE_RX:
4371 cap_rings = &mip->mi_rx_rings_cap;
4372 group_type = mip->mi_rx_group_type;
4373 break;
4374 case MAC_RING_TYPE_TX:
4375 cap_rings = &mip->mi_tx_rings_cap;
4376 group_type = mip->mi_tx_group_type;
4377 break;
4378 default:
4379 ASSERT(B_FALSE);
4380 }
4381
4382 /*
4383 * There should be no ring with the same ring index in the target
4384 * group.
4385 */
4386 ASSERT(mac_find_ring((mac_group_handle_t)group,
4387 driver_call ? index : ring->mr_index) == NULL);
4388
4389 if (driver_call) {
4390 /*
4391 * The function is called as a result of a request from
4392 * a driver to add a ring to an existing group, for example
4393 * from the aggregation driver. Allocate a new mac_ring_t
4394 * for that ring.
4395 */
4396 ring = mac_init_ring(mip, group, index, cap_rings);
4397 ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
4398 } else {
4399 /*
4400 * The function is called as a result of a MAC layer request
4401 * to add a ring to an existing group. In this case the
4402 * ring is being moved between groups, which requires
4403 * the underlying driver to support dynamic grouping,
4404 * and the mac_ring_t already exists.
4405 */
4406 ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4407 ASSERT(group->mrg_driver == NULL ||
4408 cap_rings->mr_gaddring != NULL);
4409 ASSERT(ring->mr_gh == NULL);
4410 }
4411
4412 /*
4413 * At this point the ring should not be in use, and it should be
4414 * of the right for the target group.
4415 */
4416 ASSERT(ring->mr_state < MR_INUSE);
4417 ASSERT(ring->mr_srs == NULL);
4418 ASSERT(ring->mr_type == group->mrg_type);
4419
4420 if (!driver_call) {
4421 /*
4422 * Add the driver level hardware ring if the process was not
4423 * initiated by the driver, and the target group is not the
4424 * group.
4425 */
4426 if (group->mrg_driver != NULL) {
4427 cap_rings->mr_gaddring(group->mrg_driver,
4428 ring->mr_driver, ring->mr_type);
4429 }
4430
4431 /*
4432 * Insert the ring ahead existing rings.
4433 */
4434 ring->mr_next = group->mrg_rings;
4435 group->mrg_rings = ring;
4436 ring->mr_gh = (mac_group_handle_t)group;
4437 group->mrg_cur_count++;
4438 }
4439
4440 /*
4441 * If the group has not been actively used, we're done.
4442 */
4443 if (group->mrg_index != -1 &&
4444 group->mrg_state < MAC_GROUP_STATE_RESERVED)
4445 return (0);
4446
4447 /*
4448 * Start the ring if needed. Failure causes to undo the grouping action.
4449 */
4450 if (ring->mr_state != MR_INUSE) {
4451 if ((ret = mac_start_ring(ring)) != 0) {
4452 if (!driver_call) {
4453 cap_rings->mr_gremring(group->mrg_driver,
4454 ring->mr_driver, ring->mr_type);
4455 }
4456 group->mrg_cur_count--;
4457 group->mrg_rings = ring->mr_next;
4458
4459 ring->mr_gh = NULL;
4460
4461 if (driver_call)
4462 mac_ring_free(mip, ring);
4463
4464 return (ret);
4465 }
4466 }
4467
4468 /*
4469 * Set up SRS/SR according to the ring type.
4470 */
4471 switch (ring->mr_type) {
4472 case MAC_RING_TYPE_RX:
4473 /*
4474 * Setup SRS on top of the new ring if the group is
4475 * reserved for someones exclusive use.
4476 */
4477 if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
4478 mac_client_impl_t *mcip;
4479
4480 mcip = MAC_GROUP_ONLY_CLIENT(group);
4481 /*
4482 * Even though this group is reserved we migth still
4483 * have multiple clients, i.e a VLAN shares the
4484 * group with the primary mac client.
4485 */
4486 if (mcip != NULL) {
4487 flent = mcip->mci_flent;
4488 ASSERT(flent->fe_rx_srs_cnt > 0);
4489 mac_rx_srs_group_setup(mcip, flent, SRST_LINK);
4490 mac_fanout_setup(mcip, flent,
4491 MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver,
4492 mcip, NULL, NULL);
4493 } else {
4494 ring->mr_classify_type = MAC_SW_CLASSIFIER;
4495 }
4496 }
4497 break;
4498 case MAC_RING_TYPE_TX:
4499 {
4500 mac_grp_client_t *mgcp = group->mrg_clients;
4501 mac_client_impl_t *mcip;
4502 mac_soft_ring_set_t *mac_srs;
4503 mac_srs_tx_t *tx;
4504
4505 if (MAC_GROUP_NO_CLIENT(group)) {
4506 if (ring->mr_state == MR_INUSE)
4507 mac_stop_ring(ring);
4508 ring->mr_flag = 0;
4509 break;
4510 }
4511 /*
4512 * If the rings are being moved to a group that has
4513 * clients using it, then add the new rings to the
4514 * clients SRS.
4515 */
4516 while (mgcp != NULL) {
4517 boolean_t is_aggr;
4518
4519 mcip = mgcp->mgc_client;
4520 flent = mcip->mci_flent;
4521 is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR);
4522 mac_srs = MCIP_TX_SRS(mcip);
4523 tx = &mac_srs->srs_tx;
4524 mac_tx_client_quiesce((mac_client_handle_t)mcip);
4525 /*
4526 * If we are growing from 1 to multiple rings.
4527 */
4528 if (tx->st_mode == SRS_TX_BW ||
4529 tx->st_mode == SRS_TX_SERIALIZE ||
4530 tx->st_mode == SRS_TX_DEFAULT) {
4531 mac_ring_t *tx_ring = tx->st_arg2;
4532
4533 tx->st_arg2 = NULL;
4534 mac_tx_srs_stat_recreate(mac_srs, B_TRUE);
4535 mac_tx_srs_add_ring(mac_srs, tx_ring);
4536 if (mac_srs->srs_type & SRST_BW_CONTROL) {
4537 tx->st_mode = is_aggr ? SRS_TX_BW_AGGR :
4538 SRS_TX_BW_FANOUT;
4539 } else {
4540 tx->st_mode = is_aggr ? SRS_TX_AGGR :
4541 SRS_TX_FANOUT;
4542 }
4543 tx->st_func = mac_tx_get_func(tx->st_mode);
4544 }
4545 mac_tx_srs_add_ring(mac_srs, ring);
4546 mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
4547 mac_rx_deliver, mcip, NULL, NULL);
4548 mac_tx_client_restart((mac_client_handle_t)mcip);
4549 mgcp = mgcp->mgc_next;
4550 }
4551 break;
4552 }
4553 default:
4554 ASSERT(B_FALSE);
4555 }
4556 /*
4557 * For aggr, the default ring will be NULL to begin with. If it
4558 * is NULL, then pick the first ring that gets added as the
4559 * default ring. Any ring in an aggregation can be removed at
4560 * any time (by the user action of removing a link) and if the
4561 * current default ring gets removed, then a new one gets
4562 * picked (see i_mac_group_rem_ring()).
4563 */
4564 if (mip->mi_state_flags & MIS_IS_AGGR &&
4565 mip->mi_default_tx_ring == NULL &&
4566 ring->mr_type == MAC_RING_TYPE_TX) {
4567 mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4568 }
4569
4570 MAC_RING_UNMARK(ring, MR_INCIPIENT);
4571 return (0);
4572 }
4573
4574 /*
4575 * Remove a ring from it's current group. MAC internal function for dynamic
4576 * grouping.
4577 *
4578 * The caller needs to call mac_perim_enter() before calling this function.
4579 */
4580 void
4581 i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
4582 boolean_t driver_call)
4583 {
4584 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4585 mac_capab_rings_t *cap_rings = NULL;
4586 mac_group_type_t group_type;
4587
4588 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4589
4590 ASSERT(mac_find_ring((mac_group_handle_t)group,
4591 ring->mr_index) == (mac_ring_handle_t)ring);
4592 ASSERT((mac_group_t *)ring->mr_gh == group);
4593 ASSERT(ring->mr_type == group->mrg_type);
4594
4595 if (ring->mr_state == MR_INUSE)
4596 mac_stop_ring(ring);
4597 switch (ring->mr_type) {
4598 case MAC_RING_TYPE_RX:
4599 group_type = mip->mi_rx_group_type;
4600 cap_rings = &mip->mi_rx_rings_cap;
4601
4602 /*
4603 * Only hardware classified packets hold a reference to the
4604 * ring all the way up the Rx path. mac_rx_srs_remove()
4605 * will take care of quiescing the Rx path and removing the
4606 * SRS. The software classified path neither holds a reference
4607 * nor any association with the ring in mac_rx.
4608 */
4609 if (ring->mr_srs != NULL) {
4610 mac_rx_srs_remove(ring->mr_srs);
4611 ring->mr_srs = NULL;
4612 }
4613
4614 break;
4615 case MAC_RING_TYPE_TX:
4616 {
4617 mac_grp_client_t *mgcp;
4618 mac_client_impl_t *mcip;
4619 mac_soft_ring_set_t *mac_srs;
4620 mac_srs_tx_t *tx;
4621 mac_ring_t *rem_ring;
4622 mac_group_t *defgrp;
4623 uint_t ring_info = 0;
4624
4625 /*
4626 * For TX this function is invoked in three
4627 * cases:
4628 *
4629 * 1) In the case of a failure during the
4630 * initial creation of a group when a share is
4631 * associated with a MAC client. So the SRS is not
4632 * yet setup, and will be setup later after the
4633 * group has been reserved and populated.
4634 *
4635 * 2) From mac_release_tx_group() when freeing
4636 * a TX SRS.
4637 *
4638 * 3) In the case of aggr, when a port gets removed,
4639 * the pseudo Tx rings that it exposed gets removed.
4640 *
4641 * In the first two cases the SRS and its soft
4642 * rings are already quiesced.
4643 */
4644 if (driver_call) {
4645 mac_client_impl_t *mcip;
4646 mac_soft_ring_set_t *mac_srs;
4647 mac_soft_ring_t *sringp;
4648 mac_srs_tx_t *srs_tx;
4649
4650 if (mip->mi_state_flags & MIS_IS_AGGR &&
4651 mip->mi_default_tx_ring ==
4652 (mac_ring_handle_t)ring) {
4653 /* pick a new default Tx ring */
4654 mip->mi_default_tx_ring =
4655 (group->mrg_rings != ring) ?
4656 (mac_ring_handle_t)group->mrg_rings :
4657 (mac_ring_handle_t)(ring->mr_next);
4658 }
4659 /* Presently only aggr case comes here */
4660 if (group->mrg_state != MAC_GROUP_STATE_RESERVED)
4661 break;
4662
4663 mcip = MAC_GROUP_ONLY_CLIENT(group);
4664 ASSERT(mcip != NULL);
4665 ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR);
4666 mac_srs = MCIP_TX_SRS(mcip);
4667 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
4668 mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
4669 srs_tx = &mac_srs->srs_tx;
4670 /*
4671 * Wakeup any callers blocked on this
4672 * Tx ring due to flow control.
4673 */
4674 sringp = srs_tx->st_soft_rings[ring->mr_index];
4675 ASSERT(sringp != NULL);
4676 mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)sringp);
4677 mac_tx_client_quiesce((mac_client_handle_t)mcip);
4678 mac_tx_srs_del_ring(mac_srs, ring);
4679 mac_tx_client_restart((mac_client_handle_t)mcip);
4680 break;
4681 }
4682 ASSERT(ring != (mac_ring_t *)mip->mi_default_tx_ring);
4683 group_type = mip->mi_tx_group_type;
4684 cap_rings = &mip->mi_tx_rings_cap;
4685 /*
4686 * See if we need to take it out of the MAC clients using
4687 * this group
4688 */
4689 if (MAC_GROUP_NO_CLIENT(group))
4690 break;
4691 mgcp = group->mrg_clients;
4692 defgrp = MAC_DEFAULT_TX_GROUP(mip);
4693 while (mgcp != NULL) {
4694 mcip = mgcp->mgc_client;
4695 mac_srs = MCIP_TX_SRS(mcip);
4696 tx = &mac_srs->srs_tx;
4697 mac_tx_client_quiesce((mac_client_handle_t)mcip);
4698 /*
4699 * If we are here when removing rings from the
4700 * defgroup, mac_reserve_tx_ring would have
4701 * already deleted the ring from the MAC
4702 * clients in the group.
4703 */
4704 if (group != defgrp) {
4705 mac_tx_invoke_callbacks(mcip,
4706 (mac_tx_cookie_t)
4707 mac_tx_srs_get_soft_ring(mac_srs, ring));
4708 mac_tx_srs_del_ring(mac_srs, ring);
4709 }
4710 /*
4711 * Additionally, if we are left with only
4712 * one ring in the group after this, we need
4713 * to modify the mode etc. to. (We haven't
4714 * yet taken the ring out, so we check with 2).
4715 */
4716 if (group->mrg_cur_count == 2) {
4717 if (ring->mr_next == NULL)
4718 rem_ring = group->mrg_rings;
4719 else
4720 rem_ring = ring->mr_next;
4721 mac_tx_invoke_callbacks(mcip,
4722 (mac_tx_cookie_t)
4723 mac_tx_srs_get_soft_ring(mac_srs,
4724 rem_ring));
4725 mac_tx_srs_del_ring(mac_srs, rem_ring);
4726 if (rem_ring->mr_state != MR_INUSE) {
4727 (void) mac_start_ring(rem_ring);
4728 }
4729 tx->st_arg2 = (void *)rem_ring;
4730 mac_tx_srs_stat_recreate(mac_srs, B_FALSE);
4731 ring_info = mac_hwring_getinfo(
4732 (mac_ring_handle_t)rem_ring);
4733 /*
4734 * We are shrinking from multiple
4735 * to 1 ring.
4736 */
4737 if (mac_srs->srs_type & SRST_BW_CONTROL) {
4738 tx->st_mode = SRS_TX_BW;
4739 } else if (mac_tx_serialize ||
4740 (ring_info & MAC_RING_TX_SERIALIZE)) {
4741 tx->st_mode = SRS_TX_SERIALIZE;
4742 } else {
4743 tx->st_mode = SRS_TX_DEFAULT;
4744 }
4745 tx->st_func = mac_tx_get_func(tx->st_mode);
4746 }
4747 mac_tx_client_restart((mac_client_handle_t)mcip);
4748 mgcp = mgcp->mgc_next;
4749 }
4750 break;
4751 }
4752 default:
4753 ASSERT(B_FALSE);
4754 }
4755
4756 /*
4757 * Remove the ring from the group.
4758 */
4759 if (ring == group->mrg_rings)
4760 group->mrg_rings = ring->mr_next;
4761 else {
4762 mac_ring_t *pre;
4763
4764 pre = group->mrg_rings;
4765 while (pre->mr_next != ring)
4766 pre = pre->mr_next;
4767 pre->mr_next = ring->mr_next;
4768 }
4769 group->mrg_cur_count--;
4770
4771 if (!driver_call) {
4772 ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4773 ASSERT(group->mrg_driver == NULL ||
4774 cap_rings->mr_gremring != NULL);
4775
4776 /*
4777 * Remove the driver level hardware ring.
4778 */
4779 if (group->mrg_driver != NULL) {
4780 cap_rings->mr_gremring(group->mrg_driver,
4781 ring->mr_driver, ring->mr_type);
4782 }
4783 }
4784
4785 ring->mr_gh = NULL;
4786 if (driver_call)
4787 mac_ring_free(mip, ring);
4788 else
4789 ring->mr_flag = 0;
4790 }
4791
4792 /*
4793 * Move a ring to the target group. If needed, remove the ring from the group
4794 * that it currently belongs to.
4795 *
4796 * The caller need to enter MAC's perimeter by calling mac_perim_enter().
4797 */
4798 static int
4799 mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
4800 {
4801 mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
4802 int rv;
4803
4804 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4805 ASSERT(d_group != NULL);
4806 ASSERT(s_group->mrg_mh == d_group->mrg_mh);
4807
4808 if (s_group == d_group)
4809 return (0);
4810
4811 /*
4812 * Remove it from current group first.
4813 */
4814 if (s_group != NULL)
4815 i_mac_group_rem_ring(s_group, ring, B_FALSE);
4816
4817 /*
4818 * Add it to the new group.
4819 */
4820 rv = i_mac_group_add_ring(d_group, ring, 0);
4821 if (rv != 0) {
4822 /*
4823 * Failed to add ring back to source group. If
4824 * that fails, the ring is stuck in limbo, log message.
4825 */
4826 if (i_mac_group_add_ring(s_group, ring, 0)) {
4827 cmn_err(CE_WARN, "%s: failed to move ring %p\n",
4828 mip->mi_name, (void *)ring);
4829 }
4830 }
4831
4832 return (rv);
4833 }
4834
4835 /*
4836 * Find a MAC address according to its value.
4837 */
4838 mac_address_t *
4839 mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
4840 {
4841 mac_address_t *map;
4842
4843 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4844
4845 for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
4846 if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
4847 break;
4848 }
4849
4850 return (map);
4851 }
4852
4853 /*
4854 * Check whether the MAC address is shared by multiple clients.
4855 */
4856 boolean_t
4857 mac_check_macaddr_shared(mac_address_t *map)
4858 {
4859 ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
4860
4861 return (map->ma_nusers > 1);
4862 }
4863
4864 /*
4865 * Remove the specified MAC address from the MAC address list and free it.
4866 */
4867 static void
4868 mac_free_macaddr(mac_address_t *map)
4869 {
4870 mac_impl_t *mip = map->ma_mip;
4871
4872 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4873 ASSERT(mip->mi_addresses != NULL);
4874
4875 map = mac_find_macaddr(mip, map->ma_addr);
4876
4877 ASSERT(map != NULL);
4878 ASSERT(map->ma_nusers == 0);
4879
4880 if (map == mip->mi_addresses) {
4881 mip->mi_addresses = map->ma_next;
4882 } else {
4883 mac_address_t *pre;
4884
4885 pre = mip->mi_addresses;
4886 while (pre->ma_next != map)
4887 pre = pre->ma_next;
4888 pre->ma_next = map->ma_next;
4889 }
4890
4891 kmem_free(map, sizeof (mac_address_t));
4892 }
4893
4894 /*
4895 * Add a MAC address reference for a client. If the desired MAC address
4896 * exists, add a reference to it. Otherwise, add the new address by adding
4897 * it to a reserved group or setting promiscuous mode. Won't try different
4898 * group is the group is non-NULL, so the caller must explictly share
4899 * default group when needed.
4900 *
4901 * Note, the primary MAC address is initialized at registration time, so
4902 * to add it to default group only need to activate it if its reference
4903 * count is still zero. Also, some drivers may not have advertised RINGS
4904 * capability.
4905 */
4906 int
4907 mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
4908 boolean_t use_hw)
4909 {
4910 mac_address_t *map;
4911 int err = 0;
4912 boolean_t allocated_map = B_FALSE;
4913
4914 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4915
4916 map = mac_find_macaddr(mip, mac_addr);
4917
4918 /*
4919 * If the new MAC address has not been added. Allocate a new one
4920 * and set it up.
4921 */
4922 if (map == NULL) {
4923 map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
4924 map->ma_len = mip->mi_type->mt_addr_length;
4925 bcopy(mac_addr, map->ma_addr, map->ma_len);
4926 map->ma_nusers = 0;
4927 map->ma_group = group;
4928 map->ma_mip = mip;
4929
4930 /* add the new MAC address to the head of the address list */
4931 map->ma_next = mip->mi_addresses;
4932 mip->mi_addresses = map;
4933
4934 allocated_map = B_TRUE;
4935 }
4936
4937 ASSERT(map->ma_group == NULL || map->ma_group == group);
4938 if (map->ma_group == NULL)
4939 map->ma_group = group;
4940
4941 /*
4942 * If the MAC address is already in use, simply account for the
4943 * new client.
4944 */
4945 if (map->ma_nusers++ > 0)
4946 return (0);
4947
4948 /*
4949 * Activate this MAC address by adding it to the reserved group.
4950 */
4951 if (group != NULL) {
4952 err = mac_group_addmac(group, (const uint8_t *)mac_addr);
4953 if (err == 0) {
4954 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4955 return (0);
4956 }
4957 }
4958
4959 /*
4960 * The MAC address addition failed. If the client requires a
4961 * hardware classified MAC address, fail the operation.
4962 */
4963 if (use_hw) {
4964 err = ENOSPC;
4965 goto bail;
4966 }
4967
4968 /*
4969 * Try promiscuous mode.
4970 *
4971 * For drivers that don't advertise RINGS capability, do
4972 * nothing for the primary address.
4973 */
4974 if ((group == NULL) &&
4975 (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
4976 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4977 return (0);
4978 }
4979
4980 /*
4981 * Enable promiscuous mode in order to receive traffic
4982 * to the new MAC address.
4983 */
4984 if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
4985 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
4986 return (0);
4987 }
4988
4989 /*
4990 * Free the MAC address that could not be added. Don't free
4991 * a pre-existing address, it could have been the entry
4992 * for the primary MAC address which was pre-allocated by
4993 * mac_init_macaddr(), and which must remain on the list.
4994 */
4995 bail:
4996 map->ma_nusers--;
4997 if (allocated_map)
4998 mac_free_macaddr(map);
4999 return (err);
5000 }
5001
5002 /*
5003 * Remove a reference to a MAC address. This may cause to remove the MAC
5004 * address from an associated group or to turn off promiscuous mode.
5005 * The caller needs to handle the failure properly.
5006 */
5007 int
5008 mac_remove_macaddr(mac_address_t *map)
5009 {
5010 mac_impl_t *mip = map->ma_mip;
5011 int err = 0;
5012
5013 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5014
5015 ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
5016
5017 /*
5018 * If it's not the last client using this MAC address, only update
5019 * the MAC clients count.
5020 */
5021 if (--map->ma_nusers > 0)
5022 return (0);
5023
5024 /*
5025 * The MAC address is no longer used by any MAC client, so remove
5026 * it from its associated group, or turn off promiscuous mode
5027 * if it was enabled for the MAC address.
5028 */
5029 switch (map->ma_type) {
5030 case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
5031 /*
5032 * Don't free the preset primary address for drivers that
5033 * don't advertise RINGS capability.
5034 */
5035 if (map->ma_group == NULL)
5036 return (0);
5037
5038 err = mac_group_remmac(map->ma_group, map->ma_addr);
5039 if (err == 0)
5040 map->ma_group = NULL;
5041 break;
5042 case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
5043 err = i_mac_promisc_set(mip, B_FALSE);
5044 break;
5045 default:
5046 ASSERT(B_FALSE);
5047 }
5048
5049 if (err != 0)
5050 return (err);
5051
5052 /*
5053 * We created MAC address for the primary one at registration, so we
5054 * won't free it here. mac_fini_macaddr() will take care of it.
5055 */
5056 if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
5057 mac_free_macaddr(map);
5058
5059 return (0);
5060 }
5061
5062 /*
5063 * Update an existing MAC address. The caller need to make sure that the new
5064 * value has not been used.
5065 */
5066 int
5067 mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
5068 {
5069 mac_impl_t *mip = map->ma_mip;
5070 int err = 0;
5071
5072 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5073 ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
5074
5075 switch (map->ma_type) {
5076 case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
5077 /*
5078 * Update the primary address for drivers that are not
5079 * RINGS capable.
5080 */
5081 if (mip->mi_rx_groups == NULL) {
5082 err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
5083 mac_addr);
5084 if (err != 0)
5085 return (err);
5086 break;
5087 }
5088
5089 /*
5090 * If this MAC address is not currently in use,
5091 * simply break out and update the value.
5092 */
5093 if (map->ma_nusers == 0)
5094 break;
5095
5096 /*
5097 * Need to replace the MAC address associated with a group.
5098 */
5099 err = mac_group_remmac(map->ma_group, map->ma_addr);
5100 if (err != 0)
5101 return (err);
5102
5103 err = mac_group_addmac(map->ma_group, mac_addr);
5104
5105 /*
5106 * Failure hints hardware error. The MAC layer needs to
5107 * have error notification facility to handle this.
5108 * Now, simply try to restore the value.
5109 */
5110 if (err != 0)
5111 (void) mac_group_addmac(map->ma_group, map->ma_addr);
5112
5113 break;
5114 case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
5115 /*
5116 * Need to do nothing more if in promiscuous mode.
5117 */
5118 break;
5119 default:
5120 ASSERT(B_FALSE);
5121 }
5122
5123 /*
5124 * Successfully replaced the MAC address.
5125 */
5126 if (err == 0)
5127 bcopy(mac_addr, map->ma_addr, map->ma_len);
5128
5129 return (err);
5130 }
5131
5132 /*
5133 * Freshen the MAC address with new value. Its caller must have updated the
5134 * hardware MAC address before calling this function.
5135 * This funcitons is supposed to be used to handle the MAC address change
5136 * notification from underlying drivers.
5137 */
5138 void
5139 mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
5140 {
5141 mac_impl_t *mip = map->ma_mip;
5142
5143 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5144 ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
5145
5146 /*
5147 * Freshen the MAC address with new value.
5148 */
5149 bcopy(mac_addr, map->ma_addr, map->ma_len);
5150 bcopy(mac_addr, mip->mi_addr, map->ma_len);
5151
5152 /*
5153 * Update all MAC clients that share this MAC address.
5154 */
5155 mac_unicast_update_clients(mip, map);
5156 }
5157
5158 /*
5159 * Set up the primary MAC address.
5160 */
5161 void
5162 mac_init_macaddr(mac_impl_t *mip)
5163 {
5164 mac_address_t *map;
5165
5166 /*
5167 * The reference count is initialized to zero, until it's really
5168 * activated.
5169 */
5170 map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
5171 map->ma_len = mip->mi_type->mt_addr_length;
5172 bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
5173
5174 /*
5175 * If driver advertises RINGS capability, it shouldn't have initialized
5176 * its primary MAC address. For other drivers, including VNIC, the
5177 * primary address must work after registration.
5178 */
5179 if (mip->mi_rx_groups == NULL)
5180 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
5181
5182 map->ma_mip = mip;
5183
5184 mip->mi_addresses = map;
5185 }
5186
5187 /*
5188 * Clean up the primary MAC address. Note, only one primary MAC address
5189 * is allowed. All other MAC addresses must have been freed appropriately.
5190 */
5191 void
5192 mac_fini_macaddr(mac_impl_t *mip)
5193 {
5194 mac_address_t *map = mip->mi_addresses;
5195
5196 if (map == NULL)
5197 return;
5198
5199 /*
5200 * If mi_addresses is initialized, there should be exactly one
5201 * entry left on the list with no users.
5202 */
5203 ASSERT(map->ma_nusers == 0);
5204 ASSERT(map->ma_next == NULL);
5205
5206 kmem_free(map, sizeof (mac_address_t));
5207 mip->mi_addresses = NULL;
5208 }
5209
5210 /*
5211 * Logging related functions.
5212 *
5213 * Note that Kernel statistics have been extended to maintain fine
5214 * granularity of statistics viz. hardware lane, software lane, fanout
5215 * stats etc. However, extended accounting continues to support only
5216 * aggregate statistics like before.
5217 */
5218
5219 /* Write the flow description to a netinfo_t record */
5220 static netinfo_t *
5221 mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
5222 {
5223 netinfo_t *ninfo;
5224 net_desc_t *ndesc;
5225 flow_desc_t *fdesc;
5226 mac_resource_props_t *mrp;
5227
5228 ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5229 if (ninfo == NULL)
5230 return (NULL);
5231 ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5232 if (ndesc == NULL) {
5233 kmem_free(ninfo, sizeof (netinfo_t));
5234 return (NULL);
5235 }
5236
5237 /*
5238 * Grab the fe_lock to see a self-consistent fe_flow_desc.
5239 * Updates to the fe_flow_desc are done under the fe_lock
5240 */
5241 mutex_enter(&flent->fe_lock);
5242 fdesc = &flent->fe_flow_desc;
5243 mrp = &flent->fe_resource_props;
5244
5245 ndesc->nd_name = flent->fe_flow_name;
5246 ndesc->nd_devname = mcip->mci_name;
5247 bcopy(fdesc->fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5248 bcopy(fdesc->fd_dst_mac, ndesc->nd_edest, ETHERADDRL);
5249 ndesc->nd_sap = htonl(fdesc->fd_sap);
5250 ndesc->nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
5251 ndesc->nd_bw_limit = mrp->mrp_maxbw;
5252 if (ndesc->nd_isv4) {
5253 ndesc->nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
5254 ndesc->nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
5255 } else {
5256 bcopy(&fdesc->fd_local_addr, ndesc->nd_saddr, IPV6_ADDR_LEN);
5257 bcopy(&fdesc->fd_remote_addr, ndesc->nd_daddr, IPV6_ADDR_LEN);
5258 }
5259 ndesc->nd_sport = htons(fdesc->fd_local_port);
5260 ndesc->nd_dport = htons(fdesc->fd_remote_port);
5261 ndesc->nd_protocol = (uint8_t)fdesc->fd_protocol;
5262 mutex_exit(&flent->fe_lock);
5263
5264 ninfo->ni_record = ndesc;
5265 ninfo->ni_size = sizeof (net_desc_t);
5266 ninfo->ni_type = EX_NET_FLDESC_REC;
5267
5268 return (ninfo);
5269 }
5270
5271 /* Write the flow statistics to a netinfo_t record */
5272 static netinfo_t *
5273 mac_write_flow_stats(flow_entry_t *flent)
5274 {
5275 netinfo_t *ninfo;
5276 net_stat_t *nstat;
5277 mac_soft_ring_set_t *mac_srs;
5278 mac_rx_stats_t *mac_rx_stat;
5279 mac_tx_stats_t *mac_tx_stat;
5280 int i;
5281
5282 ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5283 if (ninfo == NULL)
5284 return (NULL);
5285 nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5286 if (nstat == NULL) {
5287 kmem_free(ninfo, sizeof (netinfo_t));
5288 return (NULL);
5289 }
5290
5291 nstat->ns_name = flent->fe_flow_name;
5292 for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5293 mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5294 mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5295
5296 nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5297 mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
5298 nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5299 mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5300 nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5301 }
5302
5303 mac_srs = (mac_soft_ring_set_t *)(flent->fe_tx_srs);
5304 if (mac_srs != NULL) {
5305 mac_tx_stat = &mac_srs->srs_tx.st_stat;
5306
5307 nstat->ns_obytes = mac_tx_stat->mts_obytes;
5308 nstat->ns_opackets = mac_tx_stat->mts_opackets;
5309 nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5310 }
5311
5312 ninfo->ni_record = nstat;
5313 ninfo->ni_size = sizeof (net_stat_t);
5314 ninfo->ni_type = EX_NET_FLSTAT_REC;
5315
5316 return (ninfo);
5317 }
5318
5319 /* Write the link description to a netinfo_t record */
5320 static netinfo_t *
5321 mac_write_link_desc(mac_client_impl_t *mcip)
5322 {
5323 netinfo_t *ninfo;
5324 net_desc_t *ndesc;
5325 flow_entry_t *flent = mcip->mci_flent;
5326
5327 ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5328 if (ninfo == NULL)
5329 return (NULL);
5330 ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5331 if (ndesc == NULL) {
5332 kmem_free(ninfo, sizeof (netinfo_t));
5333 return (NULL);
5334 }
5335
5336 ndesc->nd_name = mcip->mci_name;
5337 ndesc->nd_devname = mcip->mci_name;
5338 ndesc->nd_isv4 = B_TRUE;
5339 /*
5340 * Grab the fe_lock to see a self-consistent fe_flow_desc.
5341 * Updates to the fe_flow_desc are done under the fe_lock
5342 * after removing the flent from the flow table.
5343 */
5344 mutex_enter(&flent->fe_lock);
5345 bcopy(flent->fe_flow_desc.fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5346 mutex_exit(&flent->fe_lock);
5347
5348 ninfo->ni_record = ndesc;
5349 ninfo->ni_size = sizeof (net_desc_t);
5350 ninfo->ni_type = EX_NET_LNDESC_REC;
5351
5352 return (ninfo);
5353 }
5354
5355 /* Write the link statistics to a netinfo_t record */
5356 static netinfo_t *
5357 mac_write_link_stats(mac_client_impl_t *mcip)
5358 {
5359 netinfo_t *ninfo;
5360 net_stat_t *nstat;
5361 flow_entry_t *flent;
5362 mac_soft_ring_set_t *mac_srs;
5363 mac_rx_stats_t *mac_rx_stat;
5364 mac_tx_stats_t *mac_tx_stat;
5365 int i;
5366
5367 ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5368 if (ninfo == NULL)
5369 return (NULL);
5370 nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5371 if (nstat == NULL) {
5372 kmem_free(ninfo, sizeof (netinfo_t));
5373 return (NULL);
5374 }
5375
5376 nstat->ns_name = mcip->mci_name;
5377 flent = mcip->mci_flent;
5378 if (flent != NULL) {
5379 for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5380 mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5381 mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5382
5383 nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5384 mac_rx_stat->mrs_pollbytes +
5385 mac_rx_stat->mrs_lclbytes;
5386 nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5387 mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5388 nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5389 }
5390 }
5391
5392 mac_srs = (mac_soft_ring_set_t *)(mcip->mci_flent->fe_tx_srs);
5393 if (mac_srs != NULL) {
5394 mac_tx_stat = &mac_srs->srs_tx.st_stat;
5395
5396 nstat->ns_obytes = mac_tx_stat->mts_obytes;
5397 nstat->ns_opackets = mac_tx_stat->mts_opackets;
5398 nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5399 }
5400
5401 ninfo->ni_record = nstat;
5402 ninfo->ni_size = sizeof (net_stat_t);
5403 ninfo->ni_type = EX_NET_LNSTAT_REC;
5404
5405 return (ninfo);
5406 }
5407
5408 typedef struct i_mac_log_state_s {
5409 boolean_t mi_last;
5410 int mi_fenable;
5411 int mi_lenable;
5412 list_t *mi_list;
5413 } i_mac_log_state_t;
5414
5415 /*
5416 * For a given flow, if the description has not been logged before, do it now.
5417 * If it is a VNIC, then we have collected information about it from the MAC
5418 * table, so skip it.
5419 *
5420 * Called through mac_flow_walk_nolock()
5421 *
5422 * Return 0 if successful.
5423 */
5424 static int
5425 mac_log_flowinfo(flow_entry_t *flent, void *arg)
5426 {
5427 mac_client_impl_t *mcip = flent->fe_mcip;
5428 i_mac_log_state_t *lstate = arg;
5429 netinfo_t *ninfo;
5430
5431 if (mcip == NULL)
5432 return (0);
5433
5434 /*
5435 * If the name starts with "vnic", and fe_user_generated is true (to
5436 * exclude the mcast and active flow entries created implicitly for
5437 * a vnic, it is a VNIC flow. i.e. vnic1 is a vnic flow,
5438 * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
5439 */
5440 if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
5441 (flent->fe_type & FLOW_USER) != 0) {
5442 return (0);
5443 }
5444
5445 if (!flent->fe_desc_logged) {
5446 /*
5447 * We don't return error because we want to continue the
5448 * walk in case this is the last walk which means we
5449 * need to reset fe_desc_logged in all the flows.
5450 */
5451 if ((ninfo = mac_write_flow_desc(flent, mcip)) == NULL)
5452 return (0);
5453 list_insert_tail(lstate->mi_list, ninfo);
5454 flent->fe_desc_logged = B_TRUE;
5455 }
5456
5457 /*
5458 * Regardless of the error, we want to proceed in case we have to
5459 * reset fe_desc_logged.
5460 */
5461 ninfo = mac_write_flow_stats(flent);
5462 if (ninfo == NULL)
5463 return (-1);
5464
5465 list_insert_tail(lstate->mi_list, ninfo);
5466
5467 if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
5468 flent->fe_desc_logged = B_FALSE;
5469
5470 return (0);
5471 }
5472
5473 /*
5474 * Log the description for each mac client of this mac_impl_t, if it
5475 * hasn't already been done. Additionally, log statistics for the link as
5476 * well. Walk the flow table and log information for each flow as well.
5477 * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
5478 * also fe_desc_logged, if flow logging is on) since we want to log the
5479 * description if and when logging is restarted.
5480 *
5481 * Return 0 upon success or -1 upon failure
5482 */
5483 static int
5484 i_mac_impl_log(mac_impl_t *mip, i_mac_log_state_t *lstate)
5485 {
5486 mac_client_impl_t *mcip;
5487 netinfo_t *ninfo;
5488
5489 i_mac_perim_enter(mip);
5490 /*
5491 * Only walk the client list for NIC and etherstub
5492 */
5493 if ((mip->mi_state_flags & MIS_DISABLED) ||
5494 ((mip->mi_state_flags & MIS_IS_VNIC) &&
5495 (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL))) {
5496 i_mac_perim_exit(mip);
5497 return (0);
5498 }
5499
5500 for (mcip = mip->mi_clients_list; mcip != NULL;
5501 mcip = mcip->mci_client_next) {
5502 if (!MCIP_DATAPATH_SETUP(mcip))
5503 continue;
5504 if (lstate->mi_lenable) {
5505 if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
5506 ninfo = mac_write_link_desc(mcip);
5507 if (ninfo == NULL) {
5508 /*
5509 * We can't terminate it if this is the last
5510 * walk, else there might be some links with
5511 * mi_desc_logged set to true, which means
5512 * their description won't be logged the next
5513 * time logging is started (similarly for the
5514 * flows within such links). We can continue
5515 * without walking the flow table (i.e. to
5516 * set fe_desc_logged to false) because we
5517 * won't have written any flow stuff for this
5518 * link as we haven't logged the link itself.
5519 */
5520 i_mac_perim_exit(mip);
5521 if (lstate->mi_last)
5522 return (0);
5523 else
5524 return (-1);
5525 }
5526 mcip->mci_state_flags |= MCIS_DESC_LOGGED;
5527 list_insert_tail(lstate->mi_list, ninfo);
5528 }
5529 }
5530
5531 ninfo = mac_write_link_stats(mcip);
5532 if (ninfo == NULL && !lstate->mi_last) {
5533 i_mac_perim_exit(mip);
5534 return (-1);
5535 }
5536 list_insert_tail(lstate->mi_list, ninfo);
5537
5538 if (lstate->mi_last)
5539 mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
5540
5541 if (lstate->mi_fenable) {
5542 if (mcip->mci_subflow_tab != NULL) {
5543 (void) mac_flow_walk_nolock(
5544 mcip->mci_subflow_tab, mac_log_flowinfo,
5545 lstate);
5546 }
5547 }
5548 }
5549 i_mac_perim_exit(mip);
5550 return (0);
5551 }
5552
5553 /*
5554 * modhash walker function to add a mac_impl_t to a list
5555 */
5556 /*ARGSUSED*/
5557 static uint_t
5558 i_mac_impl_list_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
5559 {
5560 list_t *list = (list_t *)arg;
5561 mac_impl_t *mip = (mac_impl_t *)val;
5562
5563 if ((mip->mi_state_flags & MIS_DISABLED) == 0) {
5564 list_insert_tail(list, mip);
5565 mip->mi_ref++;
5566 }
5567
5568 return (MH_WALK_CONTINUE);
5569 }
5570
5571 void
5572 i_mac_log_info(list_t *net_log_list, i_mac_log_state_t *lstate)
5573 {
5574 list_t mac_impl_list;
5575 mac_impl_t *mip;
5576 netinfo_t *ninfo;
5577
5578 /* Create list of mac_impls */
5579 ASSERT(RW_LOCK_HELD(&i_mac_impl_lock));
5580 list_create(&mac_impl_list, sizeof (mac_impl_t), offsetof(mac_impl_t,
5581 mi_node));
5582 mod_hash_walk(i_mac_impl_hash, i_mac_impl_list_walker, &mac_impl_list);
5583 rw_exit(&i_mac_impl_lock);
5584
5585 /* Create log entries for each mac_impl */
5586 for (mip = list_head(&mac_impl_list); mip != NULL;
5587 mip = list_next(&mac_impl_list, mip)) {
5588 if (i_mac_impl_log(mip, lstate) != 0)
5589 continue;
5590 }
5591
5592 /* Remove elements and destroy list of mac_impls */
5593 rw_enter(&i_mac_impl_lock, RW_WRITER);
5594 while ((mip = list_remove_tail(&mac_impl_list)) != NULL) {
5595 mip->mi_ref--;
5596 }
5597 rw_exit(&i_mac_impl_lock);
5598 list_destroy(&mac_impl_list);
5599
5600 /*
5601 * Write log entries to files outside of locks, free associated
5602 * structures, and remove entries from the list.
5603 */
5604 while ((ninfo = list_head(net_log_list)) != NULL) {
5605 (void) exacct_commit_netinfo(ninfo->ni_record, ninfo->ni_type);
5606 list_remove(net_log_list, ninfo);
5607 kmem_free(ninfo->ni_record, ninfo->ni_size);
5608 kmem_free(ninfo, sizeof (*ninfo));
5609 }
5610 list_destroy(net_log_list);
5611 }
5612
5613 /*
5614 * The timer thread that runs every mac_logging_interval seconds and logs
5615 * link and/or flow information.
5616 */
5617 /* ARGSUSED */
5618 void
5619 mac_log_linkinfo(void *arg)
5620 {
5621 i_mac_log_state_t lstate;
5622 list_t net_log_list;
5623
5624 list_create(&net_log_list, sizeof (netinfo_t),
5625 offsetof(netinfo_t, ni_link));
5626
5627 rw_enter(&i_mac_impl_lock, RW_READER);
5628 if (!mac_flow_log_enable && !mac_link_log_enable) {
5629 rw_exit(&i_mac_impl_lock);
5630 return;
5631 }
5632 lstate.mi_fenable = mac_flow_log_enable;
5633 lstate.mi_lenable = mac_link_log_enable;
5634 lstate.mi_last = B_FALSE;
5635 lstate.mi_list = &net_log_list;
5636
5637 /* Write log entries for each mac_impl in the list */
5638 i_mac_log_info(&net_log_list, &lstate);
5639
5640 if (mac_flow_log_enable || mac_link_log_enable) {
5641 mac_logging_timer = timeout(mac_log_linkinfo, NULL,
5642 SEC_TO_TICK(mac_logging_interval));
5643 }
5644 }
5645
5646 typedef struct i_mac_fastpath_state_s {
5647 boolean_t mf_disable;
5648 int mf_err;
5649 } i_mac_fastpath_state_t;
5650
5651 /* modhash walker function to enable or disable fastpath */
5652 /*ARGSUSED*/
5653 static uint_t
5654 i_mac_fastpath_walker(mod_hash_key_t key, mod_hash_val_t *val,
5655 void *arg)
5656 {
5657 i_mac_fastpath_state_t *state = arg;
5658 mac_handle_t mh = (mac_handle_t)val;
5659
5660 if (state->mf_disable)
5661 state->mf_err = mac_fastpath_disable(mh);
5662 else
5663 mac_fastpath_enable(mh);
5664
5665 return (state->mf_err == 0 ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
5666 }
5667
5668 /*
5669 * Start the logging timer.
5670 */
5671 int
5672 mac_start_logusage(mac_logtype_t type, uint_t interval)
5673 {
5674 i_mac_fastpath_state_t dstate = {B_TRUE, 0};
5675 i_mac_fastpath_state_t estate = {B_FALSE, 0};
5676 int err;
5677
5678 rw_enter(&i_mac_impl_lock, RW_WRITER);
5679 switch (type) {
5680 case MAC_LOGTYPE_FLOW:
5681 if (mac_flow_log_enable) {
5682 rw_exit(&i_mac_impl_lock);
5683 return (0);
5684 }
5685 /* FALLTHRU */
5686 case MAC_LOGTYPE_LINK:
5687 if (mac_link_log_enable) {
5688 rw_exit(&i_mac_impl_lock);
5689 return (0);
5690 }
5691 break;
5692 default:
5693 ASSERT(0);
5694 }
5695
5696 /* Disable fastpath */
5697 mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &dstate);
5698 if ((err = dstate.mf_err) != 0) {
5699 /* Reenable fastpath */
5700 mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
5701 rw_exit(&i_mac_impl_lock);
5702 return (err);
5703 }
5704
5705 switch (type) {
5706 case MAC_LOGTYPE_FLOW:
5707 mac_flow_log_enable = B_TRUE;
5708 /* FALLTHRU */
5709 case MAC_LOGTYPE_LINK:
5710 mac_link_log_enable = B_TRUE;
5711 break;
5712 }
5713
5714 mac_logging_interval = interval;
5715 rw_exit(&i_mac_impl_lock);
5716 mac_log_linkinfo(NULL);
5717 return (0);
5718 }
5719
5720 /*
5721 * Stop the logging timer if both link and flow logging are turned off.
5722 */
5723 void
5724 mac_stop_logusage(mac_logtype_t type)
5725 {
5726 i_mac_log_state_t lstate;
5727 i_mac_fastpath_state_t estate = {B_FALSE, 0};
5728 list_t net_log_list;
5729
5730 list_create(&net_log_list, sizeof (netinfo_t),
5731 offsetof(netinfo_t, ni_link));
5732
5733 rw_enter(&i_mac_impl_lock, RW_WRITER);
5734
5735 lstate.mi_fenable = mac_flow_log_enable;
5736 lstate.mi_lenable = mac_link_log_enable;
5737 lstate.mi_list = &net_log_list;
5738
5739 /* Last walk */
5740 lstate.mi_last = B_TRUE;
5741
5742 switch (type) {
5743 case MAC_LOGTYPE_FLOW:
5744 if (lstate.mi_fenable) {
5745 ASSERT(mac_link_log_enable);
5746 mac_flow_log_enable = B_FALSE;
5747 mac_link_log_enable = B_FALSE;
5748 break;
5749 }
5750 /* FALLTHRU */
5751 case MAC_LOGTYPE_LINK:
5752 if (!lstate.mi_lenable || mac_flow_log_enable) {
5753 rw_exit(&i_mac_impl_lock);
5754 return;
5755 }
5756 mac_link_log_enable = B_FALSE;
5757 break;
5758 default:
5759 ASSERT(0);
5760 }
5761
5762 /* Reenable fastpath */
5763 mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
5764
5765 (void) untimeout(mac_logging_timer);
5766 mac_logging_timer = 0;
5767
5768 /* Write log entries for each mac_impl in the list */
5769 i_mac_log_info(&net_log_list, &lstate);
5770 }
5771
5772 /*
5773 * Walk the rx and tx SRS/SRs for a flow and update the priority value.
5774 */
5775 void
5776 mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
5777 {
5778 pri_t pri;
5779 int count;
5780 mac_soft_ring_set_t *mac_srs;
5781
5782 if (flent->fe_rx_srs_cnt <= 0)
5783 return;
5784
5785 if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
5786 SRST_FLOW) {
5787 pri = FLOW_PRIORITY(mcip->mci_min_pri,
5788 mcip->mci_max_pri,
5789 flent->fe_resource_props.mrp_priority);
5790 } else {
5791 pri = mcip->mci_max_pri;
5792 }
5793
5794 for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
5795 mac_srs = flent->fe_rx_srs[count];
5796 mac_update_srs_priority(mac_srs, pri);
5797 }
5798 /*
5799 * If we have a Tx SRS, we need to modify all the threads associated
5800 * with it.
5801 */
5802 if (flent->fe_tx_srs != NULL)
5803 mac_update_srs_priority(flent->fe_tx_srs, pri);
5804 }
5805
5806 /*
5807 * RX and TX rings are reserved according to different semantics depending
5808 * on the requests from the MAC clients and type of rings:
5809 *
5810 * On the Tx side, by default we reserve individual rings, independently from
5811 * the groups.
5812 *
5813 * On the Rx side, the reservation is at the granularity of the group
5814 * of rings, and used for v12n level 1 only. It has a special case for the
5815 * primary client.
5816 *
5817 * If a share is allocated to a MAC client, we allocate a TX group and an
5818 * RX group to the client, and assign TX rings and RX rings to these
5819 * groups according to information gathered from the driver through
5820 * the share capability.
5821 *
5822 * The foreseable evolution of Rx rings will handle v12n level 2 and higher
5823 * to allocate individual rings out of a group and program the hw classifier
5824 * based on IP address or higher level criteria.
5825 */
5826
5827 /*
5828 * mac_reserve_tx_ring()
5829 * Reserve a unused ring by marking it with MR_INUSE state.
5830 * As reserved, the ring is ready to function.
5831 *
5832 * Notes for Hybrid I/O:
5833 *
5834 * If a specific ring is needed, it is specified through the desired_ring
5835 * argument. Otherwise that argument is set to NULL.
5836 * If the desired ring was previous allocated to another client, this
5837 * function swaps it with a new ring from the group of unassigned rings.
5838 */
5839 mac_ring_t *
5840 mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
5841 {
5842 mac_group_t *group;
5843 mac_grp_client_t *mgcp;
5844 mac_client_impl_t *mcip;
5845 mac_soft_ring_set_t *srs;
5846
5847 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5848
5849 /*
5850 * Find an available ring and start it before changing its status.
5851 * The unassigned rings are at the end of the mi_tx_groups
5852 * array.
5853 */
5854 group = MAC_DEFAULT_TX_GROUP(mip);
5855
5856 /* Can't take the default ring out of the default group */
5857 ASSERT(desired_ring != (mac_ring_t *)mip->mi_default_tx_ring);
5858
5859 if (desired_ring->mr_state == MR_FREE) {
5860 ASSERT(MAC_GROUP_NO_CLIENT(group));
5861 if (mac_start_ring(desired_ring) != 0)
5862 return (NULL);
5863 return (desired_ring);
5864 }
5865 /*
5866 * There are clients using this ring, so let's move the clients
5867 * away from using this ring.
5868 */
5869 for (mgcp = group->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
5870 mcip = mgcp->mgc_client;
5871 mac_tx_client_quiesce((mac_client_handle_t)mcip);
5872 srs = MCIP_TX_SRS(mcip);
5873 ASSERT(mac_tx_srs_ring_present(srs, desired_ring));
5874 mac_tx_invoke_callbacks(mcip,
5875 (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(srs,
5876 desired_ring));
5877 mac_tx_srs_del_ring(srs, desired_ring);
5878 mac_tx_client_restart((mac_client_handle_t)mcip);
5879 }
5880 return (desired_ring);
5881 }
5882
5883 /*
5884 * For a reserved group with multiple clients, return the primary client.
5885 */
5886 static mac_client_impl_t *
5887 mac_get_grp_primary(mac_group_t *grp)
5888 {
5889 mac_grp_client_t *mgcp = grp->mrg_clients;
5890 mac_client_impl_t *mcip;
5891
5892 while (mgcp != NULL) {
5893 mcip = mgcp->mgc_client;
5894 if (mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC)
5895 return (mcip);
5896 mgcp = mgcp->mgc_next;
5897 }
5898 return (NULL);
5899 }
5900
5901 /*
5902 * Hybrid I/O specifies the ring that should be given to a share.
5903 * If the ring is already used by clients, then we need to release
5904 * the ring back to the default group so that we can give it to
5905 * the share. This means the clients using this ring now get a
5906 * replacement ring. If there aren't any replacement rings, this
5907 * function returns a failure.
5908 */
5909 static int
5910 mac_reclaim_ring_from_grp(mac_impl_t *mip, mac_ring_type_t ring_type,
5911 mac_ring_t *ring, mac_ring_t **rings, int nrings)
5912 {
5913 mac_group_t *group = (mac_group_t *)ring->mr_gh;
5914 mac_resource_props_t *mrp;
5915 mac_client_impl_t *mcip;
5916 mac_group_t *defgrp;
5917 mac_ring_t *tring;
5918 mac_group_t *tgrp;
5919 int i;
5920 int j;
5921
5922 mcip = MAC_GROUP_ONLY_CLIENT(group);
5923 if (mcip == NULL)
5924 mcip = mac_get_grp_primary(group);
5925 ASSERT(mcip != NULL);
5926 ASSERT(mcip->mci_share == NULL);
5927
5928 mrp = MCIP_RESOURCE_PROPS(mcip);
5929 if (ring_type == MAC_RING_TYPE_RX) {
5930 defgrp = mip->mi_rx_donor_grp;
5931 if ((mrp->mrp_mask & MRP_RX_RINGS) == 0) {
5932 /* Need to put this mac client in the default group */
5933 if (mac_rx_switch_group(mcip, group, defgrp) != 0)
5934 return (ENOSPC);
5935 } else {
5936 /*
5937 * Switch this ring with some other ring from
5938 * the default group.
5939 */
5940 for (tring = defgrp->mrg_rings; tring != NULL;
5941 tring = tring->mr_next) {
5942 if (tring->mr_index == 0)
5943 continue;
5944 for (j = 0; j < nrings; j++) {
5945 if (rings[j] == tring)
5946 break;
5947 }
5948 if (j >= nrings)
5949 break;
5950 }
5951 if (tring == NULL)
5952 return (ENOSPC);
5953 if (mac_group_mov_ring(mip, group, tring) != 0)
5954 return (ENOSPC);
5955 if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
5956 (void) mac_group_mov_ring(mip, defgrp, tring);
5957 return (ENOSPC);
5958 }
5959 }
5960 ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
5961 return (0);
5962 }
5963
5964 defgrp = MAC_DEFAULT_TX_GROUP(mip);
5965 if (ring == (mac_ring_t *)mip->mi_default_tx_ring) {
5966 /*
5967 * See if we can get a spare ring to replace the default
5968 * ring.
5969 */
5970 if (defgrp->mrg_cur_count == 1) {
5971 /*
5972 * Need to get a ring from another client, see if
5973 * there are any clients that can be moved to
5974 * the default group, thereby freeing some rings.
5975 */
5976 for (i = 0; i < mip->mi_tx_group_count; i++) {
5977 tgrp = &mip->mi_tx_groups[i];
5978 if (tgrp->mrg_state ==
5979 MAC_GROUP_STATE_REGISTERED) {
5980 continue;
5981 }
5982 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
5983 if (mcip == NULL)
5984 mcip = mac_get_grp_primary(tgrp);
5985 ASSERT(mcip != NULL);
5986 mrp = MCIP_RESOURCE_PROPS(mcip);
5987 if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
5988 ASSERT(tgrp->mrg_cur_count == 1);
5989 /*
5990 * If this ring is part of the
5991 * rings asked by the share we cannot
5992 * use it as the default ring.
5993 */
5994 for (j = 0; j < nrings; j++) {
5995 if (rings[j] == tgrp->mrg_rings)
5996 break;
5997 }
5998 if (j < nrings)
5999 continue;
6000 mac_tx_client_quiesce(
6001 (mac_client_handle_t)mcip);
6002 mac_tx_switch_group(mcip, tgrp,
6003 defgrp);
6004 mac_tx_client_restart(
6005 (mac_client_handle_t)mcip);
6006 break;
6007 }
6008 }
6009 /*
6010 * All the rings are reserved, can't give up the
6011 * default ring.
6012 */
6013 if (defgrp->mrg_cur_count <= 1)
6014 return (ENOSPC);
6015 }
6016 /*
6017 * Swap the default ring with another.
6018 */
6019 for (tring = defgrp->mrg_rings; tring != NULL;
6020 tring = tring->mr_next) {
6021 /*
6022 * If this ring is part of the rings asked by the
6023 * share we cannot use it as the default ring.
6024 */
6025 for (j = 0; j < nrings; j++) {
6026 if (rings[j] == tring)
6027 break;
6028 }
6029 if (j >= nrings)
6030 break;
6031 }
6032 ASSERT(tring != NULL);
6033 mip->mi_default_tx_ring = (mac_ring_handle_t)tring;
6034 return (0);
6035 }
6036 /*
6037 * The Tx ring is with a group reserved by a MAC client. See if
6038 * we can swap it.
6039 */
6040 ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
6041 mcip = MAC_GROUP_ONLY_CLIENT(group);
6042 if (mcip == NULL)
6043 mcip = mac_get_grp_primary(group);
6044 ASSERT(mcip != NULL);
6045 mrp = MCIP_RESOURCE_PROPS(mcip);
6046 mac_tx_client_quiesce((mac_client_handle_t)mcip);
6047 if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
6048 ASSERT(group->mrg_cur_count == 1);
6049 /* Put this mac client in the default group */
6050 mac_tx_switch_group(mcip, group, defgrp);
6051 } else {
6052 /*
6053 * Switch this ring with some other ring from
6054 * the default group.
6055 */
6056 for (tring = defgrp->mrg_rings; tring != NULL;
6057 tring = tring->mr_next) {
6058 if (tring == (mac_ring_t *)mip->mi_default_tx_ring)
6059 continue;
6060 /*
6061 * If this ring is part of the rings asked by the
6062 * share we cannot use it for swapping.
6063 */
6064 for (j = 0; j < nrings; j++) {
6065 if (rings[j] == tring)
6066 break;
6067 }
6068 if (j >= nrings)
6069 break;
6070 }
6071 if (tring == NULL) {
6072 mac_tx_client_restart((mac_client_handle_t)mcip);
6073 return (ENOSPC);
6074 }
6075 if (mac_group_mov_ring(mip, group, tring) != 0) {
6076 mac_tx_client_restart((mac_client_handle_t)mcip);
6077 return (ENOSPC);
6078 }
6079 if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
6080 (void) mac_group_mov_ring(mip, defgrp, tring);
6081 mac_tx_client_restart((mac_client_handle_t)mcip);
6082 return (ENOSPC);
6083 }
6084 }
6085 mac_tx_client_restart((mac_client_handle_t)mcip);
6086 ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
6087 return (0);
6088 }
6089
6090 /*
6091 * Populate a zero-ring group with rings. If the share is non-NULL,
6092 * the rings are chosen according to that share.
6093 * Invoked after allocating a new RX or TX group through
6094 * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
6095 * Returns zero on success, an errno otherwise.
6096 */
6097 int
6098 i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
6099 mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share,
6100 uint32_t ringcnt)
6101 {
6102 mac_ring_t **rings, *ring;
6103 uint_t nrings;
6104 int rv = 0, i = 0, j;
6105
6106 ASSERT((ring_type == MAC_RING_TYPE_RX &&
6107 mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) ||
6108 (ring_type == MAC_RING_TYPE_TX &&
6109 mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC));
6110
6111 /*
6112 * First find the rings to allocate to the group.
6113 */
6114 if (share != NULL) {
6115 /* get rings through ms_squery() */
6116 mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
6117 ASSERT(nrings != 0);
6118 rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
6119 KM_SLEEP);
6120 mip->mi_share_capab.ms_squery(share, ring_type,
6121 (mac_ring_handle_t *)rings, &nrings);
6122 for (i = 0; i < nrings; i++) {
6123 /*
6124 * If we have given this ring to a non-default
6125 * group, we need to check if we can get this
6126 * ring.
6127 */
6128 ring = rings[i];
6129 if (ring->mr_gh != (mac_group_handle_t)src_group ||
6130 ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6131 if (mac_reclaim_ring_from_grp(mip, ring_type,
6132 ring, rings, nrings) != 0) {
6133 rv = ENOSPC;
6134 goto bail;
6135 }
6136 }
6137 }
6138 } else {
6139 /*
6140 * Pick one ring from default group.
6141 *
6142 * for now pick the second ring which requires the first ring
6143 * at index 0 to stay in the default group, since it is the
6144 * ring which carries the multicast traffic.
6145 * We need a better way for a driver to indicate this,
6146 * for example a per-ring flag.
6147 */
6148 rings = kmem_alloc(ringcnt * sizeof (mac_ring_handle_t),
6149 KM_SLEEP);
6150 for (ring = src_group->mrg_rings; ring != NULL;
6151 ring = ring->mr_next) {
6152 if (ring_type == MAC_RING_TYPE_RX &&
6153 ring->mr_index == 0) {
6154 continue;
6155 }
6156 if (ring_type == MAC_RING_TYPE_TX &&
6157 ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6158 continue;
6159 }
6160 rings[i++] = ring;
6161 if (i == ringcnt)
6162 break;
6163 }
6164 ASSERT(ring != NULL);
6165 nrings = i;
6166 /* Not enough rings as required */
6167 if (nrings != ringcnt) {
6168 rv = ENOSPC;
6169 goto bail;
6170 }
6171 }
6172
6173 switch (ring_type) {
6174 case MAC_RING_TYPE_RX:
6175 if (src_group->mrg_cur_count - nrings < 1) {
6176 /* we ran out of rings */
6177 rv = ENOSPC;
6178 goto bail;
6179 }
6180
6181 /* move receive rings to new group */
6182 for (i = 0; i < nrings; i++) {
6183 rv = mac_group_mov_ring(mip, new_group, rings[i]);
6184 if (rv != 0) {
6185 /* move rings back on failure */
6186 for (j = 0; j < i; j++) {
6187 (void) mac_group_mov_ring(mip,
6188 src_group, rings[j]);
6189 }
6190 goto bail;
6191 }
6192 }
6193 break;
6194
6195 case MAC_RING_TYPE_TX: {
6196 mac_ring_t *tmp_ring;
6197
6198 /* move the TX rings to the new group */
6199 for (i = 0; i < nrings; i++) {
6200 /* get the desired ring */
6201 tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
6202 if (tmp_ring == NULL) {
6203 rv = ENOSPC;
6204 goto bail;
6205 }
6206 ASSERT(tmp_ring == rings[i]);
6207 rv = mac_group_mov_ring(mip, new_group, rings[i]);
6208 if (rv != 0) {
6209 /* cleanup on failure */
6210 for (j = 0; j < i; j++) {
6211 (void) mac_group_mov_ring(mip,
6212 MAC_DEFAULT_TX_GROUP(mip),
6213 rings[j]);
6214 }
6215 goto bail;
6216 }
6217 }
6218 break;
6219 }
6220 }
6221
6222 /* add group to share */
6223 if (share != NULL)
6224 mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
6225
6226 bail:
6227 /* free temporary array of rings */
6228 kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
6229
6230 return (rv);
6231 }
6232
6233 void
6234 mac_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
6235 {
6236 mac_grp_client_t *mgcp;
6237
6238 for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
6239 if (mgcp->mgc_client == mcip)
6240 break;
6241 }
6242
6243 VERIFY(mgcp == NULL);
6244
6245 mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
6246 mgcp->mgc_client = mcip;
6247 mgcp->mgc_next = grp->mrg_clients;
6248 grp->mrg_clients = mgcp;
6249
6250 }
6251
6252 void
6253 mac_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
6254 {
6255 mac_grp_client_t *mgcp, **pprev;
6256
6257 for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
6258 pprev = &mgcp->mgc_next, mgcp = *pprev) {
6259 if (mgcp->mgc_client == mcip)
6260 break;
6261 }
6262
6263 ASSERT(mgcp != NULL);
6264
6265 *pprev = mgcp->mgc_next;
6266 kmem_free(mgcp, sizeof (mac_grp_client_t));
6267 }
6268
6269 /*
6270 * mac_reserve_rx_group()
6271 *
6272 * Finds an available group and exclusively reserves it for a client.
6273 * The group is chosen to suit the flow's resource controls (bandwidth and
6274 * fanout requirements) and the address type.
6275 * If the requestor is the pimary MAC then return the group with the
6276 * largest number of rings, otherwise the default ring when available.
6277 */
6278 mac_group_t *
6279 mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
6280 {
6281 mac_share_handle_t share = mcip->mci_share;
6282 mac_impl_t *mip = mcip->mci_mip;
6283 mac_group_t *grp = NULL;
6284 int i;
6285 int err = 0;
6286 mac_address_t *map;
6287 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
6288 int nrings;
6289 int donor_grp_rcnt;
6290 boolean_t need_exclgrp = B_FALSE;
6291 int need_rings = 0;
6292 mac_group_t *candidate_grp = NULL;
6293 mac_client_impl_t *gclient;
6294 mac_resource_props_t *gmrp;
6295 mac_group_t *donorgrp = NULL;
6296 boolean_t rxhw = mrp->mrp_mask & MRP_RX_RINGS;
6297 boolean_t unspec = mrp->mrp_mask & MRP_RXRINGS_UNSPEC;
6298 boolean_t isprimary;
6299
6300 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
6301
6302 isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
6303
6304 /*
6305 * Check if a group already has this mac address (case of VLANs)
6306 * unless we are moving this MAC client from one group to another.
6307 */
6308 if (!move && (map = mac_find_macaddr(mip, mac_addr)) != NULL) {
6309 if (map->ma_group != NULL)
6310 return (map->ma_group);
6311 }
6312 if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0)
6313 return (NULL);
6314 /*
6315 * If exclusive open, return NULL which will enable the
6316 * caller to use the default group.
6317 */
6318 if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
6319 return (NULL);
6320
6321 /* For dynamic groups default unspecified to 1 */
6322 if (rxhw && unspec &&
6323 mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6324 mrp->mrp_nrxrings = 1;
6325 }
6326 /*
6327 * For static grouping we allow only specifying rings=0 and
6328 * unspecified
6329 */
6330 if (rxhw && mrp->mrp_nrxrings > 0 &&
6331 mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) {
6332 return (NULL);
6333 }
6334 if (rxhw) {
6335 /*
6336 * We have explicitly asked for a group (with nrxrings,
6337 * if unspec).
6338 */
6339 if (unspec || mrp->mrp_nrxrings > 0) {
6340 need_exclgrp = B_TRUE;
6341 need_rings = mrp->mrp_nrxrings;
6342 } else if (mrp->mrp_nrxrings == 0) {
6343 /*
6344 * We have asked for a software group.
6345 */
6346 return (NULL);
6347 }
6348 } else if (isprimary && mip->mi_nactiveclients == 1 &&
6349 mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6350 /*
6351 * If the primary is the only active client on this
6352 * mip and we have not asked for any rings, we give
6353 * it the default group so that the primary gets to
6354 * use all the rings.
6355 */
6356 return (NULL);
6357 }
6358
6359 /* The group that can donate rings */
6360 donorgrp = mip->mi_rx_donor_grp;
6361
6362 /*
6363 * The number of rings that the default group can donate.
6364 * We need to leave at least one ring.
6365 */
6366 donor_grp_rcnt = donorgrp->mrg_cur_count - 1;
6367
6368 /*
6369 * Try to exclusively reserve a RX group.
6370 *
6371 * For flows requiring HW_DEFAULT_RING (unicast flow of the primary
6372 * client), try to reserve the a non-default RX group and give
6373 * it all the rings from the donor group, except the default ring
6374 *
6375 * For flows requiring HW_RING (unicast flow of other clients), try
6376 * to reserve non-default RX group with the specified number of
6377 * rings, if available.
6378 *
6379 * For flows that have not asked for software or hardware ring,
6380 * try to reserve a non-default group with 1 ring, if available.
6381 */
6382 for (i = 1; i < mip->mi_rx_group_count; i++) {
6383 grp = &mip->mi_rx_groups[i];
6384
6385 DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
6386 int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
6387
6388 /*
6389 * Check if this group could be a candidate group for
6390 * eviction if we need a group for this MAC client,
6391 * but there aren't any. A candidate group is one
6392 * that didn't ask for an exclusive group, but got
6393 * one and it has enough rings (combined with what
6394 * the donor group can donate) for the new MAC
6395 * client
6396 */
6397 if (grp->mrg_state >= MAC_GROUP_STATE_RESERVED) {
6398 /*
6399 * If the primary/donor group is not the default
6400 * group, don't bother looking for a candidate group.
6401 * If we don't have enough rings we will check
6402 * if the primary group can be vacated.
6403 */
6404 if (candidate_grp == NULL &&
6405 donorgrp == MAC_DEFAULT_RX_GROUP(mip)) {
6406 ASSERT(!MAC_GROUP_NO_CLIENT(grp));
6407 gclient = MAC_GROUP_ONLY_CLIENT(grp);
6408 if (gclient == NULL)
6409 gclient = mac_get_grp_primary(grp);
6410 ASSERT(gclient != NULL);
6411 gmrp = MCIP_RESOURCE_PROPS(gclient);
6412 if (gclient->mci_share == NULL &&
6413 (gmrp->mrp_mask & MRP_RX_RINGS) == 0 &&
6414 (unspec ||
6415 (grp->mrg_cur_count + donor_grp_rcnt >=
6416 need_rings))) {
6417 candidate_grp = grp;
6418 }
6419 }
6420 continue;
6421 }
6422 /*
6423 * This group could already be SHARED by other multicast
6424 * flows on this client. In that case, the group would
6425 * be shared and has already been started.
6426 */
6427 ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
6428
6429 if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
6430 (mac_start_group(grp) != 0)) {
6431 continue;
6432 }
6433
6434 if (mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6435 break;
6436 ASSERT(grp->mrg_cur_count == 0);
6437
6438 /*
6439 * Populate the group. Rings should be taken
6440 * from the donor group.
6441 */
6442 nrings = rxhw ? need_rings : isprimary ? donor_grp_rcnt: 1;
6443
6444 /*
6445 * If the donor group can't donate, let's just walk and
6446 * see if someone can vacate a group, so that we have
6447 * enough rings for this, unless we already have
6448 * identified a candiate group..
6449 */
6450 if (nrings <= donor_grp_rcnt) {
6451 err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
6452 donorgrp, grp, share, nrings);
6453 if (err == 0) {
6454 /*
6455 * For a share i_mac_group_allocate_rings gets
6456 * the rings from the driver, let's populate
6457 * the property for the client now.
6458 */
6459 if (share != NULL) {
6460 mac_client_set_rings(
6461 (mac_client_handle_t)mcip,
6462 grp->mrg_cur_count, -1);
6463 }
6464 if (mac_is_primary_client(mcip) && !rxhw)
6465 mip->mi_rx_donor_grp = grp;
6466 break;
6467 }
6468 }
6469
6470 DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
6471 mip->mi_name, int, grp->mrg_index, int, err);
6472
6473 /*
6474 * It's a dynamic group but the grouping operation
6475 * failed.
6476 */
6477 mac_stop_group(grp);
6478 }
6479 /* We didn't find an exclusive group for this MAC client */
6480 if (i >= mip->mi_rx_group_count) {
6481
6482 if (!need_exclgrp)
6483 return (NULL);
6484
6485 /*
6486 * If we found a candidate group then we switch the
6487 * MAC client from the candidate_group to the default
6488 * group and give the group to this MAC client. If
6489 * we didn't find a candidate_group, check if the
6490 * primary is in its own group and if it can make way
6491 * for this MAC client.
6492 */
6493 if (candidate_grp == NULL &&
6494 donorgrp != MAC_DEFAULT_RX_GROUP(mip) &&
6495 donorgrp->mrg_cur_count >= need_rings) {
6496 candidate_grp = donorgrp;
6497 }
6498 if (candidate_grp != NULL) {
6499 boolean_t prim_grp = B_FALSE;
6500
6501 /*
6502 * Switch the MAC client from the candidate group
6503 * to the default group.. If this group was the
6504 * donor group, then after the switch we need
6505 * to update the donor group too.
6506 */
6507 grp = candidate_grp;
6508 gclient = MAC_GROUP_ONLY_CLIENT(grp);
6509 if (gclient == NULL)
6510 gclient = mac_get_grp_primary(grp);
6511 if (grp == mip->mi_rx_donor_grp)
6512 prim_grp = B_TRUE;
6513 if (mac_rx_switch_group(gclient, grp,
6514 MAC_DEFAULT_RX_GROUP(mip)) != 0) {
6515 return (NULL);
6516 }
6517 if (prim_grp) {
6518 mip->mi_rx_donor_grp =
6519 MAC_DEFAULT_RX_GROUP(mip);
6520 donorgrp = MAC_DEFAULT_RX_GROUP(mip);
6521 }
6522
6523
6524 /*
6525 * Now give this group with the required rings
6526 * to this MAC client.
6527 */
6528 ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
6529 if (mac_start_group(grp) != 0)
6530 return (NULL);
6531
6532 if (mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6533 return (grp);
6534
6535 donor_grp_rcnt = donorgrp->mrg_cur_count - 1;
6536 ASSERT(grp->mrg_cur_count == 0);
6537 ASSERT(donor_grp_rcnt >= need_rings);
6538 err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
6539 donorgrp, grp, share, need_rings);
6540 if (err == 0) {
6541 /*
6542 * For a share i_mac_group_allocate_rings gets
6543 * the rings from the driver, let's populate
6544 * the property for the client now.
6545 */
6546 if (share != NULL) {
6547 mac_client_set_rings(
6548 (mac_client_handle_t)mcip,
6549 grp->mrg_cur_count, -1);
6550 }
6551 DTRACE_PROBE2(rx__group__reserved,
6552 char *, mip->mi_name, int, grp->mrg_index);
6553 return (grp);
6554 }
6555 DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
6556 mip->mi_name, int, grp->mrg_index, int, err);
6557 mac_stop_group(grp);
6558 }
6559 return (NULL);
6560 }
6561 ASSERT(grp != NULL);
6562
6563 DTRACE_PROBE2(rx__group__reserved,
6564 char *, mip->mi_name, int, grp->mrg_index);
6565 return (grp);
6566 }
6567
6568 /*
6569 * mac_rx_release_group()
6570 *
6571 * This is called when there are no clients left for the group.
6572 * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
6573 * and if it is a non default group, the shares are removed and
6574 * all rings are assigned back to default group.
6575 */
6576 void
6577 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
6578 {
6579 mac_impl_t *mip = mcip->mci_mip;
6580 mac_ring_t *ring;
6581
6582 ASSERT(group != MAC_DEFAULT_RX_GROUP(mip));
6583
6584 if (mip->mi_rx_donor_grp == group)
6585 mip->mi_rx_donor_grp = MAC_DEFAULT_RX_GROUP(mip);
6586
6587 /*
6588 * This is the case where there are no clients left. Any
6589 * SRS etc on this group have also be quiesced.
6590 */
6591 for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
6592 if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
6593 ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
6594 /*
6595 * Remove the SRS associated with the HW ring.
6596 * As a result, polling will be disabled.
6597 */
6598 ring->mr_srs = NULL;
6599 }
6600 ASSERT(group->mrg_state < MAC_GROUP_STATE_RESERVED ||
6601 ring->mr_state == MR_INUSE);
6602 if (ring->mr_state == MR_INUSE) {
6603 mac_stop_ring(ring);
6604 ring->mr_flag = 0;
6605 }
6606 }
6607
6608 /* remove group from share */
6609 if (mcip->mci_share != NULL) {
6610 mip->mi_share_capab.ms_sremove(mcip->mci_share,
6611 group->mrg_driver);
6612 }
6613
6614 if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6615 mac_ring_t *ring;
6616
6617 /*
6618 * Rings were dynamically allocated to group.
6619 * Move rings back to default group.
6620 */
6621 while ((ring = group->mrg_rings) != NULL) {
6622 (void) mac_group_mov_ring(mip, mip->mi_rx_donor_grp,
6623 ring);
6624 }
6625 }
6626 mac_stop_group(group);
6627 /*
6628 * Possible improvement: See if we can assign the group just released
6629 * to a another client of the mip
6630 */
6631 }
6632
6633 /*
6634 * When we move the primary's mac address between groups, we need to also
6635 * take all the clients sharing the same mac address along with it (VLANs)
6636 * We remove the mac address for such clients from the group after quiescing
6637 * them. When we add the mac address we restart the client. Note that
6638 * the primary's mac address is removed from the group after all the
6639 * other clients sharing the address are removed. Similarly, the primary's
6640 * mac address is added before all the other client's mac address are
6641 * added. While grp is the group where the clients reside, tgrp is
6642 * the group where the addresses have to be added.
6643 */
6644 static void
6645 mac_rx_move_macaddr_prim(mac_client_impl_t *mcip, mac_group_t *grp,
6646 mac_group_t *tgrp, uint8_t *maddr, boolean_t add)
6647 {
6648 mac_impl_t *mip = mcip->mci_mip;
6649 mac_grp_client_t *mgcp = grp->mrg_clients;
6650 mac_client_impl_t *gmcip;
6651 boolean_t prim;
6652
6653 prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
6654
6655 /*
6656 * If the clients are in a non-default group, we just have to
6657 * walk the group's client list. If it is in the default group
6658 * (which will be shared by other clients as well, we need to
6659 * check if the unicast address matches mcip's unicast.
6660 */
6661 while (mgcp != NULL) {
6662 gmcip = mgcp->mgc_client;
6663 if (gmcip != mcip &&
6664 (grp != MAC_DEFAULT_RX_GROUP(mip) ||
6665 mcip->mci_unicast == gmcip->mci_unicast)) {
6666 if (!add) {
6667 mac_rx_client_quiesce(
6668 (mac_client_handle_t)gmcip);
6669 (void) mac_remove_macaddr(mcip->mci_unicast);
6670 } else {
6671 (void) mac_add_macaddr(mip, tgrp, maddr, prim);
6672 mac_rx_client_restart(
6673 (mac_client_handle_t)gmcip);
6674 }
6675 }
6676 mgcp = mgcp->mgc_next;
6677 }
6678 }
6679
6680
6681 /*
6682 * Move the MAC address from fgrp to tgrp. If this is the primary client,
6683 * we need to take any VLANs etc. together too.
6684 */
6685 static int
6686 mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp,
6687 mac_group_t *tgrp)
6688 {
6689 mac_impl_t *mip = mcip->mci_mip;
6690 uint8_t maddr[MAXMACADDRLEN];
6691 int err = 0;
6692 boolean_t prim;
6693 boolean_t multiclnt = B_FALSE;
6694
6695 mac_rx_client_quiesce((mac_client_handle_t)mcip);
6696 ASSERT(mcip->mci_unicast != NULL);
6697 bcopy(mcip->mci_unicast->ma_addr, maddr, mcip->mci_unicast->ma_len);
6698
6699 prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
6700 if (mcip->mci_unicast->ma_nusers > 1) {
6701 mac_rx_move_macaddr_prim(mcip, fgrp, NULL, maddr, B_FALSE);
6702 multiclnt = B_TRUE;
6703 }
6704 ASSERT(mcip->mci_unicast->ma_nusers == 1);
6705 err = mac_remove_macaddr(mcip->mci_unicast);
6706 if (err != 0) {
6707 mac_rx_client_restart((mac_client_handle_t)mcip);
6708 if (multiclnt) {
6709 mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
6710 B_TRUE);
6711 }
6712 return (err);
6713 }
6714 /*
6715 * Program the H/W Classifier first, if this fails we need
6716 * not proceed with the other stuff.
6717 */
6718 if ((err = mac_add_macaddr(mip, tgrp, maddr, prim)) != 0) {
6719 /* Revert back the H/W Classifier */
6720 if ((err = mac_add_macaddr(mip, fgrp, maddr, prim)) != 0) {
6721 /*
6722 * This should not fail now since it worked earlier,
6723 * should we panic?
6724 */
6725 cmn_err(CE_WARN,
6726 "mac_rx_switch_group: switching %p back"
6727 " to group %p failed!!", (void *)mcip,
6728 (void *)fgrp);
6729 }
6730 mac_rx_client_restart((mac_client_handle_t)mcip);
6731 if (multiclnt) {
6732 mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
6733 B_TRUE);
6734 }
6735 return (err);
6736 }
6737 mcip->mci_unicast = mac_find_macaddr(mip, maddr);
6738 mac_rx_client_restart((mac_client_handle_t)mcip);
6739 if (multiclnt)
6740 mac_rx_move_macaddr_prim(mcip, fgrp, tgrp, maddr, B_TRUE);
6741 return (err);
6742 }
6743
6744 /*
6745 * Switch the MAC client from one group to another. This means we need
6746 * to remove the MAC address from the group, remove the MAC client,
6747 * teardown the SRSs and revert the group state. Then, we add the client
6748 * to the destination group, set the SRSs, and add the MAC address to the
6749 * group.
6750 */
6751 int
6752 mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
6753 mac_group_t *tgrp)
6754 {
6755 int err;
6756 mac_group_state_t next_state;
6757 mac_client_impl_t *group_only_mcip;
6758 mac_client_impl_t *gmcip;
6759 mac_impl_t *mip = mcip->mci_mip;
6760 mac_grp_client_t *mgcp;
6761
6762 ASSERT(fgrp == mcip->mci_flent->fe_rx_ring_group);
6763
6764 if ((err = mac_rx_move_macaddr(mcip, fgrp, tgrp)) != 0)
6765 return (err);
6766
6767 /*
6768 * The group might be reserved, but SRSs may not be set up, e.g.
6769 * primary and its vlans using a reserved group.
6770 */
6771 if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED &&
6772 MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
6773 mac_rx_srs_group_teardown(mcip->mci_flent, B_TRUE);
6774 }
6775 if (fgrp != MAC_DEFAULT_RX_GROUP(mip)) {
6776 mgcp = fgrp->mrg_clients;
6777 while (mgcp != NULL) {
6778 gmcip = mgcp->mgc_client;
6779 mgcp = mgcp->mgc_next;
6780 mac_group_remove_client(fgrp, gmcip);
6781 mac_group_add_client(tgrp, gmcip);
6782 gmcip->mci_flent->fe_rx_ring_group = tgrp;
6783 }
6784 mac_release_rx_group(mcip, fgrp);
6785 ASSERT(MAC_GROUP_NO_CLIENT(fgrp));
6786 mac_set_group_state(fgrp, MAC_GROUP_STATE_REGISTERED);
6787 } else {
6788 mac_group_remove_client(fgrp, mcip);
6789 mac_group_add_client(tgrp, mcip);
6790 mcip->mci_flent->fe_rx_ring_group = tgrp;
6791 /*
6792 * If there are other clients (VLANs) sharing this address
6793 * we should be here only for the primary.
6794 */
6795 if (mcip->mci_unicast->ma_nusers > 1) {
6796 /*
6797 * We need to move all the clients that are using
6798 * this h/w address.
6799 */
6800 mgcp = fgrp->mrg_clients;
6801 while (mgcp != NULL) {
6802 gmcip = mgcp->mgc_client;
6803 mgcp = mgcp->mgc_next;
6804 if (mcip->mci_unicast == gmcip->mci_unicast) {
6805 mac_group_remove_client(fgrp, gmcip);
6806 mac_group_add_client(tgrp, gmcip);
6807 gmcip->mci_flent->fe_rx_ring_group =
6808 tgrp;
6809 }
6810 }
6811 }
6812 /*
6813 * The default group will still take the multicast,
6814 * broadcast traffic etc., so it won't go to
6815 * MAC_GROUP_STATE_REGISTERED.
6816 */
6817 if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED)
6818 mac_rx_group_unmark(fgrp, MR_CONDEMNED);
6819 mac_set_group_state(fgrp, MAC_GROUP_STATE_SHARED);
6820 }
6821 next_state = mac_group_next_state(tgrp, &group_only_mcip,
6822 MAC_DEFAULT_RX_GROUP(mip), B_TRUE);
6823 mac_set_group_state(tgrp, next_state);
6824 /*
6825 * If the destination group is reserved, setup the SRSs etc.
6826 */
6827 if (tgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
6828 mac_rx_srs_group_setup(mcip, mcip->mci_flent, SRST_LINK);
6829 mac_fanout_setup(mcip, mcip->mci_flent,
6830 MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver, mcip, NULL,
6831 NULL);
6832 mac_rx_group_unmark(tgrp, MR_INCIPIENT);
6833 } else {
6834 mac_rx_switch_grp_to_sw(tgrp);
6835 }
6836 return (0);
6837 }
6838
6839 /*
6840 * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
6841 * when a share was allocated to the client.
6842 */
6843 mac_group_t *
6844 mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
6845 {
6846 mac_impl_t *mip = mcip->mci_mip;
6847 mac_group_t *grp = NULL;
6848 int rv;
6849 int i;
6850 int err;
6851 mac_group_t *defgrp;
6852 mac_share_handle_t share = mcip->mci_share;
6853 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
6854 int nrings;
6855 int defnrings;
6856 boolean_t need_exclgrp = B_FALSE;
6857 int need_rings = 0;
6858 mac_group_t *candidate_grp = NULL;
6859 mac_client_impl_t *gclient;
6860 mac_resource_props_t *gmrp;
6861 boolean_t txhw = mrp->mrp_mask & MRP_TX_RINGS;
6862 boolean_t unspec = mrp->mrp_mask & MRP_TXRINGS_UNSPEC;
6863 boolean_t isprimary;
6864
6865 isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
6866 /*
6867 * When we come here for a VLAN on the primary (dladm create-vlan),
6868 * we need to pair it along with the primary (to keep it consistent
6869 * with the RX side). So, we check if the primary is already assigned
6870 * to a group and return the group if so. The other way is also
6871 * true, i.e. the VLAN is already created and now we are plumbing
6872 * the primary.
6873 */
6874 if (!move && isprimary) {
6875 for (gclient = mip->mi_clients_list; gclient != NULL;
6876 gclient = gclient->mci_client_next) {
6877 if (gclient->mci_flent->fe_type & FLOW_PRIMARY_MAC &&
6878 gclient->mci_flent->fe_tx_ring_group != NULL) {
6879 return (gclient->mci_flent->fe_tx_ring_group);
6880 }
6881 }
6882 }
6883
6884 if (mip->mi_tx_groups == NULL || mip->mi_tx_group_count == 0)
6885 return (NULL);
6886
6887 /* For dynamic groups, default unspec to 1 */
6888 if (txhw && unspec &&
6889 mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6890 mrp->mrp_ntxrings = 1;
6891 }
6892 /*
6893 * For static grouping we allow only specifying rings=0 and
6894 * unspecified
6895 */
6896 if (txhw && mrp->mrp_ntxrings > 0 &&
6897 mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC) {
6898 return (NULL);
6899 }
6900
6901 if (txhw) {
6902 /*
6903 * We have explicitly asked for a group (with ntxrings,
6904 * if unspec).
6905 */
6906 if (unspec || mrp->mrp_ntxrings > 0) {
6907 need_exclgrp = B_TRUE;
6908 need_rings = mrp->mrp_ntxrings;
6909 } else if (mrp->mrp_ntxrings == 0) {
6910 /*
6911 * We have asked for a software group.
6912 */
6913 return (NULL);
6914 }
6915 }
6916 defgrp = MAC_DEFAULT_TX_GROUP(mip);
6917 /*
6918 * The number of rings that the default group can donate.
6919 * We need to leave at least one ring - the default ring - in
6920 * this group.
6921 */
6922 defnrings = defgrp->mrg_cur_count - 1;
6923
6924 /*
6925 * Primary gets default group unless explicitly told not
6926 * to (i.e. rings > 0).
6927 */
6928 if (isprimary && !need_exclgrp)
6929 return (NULL);
6930
6931 nrings = (mrp->mrp_mask & MRP_TX_RINGS) != 0 ? mrp->mrp_ntxrings : 1;
6932 for (i = 0; i < mip->mi_tx_group_count; i++) {
6933 grp = &mip->mi_tx_groups[i];
6934 if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
6935 (grp->mrg_state == MAC_GROUP_STATE_UNINIT)) {
6936 /*
6937 * Select a candidate for replacement if we don't
6938 * get an exclusive group. A candidate group is one
6939 * that didn't ask for an exclusive group, but got
6940 * one and it has enough rings (combined with what
6941 * the default group can donate) for the new MAC
6942 * client.
6943 */
6944 if (grp->mrg_state == MAC_GROUP_STATE_RESERVED &&
6945 candidate_grp == NULL) {
6946 gclient = MAC_GROUP_ONLY_CLIENT(grp);
6947 if (gclient == NULL)
6948 gclient = mac_get_grp_primary(grp);
6949 gmrp = MCIP_RESOURCE_PROPS(gclient);
6950 if (gclient->mci_share == NULL &&
6951 (gmrp->mrp_mask & MRP_TX_RINGS) == 0 &&
6952 (unspec ||
6953 (grp->mrg_cur_count + defnrings) >=
6954 need_rings)) {
6955 candidate_grp = grp;
6956 }
6957 }
6958 continue;
6959 }
6960 /*
6961 * If the default can't donate let's just walk and
6962 * see if someone can vacate a group, so that we have
6963 * enough rings for this.
6964 */
6965 if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC ||
6966 nrings <= defnrings) {
6967 if (grp->mrg_state == MAC_GROUP_STATE_REGISTERED) {
6968 rv = mac_start_group(grp);
6969 ASSERT(rv == 0);
6970 }
6971 break;
6972 }
6973 }
6974
6975 /* The default group */
6976 if (i >= mip->mi_tx_group_count) {
6977 /*
6978 * If we need an exclusive group and have identified a
6979 * candidate group we switch the MAC client from the
6980 * candidate group to the default group and give the
6981 * candidate group to this client.
6982 */
6983 if (need_exclgrp && candidate_grp != NULL) {
6984 /*
6985 * Switch the MAC client from the candidate group
6986 * to the default group.
6987 */
6988 grp = candidate_grp;
6989 gclient = MAC_GROUP_ONLY_CLIENT(grp);
6990 if (gclient == NULL)
6991 gclient = mac_get_grp_primary(grp);
6992 mac_tx_client_quiesce((mac_client_handle_t)gclient);
6993 mac_tx_switch_group(gclient, grp, defgrp);
6994 mac_tx_client_restart((mac_client_handle_t)gclient);
6995
6996 /*
6997 * Give the candidate group with the specified number
6998 * of rings to this MAC client.
6999 */
7000 ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
7001 rv = mac_start_group(grp);
7002 ASSERT(rv == 0);
7003
7004 if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC)
7005 return (grp);
7006
7007 ASSERT(grp->mrg_cur_count == 0);
7008 ASSERT(defgrp->mrg_cur_count > need_rings);
7009
7010 err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX,
7011 defgrp, grp, share, need_rings);
7012 if (err == 0) {
7013 /*
7014 * For a share i_mac_group_allocate_rings gets
7015 * the rings from the driver, let's populate
7016 * the property for the client now.
7017 */
7018 if (share != NULL) {
7019 mac_client_set_rings(
7020 (mac_client_handle_t)mcip, -1,
7021 grp->mrg_cur_count);
7022 }
7023 mip->mi_tx_group_free--;
7024 return (grp);
7025 }
7026 DTRACE_PROBE3(tx__group__reserve__alloc__rings, char *,
7027 mip->mi_name, int, grp->mrg_index, int, err);
7028 mac_stop_group(grp);
7029 }
7030 return (NULL);
7031 }
7032 /*
7033 * We got an exclusive group, but it is not dynamic.
7034 */
7035 if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
7036 mip->mi_tx_group_free--;
7037 return (grp);
7038 }
7039
7040 rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, defgrp, grp,
7041 share, nrings);
7042 if (rv != 0) {
7043 DTRACE_PROBE3(tx__group__reserve__alloc__rings,
7044 char *, mip->mi_name, int, grp->mrg_index, int, rv);
7045 mac_stop_group(grp);
7046 return (NULL);
7047 }
7048 /*
7049 * For a share i_mac_group_allocate_rings gets the rings from the
7050 * driver, let's populate the property for the client now.
7051 */
7052 if (share != NULL) {
7053 mac_client_set_rings((mac_client_handle_t)mcip, -1,
7054 grp->mrg_cur_count);
7055 }
7056 mip->mi_tx_group_free--;
7057 return (grp);
7058 }
7059
7060 void
7061 mac_release_tx_group(mac_client_impl_t *mcip, mac_group_t *grp)
7062 {
7063 mac_impl_t *mip = mcip->mci_mip;
7064 mac_share_handle_t share = mcip->mci_share;
7065 mac_ring_t *ring;
7066 mac_soft_ring_set_t *srs = MCIP_TX_SRS(mcip);
7067 mac_group_t *defgrp;
7068
7069 defgrp = MAC_DEFAULT_TX_GROUP(mip);
7070 if (srs != NULL) {
7071 if (srs->srs_soft_ring_count > 0) {
7072 for (ring = grp->mrg_rings; ring != NULL;
7073 ring = ring->mr_next) {
7074 ASSERT(mac_tx_srs_ring_present(srs, ring));
7075 mac_tx_invoke_callbacks(mcip,
7076 (mac_tx_cookie_t)
7077 mac_tx_srs_get_soft_ring(srs, ring));
7078 mac_tx_srs_del_ring(srs, ring);
7079 }
7080 } else {
7081 ASSERT(srs->srs_tx.st_arg2 != NULL);
7082 srs->srs_tx.st_arg2 = NULL;
7083 mac_srs_stat_delete(srs);
7084 }
7085 }
7086 if (share != NULL)
7087 mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
7088
7089 /* move the ring back to the pool */
7090 if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
7091 while ((ring = grp->mrg_rings) != NULL)
7092 (void) mac_group_mov_ring(mip, defgrp, ring);
7093 }
7094 mac_stop_group(grp);
7095 mip->mi_tx_group_free++;
7096 }
7097
7098 /*
7099 * Disassociate a MAC client from a group, i.e go through the rings in the
7100 * group and delete all the soft rings tied to them.
7101 */
7102 static void
7103 mac_tx_dismantle_soft_rings(mac_group_t *fgrp, flow_entry_t *flent)
7104 {
7105 mac_client_impl_t *mcip = flent->fe_mcip;
7106 mac_soft_ring_set_t *tx_srs;
7107 mac_srs_tx_t *tx;
7108 mac_ring_t *ring;
7109
7110 tx_srs = flent->fe_tx_srs;
7111 tx = &tx_srs->srs_tx;
7112
7113 /* Single ring case we haven't created any soft rings */
7114 if (tx->st_mode == SRS_TX_BW || tx->st_mode == SRS_TX_SERIALIZE ||
7115 tx->st_mode == SRS_TX_DEFAULT) {
7116 tx->st_arg2 = NULL;
7117 mac_srs_stat_delete(tx_srs);
7118 /* Fanout case, where we have to dismantle the soft rings */
7119 } else {
7120 for (ring = fgrp->mrg_rings; ring != NULL;
7121 ring = ring->mr_next) {
7122 ASSERT(mac_tx_srs_ring_present(tx_srs, ring));
7123 mac_tx_invoke_callbacks(mcip,
7124 (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(tx_srs,
7125 ring));
7126 mac_tx_srs_del_ring(tx_srs, ring);
7127 }
7128 ASSERT(tx->st_arg2 == NULL);
7129 }
7130 }
7131
7132 /*
7133 * Switch the MAC client from one group to another. This means we need
7134 * to remove the MAC client, teardown the SRSs and revert the group state.
7135 * Then, we add the client to the destination roup, set the SRSs etc.
7136 */
7137 void
7138 mac_tx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
7139 mac_group_t *tgrp)
7140 {
7141 mac_client_impl_t *group_only_mcip;
7142 mac_impl_t *mip = mcip->mci_mip;
7143 flow_entry_t *flent = mcip->mci_flent;
7144 mac_group_t *defgrp;
7145 mac_grp_client_t *mgcp;
7146 mac_client_impl_t *gmcip;
7147 flow_entry_t *gflent;
7148
7149 defgrp = MAC_DEFAULT_TX_GROUP(mip);
7150 ASSERT(fgrp == flent->fe_tx_ring_group);
7151
7152 if (fgrp == defgrp) {
7153 /*
7154 * If this is the primary we need to find any VLANs on
7155 * the primary and move them too.
7156 */
7157 mac_group_remove_client(fgrp, mcip);
7158 mac_tx_dismantle_soft_rings(fgrp, flent);
7159 if (mcip->mci_unicast->ma_nusers > 1) {
7160 mgcp = fgrp->mrg_clients;
7161 while (mgcp != NULL) {
7162 gmcip = mgcp->mgc_client;
7163 mgcp = mgcp->mgc_next;
7164 if (mcip->mci_unicast != gmcip->mci_unicast)
7165 continue;
7166 mac_tx_client_quiesce(
7167 (mac_client_handle_t)gmcip);
7168
7169 gflent = gmcip->mci_flent;
7170 mac_group_remove_client(fgrp, gmcip);
7171 mac_tx_dismantle_soft_rings(fgrp, gflent);
7172
7173 mac_group_add_client(tgrp, gmcip);
7174 gflent->fe_tx_ring_group = tgrp;
7175 /* We could directly set this to SHARED */
7176 tgrp->mrg_state = mac_group_next_state(tgrp,
7177 &group_only_mcip, defgrp, B_FALSE);
7178
7179 mac_tx_srs_group_setup(gmcip, gflent,
7180 SRST_LINK);
7181 mac_fanout_setup(gmcip, gflent,
7182 MCIP_RESOURCE_PROPS(gmcip), mac_rx_deliver,
7183 gmcip, NULL, NULL);
7184
7185 mac_tx_client_restart(
7186 (mac_client_handle_t)gmcip);
7187 }
7188 }
7189 if (MAC_GROUP_NO_CLIENT(fgrp)) {
7190 mac_ring_t *ring;
7191 int cnt;
7192 int ringcnt;
7193
7194 fgrp->mrg_state = MAC_GROUP_STATE_REGISTERED;
7195 /*
7196 * Additionally, we also need to stop all
7197 * the rings in the default group, except
7198 * the default ring. The reason being
7199 * this group won't be released since it is
7200 * the default group, so the rings won't
7201 * be stopped otherwise.
7202 */
7203 ringcnt = fgrp->mrg_cur_count;
7204 ring = fgrp->mrg_rings;
7205 for (cnt = 0; cnt < ringcnt; cnt++) {
7206 if (ring->mr_state == MR_INUSE &&
7207 ring !=
7208 (mac_ring_t *)mip->mi_default_tx_ring) {
7209 mac_stop_ring(ring);
7210 ring->mr_flag = 0;
7211 }
7212 ring = ring->mr_next;
7213 }
7214 } else if (MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
7215 fgrp->mrg_state = MAC_GROUP_STATE_RESERVED;
7216 } else {
7217 ASSERT(fgrp->mrg_state == MAC_GROUP_STATE_SHARED);
7218 }
7219 } else {
7220 /*
7221 * We could have VLANs sharing the non-default group with
7222 * the primary.
7223 */
7224 mgcp = fgrp->mrg_clients;
7225 while (mgcp != NULL) {
7226 gmcip = mgcp->mgc_client;
7227 mgcp = mgcp->mgc_next;
7228 if (gmcip == mcip)
7229 continue;
7230 mac_tx_client_quiesce((mac_client_handle_t)gmcip);
7231 gflent = gmcip->mci_flent;
7232
7233 mac_group_remove_client(fgrp, gmcip);
7234 mac_tx_dismantle_soft_rings(fgrp, gflent);
7235
7236 mac_group_add_client(tgrp, gmcip);
7237 gflent->fe_tx_ring_group = tgrp;
7238 /* We could directly set this to SHARED */
7239 tgrp->mrg_state = mac_group_next_state(tgrp,
7240 &group_only_mcip, defgrp, B_FALSE);
7241 mac_tx_srs_group_setup(gmcip, gflent, SRST_LINK);
7242 mac_fanout_setup(gmcip, gflent,
7243 MCIP_RESOURCE_PROPS(gmcip), mac_rx_deliver,
7244 gmcip, NULL, NULL);
7245
7246 mac_tx_client_restart((mac_client_handle_t)gmcip);
7247 }
7248 mac_group_remove_client(fgrp, mcip);
7249 mac_release_tx_group(mcip, fgrp);
7250 fgrp->mrg_state = MAC_GROUP_STATE_REGISTERED;
7251 }
7252
7253 /* Add it to the tgroup */
7254 mac_group_add_client(tgrp, mcip);
7255 flent->fe_tx_ring_group = tgrp;
7256 tgrp->mrg_state = mac_group_next_state(tgrp, &group_only_mcip,
7257 defgrp, B_FALSE);
7258
7259 mac_tx_srs_group_setup(mcip, flent, SRST_LINK);
7260 mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
7261 mac_rx_deliver, mcip, NULL, NULL);
7262 }
7263
7264 /*
7265 * This is a 1-time control path activity initiated by the client (IP).
7266 * The mac perimeter protects against other simultaneous control activities,
7267 * for example an ioctl that attempts to change the degree of fanout and
7268 * increase or decrease the number of softrings associated with this Tx SRS.
7269 */
7270 static mac_tx_notify_cb_t *
7271 mac_client_tx_notify_add(mac_client_impl_t *mcip,
7272 mac_tx_notify_t notify, void *arg)
7273 {
7274 mac_cb_info_t *mcbi;
7275 mac_tx_notify_cb_t *mtnfp;
7276
7277 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
7278
7279 mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
7280 mtnfp->mtnf_fn = notify;
7281 mtnfp->mtnf_arg = arg;
7282 mtnfp->mtnf_link.mcb_objp = mtnfp;
7283 mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
7284 mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
7285
7286 mcbi = &mcip->mci_tx_notify_cb_info;
7287 mutex_enter(mcbi->mcbi_lockp);
7288 mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
7289 mutex_exit(mcbi->mcbi_lockp);
7290 return (mtnfp);
7291 }
7292
7293 static void
7294 mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
7295 {
7296 mac_cb_info_t *mcbi;
7297 mac_cb_t **cblist;
7298
7299 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
7300
7301 if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
7302 &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
7303 cmn_err(CE_WARN,
7304 "mac_client_tx_notify_remove: callback not "
7305 "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
7306 return;
7307 }
7308
7309 mcbi = &mcip->mci_tx_notify_cb_info;
7310 cblist = &mcip->mci_tx_notify_cb_list;
7311 mutex_enter(mcbi->mcbi_lockp);
7312 if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
7313 kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
7314 else
7315 mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
7316 mutex_exit(mcbi->mcbi_lockp);
7317 }
7318
7319 /*
7320 * mac_client_tx_notify():
7321 * call to add and remove flow control callback routine.
7322 */
7323 mac_tx_notify_handle_t
7324 mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
7325 void *ptr)
7326 {
7327 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
7328 mac_tx_notify_cb_t *mtnfp = NULL;
7329
7330 i_mac_perim_enter(mcip->mci_mip);
7331
7332 if (callb_func != NULL) {
7333 /* Add a notify callback */
7334 mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
7335 } else {
7336 mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
7337 }
7338 i_mac_perim_exit(mcip->mci_mip);
7339
7340 return ((mac_tx_notify_handle_t)mtnfp);
7341 }
7342
7343 void
7344 mac_bridge_vectors(mac_bridge_tx_t txf, mac_bridge_rx_t rxf,
7345 mac_bridge_ref_t reff, mac_bridge_ls_t lsf)
7346 {
7347 mac_bridge_tx_cb = txf;
7348 mac_bridge_rx_cb = rxf;
7349 mac_bridge_ref_cb = reff;
7350 mac_bridge_ls_cb = lsf;
7351 }
7352
7353 int
7354 mac_bridge_set(mac_handle_t mh, mac_handle_t link)
7355 {
7356 mac_impl_t *mip = (mac_impl_t *)mh;
7357 int retv;
7358
7359 mutex_enter(&mip->mi_bridge_lock);
7360 if (mip->mi_bridge_link == NULL) {
7361 mip->mi_bridge_link = link;
7362 retv = 0;
7363 } else {
7364 retv = EBUSY;
7365 }
7366 mutex_exit(&mip->mi_bridge_lock);
7367 if (retv == 0) {
7368 mac_poll_state_change(mh, B_FALSE);
7369 mac_capab_update(mh);
7370 }
7371 return (retv);
7372 }
7373
7374 /*
7375 * Disable bridging on the indicated link.
7376 */
7377 void
7378 mac_bridge_clear(mac_handle_t mh, mac_handle_t link)
7379 {
7380 mac_impl_t *mip = (mac_impl_t *)mh;
7381
7382 mutex_enter(&mip->mi_bridge_lock);
7383 ASSERT(mip->mi_bridge_link == link);
7384 mip->mi_bridge_link = NULL;
7385 mutex_exit(&mip->mi_bridge_lock);
7386 mac_poll_state_change(mh, B_TRUE);
7387 mac_capab_update(mh);
7388 }
7389
7390 void
7391 mac_no_active(mac_handle_t mh)
7392 {
7393 mac_impl_t *mip = (mac_impl_t *)mh;
7394
7395 i_mac_perim_enter(mip);
7396 mip->mi_state_flags |= MIS_NO_ACTIVE;
7397 i_mac_perim_exit(mip);
7398 }
7399
7400 /*
7401 * Walk the primary VLAN clients whenever the primary's rings property
7402 * changes and update the mac_resource_props_t for the VLAN's client.
7403 * We need to do this since we don't support setting these properties
7404 * on the primary's VLAN clients, but the VLAN clients have to
7405 * follow the primary w.r.t the rings property;
7406 */
7407 void
7408 mac_set_prim_vlan_rings(mac_impl_t *mip, mac_resource_props_t *mrp)
7409 {
7410 mac_client_impl_t *vmcip;
7411 mac_resource_props_t *vmrp;
7412
7413 for (vmcip = mip->mi_clients_list; vmcip != NULL;
7414 vmcip = vmcip->mci_client_next) {
7415 if (!(vmcip->mci_flent->fe_type & FLOW_PRIMARY_MAC) ||
7416 mac_client_vid((mac_client_handle_t)vmcip) ==
7417 VLAN_ID_NONE) {
7418 continue;
7419 }
7420 vmrp = MCIP_RESOURCE_PROPS(vmcip);
7421
7422 vmrp->mrp_nrxrings = mrp->mrp_nrxrings;
7423 if (mrp->mrp_mask & MRP_RX_RINGS)
7424 vmrp->mrp_mask |= MRP_RX_RINGS;
7425 else if (vmrp->mrp_mask & MRP_RX_RINGS)
7426 vmrp->mrp_mask &= ~MRP_RX_RINGS;
7427
7428 vmrp->mrp_ntxrings = mrp->mrp_ntxrings;
7429 if (mrp->mrp_mask & MRP_TX_RINGS)
7430 vmrp->mrp_mask |= MRP_TX_RINGS;
7431 else if (vmrp->mrp_mask & MRP_TX_RINGS)
7432 vmrp->mrp_mask &= ~MRP_TX_RINGS;
7433
7434 if (mrp->mrp_mask & MRP_RXRINGS_UNSPEC)
7435 vmrp->mrp_mask |= MRP_RXRINGS_UNSPEC;
7436 else
7437 vmrp->mrp_mask &= ~MRP_RXRINGS_UNSPEC;
7438
7439 if (mrp->mrp_mask & MRP_TXRINGS_UNSPEC)
7440 vmrp->mrp_mask |= MRP_TXRINGS_UNSPEC;
7441 else
7442 vmrp->mrp_mask &= ~MRP_TXRINGS_UNSPEC;
7443 }
7444 }
7445
7446 /*
7447 * We are adding or removing ring(s) from a group. The source for taking
7448 * rings is the default group. The destination for giving rings back is
7449 * the default group.
7450 */
7451 int
7452 mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group,
7453 mac_group_t *defgrp)
7454 {
7455 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
7456 uint_t modify;
7457 int count;
7458 mac_ring_t *ring;
7459 mac_ring_t *next;
7460 mac_impl_t *mip = mcip->mci_mip;
7461 mac_ring_t **rings;
7462 uint_t ringcnt;
7463 int i = 0;
7464 boolean_t rx_group = group->mrg_type == MAC_RING_TYPE_RX;
7465 int start;
7466 int end;
7467 mac_group_t *tgrp;
7468 int j;
7469 int rv = 0;
7470
7471 /*
7472 * If we are asked for just a group, we give 1 ring, else
7473 * the specified number of rings.
7474 */
7475 if (rx_group) {
7476 ringcnt = (mrp->mrp_mask & MRP_RXRINGS_UNSPEC) ? 1:
7477 mrp->mrp_nrxrings;
7478 } else {
7479 ringcnt = (mrp->mrp_mask & MRP_TXRINGS_UNSPEC) ? 1:
7480 mrp->mrp_ntxrings;
7481 }
7482
7483 /* don't allow modifying rings for a share for now. */
7484 ASSERT(mcip->mci_share == NULL);
7485
7486 if (ringcnt == group->mrg_cur_count)
7487 return (0);
7488
7489 if (group->mrg_cur_count > ringcnt) {
7490 modify = group->mrg_cur_count - ringcnt;
7491 if (rx_group) {
7492 if (mip->mi_rx_donor_grp == group) {
7493 ASSERT(mac_is_primary_client(mcip));
7494 mip->mi_rx_donor_grp = defgrp;
7495 } else {
7496 defgrp = mip->mi_rx_donor_grp;
7497 }
7498 }
7499 ring = group->mrg_rings;
7500 rings = kmem_alloc(modify * sizeof (mac_ring_handle_t),
7501 KM_SLEEP);
7502 j = 0;
7503 for (count = 0; count < modify; count++) {
7504 next = ring->mr_next;
7505 rv = mac_group_mov_ring(mip, defgrp, ring);
7506 if (rv != 0) {
7507 /* cleanup on failure */
7508 for (j = 0; j < count; j++) {
7509 (void) mac_group_mov_ring(mip, group,
7510 rings[j]);
7511 }
7512 break;
7513 }
7514 rings[j++] = ring;
7515 ring = next;
7516 }
7517 kmem_free(rings, modify * sizeof (mac_ring_handle_t));
7518 return (rv);
7519 }
7520 if (ringcnt >= MAX_RINGS_PER_GROUP)
7521 return (EINVAL);
7522
7523 modify = ringcnt - group->mrg_cur_count;
7524
7525 if (rx_group) {
7526 if (group != mip->mi_rx_donor_grp)
7527 defgrp = mip->mi_rx_donor_grp;
7528 else
7529 /*
7530 * This is the donor group with all the remaining
7531 * rings. Default group now gets to be the donor
7532 */
7533 mip->mi_rx_donor_grp = defgrp;
7534 start = 1;
7535 end = mip->mi_rx_group_count;
7536 } else {
7537 start = 0;
7538 end = mip->mi_tx_group_count - 1;
7539 }
7540 /*
7541 * If the default doesn't have any rings, lets see if we can
7542 * take rings given to an h/w client that doesn't need it.
7543 * For now, we just see if there is any one client that can donate
7544 * all the required rings.
7545 */
7546 if (defgrp->mrg_cur_count < (modify + 1)) {
7547 for (i = start; i < end; i++) {
7548 if (rx_group) {
7549 tgrp = &mip->mi_rx_groups[i];
7550 if (tgrp == group || tgrp->mrg_state <
7551 MAC_GROUP_STATE_RESERVED) {
7552 continue;
7553 }
7554 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
7555 if (mcip == NULL)
7556 mcip = mac_get_grp_primary(tgrp);
7557 ASSERT(mcip != NULL);
7558 mrp = MCIP_RESOURCE_PROPS(mcip);
7559 if ((mrp->mrp_mask & MRP_RX_RINGS) != 0)
7560 continue;
7561 if ((tgrp->mrg_cur_count +
7562 defgrp->mrg_cur_count) < (modify + 1)) {
7563 continue;
7564 }
7565 if (mac_rx_switch_group(mcip, tgrp,
7566 defgrp) != 0) {
7567 return (ENOSPC);
7568 }
7569 } else {
7570 tgrp = &mip->mi_tx_groups[i];
7571 if (tgrp == group || tgrp->mrg_state <
7572 MAC_GROUP_STATE_RESERVED) {
7573 continue;
7574 }
7575 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
7576 if (mcip == NULL)
7577 mcip = mac_get_grp_primary(tgrp);
7578 mrp = MCIP_RESOURCE_PROPS(mcip);
7579 if ((mrp->mrp_mask & MRP_TX_RINGS) != 0)
7580 continue;
7581 if ((tgrp->mrg_cur_count +
7582 defgrp->mrg_cur_count) < (modify + 1)) {
7583 continue;
7584 }
7585 /* OK, we can switch this to s/w */
7586 mac_tx_client_quiesce(
7587 (mac_client_handle_t)mcip);
7588 mac_tx_switch_group(mcip, tgrp, defgrp);
7589 mac_tx_client_restart(
7590 (mac_client_handle_t)mcip);
7591 }
7592 }
7593 if (defgrp->mrg_cur_count < (modify + 1))
7594 return (ENOSPC);
7595 }
7596 if ((rv = i_mac_group_allocate_rings(mip, group->mrg_type, defgrp,
7597 group, mcip->mci_share, modify)) != 0) {
7598 return (rv);
7599 }
7600 return (0);
7601 }
7602
7603 /*
7604 * Given the poolname in mac_resource_props, find the cpupart
7605 * that is associated with this pool. The cpupart will be used
7606 * later for finding the cpus to be bound to the networking threads.
7607 *
7608 * use_default is set B_TRUE if pools are enabled and pool_default
7609 * is returned. This avoids a 2nd lookup to set the poolname
7610 * for pool-effective.
7611 *
7612 * returns:
7613 *
7614 * NULL - pools are disabled or if the 'cpus' property is set.
7615 * cpupart of pool_default - pools are enabled and the pool
7616 * is not available or poolname is blank
7617 * cpupart of named pool - pools are enabled and the pool
7618 * is available.
7619 */
7620 cpupart_t *
7621 mac_pset_find(mac_resource_props_t *mrp, boolean_t *use_default)
7622 {
7623 pool_t *pool;
7624 cpupart_t *cpupart;
7625
7626 *use_default = B_FALSE;
7627
7628 /* CPUs property is set */
7629 if (mrp->mrp_mask & MRP_CPUS)
7630 return (NULL);
7631
7632 ASSERT(pool_lock_held());
7633
7634 /* Pools are disabled, no pset */
7635 if (pool_state == POOL_DISABLED)
7636 return (NULL);
7637
7638 /* Pools property is set */
7639 if (mrp->mrp_mask & MRP_POOL) {
7640 if ((pool = pool_lookup_pool_by_name(mrp->mrp_pool)) == NULL) {
7641 /* Pool not found */
7642 DTRACE_PROBE1(mac_pset_find_no_pool, char *,
7643 mrp->mrp_pool);
7644 *use_default = B_TRUE;
7645 pool = pool_default;
7646 }
7647 /* Pools property is not set */
7648 } else {
7649 *use_default = B_TRUE;
7650 pool = pool_default;
7651 }
7652
7653 /* Find the CPU pset that corresponds to the pool */
7654 mutex_enter(&cpu_lock);
7655 if ((cpupart = cpupart_find(pool->pool_pset->pset_id)) == NULL) {
7656 DTRACE_PROBE1(mac_find_pset_no_pset, psetid_t,
7657 pool->pool_pset->pset_id);
7658 }
7659 mutex_exit(&cpu_lock);
7660
7661 return (cpupart);
7662 }
7663
7664 void
7665 mac_set_pool_effective(boolean_t use_default, cpupart_t *cpupart,
7666 mac_resource_props_t *mrp, mac_resource_props_t *emrp)
7667 {
7668 ASSERT(pool_lock_held());
7669
7670 if (cpupart != NULL) {
7671 emrp->mrp_mask |= MRP_POOL;
7672 if (use_default) {
7673 (void) strcpy(emrp->mrp_pool,
7674 "pool_default");
7675 } else {
7676 ASSERT(strlen(mrp->mrp_pool) != 0);
7677 (void) strcpy(emrp->mrp_pool,
7678 mrp->mrp_pool);
7679 }
7680 } else {
7681 emrp->mrp_mask &= ~MRP_POOL;
7682 bzero(emrp->mrp_pool, MAXPATHLEN);
7683 }
7684 }
7685
7686 struct mac_pool_arg {
7687 char mpa_poolname[MAXPATHLEN];
7688 pool_event_t mpa_what;
7689 };
7690
7691 /*ARGSUSED*/
7692 static uint_t
7693 mac_pool_link_update(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
7694 {
7695 struct mac_pool_arg *mpa = arg;
7696 mac_impl_t *mip = (mac_impl_t *)val;
7697 mac_client_impl_t *mcip;
7698 mac_resource_props_t *mrp, *emrp;
7699 boolean_t pool_update = B_FALSE;
7700 boolean_t pool_clear = B_FALSE;
7701 boolean_t use_default = B_FALSE;
7702 cpupart_t *cpupart = NULL;
7703
7704 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
7705 i_mac_perim_enter(mip);
7706 for (mcip = mip->mi_clients_list; mcip != NULL;
7707 mcip = mcip->mci_client_next) {
7708 pool_update = B_FALSE;
7709 pool_clear = B_FALSE;
7710 use_default = B_FALSE;
7711 mac_client_get_resources((mac_client_handle_t)mcip, mrp);
7712 emrp = MCIP_EFFECTIVE_PROPS(mcip);
7713
7714 /*
7715 * When pools are enabled
7716 */
7717 if ((mpa->mpa_what == POOL_E_ENABLE) &&
7718 ((mrp->mrp_mask & MRP_CPUS) == 0)) {
7719 mrp->mrp_mask |= MRP_POOL;
7720 pool_update = B_TRUE;
7721 }
7722
7723 /*
7724 * When pools are disabled
7725 */
7726 if ((mpa->mpa_what == POOL_E_DISABLE) &&
7727 ((mrp->mrp_mask & MRP_CPUS) == 0)) {
7728 mrp->mrp_mask |= MRP_POOL;
7729 pool_clear = B_TRUE;
7730 }
7731
7732 /*
7733 * Look for links with the pool property set and the poolname
7734 * matching the one which is changing.
7735 */
7736 if (strcmp(mrp->mrp_pool, mpa->mpa_poolname) == 0) {
7737 /*
7738 * The pool associated with the link has changed.
7739 */
7740 if (mpa->mpa_what == POOL_E_CHANGE) {
7741 mrp->mrp_mask |= MRP_POOL;
7742 pool_update = B_TRUE;
7743 }
7744 }
7745
7746 /*
7747 * This link is associated with pool_default and
7748 * pool_default has changed.
7749 */
7750 if ((mpa->mpa_what == POOL_E_CHANGE) &&
7751 (strcmp(emrp->mrp_pool, "pool_default") == 0) &&
7752 (strcmp(mpa->mpa_poolname, "pool_default") == 0)) {
7753 mrp->mrp_mask |= MRP_POOL;
7754 pool_update = B_TRUE;
7755 }
7756
7757 /*
7758 * Get new list of cpus for the pool, bind network
7759 * threads to new list of cpus and update resources.
7760 */
7761 if (pool_update) {
7762 if (MCIP_DATAPATH_SETUP(mcip)) {
7763 pool_lock();
7764 cpupart = mac_pset_find(mrp, &use_default);
7765 mac_fanout_setup(mcip, mcip->mci_flent, mrp,
7766 mac_rx_deliver, mcip, NULL, cpupart);
7767 mac_set_pool_effective(use_default, cpupart,
7768 mrp, emrp);
7769 pool_unlock();
7770 }
7771 mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip),
7772 B_FALSE);
7773 }
7774
7775 /*
7776 * Clear the effective pool and bind network threads
7777 * to any available CPU.
7778 */
7779 if (pool_clear) {
7780 if (MCIP_DATAPATH_SETUP(mcip)) {
7781 emrp->mrp_mask &= ~MRP_POOL;
7782 bzero(emrp->mrp_pool, MAXPATHLEN);
7783 mac_fanout_setup(mcip, mcip->mci_flent, mrp,
7784 mac_rx_deliver, mcip, NULL, NULL);
7785 }
7786 mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip),
7787 B_FALSE);
7788 }
7789 }
7790 i_mac_perim_exit(mip);
7791 kmem_free(mrp, sizeof (*mrp));
7792 return (MH_WALK_CONTINUE);
7793 }
7794
7795 static void
7796 mac_pool_update(void *arg)
7797 {
7798 mod_hash_walk(i_mac_impl_hash, mac_pool_link_update, arg);
7799 kmem_free(arg, sizeof (struct mac_pool_arg));
7800 }
7801
7802 /*
7803 * Callback function to be executed when a noteworthy pool event
7804 * takes place.
7805 */
7806 /* ARGSUSED */
7807 static void
7808 mac_pool_event_cb(pool_event_t what, poolid_t id, void *arg)
7809 {
7810 pool_t *pool;
7811 char *poolname = NULL;
7812 struct mac_pool_arg *mpa;
7813
7814 pool_lock();
7815 mpa = kmem_zalloc(sizeof (struct mac_pool_arg), KM_SLEEP);
7816
7817 switch (what) {
7818 case POOL_E_ENABLE:
7819 case POOL_E_DISABLE:
7820 break;
7821
7822 case POOL_E_CHANGE:
7823 pool = pool_lookup_pool_by_id(id);
7824 if (pool == NULL) {
7825 kmem_free(mpa, sizeof (struct mac_pool_arg));
7826 pool_unlock();
7827 return;
7828 }
7829 pool_get_name(pool, &poolname);
7830 (void) strlcpy(mpa->mpa_poolname, poolname,
7831 sizeof (mpa->mpa_poolname));
7832 break;
7833
7834 default:
7835 kmem_free(mpa, sizeof (struct mac_pool_arg));
7836 pool_unlock();
7837 return;
7838 }
7839 pool_unlock();
7840
7841 mpa->mpa_what = what;
7842
7843 mac_pool_update(mpa);
7844 }
7845
7846 /*
7847 * Set effective rings property. This could be called from datapath_setup/
7848 * datapath_teardown or set-linkprop.
7849 * If the group is reserved we just go ahead and set the effective rings.
7850 * Additionally, for TX this could mean the default group has lost/gained
7851 * some rings, so if the default group is reserved, we need to adjust the
7852 * effective rings for the default group clients. For RX, if we are working
7853 * with the non-default group, we just need * to reset the effective props
7854 * for the default group clients.
7855 */
7856 void
7857 mac_set_rings_effective(mac_client_impl_t *mcip)
7858 {
7859 mac_impl_t *mip = mcip->mci_mip;
7860 mac_group_t *grp;
7861 mac_group_t *defgrp;
7862 flow_entry_t *flent = mcip->mci_flent;
7863 mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip);
7864 mac_grp_client_t *mgcp;
7865 mac_client_impl_t *gmcip;
7866
7867 grp = flent->fe_rx_ring_group;
7868 if (grp != NULL) {
7869 defgrp = MAC_DEFAULT_RX_GROUP(mip);
7870 /*
7871 * If we have reserved a group, set the effective rings
7872 * to the ring count in the group.
7873 */
7874 if (grp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7875 emrp->mrp_mask |= MRP_RX_RINGS;
7876 emrp->mrp_nrxrings = grp->mrg_cur_count;
7877 }
7878
7879 /*
7880 * We go through the clients in the shared group and
7881 * reset the effective properties. It is possible this
7882 * might have already been done for some client (i.e.
7883 * if some client is being moved to a group that is
7884 * already shared). The case where the default group is
7885 * RESERVED is taken care of above (note in the RX side if
7886 * there is a non-default group, the default group is always
7887 * SHARED).
7888 */
7889 if (grp != defgrp || grp->mrg_state == MAC_GROUP_STATE_SHARED) {
7890 if (grp->mrg_state == MAC_GROUP_STATE_SHARED)
7891 mgcp = grp->mrg_clients;
7892 else
7893 mgcp = defgrp->mrg_clients;
7894 while (mgcp != NULL) {
7895 gmcip = mgcp->mgc_client;
7896 emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7897 if (emrp->mrp_mask & MRP_RX_RINGS) {
7898 emrp->mrp_mask &= ~MRP_RX_RINGS;
7899 emrp->mrp_nrxrings = 0;
7900 }
7901 mgcp = mgcp->mgc_next;
7902 }
7903 }
7904 }
7905
7906 /* Now the TX side */
7907 grp = flent->fe_tx_ring_group;
7908 if (grp != NULL) {
7909 defgrp = MAC_DEFAULT_TX_GROUP(mip);
7910
7911 if (grp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7912 emrp->mrp_mask |= MRP_TX_RINGS;
7913 emrp->mrp_ntxrings = grp->mrg_cur_count;
7914 } else if (grp->mrg_state == MAC_GROUP_STATE_SHARED) {
7915 mgcp = grp->mrg_clients;
7916 while (mgcp != NULL) {
7917 gmcip = mgcp->mgc_client;
7918 emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7919 if (emrp->mrp_mask & MRP_TX_RINGS) {
7920 emrp->mrp_mask &= ~MRP_TX_RINGS;
7921 emrp->mrp_ntxrings = 0;
7922 }
7923 mgcp = mgcp->mgc_next;
7924 }
7925 }
7926
7927 /*
7928 * If the group is not the default group and the default
7929 * group is reserved, the ring count in the default group
7930 * might have changed, update it.
7931 */
7932 if (grp != defgrp &&
7933 defgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7934 gmcip = MAC_GROUP_ONLY_CLIENT(defgrp);
7935 emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7936 emrp->mrp_ntxrings = defgrp->mrg_cur_count;
7937 }
7938 }
7939 emrp = MCIP_EFFECTIVE_PROPS(mcip);
7940 }
7941
7942 /*
7943 * Check if the primary is in the default group. If so, see if we
7944 * can give it a an exclusive group now that another client is
7945 * being configured. We take the primary out of the default group
7946 * because the multicast/broadcast packets for the all the clients
7947 * will land in the default ring in the default group which means
7948 * any client in the default group, even if it is the only on in
7949 * the group, will lose exclusive access to the rings, hence
7950 * polling.
7951 */
7952 mac_client_impl_t *
7953 mac_check_primary_relocation(mac_client_impl_t *mcip, boolean_t rxhw)
7954 {
7955 mac_impl_t *mip = mcip->mci_mip;
7956 mac_group_t *defgrp = MAC_DEFAULT_RX_GROUP(mip);
7957 flow_entry_t *flent = mcip->mci_flent;
7958 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
7959 uint8_t *mac_addr;
7960 mac_group_t *ngrp;
7961
7962 /*
7963 * Check if the primary is in the default group, if not
7964 * or if it is explicitly configured to be in the default
7965 * group OR set the RX rings property, return.
7966 */
7967 if (flent->fe_rx_ring_group != defgrp || mrp->mrp_mask & MRP_RX_RINGS)
7968 return (NULL);
7969
7970 /*
7971 * If the new client needs an exclusive group and we
7972 * don't have another for the primary, return.
7973 */
7974 if (rxhw && mip->mi_rxhwclnt_avail < 2)
7975 return (NULL);
7976
7977 mac_addr = flent->fe_flow_desc.fd_dst_mac;
7978 /*
7979 * We call this when we are setting up the datapath for
7980 * the first non-primary.
7981 */
7982 ASSERT(mip->mi_nactiveclients == 2);
7983 /*
7984 * OK, now we have the primary that needs to be relocated.
7985 */
7986 ngrp = mac_reserve_rx_group(mcip, mac_addr, B_TRUE);
7987 if (ngrp == NULL)
7988 return (NULL);
7989 if (mac_rx_switch_group(mcip, defgrp, ngrp) != 0) {
7990 mac_stop_group(ngrp);
7991 return (NULL);
7992 }
7993 return (mcip);
7994 }