zfs-cherry New usr/src/uts/common/io/mac/mac.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015 Joyent, Inc.
  25  */
  26 
  27 /*
  28  * MAC Services Module
  29  *
  30  * The GLDv3 framework locking -  The MAC layer
  31  * --------------------------------------------
  32  *
  33  * The MAC layer is central to the GLD framework and can provide the locking
  34  * framework needed for itself and for the use of MAC clients. MAC end points
  35  * are fairly disjoint and don't share a lot of state. So a coarse grained
  36  * multi-threading scheme is to single thread all create/modify/delete or set
  37  * type of control operations on a per mac end point while allowing data threads
  38  * concurrently.
  39  *
  40  * Control operations (set) that modify a mac end point are always serialized on
  41  * a per mac end point basis, We have at most 1 such thread per mac end point
  42  * at a time.
  43  *
  44  * All other operations that are not serialized are essentially multi-threaded.
  45  * For example a control operation (get) like getting statistics which may not
  46  * care about reading values atomically or data threads sending or receiving
  47  * data. Mostly these type of operations don't modify the control state. Any
  48  * state these operations care about are protected using traditional locks.
  49  *
  50  * The perimeter only serializes serial operations. It does not imply there
  51  * aren't any other concurrent operations. However a serialized operation may
  52  * sometimes need to make sure it is the only thread. In this case it needs
  53  * to use reference counting mechanisms to cv_wait until any current data
  54  * threads are done.
  55  *
  56  * The mac layer itself does not hold any locks across a call to another layer.
  57  * The perimeter is however held across a down call to the driver to make the
  58  * whole control operation atomic with respect to other control operations.
  59  * Also the data path and get type control operations may proceed concurrently.
  60  * These operations synchronize with the single serial operation on a given mac
  61  * end point using regular locks. The perimeter ensures that conflicting
  62  * operations like say a mac_multicast_add and a mac_multicast_remove on the
  63  * same mac end point don't interfere with each other and also ensures that the
  64  * changes in the mac layer and the call to the underlying driver to say add a
  65  * multicast address are done atomically without interference from a thread
  66  * trying to delete the same address.
  67  *
  68  * For example, consider
  69  * mac_multicst_add()
  70  * {
  71  *      mac_perimeter_enter();  serialize all control operations
  72  *
  73  *      grab list lock          protect against access by data threads
  74  *      add to list
  75  *      drop list lock
  76  *
  77  *      call driver's mi_multicst
  78  *
  79  *      mac_perimeter_exit();
  80  * }
  81  *
  82  * To lessen the number of serialization locks and simplify the lock hierarchy,
  83  * we serialize all the control operations on a per mac end point by using a
  84  * single serialization lock called the perimeter. We allow recursive entry into
  85  * the perimeter to facilitate use of this mechanism by both the mac client and
  86  * the MAC layer itself.
  87  *
  88  * MAC client means an entity that does an operation on a mac handle
  89  * obtained from a mac_open/mac_client_open. Similarly MAC driver means
  90  * an entity that does an operation on a mac handle obtained from a
  91  * mac_register. An entity could be both client and driver but on different
  92  * handles eg. aggr. and should only make the corresponding mac interface calls
  93  * i.e. mac driver interface or mac client interface as appropriate for that
  94  * mac handle.
  95  *
  96  * General rules.
  97  * -------------
  98  *
  99  * R1. The lock order of upcall threads is natually opposite to downcall
 100  * threads. Hence upcalls must not hold any locks across layers for fear of
 101  * recursive lock enter and lock order violation. This applies to all layers.
 102  *
 103  * R2. The perimeter is just another lock. Since it is held in the down
 104  * direction, acquiring the perimeter in an upcall is prohibited as it would
 105  * cause a deadlock. This applies to all layers.
 106  *
 107  * Note that upcalls that need to grab the mac perimeter (for example
 108  * mac_notify upcalls) can still achieve that by posting the request to a
 109  * thread, which can then grab all the required perimeters and locks in the
 110  * right global order. Note that in the above example the mac layer iself
 111  * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
 112  * to the client must do that. Please see the aggr code for an example.
 113  *
 114  * MAC client rules
 115  * ----------------
 116  *
 117  * R3. A MAC client may use the MAC provided perimeter facility to serialize
 118  * control operations on a per mac end point. It does this by by acquring
 119  * and holding the perimeter across a sequence of calls to the mac layer.
 120  * This ensures atomicity across the entire block of mac calls. In this
 121  * model the MAC client must not hold any client locks across the calls to
 122  * the mac layer. This model is the preferred solution.
 123  *
 124  * R4. However if a MAC client has a lot of global state across all mac end
 125  * points the per mac end point serialization may not be sufficient. In this
 126  * case the client may choose to use global locks or use its own serialization.
 127  * To avoid deadlocks, these client layer locks held across the mac calls
 128  * in the control path must never be acquired by the data path for the reason
 129  * mentioned below.
 130  *
 131  * (Assume that a control operation that holds a client lock blocks in the
 132  * mac layer waiting for upcall reference counts to drop to zero. If an upcall
 133  * data thread that holds this reference count, tries to acquire the same
 134  * client lock subsequently it will deadlock).
 135  *
 136  * A MAC client may follow either the R3 model or the R4 model, but can't
 137  * mix both. In the former, the hierarchy is Perim -> client locks, but in
 138  * the latter it is client locks -> Perim.
 139  *
 140  * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
 141  * context since they may block while trying to acquire the perimeter.
 142  * In addition some calls may block waiting for upcall refcnts to come down to
 143  * zero.
 144  *
 145  * R6. MAC clients must make sure that they are single threaded and all threads
 146  * from the top (in particular data threads) have finished before calling
 147  * mac_client_close. The MAC framework does not track the number of client
 148  * threads using the mac client handle. Also mac clients must make sure
 149  * they have undone all the control operations before calling mac_client_close.
 150  * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
 151  * mac_unicast_add/mac_multicast_add.
 152  *
 153  * MAC framework rules
 154  * -------------------
 155  *
 156  * R7. The mac layer itself must not hold any mac layer locks (except the mac
 157  * perimeter) across a call to any other layer from the mac layer. The call to
 158  * any other layer could be via mi_* entry points, classifier entry points into
 159  * the driver or via upcall pointers into layers above. The mac perimeter may
 160  * be acquired or held only in the down direction, for e.g. when calling into
 161  * a mi_* driver enty point to provide atomicity of the operation.
 162  *
 163  * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
 164  * mac driver interfaces, the MAC layer must provide a cut out for control
 165  * interfaces like upcall notifications and start them in a separate thread.
 166  *
 167  * R9. Note that locking order also implies a plumbing order. For example
 168  * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
 169  * to plumb in any other order must be failed at mac_open time, otherwise it
 170  * could lead to deadlocks due to inverse locking order.
 171  *
 172  * R10. MAC driver interfaces must not block since the driver could call them
 173  * in interrupt context.
 174  *
 175  * R11. Walkers must preferably not hold any locks while calling walker
 176  * callbacks. Instead these can operate on reference counts. In simple
 177  * callbacks it may be ok to hold a lock and call the callbacks, but this is
 178  * harder to maintain in the general case of arbitrary callbacks.
 179  *
 180  * R12. The MAC layer must protect upcall notification callbacks using reference
 181  * counts rather than holding locks across the callbacks.
 182  *
 183  * R13. Given the variety of drivers, it is preferable if the MAC layer can make
 184  * sure that any pointers (such as mac ring pointers) it passes to the driver
 185  * remain valid until mac unregister time. Currently the mac layer achieves
 186  * this by using generation numbers for rings and freeing the mac rings only
 187  * at unregister time.  The MAC layer must provide a layer of indirection and
 188  * must not expose underlying driver rings or driver data structures/pointers
 189  * directly to MAC clients.
 190  *
 191  * MAC driver rules
 192  * ----------------
 193  *
 194  * R14. It would be preferable if MAC drivers don't hold any locks across any
 195  * mac call. However at a minimum they must not hold any locks across data
 196  * upcalls. They must also make sure that all references to mac data structures
 197  * are cleaned up and that it is single threaded at mac_unregister time.
 198  *
 199  * R15. MAC driver interfaces don't block and so the action may be done
 200  * asynchronously in a separate thread as for example handling notifications.
 201  * The driver must not assume that the action is complete when the call
 202  * returns.
 203  *
 204  * R16. Drivers must maintain a generation number per Rx ring, and pass it
 205  * back to mac_rx_ring(); They are expected to increment the generation
 206  * number whenever the ring's stop routine is invoked.
 207  * See comments in mac_rx_ring();
 208  *
 209  * R17 Similarly mi_stop is another synchronization point and the driver must
 210  * ensure that all upcalls are done and there won't be any future upcall
 211  * before returning from mi_stop.
 212  *
 213  * R18. The driver may assume that all set/modify control operations via
 214  * the mi_* entry points are single threaded on a per mac end point.
 215  *
 216  * Lock and Perimeter hierarchy scenarios
 217  * ---------------------------------------
 218  *
 219  * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
 220  *
 221  * ft_lock -> fe_lock [mac_flow_lookup]
 222  *
 223  * mi_rw_lock -> fe_lock [mac_bcast_send]
 224  *
 225  * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
 226  *
 227  * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
 228  *
 229  * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
 230  *
 231  * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
 232  * client to driver. In the case of clients that explictly use the mac provided
 233  * perimeter mechanism for its serialization, the hierarchy is
 234  * Perimeter -> mac layer locks, since the client never holds any locks across
 235  * the mac calls. In the case of clients that use its own locks the hierarchy
 236  * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
 237  * calls mac_perim_enter/exit in this case.
 238  *
 239  * Subflow creation rules
 240  * ---------------------------
 241  * o In case of a user specified cpulist present on underlying link and flows,
 242  * the flows cpulist must be a subset of the underlying link.
 243  * o In case of a user specified fanout mode present on link and flow, the
 244  * subflow fanout count has to be less than or equal to that of the
 245  * underlying link. The cpu-bindings for the subflows will be a subset of
 246  * the underlying link.
 247  * o In case if no cpulist specified on both underlying link and flow, the
 248  * underlying link relies on a  MAC tunable to provide out of box fanout.
 249  * The subflow will have no cpulist (the subflow will be unbound)
 250  * o In case if no cpulist is specified on the underlying link, a subflow can
 251  * carry  either a user-specified cpulist or fanout count. The cpu-bindings
 252  * for the subflow will not adhere to restriction that they need to be subset
 253  * of the underlying link.
 254  * o In case where the underlying link is carrying either a user specified
 255  * cpulist or fanout mode and for a unspecified subflow, the subflow will be
 256  * created unbound.
 257  * o While creating unbound subflows, bandwidth mode changes attempt to
 258  * figure a right fanout count. In such cases the fanout count will override
 259  * the unbound cpu-binding behavior.
 260  * o In addition to this, while cycling between flow and link properties, we
 261  * impose a restriction that if a link property has a subflow with
 262  * user-specified attributes, we will not allow changing the link property.
 263  * The administrator needs to reset all the user specified properties for the
 264  * subflows before attempting a link property change.
 265  * Some of the above rules can be overridden by specifying additional command
 266  * line options while creating or modifying link or subflow properties.
 267  */
 268 
 269 #include <sys/types.h>
 270 #include <sys/conf.h>
 271 #include <sys/id_space.h>
 272 #include <sys/esunddi.h>
 273 #include <sys/stat.h>
 274 #include <sys/mkdev.h>
 275 #include <sys/stream.h>
 276 #include <sys/strsun.h>
 277 #include <sys/strsubr.h>
 278 #include <sys/dlpi.h>
 279 #include <sys/list.h>
 280 #include <sys/modhash.h>
 281 #include <sys/mac_provider.h>
 282 #include <sys/mac_client_impl.h>
 283 #include <sys/mac_soft_ring.h>
 284 #include <sys/mac_stat.h>
 285 #include <sys/mac_impl.h>
 286 #include <sys/mac.h>
 287 #include <sys/dls.h>
 288 #include <sys/dld.h>
 289 #include <sys/modctl.h>
 290 #include <sys/fs/dv_node.h>
 291 #include <sys/thread.h>
 292 #include <sys/proc.h>
 293 #include <sys/callb.h>
 294 #include <sys/cpuvar.h>
 295 #include <sys/atomic.h>
 296 #include <sys/bitmap.h>
 297 #include <sys/sdt.h>
 298 #include <sys/mac_flow.h>
 299 #include <sys/ddi_intr_impl.h>
 300 #include <sys/disp.h>
 301 #include <sys/sdt.h>
 302 #include <sys/vnic.h>
 303 #include <sys/vnic_impl.h>
 304 #include <sys/vlan.h>
 305 #include <inet/ip.h>
 306 #include <inet/ip6.h>
 307 #include <sys/exacct.h>
 308 #include <sys/exacct_impl.h>
 309 #include <inet/nd.h>
 310 #include <sys/ethernet.h>
 311 #include <sys/pool.h>
 312 #include <sys/pool_pset.h>
 313 #include <sys/cpupart.h>
 314 #include <inet/wifi_ioctl.h>
 315 #include <net/wpa.h>
 316 
 317 #define IMPL_HASHSZ     67      /* prime */
 318 
 319 kmem_cache_t            *i_mac_impl_cachep;
 320 mod_hash_t              *i_mac_impl_hash;
 321 krwlock_t               i_mac_impl_lock;
 322 uint_t                  i_mac_impl_count;
 323 static kmem_cache_t     *mac_ring_cache;
 324 static id_space_t       *minor_ids;
 325 static uint32_t         minor_count;
 326 static pool_event_cb_t  mac_pool_event_reg;
 327 
 328 /*
 329  * Logging stuff. Perhaps mac_logging_interval could be broken into
 330  * mac_flow_log_interval and mac_link_log_interval if we want to be
 331  * able to schedule them differently.
 332  */
 333 uint_t                  mac_logging_interval;
 334 boolean_t               mac_flow_log_enable;
 335 boolean_t               mac_link_log_enable;
 336 timeout_id_t            mac_logging_timer;
 337 
 338 /* for debugging, see MAC_DBG_PRT() in mac_impl.h */
 339 int mac_dbg = 0;
 340 
 341 #define MACTYPE_KMODDIR "mac"
 342 #define MACTYPE_HASHSZ  67
 343 static mod_hash_t       *i_mactype_hash;
 344 /*
 345  * i_mactype_lock synchronizes threads that obtain references to mactype_t
 346  * structures through i_mactype_getplugin().
 347  */
 348 static kmutex_t         i_mactype_lock;
 349 
 350 /*
 351  * mac_tx_percpu_cnt
 352  *
 353  * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
 354  * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
 355  * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
 356  * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
 357  */
 358 int mac_tx_percpu_cnt;
 359 int mac_tx_percpu_cnt_max = 128;
 360 
 361 /*
 362  * Call back functions for the bridge module.  These are guaranteed to be valid
 363  * when holding a reference on a link or when holding mip->mi_bridge_lock and
 364  * mi_bridge_link is non-NULL.
 365  */
 366 mac_bridge_tx_t mac_bridge_tx_cb;
 367 mac_bridge_rx_t mac_bridge_rx_cb;
 368 mac_bridge_ref_t mac_bridge_ref_cb;
 369 mac_bridge_ls_t mac_bridge_ls_cb;
 370 
 371 static int i_mac_constructor(void *, void *, int);
 372 static void i_mac_destructor(void *, void *);
 373 static int i_mac_ring_ctor(void *, void *, int);
 374 static void i_mac_ring_dtor(void *, void *);
 375 static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
 376 void mac_tx_client_flush(mac_client_impl_t *);
 377 void mac_tx_client_block(mac_client_impl_t *);
 378 static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
 379 static int mac_start_group_and_rings(mac_group_t *);
 380 static void mac_stop_group_and_rings(mac_group_t *);
 381 static void mac_pool_event_cb(pool_event_t, int, void *);
 382 
 383 typedef struct netinfo_s {
 384         list_node_t     ni_link;
 385         void            *ni_record;
 386         int             ni_size;
 387         int             ni_type;
 388 } netinfo_t;
 389 
 390 /*
 391  * Module initialization functions.
 392  */
 393 
 394 void
 395 mac_init(void)
 396 {
 397         mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
 398             boot_max_ncpus);
 399 
 400         /* Upper bound is mac_tx_percpu_cnt_max */
 401         if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
 402                 mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
 403 
 404         if (mac_tx_percpu_cnt < 1) {
 405                 /* Someone set max_tx_percpu_cnt_max to 0 or less */
 406                 mac_tx_percpu_cnt = 1;
 407         }
 408 
 409         ASSERT(mac_tx_percpu_cnt >= 1);
 410         mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
 411         /*
 412          * Make it of the form 2**N - 1 in the range
 413          * [0 .. mac_tx_percpu_cnt_max - 1]
 414          */
 415         mac_tx_percpu_cnt--;
 416 
 417         i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
 418             sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
 419             NULL, NULL, NULL, 0);
 420         ASSERT(i_mac_impl_cachep != NULL);
 421 
 422         mac_ring_cache = kmem_cache_create("mac_ring_cache",
 423             sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
 424             NULL, NULL, 0);
 425         ASSERT(mac_ring_cache != NULL);
 426 
 427         i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
 428             IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
 429             mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 430         rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
 431 
 432         mac_flow_init();
 433         mac_soft_ring_init();
 434         mac_bcast_init();
 435         mac_client_init();
 436 
 437         i_mac_impl_count = 0;
 438 
 439         i_mactype_hash = mod_hash_create_extended("mactype_hash",
 440             MACTYPE_HASHSZ,
 441             mod_hash_null_keydtor, mod_hash_null_valdtor,
 442             mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 443 
 444         /*
 445          * Allocate an id space to manage minor numbers. The range of the
 446          * space will be from MAC_MAX_MINOR+1 to MAC_PRIVATE_MINOR-1.  This
 447          * leaves half of the 32-bit minors available for driver private use.
 448          */
 449         minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1,
 450             MAC_PRIVATE_MINOR-1);
 451         ASSERT(minor_ids != NULL);
 452         minor_count = 0;
 453 
 454         /* Let's default to 20 seconds */
 455         mac_logging_interval = 20;
 456         mac_flow_log_enable = B_FALSE;
 457         mac_link_log_enable = B_FALSE;
 458         mac_logging_timer = 0;
 459 
 460         /* Register to be notified of noteworthy pools events */
 461         mac_pool_event_reg.pec_func =  mac_pool_event_cb;
 462         mac_pool_event_reg.pec_arg = NULL;
 463         pool_event_cb_register(&mac_pool_event_reg);
 464 }
 465 
 466 int
 467 mac_fini(void)
 468 {
 469 
 470         if (i_mac_impl_count > 0 || minor_count > 0)
 471                 return (EBUSY);
 472 
 473         pool_event_cb_unregister(&mac_pool_event_reg);
 474 
 475         id_space_destroy(minor_ids);
 476         mac_flow_fini();
 477 
 478         mod_hash_destroy_hash(i_mac_impl_hash);
 479         rw_destroy(&i_mac_impl_lock);
 480 
 481         mac_client_fini();
 482         kmem_cache_destroy(mac_ring_cache);
 483 
 484         mod_hash_destroy_hash(i_mactype_hash);
 485         mac_soft_ring_finish();
 486 
 487 
 488         return (0);
 489 }
 490 
 491 /*
 492  * Initialize a GLDv3 driver's device ops.  A driver that manages its own ops
 493  * (e.g. softmac) may pass in a NULL ops argument.
 494  */
 495 void
 496 mac_init_ops(struct dev_ops *ops, const char *name)
 497 {
 498         major_t major = ddi_name_to_major((char *)name);
 499 
 500         /*
 501          * By returning on error below, we are not letting the driver continue
 502          * in an undefined context.  The mac_register() function will faill if
 503          * DN_GLDV3_DRIVER isn't set.
 504          */
 505         if (major == DDI_MAJOR_T_NONE)
 506                 return;
 507         LOCK_DEV_OPS(&devnamesp[major].dn_lock);
 508         devnamesp[major].dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
 509         UNLOCK_DEV_OPS(&devnamesp[major].dn_lock);
 510         if (ops != NULL)
 511                 dld_init_ops(ops, name);
 512 }
 513 
 514 void
 515 mac_fini_ops(struct dev_ops *ops)
 516 {
 517         dld_fini_ops(ops);
 518 }
 519 
 520 /*ARGSUSED*/
 521 static int
 522 i_mac_constructor(void *buf, void *arg, int kmflag)
 523 {
 524         mac_impl_t      *mip = buf;
 525 
 526         bzero(buf, sizeof (mac_impl_t));
 527 
 528         mip->mi_linkstate = LINK_STATE_UNKNOWN;
 529 
 530         rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
 531         mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
 532         mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
 533         mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
 534 
 535         mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
 536         cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
 537         mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
 538         cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
 539 
 540         mutex_init(&mip->mi_bridge_lock, NULL, MUTEX_DEFAULT, NULL);
 541 
 542         return (0);
 543 }
 544 
 545 /*ARGSUSED*/
 546 static void
 547 i_mac_destructor(void *buf, void *arg)
 548 {
 549         mac_impl_t      *mip = buf;
 550         mac_cb_info_t   *mcbi;
 551 
 552         ASSERT(mip->mi_ref == 0);
 553         ASSERT(mip->mi_active == 0);
 554         ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
 555         ASSERT(mip->mi_devpromisc == 0);
 556         ASSERT(mip->mi_ksp == NULL);
 557         ASSERT(mip->mi_kstat_count == 0);
 558         ASSERT(mip->mi_nclients == 0);
 559         ASSERT(mip->mi_nactiveclients == 0);
 560         ASSERT(mip->mi_single_active_client == NULL);
 561         ASSERT(mip->mi_state_flags == 0);
 562         ASSERT(mip->mi_factory_addr == NULL);
 563         ASSERT(mip->mi_factory_addr_num == 0);
 564         ASSERT(mip->mi_default_tx_ring == NULL);
 565 
 566         mcbi = &mip->mi_notify_cb_info;
 567         ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
 568         ASSERT(mip->mi_notify_bits == 0);
 569         ASSERT(mip->mi_notify_thread == NULL);
 570         ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
 571         mcbi->mcbi_lockp = NULL;
 572 
 573         mcbi = &mip->mi_promisc_cb_info;
 574         ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
 575         ASSERT(mip->mi_promisc_list == NULL);
 576         ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
 577         mcbi->mcbi_lockp = NULL;
 578 
 579         ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
 580         ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
 581 
 582         rw_destroy(&mip->mi_rw_lock);
 583 
 584         mutex_destroy(&mip->mi_promisc_lock);
 585         cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
 586         mutex_destroy(&mip->mi_notify_lock);
 587         cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
 588         mutex_destroy(&mip->mi_ring_lock);
 589 
 590         ASSERT(mip->mi_bridge_link == NULL);
 591 }
 592 
 593 /* ARGSUSED */
 594 static int
 595 i_mac_ring_ctor(void *buf, void *arg, int kmflag)
 596 {
 597         mac_ring_t *ring = (mac_ring_t *)buf;
 598 
 599         bzero(ring, sizeof (mac_ring_t));
 600         cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
 601         mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
 602         ring->mr_state = MR_FREE;
 603         return (0);
 604 }
 605 
 606 /* ARGSUSED */
 607 static void
 608 i_mac_ring_dtor(void *buf, void *arg)
 609 {
 610         mac_ring_t *ring = (mac_ring_t *)buf;
 611 
 612         cv_destroy(&ring->mr_cv);
 613         mutex_destroy(&ring->mr_lock);
 614 }
 615 
 616 /*
 617  * Common functions to do mac callback addition and deletion. Currently this is
 618  * used by promisc callbacks and notify callbacks. List addition and deletion
 619  * need to take care of list walkers. List walkers in general, can't hold list
 620  * locks and make upcall callbacks due to potential lock order and recursive
 621  * reentry issues. Instead list walkers increment the list walker count to mark
 622  * the presence of a walker thread. Addition can be carefully done to ensure
 623  * that the list walker always sees either the old list or the new list.
 624  * However the deletion can't be done while the walker is active, instead the
 625  * deleting thread simply marks the entry as logically deleted. The last walker
 626  * physically deletes and frees up the logically deleted entries when the walk
 627  * is complete.
 628  */
 629 void
 630 mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
 631     mac_cb_t *mcb_elem)
 632 {
 633         mac_cb_t        *p;
 634         mac_cb_t        **pp;
 635 
 636         /* Verify it is not already in the list */
 637         for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
 638                 if (p == mcb_elem)
 639                         break;
 640         }
 641         VERIFY(p == NULL);
 642 
 643         /*
 644          * Add it to the head of the callback list. The membar ensures that
 645          * the following list pointer manipulations reach global visibility
 646          * in exactly the program order below.
 647          */
 648         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 649 
 650         mcb_elem->mcb_nextp = *mcb_head;
 651         membar_producer();
 652         *mcb_head = mcb_elem;
 653 }
 654 
 655 /*
 656  * Mark the entry as logically deleted. If there aren't any walkers unlink
 657  * from the list. In either case return the corresponding status.
 658  */
 659 boolean_t
 660 mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
 661     mac_cb_t *mcb_elem)
 662 {
 663         mac_cb_t        *p;
 664         mac_cb_t        **pp;
 665 
 666         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 667         /*
 668          * Search the callback list for the entry to be removed
 669          */
 670         for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
 671                 if (p == mcb_elem)
 672                         break;
 673         }
 674         VERIFY(p != NULL);
 675 
 676         /*
 677          * If there are walkers just mark it as deleted and the last walker
 678          * will remove from the list and free it.
 679          */
 680         if (mcbi->mcbi_walker_cnt != 0) {
 681                 p->mcb_flags |= MCB_CONDEMNED;
 682                 mcbi->mcbi_del_cnt++;
 683                 return (B_FALSE);
 684         }
 685 
 686         ASSERT(mcbi->mcbi_del_cnt == 0);
 687         *pp = p->mcb_nextp;
 688         p->mcb_nextp = NULL;
 689         return (B_TRUE);
 690 }
 691 
 692 /*
 693  * Wait for all pending callback removals to be completed
 694  */
 695 void
 696 mac_callback_remove_wait(mac_cb_info_t *mcbi)
 697 {
 698         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 699         while (mcbi->mcbi_del_cnt != 0) {
 700                 DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
 701                 cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
 702         }
 703 }
 704 
 705 /*
 706  * The last mac callback walker does the cleanup. Walk the list and unlik
 707  * all the logically deleted entries and construct a temporary list of
 708  * removed entries. Return the list of removed entries to the caller.
 709  */
 710 mac_cb_t *
 711 mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
 712 {
 713         mac_cb_t        *p;
 714         mac_cb_t        **pp;
 715         mac_cb_t        *rmlist = NULL;         /* List of removed elements */
 716         int     cnt = 0;
 717 
 718         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 719         ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
 720 
 721         pp = mcb_head;
 722         while (*pp != NULL) {
 723                 if ((*pp)->mcb_flags & MCB_CONDEMNED) {
 724                         p = *pp;
 725                         *pp = p->mcb_nextp;
 726                         p->mcb_nextp = rmlist;
 727                         rmlist = p;
 728                         cnt++;
 729                         continue;
 730                 }
 731                 pp = &(*pp)->mcb_nextp;
 732         }
 733 
 734         ASSERT(mcbi->mcbi_del_cnt == cnt);
 735         mcbi->mcbi_del_cnt = 0;
 736         return (rmlist);
 737 }
 738 
 739 boolean_t
 740 mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
 741 {
 742         mac_cb_t        *mcb;
 743 
 744         /* Verify it is not already in the list */
 745         for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
 746                 if (mcb == mcb_elem)
 747                         return (B_TRUE);
 748         }
 749 
 750         return (B_FALSE);
 751 }
 752 
 753 boolean_t
 754 mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
 755 {
 756         boolean_t       found;
 757 
 758         mutex_enter(mcbi->mcbi_lockp);
 759         found = mac_callback_lookup(mcb_headp, mcb_elem);
 760         mutex_exit(mcbi->mcbi_lockp);
 761 
 762         return (found);
 763 }
 764 
 765 /* Free the list of removed callbacks */
 766 void
 767 mac_callback_free(mac_cb_t *rmlist)
 768 {
 769         mac_cb_t        *mcb;
 770         mac_cb_t        *mcb_next;
 771 
 772         for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
 773                 mcb_next = mcb->mcb_nextp;
 774                 kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
 775         }
 776 }
 777 
 778 /*
 779  * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
 780  * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
 781  * is only a single shared total walker count, and an entry can't be physically
 782  * unlinked if a walker is active on either list. The last walker does this
 783  * cleanup of logically deleted entries.
 784  */
 785 void
 786 i_mac_promisc_walker_cleanup(mac_impl_t *mip)
 787 {
 788         mac_cb_t        *rmlist;
 789         mac_cb_t        *mcb;
 790         mac_cb_t        *mcb_next;
 791         mac_promisc_impl_t      *mpip;
 792 
 793         /*
 794          * Construct a temporary list of deleted callbacks by walking the
 795          * the mi_promisc_list. Then for each entry in the temporary list,
 796          * remove it from the mci_promisc_list and free the entry.
 797          */
 798         rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
 799             &mip->mi_promisc_list);
 800 
 801         for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
 802                 mcb_next = mcb->mcb_nextp;
 803                 mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
 804                 VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
 805                     &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
 806                 mcb->mcb_flags = 0;
 807                 mcb->mcb_nextp = NULL;
 808                 kmem_cache_free(mac_promisc_impl_cache, mpip);
 809         }
 810 }
 811 
 812 void
 813 i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
 814 {
 815         mac_cb_info_t   *mcbi;
 816 
 817         /*
 818          * Signal the notify thread even after mi_ref has become zero and
 819          * mi_disabled is set. The synchronization with the notify thread
 820          * happens in mac_unregister and that implies the driver must make
 821          * sure it is single-threaded (with respect to mac calls) and that
 822          * all pending mac calls have returned before it calls mac_unregister
 823          */
 824         rw_enter(&i_mac_impl_lock, RW_READER);
 825         if (mip->mi_state_flags & MIS_DISABLED)
 826                 goto exit;
 827 
 828         /*
 829          * Guard against incorrect notifications.  (Running a newer
 830          * mac client against an older implementation?)
 831          */
 832         if (type >= MAC_NNOTE)
 833                 goto exit;
 834 
 835         mcbi = &mip->mi_notify_cb_info;
 836         mutex_enter(mcbi->mcbi_lockp);
 837         mip->mi_notify_bits |= (1 << type);
 838         cv_broadcast(&mcbi->mcbi_cv);
 839         mutex_exit(mcbi->mcbi_lockp);
 840 
 841 exit:
 842         rw_exit(&i_mac_impl_lock);
 843 }
 844 
 845 /*
 846  * Mac serialization primitives. Please see the block comment at the
 847  * top of the file.
 848  */
 849 void
 850 i_mac_perim_enter(mac_impl_t *mip)
 851 {
 852         mac_client_impl_t       *mcip;
 853 
 854         if (mip->mi_state_flags & MIS_IS_VNIC) {
 855                 /*
 856                  * This is a VNIC. Return the lower mac since that is what
 857                  * we want to serialize on.
 858                  */
 859                 mcip = mac_vnic_lower(mip);
 860                 mip = mcip->mci_mip;
 861         }
 862 
 863         mutex_enter(&mip->mi_perim_lock);
 864         if (mip->mi_perim_owner == curthread) {
 865                 mip->mi_perim_ocnt++;
 866                 mutex_exit(&mip->mi_perim_lock);
 867                 return;
 868         }
 869 
 870         while (mip->mi_perim_owner != NULL)
 871                 cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
 872 
 873         mip->mi_perim_owner = curthread;
 874         ASSERT(mip->mi_perim_ocnt == 0);
 875         mip->mi_perim_ocnt++;
 876 #ifdef DEBUG
 877         mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
 878             MAC_PERIM_STACK_DEPTH);
 879 #endif
 880         mutex_exit(&mip->mi_perim_lock);
 881 }
 882 
 883 int
 884 i_mac_perim_enter_nowait(mac_impl_t *mip)
 885 {
 886         /*
 887          * The vnic is a special case, since the serialization is done based
 888          * on the lower mac. If the lower mac is busy, it does not imply the
 889          * vnic can't be unregistered. But in the case of other drivers,
 890          * a busy perimeter or open mac handles implies that the mac is busy
 891          * and can't be unregistered.
 892          */
 893         if (mip->mi_state_flags & MIS_IS_VNIC) {
 894                 i_mac_perim_enter(mip);
 895                 return (0);
 896         }
 897 
 898         mutex_enter(&mip->mi_perim_lock);
 899         if (mip->mi_perim_owner != NULL) {
 900                 mutex_exit(&mip->mi_perim_lock);
 901                 return (EBUSY);
 902         }
 903         ASSERT(mip->mi_perim_ocnt == 0);
 904         mip->mi_perim_owner = curthread;
 905         mip->mi_perim_ocnt++;
 906         mutex_exit(&mip->mi_perim_lock);
 907 
 908         return (0);
 909 }
 910 
 911 void
 912 i_mac_perim_exit(mac_impl_t *mip)
 913 {
 914         mac_client_impl_t *mcip;
 915 
 916         if (mip->mi_state_flags & MIS_IS_VNIC) {
 917                 /*
 918                  * This is a VNIC. Return the lower mac since that is what
 919                  * we want to serialize on.
 920                  */
 921                 mcip = mac_vnic_lower(mip);
 922                 mip = mcip->mci_mip;
 923         }
 924 
 925         ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
 926 
 927         mutex_enter(&mip->mi_perim_lock);
 928         if (--mip->mi_perim_ocnt == 0) {
 929                 mip->mi_perim_owner = NULL;
 930                 cv_signal(&mip->mi_perim_cv);
 931         }
 932         mutex_exit(&mip->mi_perim_lock);
 933 }
 934 
 935 /*
 936  * Returns whether the current thread holds the mac perimeter. Used in making
 937  * assertions.
 938  */
 939 boolean_t
 940 mac_perim_held(mac_handle_t mh)
 941 {
 942         mac_impl_t      *mip = (mac_impl_t *)mh;
 943         mac_client_impl_t *mcip;
 944 
 945         if (mip->mi_state_flags & MIS_IS_VNIC) {
 946                 /*
 947                  * This is a VNIC. Return the lower mac since that is what
 948                  * we want to serialize on.
 949                  */
 950                 mcip = mac_vnic_lower(mip);
 951                 mip = mcip->mci_mip;
 952         }
 953         return (mip->mi_perim_owner == curthread);
 954 }
 955 
 956 /*
 957  * mac client interfaces to enter the mac perimeter of a mac end point, given
 958  * its mac handle, or macname or linkid.
 959  */
 960 void
 961 mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
 962 {
 963         mac_impl_t      *mip = (mac_impl_t *)mh;
 964 
 965         i_mac_perim_enter(mip);
 966         /*
 967          * The mac_perim_handle_t returned encodes the 'mip' and whether a
 968          * mac_open has been done internally while entering the perimeter.
 969          * This information is used in mac_perim_exit
 970          */
 971         MAC_ENCODE_MPH(*mphp, mip, 0);
 972 }
 973 
 974 int
 975 mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
 976 {
 977         int     err;
 978         mac_handle_t    mh;
 979 
 980         if ((err = mac_open(name, &mh)) != 0)
 981                 return (err);
 982 
 983         mac_perim_enter_by_mh(mh, mphp);
 984         MAC_ENCODE_MPH(*mphp, mh, 1);
 985         return (0);
 986 }
 987 
 988 int
 989 mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
 990 {
 991         int     err;
 992         mac_handle_t    mh;
 993 
 994         if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
 995                 return (err);
 996 
 997         mac_perim_enter_by_mh(mh, mphp);
 998         MAC_ENCODE_MPH(*mphp, mh, 1);
 999         return (0);
1000 }
1001 
1002 void
1003 mac_perim_exit(mac_perim_handle_t mph)
1004 {
1005         mac_impl_t      *mip;
1006         boolean_t       need_close;
1007 
1008         MAC_DECODE_MPH(mph, mip, need_close);
1009         i_mac_perim_exit(mip);
1010         if (need_close)
1011                 mac_close((mac_handle_t)mip);
1012 }
1013 
1014 int
1015 mac_hold(const char *macname, mac_impl_t **pmip)
1016 {
1017         mac_impl_t      *mip;
1018         int             err;
1019 
1020         /*
1021          * Check the device name length to make sure it won't overflow our
1022          * buffer.
1023          */
1024         if (strlen(macname) >= MAXNAMELEN)
1025                 return (EINVAL);
1026 
1027         /*
1028          * Look up its entry in the global hash table.
1029          */
1030         rw_enter(&i_mac_impl_lock, RW_WRITER);
1031         err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
1032             (mod_hash_val_t *)&mip);
1033 
1034         if (err != 0) {
1035                 rw_exit(&i_mac_impl_lock);
1036                 return (ENOENT);
1037         }
1038 
1039         if (mip->mi_state_flags & MIS_DISABLED) {
1040                 rw_exit(&i_mac_impl_lock);
1041                 return (ENOENT);
1042         }
1043 
1044         if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
1045                 rw_exit(&i_mac_impl_lock);
1046                 return (EBUSY);
1047         }
1048 
1049         mip->mi_ref++;
1050         rw_exit(&i_mac_impl_lock);
1051 
1052         *pmip = mip;
1053         return (0);
1054 }
1055 
1056 void
1057 mac_rele(mac_impl_t *mip)
1058 {
1059         rw_enter(&i_mac_impl_lock, RW_WRITER);
1060         ASSERT(mip->mi_ref != 0);
1061         if (--mip->mi_ref == 0) {
1062                 ASSERT(mip->mi_nactiveclients == 0 &&
1063                     !(mip->mi_state_flags & MIS_EXCLUSIVE));
1064         }
1065         rw_exit(&i_mac_impl_lock);
1066 }
1067 
1068 /*
1069  * Private GLDv3 function to start a MAC instance.
1070  */
1071 int
1072 mac_start(mac_handle_t mh)
1073 {
1074         mac_impl_t      *mip = (mac_impl_t *)mh;
1075         int             err = 0;
1076         mac_group_t     *defgrp;
1077 
1078         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1079         ASSERT(mip->mi_start != NULL);
1080 
1081         /*
1082          * Check whether the device is already started.
1083          */
1084         if (mip->mi_active++ == 0) {
1085                 mac_ring_t *ring = NULL;
1086 
1087                 /*
1088                  * Start the device.
1089                  */
1090                 err = mip->mi_start(mip->mi_driver);
1091                 if (err != 0) {
1092                         mip->mi_active--;
1093                         return (err);
1094                 }
1095 
1096                 /*
1097                  * Start the default tx ring.
1098                  */
1099                 if (mip->mi_default_tx_ring != NULL) {
1100 
1101                         ring = (mac_ring_t *)mip->mi_default_tx_ring;
1102                         if (ring->mr_state != MR_INUSE) {
1103                                 err = mac_start_ring(ring);
1104                                 if (err != 0) {
1105                                         mip->mi_active--;
1106                                         return (err);
1107                                 }
1108                         }
1109                 }
1110 
1111                 if ((defgrp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1112                         /*
1113                          * Start the default ring, since it will be needed
1114                          * to receive broadcast and multicast traffic for
1115                          * both primary and non-primary MAC clients.
1116                          */
1117                         ASSERT(defgrp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1118                         err = mac_start_group_and_rings(defgrp);
1119                         if (err != 0) {
1120                                 mip->mi_active--;
1121                                 if ((ring != NULL) &&
1122                                     (ring->mr_state == MR_INUSE))
1123                                         mac_stop_ring(ring);
1124                                 return (err);
1125                         }
1126                         mac_set_group_state(defgrp, MAC_GROUP_STATE_SHARED);
1127                 }
1128         }
1129 
1130         return (err);
1131 }
1132 
1133 /*
1134  * Private GLDv3 function to stop a MAC instance.
1135  */
1136 void
1137 mac_stop(mac_handle_t mh)
1138 {
1139         mac_impl_t      *mip = (mac_impl_t *)mh;
1140         mac_group_t     *grp;
1141 
1142         ASSERT(mip->mi_stop != NULL);
1143         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1144 
1145         /*
1146          * Check whether the device is still needed.
1147          */
1148         ASSERT(mip->mi_active != 0);
1149         if (--mip->mi_active == 0) {
1150                 if ((grp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1151                         /*
1152                          * There should be no more active clients since the
1153                          * MAC is being stopped. Stop the default RX group
1154                          * and transition it back to registered state.
1155                          *
1156                          * When clients are torn down, the groups
1157                          * are release via mac_release_rx_group which
1158                          * knows the the default group is always in
1159                          * started mode since broadcast uses it. So
1160                          * we can assert that their are no clients
1161                          * (since mac_bcast_add doesn't register itself
1162                          * as a client) and group is in SHARED state.
1163                          */
1164                         ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1165                         ASSERT(MAC_GROUP_NO_CLIENT(grp) &&
1166                             mip->mi_nactiveclients == 0);
1167                         mac_stop_group_and_rings(grp);
1168                         mac_set_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1169                 }
1170 
1171                 if (mip->mi_default_tx_ring != NULL) {
1172                         mac_ring_t *ring;
1173 
1174                         ring = (mac_ring_t *)mip->mi_default_tx_ring;
1175                         if (ring->mr_state == MR_INUSE) {
1176                                 mac_stop_ring(ring);
1177                                 ring->mr_flag = 0;
1178                         }
1179                 }
1180 
1181                 /*
1182                  * Stop the device.
1183                  */
1184                 mip->mi_stop(mip->mi_driver);
1185         }
1186 }
1187 
1188 int
1189 i_mac_promisc_set(mac_impl_t *mip, boolean_t on)
1190 {
1191         int             err = 0;
1192 
1193         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1194         ASSERT(mip->mi_setpromisc != NULL);
1195 
1196         if (on) {
1197                 /*
1198                  * Enable promiscuous mode on the device if not yet enabled.
1199                  */
1200                 if (mip->mi_devpromisc++ == 0) {
1201                         err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1202                         if (err != 0) {
1203                                 mip->mi_devpromisc--;
1204                                 return (err);
1205                         }
1206                         i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1207                 }
1208         } else {
1209                 if (mip->mi_devpromisc == 0)
1210                         return (EPROTO);
1211 
1212                 /*
1213                  * Disable promiscuous mode on the device if this is the last
1214                  * enabling.
1215                  */
1216                 if (--mip->mi_devpromisc == 0) {
1217                         err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1218                         if (err != 0) {
1219                                 mip->mi_devpromisc++;
1220                                 return (err);
1221                         }
1222                         i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1223                 }
1224         }
1225 
1226         return (0);
1227 }
1228 
1229 /*
1230  * The promiscuity state can change any time. If the caller needs to take
1231  * actions that are atomic with the promiscuity state, then the caller needs
1232  * to bracket the entire sequence with mac_perim_enter/exit
1233  */
1234 boolean_t
1235 mac_promisc_get(mac_handle_t mh)
1236 {
1237         mac_impl_t              *mip = (mac_impl_t *)mh;
1238 
1239         /*
1240          * Return the current promiscuity.
1241          */
1242         return (mip->mi_devpromisc != 0);
1243 }
1244 
1245 /*
1246  * Invoked at MAC instance attach time to initialize the list
1247  * of factory MAC addresses supported by a MAC instance. This function
1248  * builds a local cache in the mac_impl_t for the MAC addresses
1249  * supported by the underlying hardware. The MAC clients themselves
1250  * use the mac_addr_factory*() functions to query and reserve
1251  * factory MAC addresses.
1252  */
1253 void
1254 mac_addr_factory_init(mac_impl_t *mip)
1255 {
1256         mac_capab_multifactaddr_t capab;
1257         uint8_t *addr;
1258         int i;
1259 
1260         /*
1261          * First round to see how many factory MAC addresses are available.
1262          */
1263         bzero(&capab, sizeof (capab));
1264         if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1265             &capab) || (capab.mcm_naddr == 0)) {
1266                 /*
1267                  * The MAC instance doesn't support multiple factory
1268                  * MAC addresses, we're done here.
1269                  */
1270                 return;
1271         }
1272 
1273         /*
1274          * Allocate the space and get all the factory addresses.
1275          */
1276         addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1277         capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1278 
1279         mip->mi_factory_addr_num = capab.mcm_naddr;
1280         mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1281             sizeof (mac_factory_addr_t), KM_SLEEP);
1282 
1283         for (i = 0; i < capab.mcm_naddr; i++) {
1284                 bcopy(addr + i * MAXMACADDRLEN,
1285                     mip->mi_factory_addr[i].mfa_addr,
1286                     mip->mi_type->mt_addr_length);
1287                 mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1288         }
1289 
1290         kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1291 }
1292 
1293 void
1294 mac_addr_factory_fini(mac_impl_t *mip)
1295 {
1296         if (mip->mi_factory_addr == NULL) {
1297                 ASSERT(mip->mi_factory_addr_num == 0);
1298                 return;
1299         }
1300 
1301         kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1302             sizeof (mac_factory_addr_t));
1303 
1304         mip->mi_factory_addr = NULL;
1305         mip->mi_factory_addr_num = 0;
1306 }
1307 
1308 /*
1309  * Reserve a factory MAC address. If *slot is set to -1, the function
1310  * attempts to reserve any of the available factory MAC addresses and
1311  * returns the reserved slot id. If no slots are available, the function
1312  * returns ENOSPC. If *slot is not set to -1, the function reserves
1313  * the specified slot if it is available, or returns EBUSY is the slot
1314  * is already used. Returns ENOTSUP if the underlying MAC does not
1315  * support multiple factory addresses. If the slot number is not -1 but
1316  * is invalid, returns EINVAL.
1317  */
1318 int
1319 mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1320 {
1321         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1322         mac_impl_t *mip = mcip->mci_mip;
1323         int i, ret = 0;
1324 
1325         i_mac_perim_enter(mip);
1326         /*
1327          * Protect against concurrent readers that may need a self-consistent
1328          * view of the factory addresses
1329          */
1330         rw_enter(&mip->mi_rw_lock, RW_WRITER);
1331 
1332         if (mip->mi_factory_addr_num == 0) {
1333                 ret = ENOTSUP;
1334                 goto bail;
1335         }
1336 
1337         if (*slot != -1) {
1338                 /* check the specified slot */
1339                 if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1340                         ret = EINVAL;
1341                         goto bail;
1342                 }
1343                 if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1344                         ret = EBUSY;
1345                         goto bail;
1346                 }
1347         } else {
1348                 /* pick the next available slot */
1349                 for (i = 0; i < mip->mi_factory_addr_num; i++) {
1350                         if (!mip->mi_factory_addr[i].mfa_in_use)
1351                                 break;
1352                 }
1353 
1354                 if (i == mip->mi_factory_addr_num) {
1355                         ret = ENOSPC;
1356                         goto bail;
1357                 }
1358                 *slot = i+1;
1359         }
1360 
1361         mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1362         mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1363 
1364 bail:
1365         rw_exit(&mip->mi_rw_lock);
1366         i_mac_perim_exit(mip);
1367         return (ret);
1368 }
1369 
1370 /*
1371  * Release the specified factory MAC address slot.
1372  */
1373 void
1374 mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1375 {
1376         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1377         mac_impl_t *mip = mcip->mci_mip;
1378 
1379         i_mac_perim_enter(mip);
1380         /*
1381          * Protect against concurrent readers that may need a self-consistent
1382          * view of the factory addresses
1383          */
1384         rw_enter(&mip->mi_rw_lock, RW_WRITER);
1385 
1386         ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1387         ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1388 
1389         mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1390 
1391         rw_exit(&mip->mi_rw_lock);
1392         i_mac_perim_exit(mip);
1393 }
1394 
1395 /*
1396  * Stores in mac_addr the value of the specified MAC address. Returns
1397  * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1398  * The caller must provide a string of at least MAXNAMELEN bytes.
1399  */
1400 void
1401 mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1402     uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1403 {
1404         mac_impl_t *mip = (mac_impl_t *)mh;
1405         boolean_t in_use;
1406 
1407         ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1408 
1409         /*
1410          * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1411          * and mi_rw_lock
1412          */
1413         rw_enter(&mip->mi_rw_lock, RW_READER);
1414         bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1415         *addr_len = mip->mi_type->mt_addr_length;
1416         in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1417         if (in_use && client_name != NULL) {
1418                 bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1419                     client_name, MAXNAMELEN);
1420         }
1421         if (in_use_arg != NULL)
1422                 *in_use_arg = in_use;
1423         rw_exit(&mip->mi_rw_lock);
1424 }
1425 
1426 /*
1427  * Returns the number of factory MAC addresses (in addition to the
1428  * primary MAC address), 0 if the underlying MAC doesn't support
1429  * that feature.
1430  */
1431 uint_t
1432 mac_addr_factory_num(mac_handle_t mh)
1433 {
1434         mac_impl_t *mip = (mac_impl_t *)mh;
1435 
1436         return (mip->mi_factory_addr_num);
1437 }
1438 
1439 
1440 void
1441 mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1442 {
1443         mac_ring_t      *ring;
1444 
1445         for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1446                 ring->mr_flag &= ~flag;
1447 }
1448 
1449 /*
1450  * The following mac_hwrings_xxx() functions are private mac client functions
1451  * used by the aggr driver to access and control the underlying HW Rx group
1452  * and rings. In this case, the aggr driver has exclusive control of the
1453  * underlying HW Rx group/rings, it calls the following functions to
1454  * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
1455  * addresses, or set up the Rx callback.
1456  */
1457 /* ARGSUSED */
1458 static void
1459 mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1460     mblk_t *mp_chain, boolean_t loopback)
1461 {
1462         mac_soft_ring_set_t     *mac_srs = (mac_soft_ring_set_t *)srs;
1463         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1464         mac_direct_rx_t         proc;
1465         void                    *arg1;
1466         mac_resource_handle_t   arg2;
1467 
1468         proc = srs_rx->sr_func;
1469         arg1 = srs_rx->sr_arg1;
1470         arg2 = mac_srs->srs_mrh;
1471 
1472         proc(arg1, arg2, mp_chain, NULL);
1473 }
1474 
1475 /*
1476  * This function is called to get the list of HW rings that are reserved by
1477  * an exclusive mac client.
1478  *
1479  * Return value: the number of HW rings.
1480  */
1481 int
1482 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1483     mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
1484 {
1485         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1486         flow_entry_t            *flent = mcip->mci_flent;
1487         mac_group_t             *grp;
1488         mac_ring_t              *ring;
1489         int                     cnt = 0;
1490 
1491         if (rtype == MAC_RING_TYPE_RX) {
1492                 grp = flent->fe_rx_ring_group;
1493         } else if (rtype == MAC_RING_TYPE_TX) {
1494                 grp = flent->fe_tx_ring_group;
1495         } else {
1496                 ASSERT(B_FALSE);
1497                 return (-1);
1498         }
1499         /*
1500          * The mac client did not reserve any RX group, return directly.
1501          * This is probably because the underlying MAC does not support
1502          * any groups.
1503          */
1504         if (hwgh != NULL)
1505                 *hwgh = NULL;
1506         if (grp == NULL)
1507                 return (0);
1508         /*
1509          * This group must be reserved by this mac client.
1510          */
1511         ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1512             (mcip == MAC_GROUP_ONLY_CLIENT(grp)));
1513 
1514         for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) {
1515                 ASSERT(cnt < MAX_RINGS_PER_GROUP);
1516                 hwrh[cnt] = (mac_ring_handle_t)ring;
1517         }
1518         if (hwgh != NULL)
1519                 *hwgh = (mac_group_handle_t)grp;
1520 
1521         return (cnt);
1522 }
1523 
1524 /*
1525  * This function is called to get info about Tx/Rx rings.
1526  *
1527  * Return value: returns uint_t which will have various bits set
1528  * that indicates different properties of the ring.
1529  */
1530 uint_t
1531 mac_hwring_getinfo(mac_ring_handle_t rh)
1532 {
1533         mac_ring_t *ring = (mac_ring_t *)rh;
1534         mac_ring_info_t *info = &ring->mr_info;
1535 
1536         return (info->mri_flags);
1537 }
1538 
1539 /*
1540  * Export ddi interrupt handles from the HW ring to the pseudo ring and
1541  * setup the RX callback of the mac client which exclusively controls
1542  * HW ring.
1543  */
1544 void
1545 mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh,
1546     mac_ring_handle_t pseudo_rh)
1547 {
1548         mac_ring_t              *hw_ring = (mac_ring_t *)hwrh;
1549         mac_ring_t              *pseudo_ring;
1550         mac_soft_ring_set_t     *mac_srs = hw_ring->mr_srs;
1551 
1552         if (pseudo_rh != NULL) {
1553                 pseudo_ring = (mac_ring_t *)pseudo_rh;
1554                 /* Export the ddi handles to pseudo ring */
1555                 pseudo_ring->mr_info.mri_intr.mi_ddi_handle =
1556                     hw_ring->mr_info.mri_intr.mi_ddi_handle;
1557                 pseudo_ring->mr_info.mri_intr.mi_ddi_shared =
1558                     hw_ring->mr_info.mri_intr.mi_ddi_shared;
1559                 /*
1560                  * Save a pointer to pseudo ring in the hw ring. If
1561                  * interrupt handle changes, the hw ring will be
1562                  * notified of the change (see mac_ring_intr_set())
1563                  * and the appropriate change has to be made to
1564                  * the pseudo ring that has exported the ddi handle.
1565                  */
1566                 hw_ring->mr_prh = pseudo_rh;
1567         }
1568 
1569         if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1570                 ASSERT(!(mac_srs->srs_type & SRST_TX));
1571                 mac_srs->srs_mrh = prh;
1572                 mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1573         }
1574 }
1575 
1576 void
1577 mac_hwring_teardown(mac_ring_handle_t hwrh)
1578 {
1579         mac_ring_t              *hw_ring = (mac_ring_t *)hwrh;
1580         mac_soft_ring_set_t     *mac_srs;
1581 
1582         if (hw_ring == NULL)
1583                 return;
1584         hw_ring->mr_prh = NULL;
1585         if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1586                 mac_srs = hw_ring->mr_srs;
1587                 ASSERT(!(mac_srs->srs_type & SRST_TX));
1588                 mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1589                 mac_srs->srs_mrh = NULL;
1590         }
1591 }
1592 
1593 int
1594 mac_hwring_disable_intr(mac_ring_handle_t rh)
1595 {
1596         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1597         mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1598 
1599         return (intr->mi_disable(intr->mi_handle));
1600 }
1601 
1602 int
1603 mac_hwring_enable_intr(mac_ring_handle_t rh)
1604 {
1605         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1606         mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1607 
1608         return (intr->mi_enable(intr->mi_handle));
1609 }
1610 
1611 int
1612 mac_hwring_start(mac_ring_handle_t rh)
1613 {
1614         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1615 
1616         MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1617         return (0);
1618 }
1619 
1620 void
1621 mac_hwring_stop(mac_ring_handle_t rh)
1622 {
1623         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1624 
1625         mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1626 }
1627 
1628 mblk_t *
1629 mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1630 {
1631         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1632         mac_ring_info_t *info = &rr_ring->mr_info;
1633 
1634         return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1635 }
1636 
1637 /*
1638  * Send packets through a selected tx ring.
1639  */
1640 mblk_t *
1641 mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp)
1642 {
1643         mac_ring_t *ring = (mac_ring_t *)rh;
1644         mac_ring_info_t *info = &ring->mr_info;
1645 
1646         ASSERT(ring->mr_type == MAC_RING_TYPE_TX &&
1647             ring->mr_state >= MR_INUSE);
1648         return (info->mri_tx(info->mri_driver, mp));
1649 }
1650 
1651 /*
1652  * Query stats for a particular rx/tx ring
1653  */
1654 int
1655 mac_hwring_getstat(mac_ring_handle_t rh, uint_t stat, uint64_t *val)
1656 {
1657         mac_ring_t      *ring = (mac_ring_t *)rh;
1658         mac_ring_info_t *info = &ring->mr_info;
1659 
1660         return (info->mri_stat(info->mri_driver, stat, val));
1661 }
1662 
1663 /*
1664  * Private function that is only used by aggr to send packets through
1665  * a port/Tx ring. Since aggr exposes a pseudo Tx ring even for ports
1666  * that does not expose Tx rings, aggr_ring_tx() entry point needs
1667  * access to mac_impl_t to send packets through m_tx() entry point.
1668  * It accomplishes this by calling mac_hwring_send_priv() function.
1669  */
1670 mblk_t *
1671 mac_hwring_send_priv(mac_client_handle_t mch, mac_ring_handle_t rh, mblk_t *mp)
1672 {
1673         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1674         mac_impl_t *mip = mcip->mci_mip;
1675 
1676         MAC_TX(mip, rh, mp, mcip);
1677         return (mp);
1678 }
1679 
1680 /*
1681  * Private function that is only used by aggr to update the default transmission
1682  * ring. Because aggr exposes a pseudo Tx ring even for ports that may
1683  * temporarily be down, it may need to update the default ring that is used by
1684  * MAC such that it refers to a link that can actively be used to send traffic.
1685  * Note that this is different from the case where the port has been removed
1686  * from the group. In those cases, all of the rings will be torn down because
1687  * the ring will no longer exist. It's important to give aggr a case where the
1688  * rings can still exist such that it may be able to continue to send LACP PDUs
1689  * to potentially restore the link.
1690  *
1691  * Finally, we explicitly don't do anything if the ring hasn't been enabled yet.
1692  * This is to help out aggr which doesn't really know the internal state that
1693  * MAC does about the rings and can't know that it's not quite ready for use
1694  * yet.
1695  */
1696 void
1697 mac_hwring_set_default(mac_handle_t mh, mac_ring_handle_t rh)
1698 {
1699         mac_impl_t *mip = (mac_impl_t *)mh;
1700         mac_ring_t *ring = (mac_ring_t *)rh;
1701 
1702         ASSERT(MAC_PERIM_HELD(mh));
1703         VERIFY(mip->mi_state_flags & MIS_IS_AGGR);
1704 
1705         if (ring->mr_state != MR_INUSE)
1706                 return;
1707 
1708         mip->mi_default_tx_ring = rh;
1709 }
1710 
1711 int
1712 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1713 {
1714         mac_group_t *group = (mac_group_t *)gh;
1715 
1716         return (mac_group_addmac(group, addr));
1717 }
1718 
1719 int
1720 mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1721 {
1722         mac_group_t *group = (mac_group_t *)gh;
1723 
1724         return (mac_group_remmac(group, addr));
1725 }
1726 
1727 /*
1728  * Set the RX group to be shared/reserved. Note that the group must be
1729  * started/stopped outside of this function.
1730  */
1731 void
1732 mac_set_group_state(mac_group_t *grp, mac_group_state_t state)
1733 {
1734         /*
1735          * If there is no change in the group state, just return.
1736          */
1737         if (grp->mrg_state == state)
1738                 return;
1739 
1740         switch (state) {
1741         case MAC_GROUP_STATE_RESERVED:
1742                 /*
1743                  * Successfully reserved the group.
1744                  *
1745                  * Given that there is an exclusive client controlling this
1746                  * group, we enable the group level polling when available,
1747                  * so that SRSs get to turn on/off individual rings they's
1748                  * assigned to.
1749                  */
1750                 ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1751 
1752                 if (grp->mrg_type == MAC_RING_TYPE_RX &&
1753                     GROUP_INTR_DISABLE_FUNC(grp) != NULL) {
1754                         GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1755                 }
1756                 break;
1757 
1758         case MAC_GROUP_STATE_SHARED:
1759                 /*
1760                  * Set all rings of this group to software classified.
1761                  * If the group has an overriding interrupt, then re-enable it.
1762                  */
1763                 ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1764 
1765                 if (grp->mrg_type == MAC_RING_TYPE_RX &&
1766                     GROUP_INTR_ENABLE_FUNC(grp) != NULL) {
1767                         GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1768                 }
1769                 /* The ring is not available for reservations any more */
1770                 break;
1771 
1772         case MAC_GROUP_STATE_REGISTERED:
1773                 /* Also callable from mac_register, perim is not held */
1774                 break;
1775 
1776         default:
1777                 ASSERT(B_FALSE);
1778                 break;
1779         }
1780 
1781         grp->mrg_state = state;
1782 }
1783 
1784 /*
1785  * Quiesce future hardware classified packets for the specified Rx ring
1786  */
1787 static void
1788 mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
1789 {
1790         ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
1791         ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
1792 
1793         mutex_enter(&rx_ring->mr_lock);
1794         rx_ring->mr_flag |= ring_flag;
1795         while (rx_ring->mr_refcnt != 0)
1796                 cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
1797         mutex_exit(&rx_ring->mr_lock);
1798 }
1799 
1800 /*
1801  * Please see mac_tx for details about the per cpu locking scheme
1802  */
1803 static void
1804 mac_tx_lock_all(mac_client_impl_t *mcip)
1805 {
1806         int     i;
1807 
1808         for (i = 0; i <= mac_tx_percpu_cnt; i++)
1809                 mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1810 }
1811 
1812 static void
1813 mac_tx_unlock_all(mac_client_impl_t *mcip)
1814 {
1815         int     i;
1816 
1817         for (i = mac_tx_percpu_cnt; i >= 0; i--)
1818                 mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1819 }
1820 
1821 static void
1822 mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
1823 {
1824         int     i;
1825 
1826         for (i = mac_tx_percpu_cnt; i > 0; i--)
1827                 mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1828 }
1829 
1830 static int
1831 mac_tx_sum_refcnt(mac_client_impl_t *mcip)
1832 {
1833         int     i;
1834         int     refcnt = 0;
1835 
1836         for (i = 0; i <= mac_tx_percpu_cnt; i++)
1837                 refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
1838 
1839         return (refcnt);
1840 }
1841 
1842 /*
1843  * Stop future Tx packets coming down from the client in preparation for
1844  * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
1845  * of rings between clients
1846  */
1847 void
1848 mac_tx_client_block(mac_client_impl_t *mcip)
1849 {
1850         mac_tx_lock_all(mcip);
1851         mcip->mci_tx_flag |= MCI_TX_QUIESCE;
1852         while (mac_tx_sum_refcnt(mcip) != 0) {
1853                 mac_tx_unlock_allbutzero(mcip);
1854                 cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1855                 mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1856                 mac_tx_lock_all(mcip);
1857         }
1858         mac_tx_unlock_all(mcip);
1859 }
1860 
1861 void
1862 mac_tx_client_unblock(mac_client_impl_t *mcip)
1863 {
1864         mac_tx_lock_all(mcip);
1865         mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
1866         mac_tx_unlock_all(mcip);
1867         /*
1868          * We may fail to disable flow control for the last MAC_NOTE_TX
1869          * notification because the MAC client is quiesced. Send the
1870          * notification again.
1871          */
1872         i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
1873 }
1874 
1875 /*
1876  * Wait for an SRS to quiesce. The SRS worker will signal us when the
1877  * quiesce is done.
1878  */
1879 static void
1880 mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
1881 {
1882         mutex_enter(&srs->srs_lock);
1883         while (!(srs->srs_state & srs_flag))
1884                 cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
1885         mutex_exit(&srs->srs_lock);
1886 }
1887 
1888 /*
1889  * Quiescing an Rx SRS is achieved by the following sequence. The protocol
1890  * works bottom up by cutting off packet flow from the bottommost point in the
1891  * mac, then the SRS, and then the soft rings. There are 2 use cases of this
1892  * mechanism. One is a temporary quiesce of the SRS, such as say while changing
1893  * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
1894  * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
1895  * for the SRS and MR flags. In the former case the threads pause waiting for
1896  * a restart, while in the latter case the threads exit. The Tx SRS teardown
1897  * is also mostly similar to the above.
1898  *
1899  * 1. Stop future hardware classified packets at the lowest level in the mac.
1900  *    Remove any hardware classification rule (CONDEMNED case) and mark the
1901  *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
1902  *    from increasing. Upcalls from the driver that come through hardware
1903  *    classification will be dropped in mac_rx from now on. Then we wait for
1904  *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
1905  *    sure there aren't any upcall threads from the driver through hardware
1906  *    classification. In the case of SRS teardown we also remove the
1907  *    classification rule in the driver.
1908  *
1909  * 2. Stop future software classified packets by marking the flow entry with
1910  *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
1911  *    increasing. We also remove the flow entry from the table in the latter
1912  *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
1913  *    that indicates there aren't any active threads using that flow entry.
1914  *
1915  * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
1916  *    SRS worker thread, and the soft ring threads are quiesced in sequence
1917  *    with the SRS worker thread serving as a master controller. This
1918  *    mechansim is explained in mac_srs_worker_quiesce().
1919  *
1920  * The restart mechanism to reactivate the SRS and softrings is explained
1921  * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
1922  * restart sequence.
1923  */
1924 void
1925 mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1926 {
1927         flow_entry_t    *flent = srs->srs_flent;
1928         uint_t  mr_flag, srs_done_flag;
1929 
1930         ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1931         ASSERT(!(srs->srs_type & SRST_TX));
1932 
1933         if (srs_quiesce_flag == SRS_CONDEMNED) {
1934                 mr_flag = MR_CONDEMNED;
1935                 srs_done_flag = SRS_CONDEMNED_DONE;
1936                 if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1937                         mac_srs_client_poll_disable(srs->srs_mcip, srs);
1938         } else {
1939                 ASSERT(srs_quiesce_flag == SRS_QUIESCE);
1940                 mr_flag = MR_QUIESCE;
1941                 srs_done_flag = SRS_QUIESCE_DONE;
1942                 if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1943                         mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
1944         }
1945 
1946         if (srs->srs_ring != NULL) {
1947                 mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
1948         } else {
1949                 /*
1950                  * SRS is driven by software classification. In case
1951                  * of CONDEMNED, the top level teardown functions will
1952                  * deal with flow removal.
1953                  */
1954                 if (srs_quiesce_flag != SRS_CONDEMNED) {
1955                         FLOW_MARK(flent, FE_QUIESCE);
1956                         mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1957                 }
1958         }
1959 
1960         /*
1961          * Signal the SRS to quiesce itself, and then cv_wait for the
1962          * SRS quiesce to complete. The SRS worker thread will wake us
1963          * up when the quiesce is complete
1964          */
1965         mac_srs_signal(srs, srs_quiesce_flag);
1966         mac_srs_quiesce_wait(srs, srs_done_flag);
1967 }
1968 
1969 /*
1970  * Remove an SRS.
1971  */
1972 void
1973 mac_rx_srs_remove(mac_soft_ring_set_t *srs)
1974 {
1975         flow_entry_t *flent = srs->srs_flent;
1976         int i;
1977 
1978         mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
1979         /*
1980          * Locate and remove our entry in the fe_rx_srs[] array, and
1981          * adjust the fe_rx_srs array entries and array count by
1982          * moving the last entry into the vacated spot.
1983          */
1984         mutex_enter(&flent->fe_lock);
1985         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1986                 if (flent->fe_rx_srs[i] == srs)
1987                         break;
1988         }
1989 
1990         ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
1991         if (i != flent->fe_rx_srs_cnt - 1) {
1992                 flent->fe_rx_srs[i] =
1993                     flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
1994                 i = flent->fe_rx_srs_cnt - 1;
1995         }
1996 
1997         flent->fe_rx_srs[i] = NULL;
1998         flent->fe_rx_srs_cnt--;
1999         mutex_exit(&flent->fe_lock);
2000 
2001         mac_srs_free(srs);
2002 }
2003 
2004 static void
2005 mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
2006 {
2007         mutex_enter(&srs->srs_lock);
2008         srs->srs_state &= ~flag;
2009         mutex_exit(&srs->srs_lock);
2010 }
2011 
2012 void
2013 mac_rx_srs_restart(mac_soft_ring_set_t *srs)
2014 {
2015         flow_entry_t    *flent = srs->srs_flent;
2016         mac_ring_t      *mr;
2017 
2018         ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
2019         ASSERT((srs->srs_type & SRST_TX) == 0);
2020 
2021         /*
2022          * This handles a change in the number of SRSs between the quiesce and
2023          * and restart operation of a flow.
2024          */
2025         if (!SRS_QUIESCED(srs))
2026                 return;
2027 
2028         /*
2029          * Signal the SRS to restart itself. Wait for the restart to complete
2030          * Note that we only restart the SRS if it is not marked as
2031          * permanently quiesced.
2032          */
2033         if (!SRS_QUIESCED_PERMANENT(srs)) {
2034                 mac_srs_signal(srs, SRS_RESTART);
2035                 mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2036                 mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2037 
2038                 mac_srs_client_poll_restart(srs->srs_mcip, srs);
2039         }
2040 
2041         /* Finally clear the flags to let the packets in */
2042         mr = srs->srs_ring;
2043         if (mr != NULL) {
2044                 MAC_RING_UNMARK(mr, MR_QUIESCE);
2045                 /* In case the ring was stopped, safely restart it */
2046                 if (mr->mr_state != MR_INUSE)
2047                         (void) mac_start_ring(mr);
2048         } else {
2049                 FLOW_UNMARK(flent, FE_QUIESCE);
2050         }
2051 }
2052 
2053 /*
2054  * Temporary quiesce of a flow and associated Rx SRS.
2055  * Please see block comment above mac_rx_classify_flow_rem.
2056  */
2057 /* ARGSUSED */
2058 int
2059 mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
2060 {
2061         int             i;
2062 
2063         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2064                 mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
2065                     SRS_QUIESCE);
2066         }
2067         return (0);
2068 }
2069 
2070 /*
2071  * Restart a flow and associated Rx SRS that has been quiesced temporarily
2072  * Please see block comment above mac_rx_classify_flow_rem
2073  */
2074 /* ARGSUSED */
2075 int
2076 mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
2077 {
2078         int             i;
2079 
2080         for (i = 0; i < flent->fe_rx_srs_cnt; i++)
2081                 mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
2082 
2083         return (0);
2084 }
2085 
2086 void
2087 mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
2088 {
2089         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2090         flow_entry_t            *flent = mcip->mci_flent;
2091         mac_impl_t              *mip = mcip->mci_mip;
2092         mac_soft_ring_set_t     *mac_srs;
2093         int                     i;
2094 
2095         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2096 
2097         if (flent == NULL)
2098                 return;
2099 
2100         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2101                 mac_srs = flent->fe_rx_srs[i];
2102                 mutex_enter(&mac_srs->srs_lock);
2103                 if (on)
2104                         mac_srs->srs_state |= SRS_QUIESCE_PERM;
2105                 else
2106                         mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
2107                 mutex_exit(&mac_srs->srs_lock);
2108         }
2109 }
2110 
2111 void
2112 mac_rx_client_quiesce(mac_client_handle_t mch)
2113 {
2114         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2115         mac_impl_t              *mip = mcip->mci_mip;
2116 
2117         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2118 
2119         if (MCIP_DATAPATH_SETUP(mcip)) {
2120                 (void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
2121                     NULL);
2122                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2123                     mac_rx_classify_flow_quiesce, NULL);
2124         }
2125 }
2126 
2127 void
2128 mac_rx_client_restart(mac_client_handle_t mch)
2129 {
2130         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2131         mac_impl_t              *mip = mcip->mci_mip;
2132 
2133         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2134 
2135         if (MCIP_DATAPATH_SETUP(mcip)) {
2136                 (void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
2137                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2138                     mac_rx_classify_flow_restart, NULL);
2139         }
2140 }
2141 
2142 /*
2143  * This function only quiesces the Tx SRS and softring worker threads. Callers
2144  * need to make sure that there aren't any mac client threads doing current or
2145  * future transmits in the mac before calling this function.
2146  */
2147 void
2148 mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
2149 {
2150         mac_client_impl_t       *mcip = srs->srs_mcip;
2151 
2152         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2153 
2154         ASSERT(srs->srs_type & SRST_TX);
2155         ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2156             srs_quiesce_flag == SRS_QUIESCE);
2157 
2158         /*
2159          * Signal the SRS to quiesce itself, and then cv_wait for the
2160          * SRS quiesce to complete. The SRS worker thread will wake us
2161          * up when the quiesce is complete
2162          */
2163         mac_srs_signal(srs, srs_quiesce_flag);
2164         mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2165             SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2166 }
2167 
2168 void
2169 mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2170 {
2171         /*
2172          * Resizing the fanout could result in creation of new SRSs.
2173          * They may not necessarily be in the quiesced state in which
2174          * case it need be restarted
2175          */
2176         if (!SRS_QUIESCED(srs))
2177                 return;
2178 
2179         mac_srs_signal(srs, SRS_RESTART);
2180         mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2181         mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2182 }
2183 
2184 /*
2185  * Temporary quiesce of a flow and associated Rx SRS.
2186  * Please see block comment above mac_rx_srs_quiesce
2187  */
2188 /* ARGSUSED */
2189 int
2190 mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2191 {
2192         /*
2193          * The fe_tx_srs is null for a subflow on an interface that is
2194          * not plumbed
2195          */
2196         if (flent->fe_tx_srs != NULL)
2197                 mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2198         return (0);
2199 }
2200 
2201 /* ARGSUSED */
2202 int
2203 mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2204 {
2205         /*
2206          * The fe_tx_srs is null for a subflow on an interface that is
2207          * not plumbed
2208          */
2209         if (flent->fe_tx_srs != NULL)
2210                 mac_tx_srs_restart(flent->fe_tx_srs);
2211         return (0);
2212 }
2213 
2214 static void
2215 i_mac_tx_client_quiesce(mac_client_handle_t mch, uint_t srs_quiesce_flag)
2216 {
2217         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2218 
2219         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2220 
2221         mac_tx_client_block(mcip);
2222         if (MCIP_TX_SRS(mcip) != NULL) {
2223                 mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2224                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2225                     mac_tx_flow_quiesce, NULL);
2226         }
2227 }
2228 
2229 void
2230 mac_tx_client_quiesce(mac_client_handle_t mch)
2231 {
2232         i_mac_tx_client_quiesce(mch, SRS_QUIESCE);
2233 }
2234 
2235 void
2236 mac_tx_client_condemn(mac_client_handle_t mch)
2237 {
2238         i_mac_tx_client_quiesce(mch, SRS_CONDEMNED);
2239 }
2240 
2241 void
2242 mac_tx_client_restart(mac_client_handle_t mch)
2243 {
2244         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
2245 
2246         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2247 
2248         mac_tx_client_unblock(mcip);
2249         if (MCIP_TX_SRS(mcip) != NULL) {
2250                 mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2251                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2252                     mac_tx_flow_restart, NULL);
2253         }
2254 }
2255 
2256 void
2257 mac_tx_client_flush(mac_client_impl_t *mcip)
2258 {
2259         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2260 
2261         mac_tx_client_quiesce((mac_client_handle_t)mcip);
2262         mac_tx_client_restart((mac_client_handle_t)mcip);
2263 }
2264 
2265 void
2266 mac_client_quiesce(mac_client_impl_t *mcip)
2267 {
2268         mac_rx_client_quiesce((mac_client_handle_t)mcip);
2269         mac_tx_client_quiesce((mac_client_handle_t)mcip);
2270 }
2271 
2272 void
2273 mac_client_restart(mac_client_impl_t *mcip)
2274 {
2275         mac_rx_client_restart((mac_client_handle_t)mcip);
2276         mac_tx_client_restart((mac_client_handle_t)mcip);
2277 }
2278 
2279 /*
2280  * Allocate a minor number.
2281  */
2282 minor_t
2283 mac_minor_hold(boolean_t sleep)
2284 {
2285         minor_t minor;
2286 
2287         /*
2288          * Grab a value from the arena.
2289          */
2290         atomic_inc_32(&minor_count);
2291 
2292         if (sleep)
2293                 minor = (uint_t)id_alloc(minor_ids);
2294         else
2295                 minor = (uint_t)id_alloc_nosleep(minor_ids);
2296 
2297         if (minor == 0) {
2298                 atomic_dec_32(&minor_count);
2299                 return (0);
2300         }
2301 
2302         return (minor);
2303 }
2304 
2305 /*
2306  * Release a previously allocated minor number.
2307  */
2308 void
2309 mac_minor_rele(minor_t minor)
2310 {
2311         /*
2312          * Return the value to the arena.
2313          */
2314         id_free(minor_ids, minor);
2315         atomic_dec_32(&minor_count);
2316 }
2317 
2318 uint32_t
2319 mac_no_notification(mac_handle_t mh)
2320 {
2321         mac_impl_t *mip = (mac_impl_t *)mh;
2322 
2323         return (((mip->mi_state_flags & MIS_LEGACY) != 0) ?
2324             mip->mi_capab_legacy.ml_unsup_note : 0);
2325 }
2326 
2327 /*
2328  * Prevent any new opens of this mac in preparation for unregister
2329  */
2330 int
2331 i_mac_disable(mac_impl_t *mip)
2332 {
2333         mac_client_impl_t       *mcip;
2334 
2335         rw_enter(&i_mac_impl_lock, RW_WRITER);
2336         if (mip->mi_state_flags & MIS_DISABLED) {
2337                 /* Already disabled, return success */
2338                 rw_exit(&i_mac_impl_lock);
2339                 return (0);
2340         }
2341         /*
2342          * See if there are any other references to this mac_t (e.g., VLAN's).
2343          * If so return failure. If all the other checks below pass, then
2344          * set mi_disabled atomically under the i_mac_impl_lock to prevent
2345          * any new VLAN's from being created or new mac client opens of this
2346          * mac end point.
2347          */
2348         if (mip->mi_ref > 0) {
2349                 rw_exit(&i_mac_impl_lock);
2350                 return (EBUSY);
2351         }
2352 
2353         /*
2354          * mac clients must delete all multicast groups they join before
2355          * closing. bcast groups are reference counted, the last client
2356          * to delete the group will wait till the group is physically
2357          * deleted. Since all clients have closed this mac end point
2358          * mi_bcast_ngrps must be zero at this point
2359          */
2360         ASSERT(mip->mi_bcast_ngrps == 0);
2361 
2362         /*
2363          * Don't let go of this if it has some flows.
2364          * All other code guarantees no flows are added to a disabled
2365          * mac, therefore it is sufficient to check for the flow table
2366          * only here.
2367          */
2368         mcip = mac_primary_client_handle(mip);
2369         if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2370                 rw_exit(&i_mac_impl_lock);
2371                 return (ENOTEMPTY);
2372         }
2373 
2374         mip->mi_state_flags |= MIS_DISABLED;
2375         rw_exit(&i_mac_impl_lock);
2376         return (0);
2377 }
2378 
2379 int
2380 mac_disable_nowait(mac_handle_t mh)
2381 {
2382         mac_impl_t      *mip = (mac_impl_t *)mh;
2383         int err;
2384 
2385         if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2386                 return (err);
2387         err = i_mac_disable(mip);
2388         i_mac_perim_exit(mip);
2389         return (err);
2390 }
2391 
2392 int
2393 mac_disable(mac_handle_t mh)
2394 {
2395         mac_impl_t      *mip = (mac_impl_t *)mh;
2396         int err;
2397 
2398         i_mac_perim_enter(mip);
2399         err = i_mac_disable(mip);
2400         i_mac_perim_exit(mip);
2401 
2402         /*
2403          * Clean up notification thread and wait for it to exit.
2404          */
2405         if (err == 0)
2406                 i_mac_notify_exit(mip);
2407 
2408         return (err);
2409 }
2410 
2411 /*
2412  * Called when the MAC instance has a non empty flow table, to de-multiplex
2413  * incoming packets to the right flow.
2414  * The MAC's rw lock is assumed held as a READER.
2415  */
2416 /* ARGSUSED */
2417 static mblk_t *
2418 mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2419 {
2420         flow_entry_t    *flent = NULL;
2421         uint_t          flags = FLOW_INBOUND;
2422         int             err;
2423 
2424         /*
2425          * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
2426          * to mac_flow_lookup() so that the VLAN packets can be successfully
2427          * passed to the non-VLAN aggregation flows.
2428          *
2429          * Note that there is possibly a race between this and
2430          * mac_unicast_remove/add() and VLAN packets could be incorrectly
2431          * classified to non-VLAN flows of non-aggregation mac clients. These
2432          * VLAN packets will be then filtered out by the mac module.
2433          */
2434         if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
2435                 flags |= FLOW_IGNORE_VLAN;
2436 
2437         err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2438         if (err != 0) {
2439                 /* no registered receive function */
2440                 return (mp);
2441         } else {
2442                 mac_client_impl_t       *mcip;
2443 
2444                 /*
2445                  * This flent might just be an additional one on the MAC client,
2446                  * i.e. for classification purposes (different fdesc), however
2447                  * the resources, SRS et. al., are in the mci_flent, so if
2448                  * this isn't the mci_flent, we need to get it.
2449                  */
2450                 if ((mcip = flent->fe_mcip) != NULL &&
2451                     mcip->mci_flent != flent) {
2452                         FLOW_REFRELE(flent);
2453                         flent = mcip->mci_flent;
2454                         FLOW_TRY_REFHOLD(flent, err);
2455                         if (err != 0)
2456                                 return (mp);
2457                 }
2458                 (flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2459                     B_FALSE);
2460                 FLOW_REFRELE(flent);
2461         }
2462         return (NULL);
2463 }
2464 
2465 mblk_t *
2466 mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2467 {
2468         mac_impl_t      *mip = (mac_impl_t *)mh;
2469         mblk_t          *bp, *bp1, **bpp, *list = NULL;
2470 
2471         /*
2472          * We walk the chain and attempt to classify each packet.
2473          * The packets that couldn't be classified will be returned
2474          * back to the caller.
2475          */
2476         bp = mp_chain;
2477         bpp = &list;
2478         while (bp != NULL) {
2479                 bp1 = bp;
2480                 bp = bp->b_next;
2481                 bp1->b_next = NULL;
2482 
2483                 if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2484                         *bpp = bp1;
2485                         bpp = &bp1->b_next;
2486                 }
2487         }
2488         return (list);
2489 }
2490 
2491 static int
2492 mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2493 {
2494         mac_ring_handle_t ring = arg;
2495 
2496         if (flent->fe_tx_srs)
2497                 mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2498         return (0);
2499 }
2500 
2501 void
2502 i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2503 {
2504         mac_client_impl_t       *cclient;
2505         mac_soft_ring_set_t     *mac_srs;
2506 
2507         /*
2508          * After grabbing the mi_rw_lock, the list of clients can't change.
2509          * If there are any clients mi_disabled must be B_FALSE and can't
2510          * get set since there are clients. If there aren't any clients we
2511          * don't do anything. In any case the mip has to be valid. The driver
2512          * must make sure that it goes single threaded (with respect to mac
2513          * calls) and wait for all pending mac calls to finish before calling
2514          * mac_unregister.
2515          */
2516         rw_enter(&i_mac_impl_lock, RW_READER);
2517         if (mip->mi_state_flags & MIS_DISABLED) {
2518                 rw_exit(&i_mac_impl_lock);
2519                 return;
2520         }
2521 
2522         /*
2523          * Get MAC tx srs from walking mac_client_handle list.
2524          */
2525         rw_enter(&mip->mi_rw_lock, RW_READER);
2526         for (cclient = mip->mi_clients_list; cclient != NULL;
2527             cclient = cclient->mci_client_next) {
2528                 if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL) {
2529                         mac_tx_srs_wakeup(mac_srs, ring);
2530                 } else {
2531                         /*
2532                          * Aggr opens underlying ports in exclusive mode
2533                          * and registers flow control callbacks using
2534                          * mac_tx_client_notify(). When opened in
2535                          * exclusive mode, Tx SRS won't be created
2536                          * during mac_unicast_add().
2537                          */
2538                         if (cclient->mci_state_flags & MCIS_EXCLUSIVE) {
2539                                 mac_tx_invoke_callbacks(cclient,
2540                                     (mac_tx_cookie_t)ring);
2541                         }
2542                 }
2543                 (void) mac_flow_walk(cclient->mci_subflow_tab,
2544                     mac_tx_flow_srs_wakeup, ring);
2545         }
2546         rw_exit(&mip->mi_rw_lock);
2547         rw_exit(&i_mac_impl_lock);
2548 }
2549 
2550 /* ARGSUSED */
2551 void
2552 mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2553     boolean_t add)
2554 {
2555         mac_impl_t *mip = (mac_impl_t *)mh;
2556 
2557         i_mac_perim_enter((mac_impl_t *)mh);
2558         /*
2559          * If no specific refresh function was given then default to the
2560          * driver's m_multicst entry point.
2561          */
2562         if (refresh == NULL) {
2563                 refresh = mip->mi_multicst;
2564                 arg = mip->mi_driver;
2565         }
2566 
2567         mac_bcast_refresh(mip, refresh, arg, add);
2568         i_mac_perim_exit((mac_impl_t *)mh);
2569 }
2570 
2571 void
2572 mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2573 {
2574         mac_impl_t      *mip = (mac_impl_t *)mh;
2575 
2576         /*
2577          * If no specific refresh function was given then default to the
2578          * driver's m_promisc entry point.
2579          */
2580         if (refresh == NULL) {
2581                 refresh = mip->mi_setpromisc;
2582                 arg = mip->mi_driver;
2583         }
2584         ASSERT(refresh != NULL);
2585 
2586         /*
2587          * Call the refresh function with the current promiscuity.
2588          */
2589         refresh(arg, (mip->mi_devpromisc != 0));
2590 }
2591 
2592 /*
2593  * The mac client requests that the mac not to change its margin size to
2594  * be less than the specified value.  If "current" is B_TRUE, then the client
2595  * requests the mac not to change its margin size to be smaller than the
2596  * current size. Further, return the current margin size value in this case.
2597  *
2598  * We keep every requested size in an ordered list from largest to smallest.
2599  */
2600 int
2601 mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2602 {
2603         mac_impl_t              *mip = (mac_impl_t *)mh;
2604         mac_margin_req_t        **pp, *p;
2605         int                     err = 0;
2606 
2607         rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2608         if (current)
2609                 *marginp = mip->mi_margin;
2610 
2611         /*
2612          * If the current margin value cannot satisfy the margin requested,
2613          * return ENOTSUP directly.
2614          */
2615         if (*marginp > mip->mi_margin) {
2616                 err = ENOTSUP;
2617                 goto done;
2618         }
2619 
2620         /*
2621          * Check whether the given margin is already in the list. If so,
2622          * bump the reference count.
2623          */
2624         for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2625                 if (p->mmr_margin == *marginp) {
2626                         /*
2627                          * The margin requested is already in the list,
2628                          * so just bump the reference count.
2629                          */
2630                         p->mmr_ref++;
2631                         goto done;
2632                 }
2633                 if (p->mmr_margin < *marginp)
2634                         break;
2635         }
2636 
2637 
2638         p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2639         p->mmr_margin = *marginp;
2640         p->mmr_ref++;
2641         p->mmr_nextp = *pp;
2642         *pp = p;
2643 
2644 done:
2645         rw_exit(&(mip->mi_rw_lock));
2646         return (err);
2647 }
2648 
2649 /*
2650  * The mac client requests to cancel its previous mac_margin_add() request.
2651  * We remove the requested margin size from the list.
2652  */
2653 int
2654 mac_margin_remove(mac_handle_t mh, uint32_t margin)
2655 {
2656         mac_impl_t              *mip = (mac_impl_t *)mh;
2657         mac_margin_req_t        **pp, *p;
2658         int                     err = 0;
2659 
2660         rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2661         /*
2662          * Find the entry in the list for the given margin.
2663          */
2664         for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2665                 if (p->mmr_margin == margin) {
2666                         if (--p->mmr_ref == 0)
2667                                 break;
2668 
2669                         /*
2670                          * There is still a reference to this address so
2671                          * there's nothing more to do.
2672                          */
2673                         goto done;
2674                 }
2675         }
2676 
2677         /*
2678          * We did not find an entry for the given margin.
2679          */
2680         if (p == NULL) {
2681                 err = ENOENT;
2682                 goto done;
2683         }
2684 
2685         ASSERT(p->mmr_ref == 0);
2686 
2687         /*
2688          * Remove it from the list.
2689          */
2690         *pp = p->mmr_nextp;
2691         kmem_free(p, sizeof (mac_margin_req_t));
2692 done:
2693         rw_exit(&(mip->mi_rw_lock));
2694         return (err);
2695 }
2696 
2697 boolean_t
2698 mac_margin_update(mac_handle_t mh, uint32_t margin)
2699 {
2700         mac_impl_t      *mip = (mac_impl_t *)mh;
2701         uint32_t        margin_needed = 0;
2702 
2703         rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2704 
2705         if (mip->mi_mmrp != NULL)
2706                 margin_needed = mip->mi_mmrp->mmr_margin;
2707 
2708         if (margin_needed <= margin)
2709                 mip->mi_margin = margin;
2710 
2711         rw_exit(&(mip->mi_rw_lock));
2712 
2713         if (margin_needed <= margin)
2714                 i_mac_notify(mip, MAC_NOTE_MARGIN);
2715 
2716         return (margin_needed <= margin);
2717 }
2718 
2719 /*
2720  * MAC clients use this interface to request that a MAC device not change its
2721  * MTU below the specified amount. At this time, that amount must be within the
2722  * range of the device's current minimum and the device's current maximum. eg. a
2723  * client cannot request a 3000 byte MTU when the device's MTU is currently
2724  * 2000.
2725  *
2726  * If "current" is set to B_TRUE, then the request is to simply to reserve the
2727  * current underlying mac's maximum for this mac client and return it in mtup.
2728  */
2729 int
2730 mac_mtu_add(mac_handle_t mh, uint32_t *mtup, boolean_t current)
2731 {
2732         mac_impl_t              *mip = (mac_impl_t *)mh;
2733         mac_mtu_req_t           *prev, *cur;
2734         mac_propval_range_t     mpr;
2735         int                     err;
2736 
2737         i_mac_perim_enter(mip);
2738         rw_enter(&mip->mi_rw_lock, RW_WRITER);
2739 
2740         if (current == B_TRUE)
2741                 *mtup = mip->mi_sdu_max;
2742         mpr.mpr_count = 1;
2743         err = mac_prop_info(mh, MAC_PROP_MTU, "mtu", NULL, 0, &mpr, NULL);
2744         if (err != 0) {
2745                 rw_exit(&mip->mi_rw_lock);
2746                 i_mac_perim_exit(mip);
2747                 return (err);
2748         }
2749 
2750         if (*mtup > mip->mi_sdu_max ||
2751             *mtup < mpr.mpr_range_uint32[0].mpur_min) {
2752                 rw_exit(&mip->mi_rw_lock);
2753                 i_mac_perim_exit(mip);
2754                 return (ENOTSUP);
2755         }
2756 
2757         prev = NULL;
2758         for (cur = mip->mi_mtrp; cur != NULL; cur = cur->mtr_nextp) {
2759                 if (*mtup == cur->mtr_mtu) {
2760                         cur->mtr_ref++;
2761                         rw_exit(&mip->mi_rw_lock);
2762                         i_mac_perim_exit(mip);
2763                         return (0);
2764                 }
2765 
2766                 if (*mtup > cur->mtr_mtu)
2767                         break;
2768 
2769                 prev = cur;
2770         }
2771 
2772         cur = kmem_alloc(sizeof (mac_mtu_req_t), KM_SLEEP);
2773         cur->mtr_mtu = *mtup;
2774         cur->mtr_ref = 1;
2775         if (prev != NULL) {
2776                 cur->mtr_nextp = prev->mtr_nextp;
2777                 prev->mtr_nextp = cur;
2778         } else {
2779                 cur->mtr_nextp = mip->mi_mtrp;
2780                 mip->mi_mtrp = cur;
2781         }
2782 
2783         rw_exit(&mip->mi_rw_lock);
2784         i_mac_perim_exit(mip);
2785         return (0);
2786 }
2787 
2788 int
2789 mac_mtu_remove(mac_handle_t mh, uint32_t mtu)
2790 {
2791         mac_impl_t *mip = (mac_impl_t *)mh;
2792         mac_mtu_req_t *cur, *prev;
2793 
2794         i_mac_perim_enter(mip);
2795         rw_enter(&mip->mi_rw_lock, RW_WRITER);
2796 
2797         prev = NULL;
2798         for (cur = mip->mi_mtrp; cur != NULL; cur = cur->mtr_nextp) {
2799                 if (cur->mtr_mtu == mtu) {
2800                         ASSERT(cur->mtr_ref > 0);
2801                         cur->mtr_ref--;
2802                         if (cur->mtr_ref == 0) {
2803                                 if (prev == NULL) {
2804                                         mip->mi_mtrp = cur->mtr_nextp;
2805                                 } else {
2806                                         prev->mtr_nextp = cur->mtr_nextp;
2807                                 }
2808                                 kmem_free(cur, sizeof (mac_mtu_req_t));
2809                         }
2810                         rw_exit(&mip->mi_rw_lock);
2811                         i_mac_perim_exit(mip);
2812                         return (0);
2813                 }
2814 
2815                 prev = cur;
2816         }
2817 
2818         rw_exit(&mip->mi_rw_lock);
2819         i_mac_perim_exit(mip);
2820         return (ENOENT);
2821 }
2822 
2823 /*
2824  * MAC Type Plugin functions.
2825  */
2826 
2827 mactype_t *
2828 mactype_getplugin(const char *pname)
2829 {
2830         mactype_t       *mtype = NULL;
2831         boolean_t       tried_modload = B_FALSE;
2832 
2833         mutex_enter(&i_mactype_lock);
2834 
2835 find_registered_mactype:
2836         if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
2837             (mod_hash_val_t *)&mtype) != 0) {
2838                 if (!tried_modload) {
2839                         /*
2840                          * If the plugin has not yet been loaded, then
2841                          * attempt to load it now.  If modload() succeeds,
2842                          * the plugin should have registered using
2843                          * mactype_register(), in which case we can go back
2844                          * and attempt to find it again.
2845                          */
2846                         if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
2847                                 tried_modload = B_TRUE;
2848                                 goto find_registered_mactype;
2849                         }
2850                 }
2851         } else {
2852                 /*
2853                  * Note that there's no danger that the plugin we've loaded
2854                  * could be unloaded between the modload() step and the
2855                  * reference count bump here, as we're holding
2856                  * i_mactype_lock, which mactype_unregister() also holds.
2857                  */
2858                 atomic_inc_32(&mtype->mt_ref);
2859         }
2860 
2861         mutex_exit(&i_mactype_lock);
2862         return (mtype);
2863 }
2864 
2865 mactype_register_t *
2866 mactype_alloc(uint_t mactype_version)
2867 {
2868         mactype_register_t *mtrp;
2869 
2870         /*
2871          * Make sure there isn't a version mismatch between the plugin and
2872          * the framework.  In the future, if multiple versions are
2873          * supported, this check could become more sophisticated.
2874          */
2875         if (mactype_version != MACTYPE_VERSION)
2876                 return (NULL);
2877 
2878         mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
2879         mtrp->mtr_version = mactype_version;
2880         return (mtrp);
2881 }
2882 
2883 void
2884 mactype_free(mactype_register_t *mtrp)
2885 {
2886         kmem_free(mtrp, sizeof (mactype_register_t));
2887 }
2888 
2889 int
2890 mactype_register(mactype_register_t *mtrp)
2891 {
2892         mactype_t       *mtp;
2893         mactype_ops_t   *ops = mtrp->mtr_ops;
2894 
2895         /* Do some sanity checking before we register this MAC type. */
2896         if (mtrp->mtr_ident == NULL || ops == NULL)
2897                 return (EINVAL);
2898 
2899         /*
2900          * Verify that all mandatory callbacks are set in the ops
2901          * vector.
2902          */
2903         if (ops->mtops_unicst_verify == NULL ||
2904             ops->mtops_multicst_verify == NULL ||
2905             ops->mtops_sap_verify == NULL ||
2906             ops->mtops_header == NULL ||
2907             ops->mtops_header_info == NULL) {
2908                 return (EINVAL);
2909         }
2910 
2911         mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
2912         mtp->mt_ident = mtrp->mtr_ident;
2913         mtp->mt_ops = *ops;
2914         mtp->mt_type = mtrp->mtr_mactype;
2915         mtp->mt_nativetype = mtrp->mtr_nativetype;
2916         mtp->mt_addr_length = mtrp->mtr_addrlen;
2917         if (mtrp->mtr_brdcst_addr != NULL) {
2918                 mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
2919                 bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
2920                     mtrp->mtr_addrlen);
2921         }
2922 
2923         mtp->mt_stats = mtrp->mtr_stats;
2924         mtp->mt_statcount = mtrp->mtr_statcount;
2925 
2926         mtp->mt_mapping = mtrp->mtr_mapping;
2927         mtp->mt_mappingcount = mtrp->mtr_mappingcount;
2928 
2929         if (mod_hash_insert(i_mactype_hash,
2930             (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
2931                 kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2932                 kmem_free(mtp, sizeof (*mtp));
2933                 return (EEXIST);
2934         }
2935         return (0);
2936 }
2937 
2938 int
2939 mactype_unregister(const char *ident)
2940 {
2941         mactype_t       *mtp;
2942         mod_hash_val_t  val;
2943         int             err;
2944 
2945         /*
2946          * Let's not allow MAC drivers to use this plugin while we're
2947          * trying to unregister it.  Holding i_mactype_lock also prevents a
2948          * plugin from unregistering while a MAC driver is attempting to
2949          * hold a reference to it in i_mactype_getplugin().
2950          */
2951         mutex_enter(&i_mactype_lock);
2952 
2953         if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
2954             (mod_hash_val_t *)&mtp)) != 0) {
2955                 /* A plugin is trying to unregister, but it never registered. */
2956                 err = ENXIO;
2957                 goto done;
2958         }
2959 
2960         if (mtp->mt_ref != 0) {
2961                 err = EBUSY;
2962                 goto done;
2963         }
2964 
2965         err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
2966         ASSERT(err == 0);
2967         if (err != 0) {
2968                 /* This should never happen, thus the ASSERT() above. */
2969                 err = EINVAL;
2970                 goto done;
2971         }
2972         ASSERT(mtp == (mactype_t *)val);
2973 
2974         if (mtp->mt_brdcst_addr != NULL)
2975                 kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2976         kmem_free(mtp, sizeof (mactype_t));
2977 done:
2978         mutex_exit(&i_mactype_lock);
2979         return (err);
2980 }
2981 
2982 /*
2983  * Checks the size of the value size specified for a property as
2984  * part of a property operation. Returns B_TRUE if the size is
2985  * correct, B_FALSE otherwise.
2986  */
2987 boolean_t
2988 mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range)
2989 {
2990         uint_t minsize = 0;
2991 
2992         if (is_range)
2993                 return (valsize >= sizeof (mac_propval_range_t));
2994 
2995         switch (id) {
2996         case MAC_PROP_ZONE:
2997                 minsize = sizeof (dld_ioc_zid_t);
2998                 break;
2999         case MAC_PROP_AUTOPUSH:
3000                 if (valsize != 0)
3001                         minsize = sizeof (struct dlautopush);
3002                 break;
3003         case MAC_PROP_TAGMODE:
3004                 minsize = sizeof (link_tagmode_t);
3005                 break;
3006         case MAC_PROP_RESOURCE:
3007         case MAC_PROP_RESOURCE_EFF:
3008                 minsize = sizeof (mac_resource_props_t);
3009                 break;
3010         case MAC_PROP_DUPLEX:
3011                 minsize = sizeof (link_duplex_t);
3012                 break;
3013         case MAC_PROP_SPEED:
3014                 minsize = sizeof (uint64_t);
3015                 break;
3016         case MAC_PROP_STATUS:
3017                 minsize = sizeof (link_state_t);
3018                 break;
3019         case MAC_PROP_AUTONEG:
3020         case MAC_PROP_EN_AUTONEG:
3021                 minsize = sizeof (uint8_t);
3022                 break;
3023         case MAC_PROP_MTU:
3024         case MAC_PROP_LLIMIT:
3025         case MAC_PROP_LDECAY:
3026                 minsize = sizeof (uint32_t);
3027                 break;
3028         case MAC_PROP_FLOWCTRL:
3029                 minsize = sizeof (link_flowctrl_t);
3030                 break;
3031         case MAC_PROP_ADV_10GFDX_CAP:
3032         case MAC_PROP_EN_10GFDX_CAP:
3033         case MAC_PROP_ADV_1000HDX_CAP:
3034         case MAC_PROP_EN_1000HDX_CAP:
3035         case MAC_PROP_ADV_100FDX_CAP:
3036         case MAC_PROP_EN_100FDX_CAP:
3037         case MAC_PROP_ADV_100HDX_CAP:
3038         case MAC_PROP_EN_100HDX_CAP:
3039         case MAC_PROP_ADV_10FDX_CAP:
3040         case MAC_PROP_EN_10FDX_CAP:
3041         case MAC_PROP_ADV_10HDX_CAP:
3042         case MAC_PROP_EN_10HDX_CAP:
3043         case MAC_PROP_ADV_100T4_CAP:
3044         case MAC_PROP_EN_100T4_CAP:
3045                 minsize = sizeof (uint8_t);
3046                 break;
3047         case MAC_PROP_PVID:
3048                 minsize = sizeof (uint16_t);
3049                 break;
3050         case MAC_PROP_IPTUN_HOPLIMIT:
3051                 minsize = sizeof (uint32_t);
3052                 break;
3053         case MAC_PROP_IPTUN_ENCAPLIMIT:
3054                 minsize = sizeof (uint32_t);
3055                 break;
3056         case MAC_PROP_MAX_TX_RINGS_AVAIL:
3057         case MAC_PROP_MAX_RX_RINGS_AVAIL:
3058         case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3059         case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3060                 minsize = sizeof (uint_t);
3061                 break;
3062         case MAC_PROP_WL_ESSID:
3063                 minsize = sizeof (wl_linkstatus_t);
3064                 break;
3065         case MAC_PROP_WL_BSSID:
3066                 minsize = sizeof (wl_bssid_t);
3067                 break;
3068         case MAC_PROP_WL_BSSTYPE:
3069                 minsize = sizeof (wl_bss_type_t);
3070                 break;
3071         case MAC_PROP_WL_LINKSTATUS:
3072                 minsize = sizeof (wl_linkstatus_t);
3073                 break;
3074         case MAC_PROP_WL_DESIRED_RATES:
3075                 minsize = sizeof (wl_rates_t);
3076                 break;
3077         case MAC_PROP_WL_SUPPORTED_RATES:
3078                 minsize = sizeof (wl_rates_t);
3079                 break;
3080         case MAC_PROP_WL_AUTH_MODE:
3081                 minsize = sizeof (wl_authmode_t);
3082                 break;
3083         case MAC_PROP_WL_ENCRYPTION:
3084                 minsize = sizeof (wl_encryption_t);
3085                 break;
3086         case MAC_PROP_WL_RSSI:
3087                 minsize = sizeof (wl_rssi_t);
3088                 break;
3089         case MAC_PROP_WL_PHY_CONFIG:
3090                 minsize = sizeof (wl_phy_conf_t);
3091                 break;
3092         case MAC_PROP_WL_CAPABILITY:
3093                 minsize = sizeof (wl_capability_t);
3094                 break;
3095         case MAC_PROP_WL_WPA:
3096                 minsize = sizeof (wl_wpa_t);
3097                 break;
3098         case MAC_PROP_WL_SCANRESULTS:
3099                 minsize = sizeof (wl_wpa_ess_t);
3100                 break;
3101         case MAC_PROP_WL_POWER_MODE:
3102                 minsize = sizeof (wl_ps_mode_t);
3103                 break;
3104         case MAC_PROP_WL_RADIO:
3105                 minsize = sizeof (wl_radio_t);
3106                 break;
3107         case MAC_PROP_WL_ESS_LIST:
3108                 minsize = sizeof (wl_ess_list_t);
3109                 break;
3110         case MAC_PROP_WL_KEY_TAB:
3111                 minsize = sizeof (wl_wep_key_tab_t);
3112                 break;
3113         case MAC_PROP_WL_CREATE_IBSS:
3114                 minsize = sizeof (wl_create_ibss_t);
3115                 break;
3116         case MAC_PROP_WL_SETOPTIE:
3117                 minsize = sizeof (wl_wpa_ie_t);
3118                 break;
3119         case MAC_PROP_WL_DELKEY:
3120                 minsize = sizeof (wl_del_key_t);
3121                 break;
3122         case MAC_PROP_WL_KEY:
3123                 minsize = sizeof (wl_key_t);
3124                 break;
3125         case MAC_PROP_WL_MLME:
3126                 minsize = sizeof (wl_mlme_t);
3127                 break;
3128         }
3129 
3130         return (valsize >= minsize);
3131 }
3132 
3133 /*
3134  * mac_set_prop() sets MAC or hardware driver properties:
3135  *
3136  * - MAC-managed properties such as resource properties include maxbw,
3137  *   priority, and cpu binding list, as well as the default port VID
3138  *   used by bridging. These properties are consumed by the MAC layer
3139  *   itself and not passed down to the driver. For resource control
3140  *   properties, this function invokes mac_set_resources() which will
3141  *   cache the property value in mac_impl_t and may call
3142  *   mac_client_set_resource() to update property value of the primary
3143  *   mac client, if it exists.
3144  *
3145  * - Properties which act on the hardware and must be passed to the
3146  *   driver, such as MTU, through the driver's mc_setprop() entry point.
3147  */
3148 int
3149 mac_set_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3150     uint_t valsize)
3151 {
3152         int err = ENOTSUP;
3153         mac_impl_t *mip = (mac_impl_t *)mh;
3154 
3155         ASSERT(MAC_PERIM_HELD(mh));
3156 
3157         switch (id) {
3158         case MAC_PROP_RESOURCE: {
3159                 mac_resource_props_t *mrp;
3160 
3161                 /* call mac_set_resources() for MAC properties */
3162                 ASSERT(valsize >= sizeof (mac_resource_props_t));
3163                 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3164                 bcopy(val, mrp, sizeof (*mrp));
3165                 err = mac_set_resources(mh, mrp);
3166                 kmem_free(mrp, sizeof (*mrp));
3167                 break;
3168         }
3169 
3170         case MAC_PROP_PVID:
3171                 ASSERT(valsize >= sizeof (uint16_t));
3172                 if (mip->mi_state_flags & MIS_IS_VNIC)
3173                         return (EINVAL);
3174                 err = mac_set_pvid(mh, *(uint16_t *)val);
3175                 break;
3176 
3177         case MAC_PROP_MTU: {
3178                 uint32_t mtu;
3179 
3180                 ASSERT(valsize >= sizeof (uint32_t));
3181                 bcopy(val, &mtu, sizeof (mtu));
3182                 err = mac_set_mtu(mh, mtu, NULL);
3183                 break;
3184         }
3185 
3186         case MAC_PROP_LLIMIT:
3187         case MAC_PROP_LDECAY: {
3188                 uint32_t learnval;
3189 
3190                 if (valsize < sizeof (learnval) ||
3191                     (mip->mi_state_flags & MIS_IS_VNIC))
3192                         return (EINVAL);
3193                 bcopy(val, &learnval, sizeof (learnval));
3194                 if (learnval == 0 && id == MAC_PROP_LDECAY)
3195                         return (EINVAL);
3196                 if (id == MAC_PROP_LLIMIT)
3197                         mip->mi_llimit = learnval;
3198                 else
3199                         mip->mi_ldecay = learnval;
3200                 err = 0;
3201                 break;
3202         }
3203 
3204         default:
3205                 /* For other driver properties, call driver's callback */
3206                 if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
3207                         err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
3208                             name, id, valsize, val);
3209                 }
3210         }
3211         return (err);
3212 }
3213 
3214 /*
3215  * mac_get_prop() gets MAC or device driver properties.
3216  *
3217  * If the property is a driver property, mac_get_prop() calls driver's callback
3218  * entry point to get it.
3219  * If the property is a MAC property, mac_get_prop() invokes mac_get_resources()
3220  * which returns the cached value in mac_impl_t.
3221  */
3222 int
3223 mac_get_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3224     uint_t valsize)
3225 {
3226         int err = ENOTSUP;
3227         mac_impl_t *mip = (mac_impl_t *)mh;
3228         uint_t  rings;
3229         uint_t  vlinks;
3230 
3231         bzero(val, valsize);
3232 
3233         switch (id) {
3234         case MAC_PROP_RESOURCE: {
3235                 mac_resource_props_t *mrp;
3236 
3237                 /* If mac property, read from cache */
3238                 ASSERT(valsize >= sizeof (mac_resource_props_t));
3239                 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3240                 mac_get_resources(mh, mrp);
3241                 bcopy(mrp, val, sizeof (*mrp));
3242                 kmem_free(mrp, sizeof (*mrp));
3243                 return (0);
3244         }
3245         case MAC_PROP_RESOURCE_EFF: {
3246                 mac_resource_props_t *mrp;
3247 
3248                 /* If mac effective property, read from client */
3249                 ASSERT(valsize >= sizeof (mac_resource_props_t));
3250                 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3251                 mac_get_effective_resources(mh, mrp);
3252                 bcopy(mrp, val, sizeof (*mrp));
3253                 kmem_free(mrp, sizeof (*mrp));
3254                 return (0);
3255         }
3256 
3257         case MAC_PROP_PVID:
3258                 ASSERT(valsize >= sizeof (uint16_t));
3259                 if (mip->mi_state_flags & MIS_IS_VNIC)
3260                         return (EINVAL);
3261                 *(uint16_t *)val = mac_get_pvid(mh);
3262                 return (0);
3263 
3264         case MAC_PROP_LLIMIT:
3265         case MAC_PROP_LDECAY:
3266                 ASSERT(valsize >= sizeof (uint32_t));
3267                 if (mip->mi_state_flags & MIS_IS_VNIC)
3268                         return (EINVAL);
3269                 if (id == MAC_PROP_LLIMIT)
3270                         bcopy(&mip->mi_llimit, val, sizeof (mip->mi_llimit));
3271                 else
3272                         bcopy(&mip->mi_ldecay, val, sizeof (mip->mi_ldecay));
3273                 return (0);
3274 
3275         case MAC_PROP_MTU: {
3276                 uint32_t sdu;
3277 
3278                 ASSERT(valsize >= sizeof (uint32_t));
3279                 mac_sdu_get2(mh, NULL, &sdu, NULL);
3280                 bcopy(&sdu, val, sizeof (sdu));
3281 
3282                 return (0);
3283         }
3284         case MAC_PROP_STATUS: {
3285                 link_state_t link_state;
3286 
3287                 if (valsize < sizeof (link_state))
3288                         return (EINVAL);
3289                 link_state = mac_link_get(mh);
3290                 bcopy(&link_state, val, sizeof (link_state));
3291 
3292                 return (0);
3293         }
3294 
3295         case MAC_PROP_MAX_RX_RINGS_AVAIL:
3296         case MAC_PROP_MAX_TX_RINGS_AVAIL:
3297                 ASSERT(valsize >= sizeof (uint_t));
3298                 rings = id == MAC_PROP_MAX_RX_RINGS_AVAIL ?
3299                     mac_rxavail_get(mh) : mac_txavail_get(mh);
3300                 bcopy(&rings, val, sizeof (uint_t));
3301                 return (0);
3302 
3303         case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3304         case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3305                 ASSERT(valsize >= sizeof (uint_t));
3306                 vlinks = id == MAC_PROP_MAX_RXHWCLNT_AVAIL ?
3307                     mac_rxhwlnksavail_get(mh) : mac_txhwlnksavail_get(mh);
3308                 bcopy(&vlinks, val, sizeof (uint_t));
3309                 return (0);
3310 
3311         case MAC_PROP_RXRINGSRANGE:
3312         case MAC_PROP_TXRINGSRANGE:
3313                 /*
3314                  * The value for these properties are returned through
3315                  * the MAC_PROP_RESOURCE property.
3316                  */
3317                 return (0);
3318 
3319         default:
3320                 break;
3321 
3322         }
3323 
3324         /* If driver property, request from driver */
3325         if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) {
3326                 err = mip->mi_callbacks->mc_getprop(mip->mi_driver, name, id,
3327                     valsize, val);
3328         }
3329 
3330         return (err);
3331 }
3332 
3333 /*
3334  * Helper function to initialize the range structure for use in
3335  * mac_get_prop. If the type can be other than uint32, we can
3336  * pass that as an arg.
3337  */
3338 static void
3339 _mac_set_range(mac_propval_range_t *range, uint32_t min, uint32_t max)
3340 {
3341         range->mpr_count = 1;
3342         range->mpr_type = MAC_PROPVAL_UINT32;
3343         range->mpr_range_uint32[0].mpur_min = min;
3344         range->mpr_range_uint32[0].mpur_max = max;
3345 }
3346 
3347 /*
3348  * Returns information about the specified property, such as default
3349  * values or permissions.
3350  */
3351 int
3352 mac_prop_info(mac_handle_t mh, mac_prop_id_t id, char *name,
3353     void *default_val, uint_t default_size, mac_propval_range_t *range,
3354     uint_t *perm)
3355 {
3356         mac_prop_info_state_t state;
3357         mac_impl_t *mip = (mac_impl_t *)mh;
3358         uint_t  max;
3359 
3360         /*
3361          * A property is read/write by default unless the driver says
3362          * otherwise.
3363          */
3364         if (perm != NULL)
3365                 *perm = MAC_PROP_PERM_RW;
3366 
3367         if (default_val != NULL)
3368                 bzero(default_val, default_size);
3369 
3370         /*
3371          * First, handle framework properties for which we don't need to
3372          * involve the driver.
3373          */
3374         switch (id) {
3375         case MAC_PROP_RESOURCE:
3376         case MAC_PROP_PVID:
3377         case MAC_PROP_LLIMIT:
3378         case MAC_PROP_LDECAY:
3379                 return (0);
3380 
3381         case MAC_PROP_MAX_RX_RINGS_AVAIL:
3382         case MAC_PROP_MAX_TX_RINGS_AVAIL:
3383         case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3384         case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3385                 if (perm != NULL)
3386                         *perm = MAC_PROP_PERM_READ;
3387                 return (0);
3388 
3389         case MAC_PROP_RXRINGSRANGE:
3390         case MAC_PROP_TXRINGSRANGE:
3391                 /*
3392                  * Currently, we support range for RX and TX rings properties.
3393                  * When we extend this support to maxbw, cpus and priority,
3394                  * we should move this to mac_get_resources.
3395                  * There is no default value for RX or TX rings.
3396                  */
3397                 if ((mip->mi_state_flags & MIS_IS_VNIC) &&
3398                     mac_is_vnic_primary(mh)) {
3399                         /*
3400                          * We don't support setting rings for a VLAN
3401                          * data link because it shares its ring with the
3402                          * primary MAC client.
3403                          */
3404                         if (perm != NULL)
3405                                 *perm = MAC_PROP_PERM_READ;
3406                         if (range != NULL)
3407                                 range->mpr_count = 0;
3408                 } else if (range != NULL) {
3409                         if (mip->mi_state_flags & MIS_IS_VNIC)
3410                                 mh = mac_get_lower_mac_handle(mh);
3411                         mip = (mac_impl_t *)mh;
3412                         if ((id == MAC_PROP_RXRINGSRANGE &&
3413                             mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) ||
3414                             (id == MAC_PROP_TXRINGSRANGE &&
3415                             mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC)) {
3416                                 if (id == MAC_PROP_RXRINGSRANGE) {
3417                                         if ((mac_rxhwlnksavail_get(mh) +
3418                                             mac_rxhwlnksrsvd_get(mh)) <= 1) {
3419                                                 /*
3420                                                  * doesn't support groups or
3421                                                  * rings
3422                                                  */
3423                                                 range->mpr_count = 0;
3424                                         } else {
3425                                                 /*
3426                                                  * supports specifying groups,
3427                                                  * but not rings
3428                                                  */
3429                                                 _mac_set_range(range, 0, 0);
3430                                         }
3431                                 } else {
3432                                         if ((mac_txhwlnksavail_get(mh) +
3433                                             mac_txhwlnksrsvd_get(mh)) <= 1) {
3434                                                 /*
3435                                                  * doesn't support groups or
3436                                                  * rings
3437                                                  */
3438                                                 range->mpr_count = 0;
3439                                         } else {
3440                                                 /*
3441                                                  * supports specifying groups,
3442                                                  * but not rings
3443                                                  */
3444                                                 _mac_set_range(range, 0, 0);
3445                                         }
3446                                 }
3447                         } else {
3448                                 max = id == MAC_PROP_RXRINGSRANGE ?
3449                                     mac_rxavail_get(mh) + mac_rxrsvd_get(mh) :
3450                                     mac_txavail_get(mh) + mac_txrsvd_get(mh);
3451                                 if (max <= 1) {
3452                                         /*
3453                                          * doesn't support groups or
3454                                          * rings
3455                                          */
3456                                         range->mpr_count = 0;
3457                                 } else  {
3458                                         /*
3459                                          * -1 because we have to leave out the
3460                                          * default ring.
3461                                          */
3462                                         _mac_set_range(range, 1, max - 1);
3463                                 }
3464                         }
3465                 }
3466                 return (0);
3467 
3468         case MAC_PROP_STATUS:
3469                 if (perm != NULL)
3470                         *perm = MAC_PROP_PERM_READ;
3471                 return (0);
3472         }
3473 
3474         /*
3475          * Get the property info from the driver if it implements the
3476          * property info entry point.
3477          */
3478         bzero(&state, sizeof (state));
3479 
3480         if (mip->mi_callbacks->mc_callbacks & MC_PROPINFO) {
3481                 state.pr_default = default_val;
3482                 state.pr_default_size = default_size;
3483 
3484                 /*
3485                  * The caller specifies the maximum number of ranges
3486                  * it can accomodate using mpr_count. We don't touch
3487                  * this value until the driver returns from its
3488                  * mc_propinfo() callback, and ensure we don't exceed
3489                  * this number of range as the driver defines
3490                  * supported range from its mc_propinfo().
3491                  *
3492                  * pr_range_cur_count keeps track of how many ranges
3493                  * were defined by the driver from its mc_propinfo()
3494                  * entry point.
3495                  *
3496                  * On exit, the user-specified range mpr_count returns
3497                  * the number of ranges specified by the driver on
3498                  * success, or the number of ranges it wanted to
3499                  * define if that number of ranges could not be
3500                  * accomodated by the specified range structure.  In
3501                  * the latter case, the caller will be able to
3502                  * allocate a larger range structure, and query the
3503                  * property again.
3504                  */
3505                 state.pr_range_cur_count = 0;
3506                 state.pr_range = range;
3507 
3508                 mip->mi_callbacks->mc_propinfo(mip->mi_driver, name, id,
3509                     (mac_prop_info_handle_t)&state);
3510 
3511                 if (state.pr_flags & MAC_PROP_INFO_RANGE)
3512                         range->mpr_count = state.pr_range_cur_count;
3513 
3514                 /*
3515                  * The operation could fail if the buffer supplied by
3516                  * the user was too small for the range or default
3517                  * value of the property.
3518                  */
3519                 if (state.pr_errno != 0)
3520                         return (state.pr_errno);
3521 
3522                 if (perm != NULL && state.pr_flags & MAC_PROP_INFO_PERM)
3523                         *perm = state.pr_perm;
3524         }
3525 
3526         /*
3527          * The MAC layer may want to provide default values or allowed
3528          * ranges for properties if the driver does not provide a
3529          * property info entry point, or that entry point exists, but
3530          * it did not provide a default value or allowed ranges for
3531          * that property.
3532          */
3533         switch (id) {
3534         case MAC_PROP_MTU: {
3535                 uint32_t sdu;
3536 
3537                 mac_sdu_get2(mh, NULL, &sdu, NULL);
3538 
3539                 if (range != NULL && !(state.pr_flags &
3540                     MAC_PROP_INFO_RANGE)) {
3541                         /* MTU range */
3542                         _mac_set_range(range, sdu, sdu);
3543                 }
3544 
3545                 if (default_val != NULL && !(state.pr_flags &
3546                     MAC_PROP_INFO_DEFAULT)) {
3547                         if (mip->mi_info.mi_media == DL_ETHER)
3548                                 sdu = ETHERMTU;
3549                         /* default MTU value */
3550                         bcopy(&sdu, default_val, sizeof (sdu));
3551                 }
3552         }
3553         }
3554 
3555         return (0);
3556 }
3557 
3558 int
3559 mac_fastpath_disable(mac_handle_t mh)
3560 {
3561         mac_impl_t      *mip = (mac_impl_t *)mh;
3562 
3563         if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3564                 return (0);
3565 
3566         return (mip->mi_capab_legacy.ml_fastpath_disable(mip->mi_driver));
3567 }
3568 
3569 void
3570 mac_fastpath_enable(mac_handle_t mh)
3571 {
3572         mac_impl_t      *mip = (mac_impl_t *)mh;
3573 
3574         if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3575                 return;
3576 
3577         mip->mi_capab_legacy.ml_fastpath_enable(mip->mi_driver);
3578 }
3579 
3580 void
3581 mac_register_priv_prop(mac_impl_t *mip, char **priv_props)
3582 {
3583         uint_t nprops, i;
3584 
3585         if (priv_props == NULL)
3586                 return;
3587 
3588         nprops = 0;
3589         while (priv_props[nprops] != NULL)
3590                 nprops++;
3591         if (nprops == 0)
3592                 return;
3593 
3594 
3595         mip->mi_priv_prop = kmem_zalloc(nprops * sizeof (char *), KM_SLEEP);
3596 
3597         for (i = 0; i < nprops; i++) {
3598                 mip->mi_priv_prop[i] = kmem_zalloc(MAXLINKPROPNAME, KM_SLEEP);
3599                 (void) strlcpy(mip->mi_priv_prop[i], priv_props[i],
3600                     MAXLINKPROPNAME);
3601         }
3602 
3603         mip->mi_priv_prop_count = nprops;
3604 }
3605 
3606 void
3607 mac_unregister_priv_prop(mac_impl_t *mip)
3608 {
3609         uint_t i;
3610 
3611         if (mip->mi_priv_prop_count == 0) {
3612                 ASSERT(mip->mi_priv_prop == NULL);
3613                 return;
3614         }
3615 
3616         for (i = 0; i < mip->mi_priv_prop_count; i++)
3617                 kmem_free(mip->mi_priv_prop[i], MAXLINKPROPNAME);
3618         kmem_free(mip->mi_priv_prop, mip->mi_priv_prop_count *
3619             sizeof (char *));
3620 
3621         mip->mi_priv_prop = NULL;
3622         mip->mi_priv_prop_count = 0;
3623 }
3624 
3625 /*
3626  * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
3627  * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
3628  * cases if MAC free's the ring structure after mac_stop_ring(), any
3629  * illegal access to the ring structure coming from the driver will panic
3630  * the system. In order to protect the system from such inadverent access,
3631  * we maintain a cache of rings in the mac_impl_t after they get free'd up.
3632  * When packets are received on free'd up rings, MAC (through the generation
3633  * count mechanism) will drop such packets.
3634  */
3635 static mac_ring_t *
3636 mac_ring_alloc(mac_impl_t *mip)
3637 {
3638         mac_ring_t *ring;
3639 
3640         mutex_enter(&mip->mi_ring_lock);
3641         if (mip->mi_ring_freelist != NULL) {
3642                 ring = mip->mi_ring_freelist;
3643                 mip->mi_ring_freelist = ring->mr_next;
3644                 bzero(ring, sizeof (mac_ring_t));
3645                 mutex_exit(&mip->mi_ring_lock);
3646         } else {
3647                 mutex_exit(&mip->mi_ring_lock);
3648                 ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
3649         }
3650         ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
3651         return (ring);
3652 }
3653 
3654 static void
3655 mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
3656 {
3657         ASSERT(ring->mr_state == MR_FREE);
3658 
3659         mutex_enter(&mip->mi_ring_lock);
3660         ring->mr_state = MR_FREE;
3661         ring->mr_flag = 0;
3662         ring->mr_next = mip->mi_ring_freelist;
3663         ring->mr_mip = NULL;
3664         mip->mi_ring_freelist = ring;
3665         mac_ring_stat_delete(ring);
3666         mutex_exit(&mip->mi_ring_lock);
3667 }
3668 
3669 static void
3670 mac_ring_freeall(mac_impl_t *mip)
3671 {
3672         mac_ring_t *ring_next;
3673         mutex_enter(&mip->mi_ring_lock);
3674         mac_ring_t *ring = mip->mi_ring_freelist;
3675         while (ring != NULL) {
3676                 ring_next = ring->mr_next;
3677                 kmem_cache_free(mac_ring_cache, ring);
3678                 ring = ring_next;
3679         }
3680         mip->mi_ring_freelist = NULL;
3681         mutex_exit(&mip->mi_ring_lock);
3682 }
3683 
3684 int
3685 mac_start_ring(mac_ring_t *ring)
3686 {
3687         int rv = 0;
3688 
3689         ASSERT(ring->mr_state == MR_FREE);
3690 
3691         if (ring->mr_start != NULL) {
3692                 rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
3693                 if (rv != 0)
3694                         return (rv);
3695         }
3696 
3697         ring->mr_state = MR_INUSE;
3698         return (rv);
3699 }
3700 
3701 void
3702 mac_stop_ring(mac_ring_t *ring)
3703 {
3704         ASSERT(ring->mr_state == MR_INUSE);
3705 
3706         if (ring->mr_stop != NULL)
3707                 ring->mr_stop(ring->mr_driver);
3708 
3709         ring->mr_state = MR_FREE;
3710 
3711         /*
3712          * Increment the ring generation number for this ring.
3713          */
3714         ring->mr_gen_num++;
3715 }
3716 
3717 int
3718 mac_start_group(mac_group_t *group)
3719 {
3720         int rv = 0;
3721 
3722         if (group->mrg_start != NULL)
3723                 rv = group->mrg_start(group->mrg_driver);
3724 
3725         return (rv);
3726 }
3727 
3728 void
3729 mac_stop_group(mac_group_t *group)
3730 {
3731         if (group->mrg_stop != NULL)
3732                 group->mrg_stop(group->mrg_driver);
3733 }
3734 
3735 /*
3736  * Called from mac_start() on the default Rx group. Broadcast and multicast
3737  * packets are received only on the default group. Hence the default group
3738  * needs to be up even if the primary client is not up, for the other groups
3739  * to be functional. We do this by calling this function at mac_start time
3740  * itself. However the broadcast packets that are received can't make their
3741  * way beyond mac_rx until a mac client creates a broadcast flow.
3742  */
3743 static int
3744 mac_start_group_and_rings(mac_group_t *group)
3745 {
3746         mac_ring_t      *ring;
3747         int             rv = 0;
3748 
3749         ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
3750         if ((rv = mac_start_group(group)) != 0)
3751                 return (rv);
3752 
3753         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3754                 ASSERT(ring->mr_state == MR_FREE);
3755                 if ((rv = mac_start_ring(ring)) != 0)
3756                         goto error;
3757                 ring->mr_classify_type = MAC_SW_CLASSIFIER;
3758         }
3759         return (0);
3760 
3761 error:
3762         mac_stop_group_and_rings(group);
3763         return (rv);
3764 }
3765 
3766 /* Called from mac_stop on the default Rx group */
3767 static void
3768 mac_stop_group_and_rings(mac_group_t *group)
3769 {
3770         mac_ring_t      *ring;
3771 
3772         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3773                 if (ring->mr_state != MR_FREE) {
3774                         mac_stop_ring(ring);
3775                         ring->mr_flag = 0;
3776                         ring->mr_classify_type = MAC_NO_CLASSIFIER;
3777                 }
3778         }
3779         mac_stop_group(group);
3780 }
3781 
3782 
3783 static mac_ring_t *
3784 mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
3785     mac_capab_rings_t *cap_rings)
3786 {
3787         mac_ring_t *ring, *rnext;
3788         mac_ring_info_t ring_info;
3789         ddi_intr_handle_t ddi_handle;
3790 
3791         ring = mac_ring_alloc(mip);
3792 
3793         /* Prepare basic information of ring */
3794 
3795         /*
3796          * Ring index is numbered to be unique across a particular device.
3797          * Ring index computation makes following assumptions:
3798          *      - For drivers with static grouping (e.g. ixgbe, bge),
3799          *      ring index exchanged with the driver (e.g. during mr_rget)
3800          *      is unique only across the group the ring belongs to.
3801          *      - Drivers with dynamic grouping (e.g. nxge), start
3802          *      with single group (mrg_index = 0).
3803          */
3804         ring->mr_index = group->mrg_index * group->mrg_info.mgi_count + index;
3805         ring->mr_type = group->mrg_type;
3806         ring->mr_gh = (mac_group_handle_t)group;
3807 
3808         /* Insert the new ring to the list. */
3809         ring->mr_next = group->mrg_rings;
3810         group->mrg_rings = ring;
3811 
3812         /* Zero to reuse the info data structure */
3813         bzero(&ring_info, sizeof (ring_info));
3814 
3815         /* Query ring information from driver */
3816         cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
3817             index, &ring_info, (mac_ring_handle_t)ring);
3818 
3819         ring->mr_info = ring_info;
3820 
3821         /*
3822          * The interrupt handle could be shared among multiple rings.
3823          * Thus if there is a bunch of rings that are sharing an
3824          * interrupt, then only one ring among the bunch will be made
3825          * available for interrupt re-targeting; the rest will have
3826          * ddi_shared flag set to TRUE and would not be available for
3827          * be interrupt re-targeting.
3828          */
3829         if ((ddi_handle = ring_info.mri_intr.mi_ddi_handle) != NULL) {
3830                 rnext = ring->mr_next;
3831                 while (rnext != NULL) {
3832                         if (rnext->mr_info.mri_intr.mi_ddi_handle ==
3833                             ddi_handle) {
3834                                 /*
3835                                  * If default ring (mr_index == 0) is part
3836                                  * of a group of rings sharing an
3837                                  * interrupt, then set ddi_shared flag for
3838                                  * the default ring and give another ring
3839                                  * the chance to be re-targeted.
3840                                  */
3841                                 if (rnext->mr_index == 0 &&
3842                                     !rnext->mr_info.mri_intr.mi_ddi_shared) {
3843                                         rnext->mr_info.mri_intr.mi_ddi_shared =
3844                                             B_TRUE;
3845                                 } else {
3846                                         ring->mr_info.mri_intr.mi_ddi_shared =
3847                                             B_TRUE;
3848                                 }
3849                                 break;
3850                         }
3851                         rnext = rnext->mr_next;
3852                 }
3853                 /*
3854                  * If rnext is NULL, then no matching ddi_handle was found.
3855                  * Rx rings get registered first. So if this is a Tx ring,
3856                  * then go through all the Rx rings and see if there is a
3857                  * matching ddi handle.
3858                  */
3859                 if (rnext == NULL && ring->mr_type == MAC_RING_TYPE_TX) {
3860                         mac_compare_ddi_handle(mip->mi_rx_groups,
3861                             mip->mi_rx_group_count, ring);
3862                 }
3863         }
3864 
3865         /* Update ring's status */
3866         ring->mr_state = MR_FREE;
3867         ring->mr_flag = 0;
3868 
3869         /* Update the ring count of the group */
3870         group->mrg_cur_count++;
3871 
3872         /* Create per ring kstats */
3873         if (ring->mr_stat != NULL) {
3874                 ring->mr_mip = mip;
3875                 mac_ring_stat_create(ring);
3876         }
3877 
3878         return (ring);
3879 }
3880 
3881 /*
3882  * Rings are chained together for easy regrouping.
3883  */
3884 static void
3885 mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
3886     mac_capab_rings_t *cap_rings)
3887 {
3888         int index;
3889 
3890         /*
3891          * Initialize all ring members of this group. Size of zero will not
3892          * enter the loop, so it's safe for initializing an empty group.
3893          */
3894         for (index = size - 1; index >= 0; index--)
3895                 (void) mac_init_ring(mip, group, index, cap_rings);
3896 }
3897 
3898 int
3899 mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3900 {
3901         mac_capab_rings_t       *cap_rings;
3902         mac_group_t             *group;
3903         mac_group_t             *groups;
3904         mac_group_info_t        group_info;
3905         uint_t                  group_free = 0;
3906         uint_t                  ring_left;
3907         mac_ring_t              *ring;
3908         int                     g;
3909         int                     err = 0;
3910         uint_t                  grpcnt;
3911         boolean_t               pseudo_txgrp = B_FALSE;
3912 
3913         switch (rtype) {
3914         case MAC_RING_TYPE_RX:
3915                 ASSERT(mip->mi_rx_groups == NULL);
3916 
3917                 cap_rings = &mip->mi_rx_rings_cap;
3918                 cap_rings->mr_type = MAC_RING_TYPE_RX;
3919                 break;
3920         case MAC_RING_TYPE_TX:
3921                 ASSERT(mip->mi_tx_groups == NULL);
3922 
3923                 cap_rings = &mip->mi_tx_rings_cap;
3924                 cap_rings->mr_type = MAC_RING_TYPE_TX;
3925                 break;
3926         default:
3927                 ASSERT(B_FALSE);
3928         }
3929 
3930         if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS, cap_rings))
3931                 return (0);
3932         grpcnt = cap_rings->mr_gnum;
3933 
3934         /*
3935          * If we have multiple TX rings, but only one TX group, we can
3936          * create pseudo TX groups (one per TX ring) in the MAC layer,
3937          * except for an aggr. For an aggr currently we maintain only
3938          * one group with all the rings (for all its ports), going
3939          * forwards we might change this.
3940          */
3941         if (rtype == MAC_RING_TYPE_TX &&
3942             cap_rings->mr_gnum == 0 && cap_rings->mr_rnum >  0 &&
3943             (mip->mi_state_flags & MIS_IS_AGGR) == 0) {
3944                 /*
3945                  * The -1 here is because we create a default TX group
3946                  * with all the rings in it.
3947                  */
3948                 grpcnt = cap_rings->mr_rnum - 1;
3949                 pseudo_txgrp = B_TRUE;
3950         }
3951 
3952         /*
3953          * Allocate a contiguous buffer for all groups.
3954          */
3955         groups = kmem_zalloc(sizeof (mac_group_t) * (grpcnt+ 1), KM_SLEEP);
3956 
3957         ring_left = cap_rings->mr_rnum;
3958 
3959         /*
3960          * Get all ring groups if any, and get their ring members
3961          * if any.
3962          */
3963         for (g = 0; g < grpcnt; g++) {
3964                 group = groups + g;
3965 
3966                 /* Prepare basic information of the group */
3967                 group->mrg_index = g;
3968                 group->mrg_type = rtype;
3969                 group->mrg_state = MAC_GROUP_STATE_UNINIT;
3970                 group->mrg_mh = (mac_handle_t)mip;
3971                 group->mrg_next = group + 1;
3972 
3973                 /* Zero to reuse the info data structure */
3974                 bzero(&group_info, sizeof (group_info));
3975 
3976                 if (pseudo_txgrp) {
3977                         /*
3978                          * This is a pseudo group that we created, apart
3979                          * from setting the state there is nothing to be
3980                          * done.
3981                          */
3982                         group->mrg_state = MAC_GROUP_STATE_REGISTERED;
3983                         group_free++;
3984                         continue;
3985                 }
3986                 /* Query group information from driver */
3987                 cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
3988                     (mac_group_handle_t)group);
3989 
3990                 switch (cap_rings->mr_group_type) {
3991                 case MAC_GROUP_TYPE_DYNAMIC:
3992                         if (cap_rings->mr_gaddring == NULL ||
3993                             cap_rings->mr_gremring == NULL) {
3994                                 DTRACE_PROBE3(
3995                                     mac__init__rings_no_addremring,
3996                                     char *, mip->mi_name,
3997                                     mac_group_add_ring_t,
3998                                     cap_rings->mr_gaddring,
3999                                     mac_group_add_ring_t,
4000                                     cap_rings->mr_gremring);
4001                                 err = EINVAL;
4002                                 goto bail;
4003                         }
4004 
4005                         switch (rtype) {
4006                         case MAC_RING_TYPE_RX:
4007                                 /*
4008                                  * The first RX group must have non-zero
4009                                  * rings, and the following groups must
4010                                  * have zero rings.
4011                                  */
4012                                 if (g == 0 && group_info.mgi_count == 0) {
4013                                         DTRACE_PROBE1(
4014                                             mac__init__rings__rx__def__zero,
4015                                             char *, mip->mi_name);
4016                                         err = EINVAL;
4017                                         goto bail;
4018                                 }
4019                                 if (g > 0 && group_info.mgi_count != 0) {
4020                                         DTRACE_PROBE3(
4021                                             mac__init__rings__rx__nonzero,
4022                                             char *, mip->mi_name,
4023                                             int, g, int, group_info.mgi_count);
4024                                         err = EINVAL;
4025                                         goto bail;
4026                                 }
4027                                 break;
4028                         case MAC_RING_TYPE_TX:
4029                                 /*
4030                                  * All TX ring groups must have zero rings.
4031                                  */
4032                                 if (group_info.mgi_count != 0) {
4033                                         DTRACE_PROBE3(
4034                                             mac__init__rings__tx__nonzero,
4035                                             char *, mip->mi_name,
4036                                             int, g, int, group_info.mgi_count);
4037                                         err = EINVAL;
4038                                         goto bail;
4039                                 }
4040                                 break;
4041                         }
4042                         break;
4043                 case MAC_GROUP_TYPE_STATIC:
4044                         /*
4045                          * Note that an empty group is allowed, e.g., an aggr
4046                          * would start with an empty group.
4047                          */
4048                         break;
4049                 default:
4050                         /* unknown group type */
4051                         DTRACE_PROBE2(mac__init__rings__unknown__type,
4052                             char *, mip->mi_name,
4053                             int, cap_rings->mr_group_type);
4054                         err = EINVAL;
4055                         goto bail;
4056                 }
4057 
4058 
4059                 /*
4060                  * Driver must register group->mgi_addmac/remmac() for rx groups
4061                  * to support multiple MAC addresses.
4062                  */
4063                 if (rtype == MAC_RING_TYPE_RX) {
4064                         if ((group_info.mgi_addmac == NULL) ||
4065                             (group_info.mgi_addmac == NULL)) {
4066                                 goto bail;
4067                         }
4068                 }
4069 
4070                 /* Cache driver-supplied information */
4071                 group->mrg_info = group_info;
4072 
4073                 /* Update the group's status and group count. */
4074                 mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
4075                 group_free++;
4076 
4077                 group->mrg_rings = NULL;
4078                 group->mrg_cur_count = 0;
4079                 mac_init_group(mip, group, group_info.mgi_count, cap_rings);
4080                 ring_left -= group_info.mgi_count;
4081 
4082                 /* The current group size should be equal to default value */
4083                 ASSERT(group->mrg_cur_count == group_info.mgi_count);
4084         }
4085 
4086         /* Build up a dummy group for free resources as a pool */
4087         group = groups + grpcnt;
4088 
4089         /* Prepare basic information of the group */
4090         group->mrg_index = -1;
4091         group->mrg_type = rtype;
4092         group->mrg_state = MAC_GROUP_STATE_UNINIT;
4093         group->mrg_mh = (mac_handle_t)mip;
4094         group->mrg_next = NULL;
4095 
4096         /*
4097          * If there are ungrouped rings, allocate a continuous buffer for
4098          * remaining resources.
4099          */
4100         if (ring_left != 0) {
4101                 group->mrg_rings = NULL;
4102                 group->mrg_cur_count = 0;
4103                 mac_init_group(mip, group, ring_left, cap_rings);
4104 
4105                 /* The current group size should be equal to ring_left */
4106                 ASSERT(group->mrg_cur_count == ring_left);
4107 
4108                 ring_left = 0;
4109 
4110                 /* Update this group's status */
4111                 mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
4112         } else
4113                 group->mrg_rings = NULL;
4114 
4115         ASSERT(ring_left == 0);
4116 
4117 bail:
4118 
4119         /* Cache other important information to finalize the initialization */
4120         switch (rtype) {
4121         case MAC_RING_TYPE_RX:
4122                 mip->mi_rx_group_type = cap_rings->mr_group_type;
4123                 mip->mi_rx_group_count = cap_rings->mr_gnum;
4124                 mip->mi_rx_groups = groups;
4125                 mip->mi_rx_donor_grp = groups;
4126                 if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
4127                         /*
4128                          * The default ring is reserved since it is
4129                          * used for sending the broadcast etc. packets.
4130                          */
4131                         mip->mi_rxrings_avail =
4132                             mip->mi_rx_groups->mrg_cur_count - 1;
4133                         mip->mi_rxrings_rsvd = 1;
4134                 }
4135                 /*
4136                  * The default group cannot be reserved. It is used by
4137                  * all the clients that do not have an exclusive group.
4138                  */
4139                 mip->mi_rxhwclnt_avail = mip->mi_rx_group_count - 1;
4140                 mip->mi_rxhwclnt_used = 1;
4141                 break;
4142         case MAC_RING_TYPE_TX:
4143                 mip->mi_tx_group_type = pseudo_txgrp ? MAC_GROUP_TYPE_DYNAMIC :
4144                     cap_rings->mr_group_type;
4145                 mip->mi_tx_group_count = grpcnt;
4146                 mip->mi_tx_group_free = group_free;
4147                 mip->mi_tx_groups = groups;
4148 
4149                 group = groups + grpcnt;
4150                 ring = group->mrg_rings;
4151                 /*
4152                  * The ring can be NULL in the case of aggr. Aggr will
4153                  * have an empty Tx group which will get populated
4154                  * later when pseudo Tx rings are added after
4155                  * mac_register() is done.
4156                  */
4157                 if (ring == NULL) {
4158                         ASSERT(mip->mi_state_flags & MIS_IS_AGGR);
4159                         /*
4160                          * pass the group to aggr so it can add Tx
4161                          * rings to the group later.
4162                          */
4163                         cap_rings->mr_gget(mip->mi_driver, rtype, 0, NULL,
4164                             (mac_group_handle_t)group);
4165                         /*
4166                          * Even though there are no rings at this time
4167                          * (rings will come later), set the group
4168                          * state to registered.
4169                          */
4170                         group->mrg_state = MAC_GROUP_STATE_REGISTERED;
4171                 } else {
4172                         /*
4173                          * Ring 0 is used as the default one and it could be
4174                          * assigned to a client as well.
4175                          */
4176                         while ((ring->mr_index != 0) && (ring->mr_next != NULL))
4177                                 ring = ring->mr_next;
4178                         ASSERT(ring->mr_index == 0);
4179                         mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4180                 }
4181                 if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC)
4182                         mip->mi_txrings_avail = group->mrg_cur_count - 1;
4183                         /*
4184                          * The default ring cannot be reserved.
4185                          */
4186                         mip->mi_txrings_rsvd = 1;
4187                 /*
4188                  * The default group cannot be reserved. It will be shared
4189                  * by clients that do not have an exclusive group.
4190                  */
4191                 mip->mi_txhwclnt_avail = mip->mi_tx_group_count;
4192                 mip->mi_txhwclnt_used = 1;
4193                 break;
4194         default:
4195                 ASSERT(B_FALSE);
4196         }
4197 
4198         if (err != 0)
4199                 mac_free_rings(mip, rtype);
4200 
4201         return (err);
4202 }
4203 
4204 /*
4205  * The ddi interrupt handle could be shared amoung rings. If so, compare
4206  * the new ring's ddi handle with the existing ones and set ddi_shared
4207  * flag.
4208  */
4209 void
4210 mac_compare_ddi_handle(mac_group_t *groups, uint_t grpcnt, mac_ring_t *cring)
4211 {
4212         mac_group_t *group;
4213         mac_ring_t *ring;
4214         ddi_intr_handle_t ddi_handle;
4215         int g;
4216 
4217         ddi_handle = cring->mr_info.mri_intr.mi_ddi_handle;
4218         for (g = 0; g < grpcnt; g++) {
4219                 group = groups + g;
4220                 for (ring = group->mrg_rings; ring != NULL;
4221                     ring = ring->mr_next) {
4222                         if (ring == cring)
4223                                 continue;
4224                         if (ring->mr_info.mri_intr.mi_ddi_handle ==
4225                             ddi_handle) {
4226                                 if (cring->mr_type == MAC_RING_TYPE_RX &&
4227                                     ring->mr_index == 0 &&
4228                                     !ring->mr_info.mri_intr.mi_ddi_shared) {
4229                                         ring->mr_info.mri_intr.mi_ddi_shared =
4230                                             B_TRUE;
4231                                 } else {
4232                                         cring->mr_info.mri_intr.mi_ddi_shared =
4233                                             B_TRUE;
4234                                 }
4235                                 return;
4236                         }
4237                 }
4238         }
4239 }
4240 
4241 /*
4242  * Called to free all groups of particular type (RX or TX). It's assumed that
4243  * no clients are using these groups.
4244  */
4245 void
4246 mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
4247 {
4248         mac_group_t *group, *groups;
4249         uint_t group_count;
4250 
4251         switch (rtype) {
4252         case MAC_RING_TYPE_RX:
4253                 if (mip->mi_rx_groups == NULL)
4254                         return;
4255 
4256                 groups = mip->mi_rx_groups;
4257                 group_count = mip->mi_rx_group_count;
4258 
4259                 mip->mi_rx_groups = NULL;
4260                 mip->mi_rx_donor_grp = NULL;
4261                 mip->mi_rx_group_count = 0;
4262                 break;
4263         case MAC_RING_TYPE_TX:
4264                 ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
4265 
4266                 if (mip->mi_tx_groups == NULL)
4267                         return;
4268 
4269                 groups = mip->mi_tx_groups;
4270                 group_count = mip->mi_tx_group_count;
4271 
4272                 mip->mi_tx_groups = NULL;
4273                 mip->mi_tx_group_count = 0;
4274                 mip->mi_tx_group_free = 0;
4275                 mip->mi_default_tx_ring = NULL;
4276                 break;
4277         default:
4278                 ASSERT(B_FALSE);
4279         }
4280 
4281         for (group = groups; group != NULL; group = group->mrg_next) {
4282                 mac_ring_t *ring;
4283 
4284                 if (group->mrg_cur_count == 0)
4285                         continue;
4286 
4287                 ASSERT(group->mrg_rings != NULL);
4288 
4289                 while ((ring = group->mrg_rings) != NULL) {
4290                         group->mrg_rings = ring->mr_next;
4291                         mac_ring_free(mip, ring);
4292                 }
4293         }
4294 
4295         /* Free all the cached rings */
4296         mac_ring_freeall(mip);
4297         /* Free the block of group data strutures */
4298         kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
4299 }
4300 
4301 /*
4302  * Associate a MAC address with a receive group.
4303  *
4304  * The return value of this function should always be checked properly, because
4305  * any type of failure could cause unexpected results. A group can be added
4306  * or removed with a MAC address only after it has been reserved. Ideally,
4307  * a successful reservation always leads to calling mac_group_addmac() to
4308  * steer desired traffic. Failure of adding an unicast MAC address doesn't
4309  * always imply that the group is functioning abnormally.
4310  *
4311  * Currently this function is called everywhere, and it reflects assumptions
4312  * about MAC addresses in the implementation. CR 6735196.
4313  */
4314 int
4315 mac_group_addmac(mac_group_t *group, const uint8_t *addr)
4316 {
4317         ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
4318         ASSERT(group->mrg_info.mgi_addmac != NULL);
4319 
4320         return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
4321 }
4322 
4323 /*
4324  * Remove the association between MAC address and receive group.
4325  */
4326 int
4327 mac_group_remmac(mac_group_t *group, const uint8_t *addr)
4328 {
4329         ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
4330         ASSERT(group->mrg_info.mgi_remmac != NULL);
4331 
4332         return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
4333 }
4334 
4335 /*
4336  * This is the entry point for packets transmitted through the bridging code.
4337  * If no bridge is in place, MAC_RING_TX transmits using tx ring. The 'rh'
4338  * pointer may be NULL to select the default ring.
4339  */
4340 mblk_t *
4341 mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
4342 {
4343         mac_handle_t mh;
4344 
4345         /*
4346          * Once we take a reference on the bridge link, the bridge
4347          * module itself can't unload, so the callback pointers are
4348          * stable.
4349          */
4350         mutex_enter(&mip->mi_bridge_lock);
4351         if ((mh = mip->mi_bridge_link) != NULL)
4352                 mac_bridge_ref_cb(mh, B_TRUE);
4353         mutex_exit(&mip->mi_bridge_lock);
4354         if (mh == NULL) {
4355                 MAC_RING_TX(mip, rh, mp, mp);
4356         } else {
4357                 mp = mac_bridge_tx_cb(mh, rh, mp);
4358                 mac_bridge_ref_cb(mh, B_FALSE);
4359         }
4360 
4361         return (mp);
4362 }
4363 
4364 /*
4365  * Find a ring from its index.
4366  */
4367 mac_ring_handle_t
4368 mac_find_ring(mac_group_handle_t gh, int index)
4369 {
4370         mac_group_t *group = (mac_group_t *)gh;
4371         mac_ring_t *ring = group->mrg_rings;
4372 
4373         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
4374                 if (ring->mr_index == index)
4375                         break;
4376 
4377         return ((mac_ring_handle_t)ring);
4378 }
4379 /*
4380  * Add a ring to an existing group.
4381  *
4382  * The ring must be either passed directly (for example if the ring
4383  * movement is initiated by the framework), or specified through a driver
4384  * index (for example when the ring is added by the driver.
4385  *
4386  * The caller needs to call mac_perim_enter() before calling this function.
4387  */
4388 int
4389 i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
4390 {
4391         mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4392         mac_capab_rings_t *cap_rings;
4393         boolean_t driver_call = (ring == NULL);
4394         mac_group_type_t group_type;
4395         int ret = 0;
4396         flow_entry_t *flent;
4397 
4398         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4399 
4400         switch (group->mrg_type) {
4401         case MAC_RING_TYPE_RX:
4402                 cap_rings = &mip->mi_rx_rings_cap;
4403                 group_type = mip->mi_rx_group_type;
4404                 break;
4405         case MAC_RING_TYPE_TX:
4406                 cap_rings = &mip->mi_tx_rings_cap;
4407                 group_type = mip->mi_tx_group_type;
4408                 break;
4409         default:
4410                 ASSERT(B_FALSE);
4411         }
4412 
4413         /*
4414          * There should be no ring with the same ring index in the target
4415          * group.
4416          */
4417         ASSERT(mac_find_ring((mac_group_handle_t)group,
4418             driver_call ? index : ring->mr_index) == NULL);
4419 
4420         if (driver_call) {
4421                 /*
4422                  * The function is called as a result of a request from
4423                  * a driver to add a ring to an existing group, for example
4424                  * from the aggregation driver. Allocate a new mac_ring_t
4425                  * for that ring.
4426                  */
4427                 ring = mac_init_ring(mip, group, index, cap_rings);
4428                 ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
4429         } else {
4430                 /*
4431                  * The function is called as a result of a MAC layer request
4432                  * to add a ring to an existing group. In this case the
4433                  * ring is being moved between groups, which requires
4434                  * the underlying driver to support dynamic grouping,
4435                  * and the mac_ring_t already exists.
4436                  */
4437                 ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4438                 ASSERT(group->mrg_driver == NULL ||
4439                     cap_rings->mr_gaddring != NULL);
4440                 ASSERT(ring->mr_gh == NULL);
4441         }
4442 
4443         /*
4444          * At this point the ring should not be in use, and it should be
4445          * of the right for the target group.
4446          */
4447         ASSERT(ring->mr_state < MR_INUSE);
4448         ASSERT(ring->mr_srs == NULL);
4449         ASSERT(ring->mr_type == group->mrg_type);
4450 
4451         if (!driver_call) {
4452                 /*
4453                  * Add the driver level hardware ring if the process was not
4454                  * initiated by the driver, and the target group is not the
4455                  * group.
4456                  */
4457                 if (group->mrg_driver != NULL) {
4458                         cap_rings->mr_gaddring(group->mrg_driver,
4459                             ring->mr_driver, ring->mr_type);
4460                 }
4461 
4462                 /*
4463                  * Insert the ring ahead existing rings.
4464                  */
4465                 ring->mr_next = group->mrg_rings;
4466                 group->mrg_rings = ring;
4467                 ring->mr_gh = (mac_group_handle_t)group;
4468                 group->mrg_cur_count++;
4469         }
4470 
4471         /*
4472          * If the group has not been actively used, we're done.
4473          */
4474         if (group->mrg_index != -1 &&
4475             group->mrg_state < MAC_GROUP_STATE_RESERVED)
4476                 return (0);
4477 
4478         /*
4479          * Start the ring if needed. Failure causes to undo the grouping action.
4480          */
4481         if (ring->mr_state != MR_INUSE) {
4482                 if ((ret = mac_start_ring(ring)) != 0) {
4483                         if (!driver_call) {
4484                                 cap_rings->mr_gremring(group->mrg_driver,
4485                                     ring->mr_driver, ring->mr_type);
4486                         }
4487                         group->mrg_cur_count--;
4488                         group->mrg_rings = ring->mr_next;
4489 
4490                         ring->mr_gh = NULL;
4491 
4492                         if (driver_call)
4493                                 mac_ring_free(mip, ring);
4494 
4495                         return (ret);
4496                 }
4497         }
4498 
4499         /*
4500          * Set up SRS/SR according to the ring type.
4501          */
4502         switch (ring->mr_type) {
4503         case MAC_RING_TYPE_RX:
4504                 /*
4505                  * Setup SRS on top of the new ring if the group is
4506                  * reserved for someones exclusive use.
4507                  */
4508                 if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
4509                         mac_client_impl_t *mcip;
4510 
4511                         mcip = MAC_GROUP_ONLY_CLIENT(group);
4512                         /*
4513                          * Even though this group is reserved we migth still
4514                          * have multiple clients, i.e a VLAN shares the
4515                          * group with the primary mac client.
4516                          */
4517                         if (mcip != NULL) {
4518                                 flent = mcip->mci_flent;
4519                                 ASSERT(flent->fe_rx_srs_cnt > 0);
4520                                 mac_rx_srs_group_setup(mcip, flent, SRST_LINK);
4521                                 mac_fanout_setup(mcip, flent,
4522                                     MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver,
4523                                     mcip, NULL, NULL);
4524                         } else {
4525                                 ring->mr_classify_type = MAC_SW_CLASSIFIER;
4526                         }
4527                 }
4528                 break;
4529         case MAC_RING_TYPE_TX:
4530         {
4531                 mac_grp_client_t        *mgcp = group->mrg_clients;
4532                 mac_client_impl_t       *mcip;
4533                 mac_soft_ring_set_t     *mac_srs;
4534                 mac_srs_tx_t            *tx;
4535 
4536                 if (MAC_GROUP_NO_CLIENT(group)) {
4537                         if (ring->mr_state == MR_INUSE)
4538                                 mac_stop_ring(ring);
4539                         ring->mr_flag = 0;
4540                         break;
4541                 }
4542                 /*
4543                  * If the rings are being moved to a group that has
4544                  * clients using it, then add the new rings to the
4545                  * clients SRS.
4546                  */
4547                 while (mgcp != NULL) {
4548                         boolean_t       is_aggr;
4549 
4550                         mcip = mgcp->mgc_client;
4551                         flent = mcip->mci_flent;
4552                         is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR);
4553                         mac_srs = MCIP_TX_SRS(mcip);
4554                         tx = &mac_srs->srs_tx;
4555                         mac_tx_client_quiesce((mac_client_handle_t)mcip);
4556                         /*
4557                          * If we are  growing from 1 to multiple rings.
4558                          */
4559                         if (tx->st_mode == SRS_TX_BW ||
4560                             tx->st_mode == SRS_TX_SERIALIZE ||
4561                             tx->st_mode == SRS_TX_DEFAULT) {
4562                                 mac_ring_t      *tx_ring = tx->st_arg2;
4563 
4564                                 tx->st_arg2 = NULL;
4565                                 mac_tx_srs_stat_recreate(mac_srs, B_TRUE);
4566                                 mac_tx_srs_add_ring(mac_srs, tx_ring);
4567                                 if (mac_srs->srs_type & SRST_BW_CONTROL) {
4568                                         tx->st_mode = is_aggr ? SRS_TX_BW_AGGR :
4569                                             SRS_TX_BW_FANOUT;
4570                                 } else {
4571                                         tx->st_mode = is_aggr ? SRS_TX_AGGR :
4572                                             SRS_TX_FANOUT;
4573                                 }
4574                                 tx->st_func = mac_tx_get_func(tx->st_mode);
4575                         }
4576                         mac_tx_srs_add_ring(mac_srs, ring);
4577                         mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
4578                             mac_rx_deliver, mcip, NULL, NULL);
4579                         mac_tx_client_restart((mac_client_handle_t)mcip);
4580                         mgcp = mgcp->mgc_next;
4581                 }
4582                 break;
4583         }
4584         default:
4585                 ASSERT(B_FALSE);
4586         }
4587         /*
4588          * For aggr, the default ring will be NULL to begin with. If it
4589          * is NULL, then pick the first ring that gets added as the
4590          * default ring. Any ring in an aggregation can be removed at
4591          * any time (by the user action of removing a link) and if the
4592          * current default ring gets removed, then a new one gets
4593          * picked (see i_mac_group_rem_ring()).
4594          */
4595         if (mip->mi_state_flags & MIS_IS_AGGR &&
4596             mip->mi_default_tx_ring == NULL &&
4597             ring->mr_type == MAC_RING_TYPE_TX) {
4598                 mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4599         }
4600 
4601         MAC_RING_UNMARK(ring, MR_INCIPIENT);
4602         return (0);
4603 }
4604 
4605 /*
4606  * Remove a ring from it's current group. MAC internal function for dynamic
4607  * grouping.
4608  *
4609  * The caller needs to call mac_perim_enter() before calling this function.
4610  */
4611 void
4612 i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
4613     boolean_t driver_call)
4614 {
4615         mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4616         mac_capab_rings_t *cap_rings = NULL;
4617         mac_group_type_t group_type;
4618 
4619         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4620 
4621         ASSERT(mac_find_ring((mac_group_handle_t)group,
4622             ring->mr_index) == (mac_ring_handle_t)ring);
4623         ASSERT((mac_group_t *)ring->mr_gh == group);
4624         ASSERT(ring->mr_type == group->mrg_type);
4625 
4626         if (ring->mr_state == MR_INUSE)
4627                 mac_stop_ring(ring);
4628         switch (ring->mr_type) {
4629         case MAC_RING_TYPE_RX:
4630                 group_type = mip->mi_rx_group_type;
4631                 cap_rings = &mip->mi_rx_rings_cap;
4632 
4633                 /*
4634                  * Only hardware classified packets hold a reference to the
4635                  * ring all the way up the Rx path. mac_rx_srs_remove()
4636                  * will take care of quiescing the Rx path and removing the
4637                  * SRS. The software classified path neither holds a reference
4638                  * nor any association with the ring in mac_rx.
4639                  */
4640                 if (ring->mr_srs != NULL) {
4641                         mac_rx_srs_remove(ring->mr_srs);
4642                         ring->mr_srs = NULL;
4643                 }
4644 
4645                 break;
4646         case MAC_RING_TYPE_TX:
4647         {
4648                 mac_grp_client_t        *mgcp;
4649                 mac_client_impl_t       *mcip;
4650                 mac_soft_ring_set_t     *mac_srs;
4651                 mac_srs_tx_t            *tx;
4652                 mac_ring_t              *rem_ring;
4653                 mac_group_t             *defgrp;
4654                 uint_t                  ring_info = 0;
4655 
4656                 /*
4657                  * For TX this function is invoked in three
4658                  * cases:
4659                  *
4660                  * 1) In the case of a failure during the
4661                  * initial creation of a group when a share is
4662                  * associated with a MAC client. So the SRS is not
4663                  * yet setup, and will be setup later after the
4664                  * group has been reserved and populated.
4665                  *
4666                  * 2) From mac_release_tx_group() when freeing
4667                  * a TX SRS.
4668                  *
4669                  * 3) In the case of aggr, when a port gets removed,
4670                  * the pseudo Tx rings that it exposed gets removed.
4671                  *
4672                  * In the first two cases the SRS and its soft
4673                  * rings are already quiesced.
4674                  */
4675                 if (driver_call) {
4676                         mac_client_impl_t *mcip;
4677                         mac_soft_ring_set_t *mac_srs;
4678                         mac_soft_ring_t *sringp;
4679                         mac_srs_tx_t *srs_tx;
4680 
4681                         if (mip->mi_state_flags & MIS_IS_AGGR &&
4682                             mip->mi_default_tx_ring ==
4683                             (mac_ring_handle_t)ring) {
4684                                 /* pick a new default Tx ring */
4685                                 mip->mi_default_tx_ring =
4686                                     (group->mrg_rings != ring) ?
4687                                     (mac_ring_handle_t)group->mrg_rings :
4688                                     (mac_ring_handle_t)(ring->mr_next);
4689                         }
4690                         /* Presently only aggr case comes here */
4691                         if (group->mrg_state != MAC_GROUP_STATE_RESERVED)
4692                                 break;
4693 
4694                         mcip = MAC_GROUP_ONLY_CLIENT(group);
4695                         ASSERT(mcip != NULL);
4696                         ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR);
4697                         mac_srs = MCIP_TX_SRS(mcip);
4698                         ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
4699                             mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
4700                         srs_tx = &mac_srs->srs_tx;
4701                         /*
4702                          * Wakeup any callers blocked on this
4703                          * Tx ring due to flow control.
4704                          */
4705                         sringp = srs_tx->st_soft_rings[ring->mr_index];
4706                         ASSERT(sringp != NULL);
4707                         mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)sringp);
4708                         mac_tx_client_quiesce((mac_client_handle_t)mcip);
4709                         mac_tx_srs_del_ring(mac_srs, ring);
4710                         mac_tx_client_restart((mac_client_handle_t)mcip);
4711                         break;
4712                 }
4713                 ASSERT(ring != (mac_ring_t *)mip->mi_default_tx_ring);
4714                 group_type = mip->mi_tx_group_type;
4715                 cap_rings = &mip->mi_tx_rings_cap;
4716                 /*
4717                  * See if we need to take it out of the MAC clients using
4718                  * this group
4719                  */
4720                 if (MAC_GROUP_NO_CLIENT(group))
4721                         break;
4722                 mgcp = group->mrg_clients;
4723                 defgrp = MAC_DEFAULT_TX_GROUP(mip);
4724                 while (mgcp != NULL) {
4725                         mcip = mgcp->mgc_client;
4726                         mac_srs = MCIP_TX_SRS(mcip);
4727                         tx = &mac_srs->srs_tx;
4728                         mac_tx_client_quiesce((mac_client_handle_t)mcip);
4729                         /*
4730                          * If we are here when removing rings from the
4731                          * defgroup, mac_reserve_tx_ring would have
4732                          * already deleted the ring from the MAC
4733                          * clients in the group.
4734                          */
4735                         if (group != defgrp) {
4736                                 mac_tx_invoke_callbacks(mcip,
4737                                     (mac_tx_cookie_t)
4738                                     mac_tx_srs_get_soft_ring(mac_srs, ring));
4739                                 mac_tx_srs_del_ring(mac_srs, ring);
4740                         }
4741                         /*
4742                          * Additionally, if  we are left with only
4743                          * one ring in the group after this, we need
4744                          * to modify the mode etc. to. (We haven't
4745                          * yet taken the ring out, so we check with 2).
4746                          */
4747                         if (group->mrg_cur_count == 2) {
4748                                 if (ring->mr_next == NULL)
4749                                         rem_ring = group->mrg_rings;
4750                                 else
4751                                         rem_ring = ring->mr_next;
4752                                 mac_tx_invoke_callbacks(mcip,
4753                                     (mac_tx_cookie_t)
4754                                     mac_tx_srs_get_soft_ring(mac_srs,
4755                                     rem_ring));
4756                                 mac_tx_srs_del_ring(mac_srs, rem_ring);
4757                                 if (rem_ring->mr_state != MR_INUSE) {
4758                                         (void) mac_start_ring(rem_ring);
4759                                 }
4760                                 tx->st_arg2 = (void *)rem_ring;
4761                                 mac_tx_srs_stat_recreate(mac_srs, B_FALSE);
4762                                 ring_info = mac_hwring_getinfo(
4763                                     (mac_ring_handle_t)rem_ring);
4764                                 /*
4765                                  * We are  shrinking from multiple
4766                                  * to 1 ring.
4767                                  */
4768                                 if (mac_srs->srs_type & SRST_BW_CONTROL) {
4769                                         tx->st_mode = SRS_TX_BW;
4770                                 } else if (mac_tx_serialize ||
4771                                     (ring_info & MAC_RING_TX_SERIALIZE)) {
4772                                         tx->st_mode = SRS_TX_SERIALIZE;
4773                                 } else {
4774                                         tx->st_mode = SRS_TX_DEFAULT;
4775                                 }
4776                                 tx->st_func = mac_tx_get_func(tx->st_mode);
4777                         }
4778                         mac_tx_client_restart((mac_client_handle_t)mcip);
4779                         mgcp = mgcp->mgc_next;
4780                 }
4781                 break;
4782         }
4783         default:
4784                 ASSERT(B_FALSE);
4785         }
4786 
4787         /*
4788          * Remove the ring from the group.
4789          */
4790         if (ring == group->mrg_rings)
4791                 group->mrg_rings = ring->mr_next;
4792         else {
4793                 mac_ring_t *pre;
4794 
4795                 pre = group->mrg_rings;
4796                 while (pre->mr_next != ring)
4797                         pre = pre->mr_next;
4798                 pre->mr_next = ring->mr_next;
4799         }
4800         group->mrg_cur_count--;
4801 
4802         if (!driver_call) {
4803                 ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4804                 ASSERT(group->mrg_driver == NULL ||
4805                     cap_rings->mr_gremring != NULL);
4806 
4807                 /*
4808                  * Remove the driver level hardware ring.
4809                  */
4810                 if (group->mrg_driver != NULL) {
4811                         cap_rings->mr_gremring(group->mrg_driver,
4812                             ring->mr_driver, ring->mr_type);
4813                 }
4814         }
4815 
4816         ring->mr_gh = NULL;
4817         if (driver_call)
4818                 mac_ring_free(mip, ring);
4819         else
4820                 ring->mr_flag = 0;
4821 }
4822 
4823 /*
4824  * Move a ring to the target group. If needed, remove the ring from the group
4825  * that it currently belongs to.
4826  *
4827  * The caller need to enter MAC's perimeter by calling mac_perim_enter().
4828  */
4829 static int
4830 mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
4831 {
4832         mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
4833         int rv;
4834 
4835         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4836         ASSERT(d_group != NULL);
4837         ASSERT(s_group->mrg_mh == d_group->mrg_mh);
4838 
4839         if (s_group == d_group)
4840                 return (0);
4841 
4842         /*
4843          * Remove it from current group first.
4844          */
4845         if (s_group != NULL)
4846                 i_mac_group_rem_ring(s_group, ring, B_FALSE);
4847 
4848         /*
4849          * Add it to the new group.
4850          */
4851         rv = i_mac_group_add_ring(d_group, ring, 0);
4852         if (rv != 0) {
4853                 /*
4854                  * Failed to add ring back to source group. If
4855                  * that fails, the ring is stuck in limbo, log message.
4856                  */
4857                 if (i_mac_group_add_ring(s_group, ring, 0)) {
4858                         cmn_err(CE_WARN, "%s: failed to move ring %p\n",
4859                             mip->mi_name, (void *)ring);
4860                 }
4861         }
4862 
4863         return (rv);
4864 }
4865 
4866 /*
4867  * Find a MAC address according to its value.
4868  */
4869 mac_address_t *
4870 mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
4871 {
4872         mac_address_t *map;
4873 
4874         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4875 
4876         for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
4877                 if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
4878                         break;
4879         }
4880 
4881         return (map);
4882 }
4883 
4884 /*
4885  * Check whether the MAC address is shared by multiple clients.
4886  */
4887 boolean_t
4888 mac_check_macaddr_shared(mac_address_t *map)
4889 {
4890         ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
4891 
4892         return (map->ma_nusers > 1);
4893 }
4894 
4895 /*
4896  * Remove the specified MAC address from the MAC address list and free it.
4897  */
4898 static void
4899 mac_free_macaddr(mac_address_t *map)
4900 {
4901         mac_impl_t *mip = map->ma_mip;
4902 
4903         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4904         ASSERT(mip->mi_addresses != NULL);
4905 
4906         map = mac_find_macaddr(mip, map->ma_addr);
4907 
4908         ASSERT(map != NULL);
4909         ASSERT(map->ma_nusers == 0);
4910 
4911         if (map == mip->mi_addresses) {
4912                 mip->mi_addresses = map->ma_next;
4913         } else {
4914                 mac_address_t *pre;
4915 
4916                 pre = mip->mi_addresses;
4917                 while (pre->ma_next != map)
4918                         pre = pre->ma_next;
4919                 pre->ma_next = map->ma_next;
4920         }
4921 
4922         kmem_free(map, sizeof (mac_address_t));
4923 }
4924 
4925 /*
4926  * Add a MAC address reference for a client. If the desired MAC address
4927  * exists, add a reference to it. Otherwise, add the new address by adding
4928  * it to a reserved group or setting promiscuous mode. Won't try different
4929  * group is the group is non-NULL, so the caller must explictly share
4930  * default group when needed.
4931  *
4932  * Note, the primary MAC address is initialized at registration time, so
4933  * to add it to default group only need to activate it if its reference
4934  * count is still zero. Also, some drivers may not have advertised RINGS
4935  * capability.
4936  */
4937 int
4938 mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
4939     boolean_t use_hw)
4940 {
4941         mac_address_t *map;
4942         int err = 0;
4943         boolean_t allocated_map = B_FALSE;
4944 
4945         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4946 
4947         map = mac_find_macaddr(mip, mac_addr);
4948 
4949         /*
4950          * If the new MAC address has not been added. Allocate a new one
4951          * and set it up.
4952          */
4953         if (map == NULL) {
4954                 map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
4955                 map->ma_len = mip->mi_type->mt_addr_length;
4956                 bcopy(mac_addr, map->ma_addr, map->ma_len);
4957                 map->ma_nusers = 0;
4958                 map->ma_group = group;
4959                 map->ma_mip = mip;
4960 
4961                 /* add the new MAC address to the head of the address list */
4962                 map->ma_next = mip->mi_addresses;
4963                 mip->mi_addresses = map;
4964 
4965                 allocated_map = B_TRUE;
4966         }
4967 
4968         ASSERT(map->ma_group == NULL || map->ma_group == group);
4969         if (map->ma_group == NULL)
4970                 map->ma_group = group;
4971 
4972         /*
4973          * If the MAC address is already in use, simply account for the
4974          * new client.
4975          */
4976         if (map->ma_nusers++ > 0)
4977                 return (0);
4978 
4979         /*
4980          * Activate this MAC address by adding it to the reserved group.
4981          */
4982         if (group != NULL) {
4983                 err = mac_group_addmac(group, (const uint8_t *)mac_addr);
4984                 if (err == 0) {
4985                         map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4986                         return (0);
4987                 }
4988         }
4989 
4990         /*
4991          * The MAC address addition failed. If the client requires a
4992          * hardware classified MAC address, fail the operation.
4993          */
4994         if (use_hw) {
4995                 err = ENOSPC;
4996                 goto bail;
4997         }
4998 
4999         /*
5000          * Try promiscuous mode.
5001          *
5002          * For drivers that don't advertise RINGS capability, do
5003          * nothing for the primary address.
5004          */
5005         if ((group == NULL) &&
5006             (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
5007                 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
5008                 return (0);
5009         }
5010 
5011         /*
5012          * Enable promiscuous mode in order to receive traffic
5013          * to the new MAC address.
5014          */
5015         if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
5016                 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
5017                 return (0);
5018         }
5019 
5020         /*
5021          * Free the MAC address that could not be added. Don't free
5022          * a pre-existing address, it could have been the entry
5023          * for the primary MAC address which was pre-allocated by
5024          * mac_init_macaddr(), and which must remain on the list.
5025          */
5026 bail:
5027         map->ma_nusers--;
5028         if (allocated_map)
5029                 mac_free_macaddr(map);
5030         return (err);
5031 }
5032 
5033 /*
5034  * Remove a reference to a MAC address. This may cause to remove the MAC
5035  * address from an associated group or to turn off promiscuous mode.
5036  * The caller needs to handle the failure properly.
5037  */
5038 int
5039 mac_remove_macaddr(mac_address_t *map)
5040 {
5041         mac_impl_t *mip = map->ma_mip;
5042         int err = 0;
5043 
5044         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5045 
5046         ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
5047 
5048         /*
5049          * If it's not the last client using this MAC address, only update
5050          * the MAC clients count.
5051          */
5052         if (--map->ma_nusers > 0)
5053                 return (0);
5054 
5055         /*
5056          * The MAC address is no longer used by any MAC client, so remove
5057          * it from its associated group, or turn off promiscuous mode
5058          * if it was enabled for the MAC address.
5059          */
5060         switch (map->ma_type) {
5061         case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
5062                 /*
5063                  * Don't free the preset primary address for drivers that
5064                  * don't advertise RINGS capability.
5065                  */
5066                 if (map->ma_group == NULL)
5067                         return (0);
5068 
5069                 err = mac_group_remmac(map->ma_group, map->ma_addr);
5070                 if (err == 0)
5071                         map->ma_group = NULL;
5072                 break;
5073         case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
5074                 err = i_mac_promisc_set(mip, B_FALSE);
5075                 break;
5076         default:
5077                 ASSERT(B_FALSE);
5078         }
5079 
5080         if (err != 0)
5081                 return (err);
5082 
5083         /*
5084          * We created MAC address for the primary one at registration, so we
5085          * won't free it here. mac_fini_macaddr() will take care of it.
5086          */
5087         if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
5088                 mac_free_macaddr(map);
5089 
5090         return (0);
5091 }
5092 
5093 /*
5094  * Update an existing MAC address. The caller need to make sure that the new
5095  * value has not been used.
5096  */
5097 int
5098 mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
5099 {
5100         mac_impl_t *mip = map->ma_mip;
5101         int err = 0;
5102 
5103         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5104         ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
5105 
5106         switch (map->ma_type) {
5107         case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
5108                 /*
5109                  * Update the primary address for drivers that are not
5110                  * RINGS capable.
5111                  */
5112                 if (mip->mi_rx_groups == NULL) {
5113                         err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
5114                             mac_addr);
5115                         if (err != 0)
5116                                 return (err);
5117                         break;
5118                 }
5119 
5120                 /*
5121                  * If this MAC address is not currently in use,
5122                  * simply break out and update the value.
5123                  */
5124                 if (map->ma_nusers == 0)
5125                         break;
5126 
5127                 /*
5128                  * Need to replace the MAC address associated with a group.
5129                  */
5130                 err = mac_group_remmac(map->ma_group, map->ma_addr);
5131                 if (err != 0)
5132                         return (err);
5133 
5134                 err = mac_group_addmac(map->ma_group, mac_addr);
5135 
5136                 /*
5137                  * Failure hints hardware error. The MAC layer needs to
5138                  * have error notification facility to handle this.
5139                  * Now, simply try to restore the value.
5140                  */
5141                 if (err != 0)
5142                         (void) mac_group_addmac(map->ma_group, map->ma_addr);
5143 
5144                 break;
5145         case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
5146                 /*
5147                  * Need to do nothing more if in promiscuous mode.
5148                  */
5149                 break;
5150         default:
5151                 ASSERT(B_FALSE);
5152         }
5153 
5154         /*
5155          * Successfully replaced the MAC address.
5156          */
5157         if (err == 0)
5158                 bcopy(mac_addr, map->ma_addr, map->ma_len);
5159 
5160         return (err);
5161 }
5162 
5163 /*
5164  * Freshen the MAC address with new value. Its caller must have updated the
5165  * hardware MAC address before calling this function.
5166  * This funcitons is supposed to be used to handle the MAC address change
5167  * notification from underlying drivers.
5168  */
5169 void
5170 mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
5171 {
5172         mac_impl_t *mip = map->ma_mip;
5173 
5174         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5175         ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
5176 
5177         /*
5178          * Freshen the MAC address with new value.
5179          */
5180         bcopy(mac_addr, map->ma_addr, map->ma_len);
5181         bcopy(mac_addr, mip->mi_addr, map->ma_len);
5182 
5183         /*
5184          * Update all MAC clients that share this MAC address.
5185          */
5186         mac_unicast_update_clients(mip, map);
5187 }
5188 
5189 /*
5190  * Set up the primary MAC address.
5191  */
5192 void
5193 mac_init_macaddr(mac_impl_t *mip)
5194 {
5195         mac_address_t *map;
5196 
5197         /*
5198          * The reference count is initialized to zero, until it's really
5199          * activated.
5200          */
5201         map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
5202         map->ma_len = mip->mi_type->mt_addr_length;
5203         bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
5204 
5205         /*
5206          * If driver advertises RINGS capability, it shouldn't have initialized
5207          * its primary MAC address. For other drivers, including VNIC, the
5208          * primary address must work after registration.
5209          */
5210         if (mip->mi_rx_groups == NULL)
5211                 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
5212 
5213         map->ma_mip = mip;
5214 
5215         mip->mi_addresses = map;
5216 }
5217 
5218 /*
5219  * Clean up the primary MAC address. Note, only one primary MAC address
5220  * is allowed. All other MAC addresses must have been freed appropriately.
5221  */
5222 void
5223 mac_fini_macaddr(mac_impl_t *mip)
5224 {
5225         mac_address_t *map = mip->mi_addresses;
5226 
5227         if (map == NULL)
5228                 return;
5229 
5230         /*
5231          * If mi_addresses is initialized, there should be exactly one
5232          * entry left on the list with no users.
5233          */
5234         ASSERT(map->ma_nusers == 0);
5235         ASSERT(map->ma_next == NULL);
5236 
5237         kmem_free(map, sizeof (mac_address_t));
5238         mip->mi_addresses = NULL;
5239 }
5240 
5241 /*
5242  * Logging related functions.
5243  *
5244  * Note that Kernel statistics have been extended to maintain fine
5245  * granularity of statistics viz. hardware lane, software lane, fanout
5246  * stats etc. However, extended accounting continues to support only
5247  * aggregate statistics like before.
5248  */
5249 
5250 /* Write the flow description to a netinfo_t record */
5251 static netinfo_t *
5252 mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
5253 {
5254         netinfo_t               *ninfo;
5255         net_desc_t              *ndesc;
5256         flow_desc_t             *fdesc;
5257         mac_resource_props_t    *mrp;
5258 
5259         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5260         if (ninfo == NULL)
5261                 return (NULL);
5262         ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5263         if (ndesc == NULL) {
5264                 kmem_free(ninfo, sizeof (netinfo_t));
5265                 return (NULL);
5266         }
5267 
5268         /*
5269          * Grab the fe_lock to see a self-consistent fe_flow_desc.
5270          * Updates to the fe_flow_desc are done under the fe_lock
5271          */
5272         mutex_enter(&flent->fe_lock);
5273         fdesc = &flent->fe_flow_desc;
5274         mrp = &flent->fe_resource_props;
5275 
5276         ndesc->nd_name = flent->fe_flow_name;
5277         ndesc->nd_devname = mcip->mci_name;
5278         bcopy(fdesc->fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5279         bcopy(fdesc->fd_dst_mac, ndesc->nd_edest, ETHERADDRL);
5280         ndesc->nd_sap = htonl(fdesc->fd_sap);
5281         ndesc->nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
5282         ndesc->nd_bw_limit = mrp->mrp_maxbw;
5283         if (ndesc->nd_isv4) {
5284                 ndesc->nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
5285                 ndesc->nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
5286         } else {
5287                 bcopy(&fdesc->fd_local_addr, ndesc->nd_saddr, IPV6_ADDR_LEN);
5288                 bcopy(&fdesc->fd_remote_addr, ndesc->nd_daddr, IPV6_ADDR_LEN);
5289         }
5290         ndesc->nd_sport = htons(fdesc->fd_local_port);
5291         ndesc->nd_dport = htons(fdesc->fd_remote_port);
5292         ndesc->nd_protocol = (uint8_t)fdesc->fd_protocol;
5293         mutex_exit(&flent->fe_lock);
5294 
5295         ninfo->ni_record = ndesc;
5296         ninfo->ni_size = sizeof (net_desc_t);
5297         ninfo->ni_type = EX_NET_FLDESC_REC;
5298 
5299         return (ninfo);
5300 }
5301 
5302 /* Write the flow statistics to a netinfo_t record */
5303 static netinfo_t *
5304 mac_write_flow_stats(flow_entry_t *flent)
5305 {
5306         netinfo_t               *ninfo;
5307         net_stat_t              *nstat;
5308         mac_soft_ring_set_t     *mac_srs;
5309         mac_rx_stats_t          *mac_rx_stat;
5310         mac_tx_stats_t          *mac_tx_stat;
5311         int                     i;
5312 
5313         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5314         if (ninfo == NULL)
5315                 return (NULL);
5316         nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5317         if (nstat == NULL) {
5318                 kmem_free(ninfo, sizeof (netinfo_t));
5319                 return (NULL);
5320         }
5321 
5322         nstat->ns_name = flent->fe_flow_name;
5323         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5324                 mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5325                 mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5326 
5327                 nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5328                     mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
5329                 nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5330                     mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5331                 nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5332         }
5333 
5334         mac_srs = (mac_soft_ring_set_t *)(flent->fe_tx_srs);
5335         if (mac_srs != NULL) {
5336                 mac_tx_stat = &mac_srs->srs_tx.st_stat;
5337 
5338                 nstat->ns_obytes = mac_tx_stat->mts_obytes;
5339                 nstat->ns_opackets = mac_tx_stat->mts_opackets;
5340                 nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5341         }
5342 
5343         ninfo->ni_record = nstat;
5344         ninfo->ni_size = sizeof (net_stat_t);
5345         ninfo->ni_type = EX_NET_FLSTAT_REC;
5346 
5347         return (ninfo);
5348 }
5349 
5350 /* Write the link description to a netinfo_t record */
5351 static netinfo_t *
5352 mac_write_link_desc(mac_client_impl_t *mcip)
5353 {
5354         netinfo_t               *ninfo;
5355         net_desc_t              *ndesc;
5356         flow_entry_t            *flent = mcip->mci_flent;
5357 
5358         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5359         if (ninfo == NULL)
5360                 return (NULL);
5361         ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5362         if (ndesc == NULL) {
5363                 kmem_free(ninfo, sizeof (netinfo_t));
5364                 return (NULL);
5365         }
5366 
5367         ndesc->nd_name = mcip->mci_name;
5368         ndesc->nd_devname = mcip->mci_name;
5369         ndesc->nd_isv4 = B_TRUE;
5370         /*
5371          * Grab the fe_lock to see a self-consistent fe_flow_desc.
5372          * Updates to the fe_flow_desc are done under the fe_lock
5373          * after removing the flent from the flow table.
5374          */
5375         mutex_enter(&flent->fe_lock);
5376         bcopy(flent->fe_flow_desc.fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5377         mutex_exit(&flent->fe_lock);
5378 
5379         ninfo->ni_record = ndesc;
5380         ninfo->ni_size = sizeof (net_desc_t);
5381         ninfo->ni_type = EX_NET_LNDESC_REC;
5382 
5383         return (ninfo);
5384 }
5385 
5386 /* Write the link statistics to a netinfo_t record */
5387 static netinfo_t *
5388 mac_write_link_stats(mac_client_impl_t *mcip)
5389 {
5390         netinfo_t               *ninfo;
5391         net_stat_t              *nstat;
5392         flow_entry_t            *flent;
5393         mac_soft_ring_set_t     *mac_srs;
5394         mac_rx_stats_t          *mac_rx_stat;
5395         mac_tx_stats_t          *mac_tx_stat;
5396         int                     i;
5397 
5398         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5399         if (ninfo == NULL)
5400                 return (NULL);
5401         nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5402         if (nstat == NULL) {
5403                 kmem_free(ninfo, sizeof (netinfo_t));
5404                 return (NULL);
5405         }
5406 
5407         nstat->ns_name = mcip->mci_name;
5408         flent = mcip->mci_flent;
5409         if (flent != NULL)  {
5410                 for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5411                         mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5412                         mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5413 
5414                         nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5415                             mac_rx_stat->mrs_pollbytes +
5416                             mac_rx_stat->mrs_lclbytes;
5417                         nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5418                             mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5419                         nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5420                 }
5421         }
5422 
5423         mac_srs = (mac_soft_ring_set_t *)(mcip->mci_flent->fe_tx_srs);
5424         if (mac_srs != NULL) {
5425                 mac_tx_stat = &mac_srs->srs_tx.st_stat;
5426 
5427                 nstat->ns_obytes = mac_tx_stat->mts_obytes;
5428                 nstat->ns_opackets = mac_tx_stat->mts_opackets;
5429                 nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5430         }
5431 
5432         ninfo->ni_record = nstat;
5433         ninfo->ni_size = sizeof (net_stat_t);
5434         ninfo->ni_type = EX_NET_LNSTAT_REC;
5435 
5436         return (ninfo);
5437 }
5438 
5439 typedef struct i_mac_log_state_s {
5440         boolean_t       mi_last;
5441         int             mi_fenable;
5442         int             mi_lenable;
5443         list_t          *mi_list;
5444 } i_mac_log_state_t;
5445 
5446 /*
5447  * For a given flow, if the description has not been logged before, do it now.
5448  * If it is a VNIC, then we have collected information about it from the MAC
5449  * table, so skip it.
5450  *
5451  * Called through mac_flow_walk_nolock()
5452  *
5453  * Return 0 if successful.
5454  */
5455 static int
5456 mac_log_flowinfo(flow_entry_t *flent, void *arg)
5457 {
5458         mac_client_impl_t       *mcip = flent->fe_mcip;
5459         i_mac_log_state_t       *lstate = arg;
5460         netinfo_t               *ninfo;
5461 
5462         if (mcip == NULL)
5463                 return (0);
5464 
5465         /*
5466          * If the name starts with "vnic", and fe_user_generated is true (to
5467          * exclude the mcast and active flow entries created implicitly for
5468          * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
5469          * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
5470          */
5471         if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
5472             (flent->fe_type & FLOW_USER) != 0) {
5473                 return (0);
5474         }
5475 
5476         if (!flent->fe_desc_logged) {
5477                 /*
5478                  * We don't return error because we want to continue the
5479                  * walk in case this is the last walk which means we
5480                  * need to reset fe_desc_logged in all the flows.
5481                  */
5482                 if ((ninfo = mac_write_flow_desc(flent, mcip)) == NULL)
5483                         return (0);
5484                 list_insert_tail(lstate->mi_list, ninfo);
5485                 flent->fe_desc_logged = B_TRUE;
5486         }
5487 
5488         /*
5489          * Regardless of the error, we want to proceed in case we have to
5490          * reset fe_desc_logged.
5491          */
5492         ninfo = mac_write_flow_stats(flent);
5493         if (ninfo == NULL)
5494                 return (-1);
5495 
5496         list_insert_tail(lstate->mi_list, ninfo);
5497 
5498         if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
5499                 flent->fe_desc_logged = B_FALSE;
5500 
5501         return (0);
5502 }
5503 
5504 /*
5505  * Log the description for each mac client of this mac_impl_t, if it
5506  * hasn't already been done. Additionally, log statistics for the link as
5507  * well. Walk the flow table and log information for each flow as well.
5508  * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
5509  * also fe_desc_logged, if flow logging is on) since we want to log the
5510  * description if and when logging is restarted.
5511  *
5512  * Return 0 upon success or -1 upon failure
5513  */
5514 static int
5515 i_mac_impl_log(mac_impl_t *mip, i_mac_log_state_t *lstate)
5516 {
5517         mac_client_impl_t       *mcip;
5518         netinfo_t               *ninfo;
5519 
5520         i_mac_perim_enter(mip);
5521         /*
5522          * Only walk the client list for NIC and etherstub
5523          */
5524         if ((mip->mi_state_flags & MIS_DISABLED) ||
5525             ((mip->mi_state_flags & MIS_IS_VNIC) &&
5526             (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL))) {
5527                 i_mac_perim_exit(mip);
5528                 return (0);
5529         }
5530 
5531         for (mcip = mip->mi_clients_list; mcip != NULL;
5532             mcip = mcip->mci_client_next) {
5533                 if (!MCIP_DATAPATH_SETUP(mcip))
5534                         continue;
5535                 if (lstate->mi_lenable) {
5536                         if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
5537                                 ninfo = mac_write_link_desc(mcip);
5538                                 if (ninfo == NULL) {
5539                                 /*
5540                                  * We can't terminate it if this is the last
5541                                  * walk, else there might be some links with
5542                                  * mi_desc_logged set to true, which means
5543                                  * their description won't be logged the next
5544                                  * time logging is started (similarly for the
5545                                  * flows within such links). We can continue
5546                                  * without walking the flow table (i.e. to
5547                                  * set fe_desc_logged to false) because we
5548                                  * won't have written any flow stuff for this
5549                                  * link as we haven't logged the link itself.
5550                                  */
5551                                         i_mac_perim_exit(mip);
5552                                         if (lstate->mi_last)
5553                                                 return (0);
5554                                         else
5555                                                 return (-1);
5556                                 }
5557                                 mcip->mci_state_flags |= MCIS_DESC_LOGGED;
5558                                 list_insert_tail(lstate->mi_list, ninfo);
5559                         }
5560                 }
5561 
5562                 ninfo = mac_write_link_stats(mcip);
5563                 if (ninfo == NULL && !lstate->mi_last) {
5564                         i_mac_perim_exit(mip);
5565                         return (-1);
5566                 }
5567                 list_insert_tail(lstate->mi_list, ninfo);
5568 
5569                 if (lstate->mi_last)
5570                         mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
5571 
5572                 if (lstate->mi_fenable) {
5573                         if (mcip->mci_subflow_tab != NULL) {
5574                                 (void) mac_flow_walk_nolock(
5575                                     mcip->mci_subflow_tab, mac_log_flowinfo,
5576                                     lstate);
5577                         }
5578                 }
5579         }
5580         i_mac_perim_exit(mip);
5581         return (0);
5582 }
5583 
5584 /*
5585  * modhash walker function to add a mac_impl_t to a list
5586  */
5587 /*ARGSUSED*/
5588 static uint_t
5589 i_mac_impl_list_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
5590 {
5591         list_t                  *list = (list_t *)arg;
5592         mac_impl_t              *mip = (mac_impl_t *)val;
5593 
5594         if ((mip->mi_state_flags & MIS_DISABLED) == 0) {
5595                 list_insert_tail(list, mip);
5596                 mip->mi_ref++;
5597         }
5598 
5599         return (MH_WALK_CONTINUE);
5600 }
5601 
5602 void
5603 i_mac_log_info(list_t *net_log_list, i_mac_log_state_t *lstate)
5604 {
5605         list_t                  mac_impl_list;
5606         mac_impl_t              *mip;
5607         netinfo_t               *ninfo;
5608 
5609         /* Create list of mac_impls */
5610         ASSERT(RW_LOCK_HELD(&i_mac_impl_lock));
5611         list_create(&mac_impl_list, sizeof (mac_impl_t), offsetof(mac_impl_t,
5612             mi_node));
5613         mod_hash_walk(i_mac_impl_hash, i_mac_impl_list_walker, &mac_impl_list);
5614         rw_exit(&i_mac_impl_lock);
5615 
5616         /* Create log entries for each mac_impl */
5617         for (mip = list_head(&mac_impl_list); mip != NULL;
5618             mip = list_next(&mac_impl_list, mip)) {
5619                 if (i_mac_impl_log(mip, lstate) != 0)
5620                         continue;
5621         }
5622 
5623         /* Remove elements and destroy list of mac_impls */
5624         rw_enter(&i_mac_impl_lock, RW_WRITER);
5625         while ((mip = list_remove_tail(&mac_impl_list)) != NULL) {
5626                 mip->mi_ref--;
5627         }
5628         rw_exit(&i_mac_impl_lock);
5629         list_destroy(&mac_impl_list);
5630 
5631         /*
5632          * Write log entries to files outside of locks, free associated
5633          * structures, and remove entries from the list.
5634          */
5635         while ((ninfo = list_head(net_log_list)) != NULL) {
5636                 (void) exacct_commit_netinfo(ninfo->ni_record, ninfo->ni_type);
5637                 list_remove(net_log_list, ninfo);
5638                 kmem_free(ninfo->ni_record, ninfo->ni_size);
5639                 kmem_free(ninfo, sizeof (*ninfo));
5640         }
5641         list_destroy(net_log_list);
5642 }
5643 
5644 /*
5645  * The timer thread that runs every mac_logging_interval seconds and logs
5646  * link and/or flow information.
5647  */
5648 /* ARGSUSED */
5649 void
5650 mac_log_linkinfo(void *arg)
5651 {
5652         i_mac_log_state_t       lstate;
5653         list_t                  net_log_list;
5654 
5655         list_create(&net_log_list, sizeof (netinfo_t),
5656             offsetof(netinfo_t, ni_link));
5657 
5658         rw_enter(&i_mac_impl_lock, RW_READER);
5659         if (!mac_flow_log_enable && !mac_link_log_enable) {
5660                 rw_exit(&i_mac_impl_lock);
5661                 return;
5662         }
5663         lstate.mi_fenable = mac_flow_log_enable;
5664         lstate.mi_lenable = mac_link_log_enable;
5665         lstate.mi_last = B_FALSE;
5666         lstate.mi_list = &net_log_list;
5667 
5668         /* Write log entries for each mac_impl in the list */
5669         i_mac_log_info(&net_log_list, &lstate);
5670 
5671         if (mac_flow_log_enable || mac_link_log_enable) {
5672                 mac_logging_timer = timeout(mac_log_linkinfo, NULL,
5673                     SEC_TO_TICK(mac_logging_interval));
5674         }
5675 }
5676 
5677 typedef struct i_mac_fastpath_state_s {
5678         boolean_t       mf_disable;
5679         int             mf_err;
5680 } i_mac_fastpath_state_t;
5681 
5682 /* modhash walker function to enable or disable fastpath */
5683 /*ARGSUSED*/
5684 static uint_t
5685 i_mac_fastpath_walker(mod_hash_key_t key, mod_hash_val_t *val,
5686     void *arg)
5687 {
5688         i_mac_fastpath_state_t  *state = arg;
5689         mac_handle_t            mh = (mac_handle_t)val;
5690 
5691         if (state->mf_disable)
5692                 state->mf_err = mac_fastpath_disable(mh);
5693         else
5694                 mac_fastpath_enable(mh);
5695 
5696         return (state->mf_err == 0 ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
5697 }
5698 
5699 /*
5700  * Start the logging timer.
5701  */
5702 int
5703 mac_start_logusage(mac_logtype_t type, uint_t interval)
5704 {
5705         i_mac_fastpath_state_t  dstate = {B_TRUE, 0};
5706         i_mac_fastpath_state_t  estate = {B_FALSE, 0};
5707         int                     err;
5708 
5709         rw_enter(&i_mac_impl_lock, RW_WRITER);
5710         switch (type) {
5711         case MAC_LOGTYPE_FLOW:
5712                 if (mac_flow_log_enable) {
5713                         rw_exit(&i_mac_impl_lock);
5714                         return (0);
5715                 }
5716                 /* FALLTHRU */
5717         case MAC_LOGTYPE_LINK:
5718                 if (mac_link_log_enable) {
5719                         rw_exit(&i_mac_impl_lock);
5720                         return (0);
5721                 }
5722                 break;
5723         default:
5724                 ASSERT(0);
5725         }
5726 
5727         /* Disable fastpath */
5728         mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &dstate);
5729         if ((err = dstate.mf_err) != 0) {
5730                 /* Reenable fastpath  */
5731                 mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
5732                 rw_exit(&i_mac_impl_lock);
5733                 return (err);
5734         }
5735 
5736         switch (type) {
5737         case MAC_LOGTYPE_FLOW:
5738                 mac_flow_log_enable = B_TRUE;
5739                 /* FALLTHRU */
5740         case MAC_LOGTYPE_LINK:
5741                 mac_link_log_enable = B_TRUE;
5742                 break;
5743         }
5744 
5745         mac_logging_interval = interval;
5746         rw_exit(&i_mac_impl_lock);
5747         mac_log_linkinfo(NULL);
5748         return (0);
5749 }
5750 
5751 /*
5752  * Stop the logging timer if both link and flow logging are turned off.
5753  */
5754 void
5755 mac_stop_logusage(mac_logtype_t type)
5756 {
5757         i_mac_log_state_t       lstate;
5758         i_mac_fastpath_state_t  estate = {B_FALSE, 0};
5759         list_t                  net_log_list;
5760 
5761         list_create(&net_log_list, sizeof (netinfo_t),
5762             offsetof(netinfo_t, ni_link));
5763 
5764         rw_enter(&i_mac_impl_lock, RW_WRITER);
5765 
5766         lstate.mi_fenable = mac_flow_log_enable;
5767         lstate.mi_lenable = mac_link_log_enable;
5768         lstate.mi_list = &net_log_list;
5769 
5770         /* Last walk */
5771         lstate.mi_last = B_TRUE;
5772 
5773         switch (type) {
5774         case MAC_LOGTYPE_FLOW:
5775                 if (lstate.mi_fenable) {
5776                         ASSERT(mac_link_log_enable);
5777                         mac_flow_log_enable = B_FALSE;
5778                         mac_link_log_enable = B_FALSE;
5779                         break;
5780                 }
5781                 /* FALLTHRU */
5782         case MAC_LOGTYPE_LINK:
5783                 if (!lstate.mi_lenable || mac_flow_log_enable) {
5784                         rw_exit(&i_mac_impl_lock);
5785                         return;
5786                 }
5787                 mac_link_log_enable = B_FALSE;
5788                 break;
5789         default:
5790                 ASSERT(0);
5791         }
5792 
5793         /* Reenable fastpath */
5794         mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
5795 
5796         (void) untimeout(mac_logging_timer);
5797         mac_logging_timer = 0;
5798 
5799         /* Write log entries for each mac_impl in the list */
5800         i_mac_log_info(&net_log_list, &lstate);
5801 }
5802 
5803 /*
5804  * Walk the rx and tx SRS/SRs for a flow and update the priority value.
5805  */
5806 void
5807 mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
5808 {
5809         pri_t                   pri;
5810         int                     count;
5811         mac_soft_ring_set_t     *mac_srs;
5812 
5813         if (flent->fe_rx_srs_cnt <= 0)
5814                 return;
5815 
5816         if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
5817             SRST_FLOW) {
5818                 pri = FLOW_PRIORITY(mcip->mci_min_pri,
5819                     mcip->mci_max_pri,
5820                     flent->fe_resource_props.mrp_priority);
5821         } else {
5822                 pri = mcip->mci_max_pri;
5823         }
5824 
5825         for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
5826                 mac_srs = flent->fe_rx_srs[count];
5827                 mac_update_srs_priority(mac_srs, pri);
5828         }
5829         /*
5830          * If we have a Tx SRS, we need to modify all the threads associated
5831          * with it.
5832          */
5833         if (flent->fe_tx_srs != NULL)
5834                 mac_update_srs_priority(flent->fe_tx_srs, pri);
5835 }
5836 
5837 /*
5838  * RX and TX rings are reserved according to different semantics depending
5839  * on the requests from the MAC clients and type of rings:
5840  *
5841  * On the Tx side, by default we reserve individual rings, independently from
5842  * the groups.
5843  *
5844  * On the Rx side, the reservation is at the granularity of the group
5845  * of rings, and used for v12n level 1 only. It has a special case for the
5846  * primary client.
5847  *
5848  * If a share is allocated to a MAC client, we allocate a TX group and an
5849  * RX group to the client, and assign TX rings and RX rings to these
5850  * groups according to information gathered from the driver through
5851  * the share capability.
5852  *
5853  * The foreseable evolution of Rx rings will handle v12n level 2 and higher
5854  * to allocate individual rings out of a group and program the hw classifier
5855  * based on IP address or higher level criteria.
5856  */
5857 
5858 /*
5859  * mac_reserve_tx_ring()
5860  * Reserve a unused ring by marking it with MR_INUSE state.
5861  * As reserved, the ring is ready to function.
5862  *
5863  * Notes for Hybrid I/O:
5864  *
5865  * If a specific ring is needed, it is specified through the desired_ring
5866  * argument. Otherwise that argument is set to NULL.
5867  * If the desired ring was previous allocated to another client, this
5868  * function swaps it with a new ring from the group of unassigned rings.
5869  */
5870 mac_ring_t *
5871 mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
5872 {
5873         mac_group_t             *group;
5874         mac_grp_client_t        *mgcp;
5875         mac_client_impl_t       *mcip;
5876         mac_soft_ring_set_t     *srs;
5877 
5878         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5879 
5880         /*
5881          * Find an available ring and start it before changing its status.
5882          * The unassigned rings are at the end of the mi_tx_groups
5883          * array.
5884          */
5885         group = MAC_DEFAULT_TX_GROUP(mip);
5886 
5887         /* Can't take the default ring out of the default group */
5888         ASSERT(desired_ring != (mac_ring_t *)mip->mi_default_tx_ring);
5889 
5890         if (desired_ring->mr_state == MR_FREE) {
5891                 ASSERT(MAC_GROUP_NO_CLIENT(group));
5892                 if (mac_start_ring(desired_ring) != 0)
5893                         return (NULL);
5894                 return (desired_ring);
5895         }
5896         /*
5897          * There are clients using this ring, so let's move the clients
5898          * away from using this ring.
5899          */
5900         for (mgcp = group->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
5901                 mcip = mgcp->mgc_client;
5902                 mac_tx_client_quiesce((mac_client_handle_t)mcip);
5903                 srs = MCIP_TX_SRS(mcip);
5904                 ASSERT(mac_tx_srs_ring_present(srs, desired_ring));
5905                 mac_tx_invoke_callbacks(mcip,
5906                     (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(srs,
5907                     desired_ring));
5908                 mac_tx_srs_del_ring(srs, desired_ring);
5909                 mac_tx_client_restart((mac_client_handle_t)mcip);
5910         }
5911         return (desired_ring);
5912 }
5913 
5914 /*
5915  * For a reserved group with multiple clients, return the primary client.
5916  */
5917 static mac_client_impl_t *
5918 mac_get_grp_primary(mac_group_t *grp)
5919 {
5920         mac_grp_client_t        *mgcp = grp->mrg_clients;
5921         mac_client_impl_t       *mcip;
5922 
5923         while (mgcp != NULL) {
5924                 mcip = mgcp->mgc_client;
5925                 if (mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC)
5926                         return (mcip);
5927                 mgcp = mgcp->mgc_next;
5928         }
5929         return (NULL);
5930 }
5931 
5932 /*
5933  * Hybrid I/O specifies the ring that should be given to a share.
5934  * If the ring is already used by clients, then we need to release
5935  * the ring back to the default group so that we can give it to
5936  * the share. This means the clients using this ring now get a
5937  * replacement ring. If there aren't any replacement rings, this
5938  * function returns a failure.
5939  */
5940 static int
5941 mac_reclaim_ring_from_grp(mac_impl_t *mip, mac_ring_type_t ring_type,
5942     mac_ring_t *ring, mac_ring_t **rings, int nrings)
5943 {
5944         mac_group_t             *group = (mac_group_t *)ring->mr_gh;
5945         mac_resource_props_t    *mrp;
5946         mac_client_impl_t       *mcip;
5947         mac_group_t             *defgrp;
5948         mac_ring_t              *tring;
5949         mac_group_t             *tgrp;
5950         int                     i;
5951         int                     j;
5952 
5953         mcip = MAC_GROUP_ONLY_CLIENT(group);
5954         if (mcip == NULL)
5955                 mcip = mac_get_grp_primary(group);
5956         ASSERT(mcip != NULL);
5957         ASSERT(mcip->mci_share == NULL);
5958 
5959         mrp = MCIP_RESOURCE_PROPS(mcip);
5960         if (ring_type == MAC_RING_TYPE_RX) {
5961                 defgrp = mip->mi_rx_donor_grp;
5962                 if ((mrp->mrp_mask & MRP_RX_RINGS) == 0) {
5963                         /* Need to put this mac client in the default group */
5964                         if (mac_rx_switch_group(mcip, group, defgrp) != 0)
5965                                 return (ENOSPC);
5966                 } else {
5967                         /*
5968                          * Switch this ring with some other ring from
5969                          * the default group.
5970                          */
5971                         for (tring = defgrp->mrg_rings; tring != NULL;
5972                             tring = tring->mr_next) {
5973                                 if (tring->mr_index == 0)
5974                                         continue;
5975                                 for (j = 0; j < nrings; j++) {
5976                                         if (rings[j] == tring)
5977                                                 break;
5978                                 }
5979                                 if (j >= nrings)
5980                                         break;
5981                         }
5982                         if (tring == NULL)
5983                                 return (ENOSPC);
5984                         if (mac_group_mov_ring(mip, group, tring) != 0)
5985                                 return (ENOSPC);
5986                         if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
5987                                 (void) mac_group_mov_ring(mip, defgrp, tring);
5988                                 return (ENOSPC);
5989                         }
5990                 }
5991                 ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
5992                 return (0);
5993         }
5994 
5995         defgrp = MAC_DEFAULT_TX_GROUP(mip);
5996         if (ring == (mac_ring_t *)mip->mi_default_tx_ring) {
5997                 /*
5998                  * See if we can get a spare ring to replace the default
5999                  * ring.
6000                  */
6001                 if (defgrp->mrg_cur_count == 1) {
6002                         /*
6003                          * Need to get a ring from another client, see if
6004                          * there are any clients that can be moved to
6005                          * the default group, thereby freeing some rings.
6006                          */
6007                         for (i = 0; i < mip->mi_tx_group_count; i++) {
6008                                 tgrp = &mip->mi_tx_groups[i];
6009                                 if (tgrp->mrg_state ==
6010                                     MAC_GROUP_STATE_REGISTERED) {
6011                                         continue;
6012                                 }
6013                                 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
6014                                 if (mcip == NULL)
6015                                         mcip = mac_get_grp_primary(tgrp);
6016                                 ASSERT(mcip != NULL);
6017                                 mrp = MCIP_RESOURCE_PROPS(mcip);
6018                                 if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
6019                                         ASSERT(tgrp->mrg_cur_count == 1);
6020                                         /*
6021                                          * If this ring is part of the
6022                                          * rings asked by the share we cannot
6023                                          * use it as the default ring.
6024                                          */
6025                                         for (j = 0; j < nrings; j++) {
6026                                                 if (rings[j] == tgrp->mrg_rings)
6027                                                         break;
6028                                         }
6029                                         if (j < nrings)
6030                                                 continue;
6031                                         mac_tx_client_quiesce(
6032                                             (mac_client_handle_t)mcip);
6033                                         mac_tx_switch_group(mcip, tgrp,
6034                                             defgrp);
6035                                         mac_tx_client_restart(
6036                                             (mac_client_handle_t)mcip);
6037                                         break;
6038                                 }
6039                         }
6040                         /*
6041                          * All the rings are reserved, can't give up the
6042                          * default ring.
6043                          */
6044                         if (defgrp->mrg_cur_count <= 1)
6045                                 return (ENOSPC);
6046                 }
6047                 /*
6048                  * Swap the default ring with another.
6049                  */
6050                 for (tring = defgrp->mrg_rings; tring != NULL;
6051                     tring = tring->mr_next) {
6052                         /*
6053                          * If this ring is part of the rings asked by the
6054                          * share we cannot use it as the default ring.
6055                          */
6056                         for (j = 0; j < nrings; j++) {
6057                                 if (rings[j] == tring)
6058                                         break;
6059                         }
6060                         if (j >= nrings)
6061                                 break;
6062                 }
6063                 ASSERT(tring != NULL);
6064                 mip->mi_default_tx_ring = (mac_ring_handle_t)tring;
6065                 return (0);
6066         }
6067         /*
6068          * The Tx ring is with a group reserved by a MAC client. See if
6069          * we can swap it.
6070          */
6071         ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
6072         mcip = MAC_GROUP_ONLY_CLIENT(group);
6073         if (mcip == NULL)
6074                 mcip = mac_get_grp_primary(group);
6075         ASSERT(mcip !=  NULL);
6076         mrp = MCIP_RESOURCE_PROPS(mcip);
6077         mac_tx_client_quiesce((mac_client_handle_t)mcip);
6078         if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
6079                 ASSERT(group->mrg_cur_count == 1);
6080                 /* Put this mac client in the default group */
6081                 mac_tx_switch_group(mcip, group, defgrp);
6082         } else {
6083                 /*
6084                  * Switch this ring with some other ring from
6085                  * the default group.
6086                  */
6087                 for (tring = defgrp->mrg_rings; tring != NULL;
6088                     tring = tring->mr_next) {
6089                         if (tring == (mac_ring_t *)mip->mi_default_tx_ring)
6090                                 continue;
6091                         /*
6092                          * If this ring is part of the rings asked by the
6093                          * share we cannot use it for swapping.
6094                          */
6095                         for (j = 0; j < nrings; j++) {
6096                                 if (rings[j] == tring)
6097                                         break;
6098                         }
6099                         if (j >= nrings)
6100                                 break;
6101                 }
6102                 if (tring == NULL) {
6103                         mac_tx_client_restart((mac_client_handle_t)mcip);
6104                         return (ENOSPC);
6105                 }
6106                 if (mac_group_mov_ring(mip, group, tring) != 0) {
6107                         mac_tx_client_restart((mac_client_handle_t)mcip);
6108                         return (ENOSPC);
6109                 }
6110                 if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
6111                         (void) mac_group_mov_ring(mip, defgrp, tring);
6112                         mac_tx_client_restart((mac_client_handle_t)mcip);
6113                         return (ENOSPC);
6114                 }
6115         }
6116         mac_tx_client_restart((mac_client_handle_t)mcip);
6117         ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
6118         return (0);
6119 }
6120 
6121 /*
6122  * Populate a zero-ring group with rings. If the share is non-NULL,
6123  * the rings are chosen according to that share.
6124  * Invoked after allocating a new RX or TX group through
6125  * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
6126  * Returns zero on success, an errno otherwise.
6127  */
6128 int
6129 i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
6130     mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share,
6131     uint32_t ringcnt)
6132 {
6133         mac_ring_t **rings, *ring;
6134         uint_t nrings;
6135         int rv = 0, i = 0, j;
6136 
6137         ASSERT((ring_type == MAC_RING_TYPE_RX &&
6138             mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) ||
6139             (ring_type == MAC_RING_TYPE_TX &&
6140             mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC));
6141 
6142         /*
6143          * First find the rings to allocate to the group.
6144          */
6145         if (share != NULL) {
6146                 /* get rings through ms_squery() */
6147                 mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
6148                 ASSERT(nrings != 0);
6149                 rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
6150                     KM_SLEEP);
6151                 mip->mi_share_capab.ms_squery(share, ring_type,
6152                     (mac_ring_handle_t *)rings, &nrings);
6153                 for (i = 0; i < nrings; i++) {
6154                         /*
6155                          * If we have given this ring to a non-default
6156                          * group, we need to check if we can get this
6157                          * ring.
6158                          */
6159                         ring = rings[i];
6160                         if (ring->mr_gh != (mac_group_handle_t)src_group ||
6161                             ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6162                                 if (mac_reclaim_ring_from_grp(mip, ring_type,
6163                                     ring, rings, nrings) != 0) {
6164                                         rv = ENOSPC;
6165                                         goto bail;
6166                                 }
6167                         }
6168                 }
6169         } else {
6170                 /*
6171                  * Pick one ring from default group.
6172                  *
6173                  * for now pick the second ring which requires the first ring
6174                  * at index 0 to stay in the default group, since it is the
6175                  * ring which carries the multicast traffic.
6176                  * We need a better way for a driver to indicate this,
6177                  * for example a per-ring flag.
6178                  */
6179                 rings = kmem_alloc(ringcnt * sizeof (mac_ring_handle_t),
6180                     KM_SLEEP);
6181                 for (ring = src_group->mrg_rings; ring != NULL;
6182                     ring = ring->mr_next) {
6183                         if (ring_type == MAC_RING_TYPE_RX &&
6184                             ring->mr_index == 0) {
6185                                 continue;
6186                         }
6187                         if (ring_type == MAC_RING_TYPE_TX &&
6188                             ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6189                                 continue;
6190                         }
6191                         rings[i++] = ring;
6192                         if (i == ringcnt)
6193                                 break;
6194                 }
6195                 ASSERT(ring != NULL);
6196                 nrings = i;
6197                 /* Not enough rings as required */
6198                 if (nrings != ringcnt) {
6199                         rv = ENOSPC;
6200                         goto bail;
6201                 }
6202         }
6203 
6204         switch (ring_type) {
6205         case MAC_RING_TYPE_RX:
6206                 if (src_group->mrg_cur_count - nrings < 1) {
6207                         /* we ran out of rings */
6208                         rv = ENOSPC;
6209                         goto bail;
6210                 }
6211 
6212                 /* move receive rings to new group */
6213                 for (i = 0; i < nrings; i++) {
6214                         rv = mac_group_mov_ring(mip, new_group, rings[i]);
6215                         if (rv != 0) {
6216                                 /* move rings back on failure */
6217                                 for (j = 0; j < i; j++) {
6218                                         (void) mac_group_mov_ring(mip,
6219                                             src_group, rings[j]);
6220                                 }
6221                                 goto bail;
6222                         }
6223                 }
6224                 break;
6225 
6226         case MAC_RING_TYPE_TX: {
6227                 mac_ring_t *tmp_ring;
6228 
6229                 /* move the TX rings to the new group */
6230                 for (i = 0; i < nrings; i++) {
6231                         /* get the desired ring */
6232                         tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
6233                         if (tmp_ring == NULL) {
6234                                 rv = ENOSPC;
6235                                 goto bail;
6236                         }
6237                         ASSERT(tmp_ring == rings[i]);
6238                         rv = mac_group_mov_ring(mip, new_group, rings[i]);
6239                         if (rv != 0) {
6240                                 /* cleanup on failure */
6241                                 for (j = 0; j < i; j++) {
6242                                         (void) mac_group_mov_ring(mip,
6243                                             MAC_DEFAULT_TX_GROUP(mip),
6244                                             rings[j]);
6245                                 }
6246                                 goto bail;
6247                         }
6248                 }
6249                 break;
6250         }
6251         }
6252 
6253         /* add group to share */
6254         if (share != NULL)
6255                 mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
6256 
6257 bail:
6258         /* free temporary array of rings */
6259         kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
6260 
6261         return (rv);
6262 }
6263 
6264 void
6265 mac_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
6266 {
6267         mac_grp_client_t *mgcp;
6268 
6269         for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
6270                 if (mgcp->mgc_client == mcip)
6271                         break;
6272         }
6273 
6274         VERIFY(mgcp == NULL);
6275 
6276         mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
6277         mgcp->mgc_client = mcip;
6278         mgcp->mgc_next = grp->mrg_clients;
6279         grp->mrg_clients = mgcp;
6280 
6281 }
6282 
6283 void
6284 mac_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
6285 {
6286         mac_grp_client_t *mgcp, **pprev;
6287 
6288         for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
6289             pprev = &mgcp->mgc_next, mgcp = *pprev) {
6290                 if (mgcp->mgc_client == mcip)
6291                         break;
6292         }
6293 
6294         ASSERT(mgcp != NULL);
6295 
6296         *pprev = mgcp->mgc_next;
6297         kmem_free(mgcp, sizeof (mac_grp_client_t));
6298 }
6299 
6300 /*
6301  * mac_reserve_rx_group()
6302  *
6303  * Finds an available group and exclusively reserves it for a client.
6304  * The group is chosen to suit the flow's resource controls (bandwidth and
6305  * fanout requirements) and the address type.
6306  * If the requestor is the pimary MAC then return the group with the
6307  * largest number of rings, otherwise the default ring when available.
6308  */
6309 mac_group_t *
6310 mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
6311 {
6312         mac_share_handle_t      share = mcip->mci_share;
6313         mac_impl_t              *mip = mcip->mci_mip;
6314         mac_group_t             *grp = NULL;
6315         int                     i;
6316         int                     err = 0;
6317         mac_address_t           *map;
6318         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
6319         int                     nrings;
6320         int                     donor_grp_rcnt;
6321         boolean_t               need_exclgrp = B_FALSE;
6322         int                     need_rings = 0;
6323         mac_group_t             *candidate_grp = NULL;
6324         mac_client_impl_t       *gclient;
6325         mac_resource_props_t    *gmrp;
6326         mac_group_t             *donorgrp = NULL;
6327         boolean_t               rxhw = mrp->mrp_mask & MRP_RX_RINGS;
6328         boolean_t               unspec = mrp->mrp_mask & MRP_RXRINGS_UNSPEC;
6329         boolean_t               isprimary;
6330 
6331         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
6332 
6333         isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
6334 
6335         /*
6336          * Check if a group already has this mac address (case of VLANs)
6337          * unless we are moving this MAC client from one group to another.
6338          */
6339         if (!move && (map = mac_find_macaddr(mip, mac_addr)) != NULL) {
6340                 if (map->ma_group != NULL)
6341                         return (map->ma_group);
6342         }
6343         if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0)
6344                 return (NULL);
6345         /*
6346          * If exclusive open, return NULL which will enable the
6347          * caller to use the default group.
6348          */
6349         if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
6350                 return (NULL);
6351 
6352         /* For dynamic groups default unspecified to 1 */
6353         if (rxhw && unspec &&
6354             mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6355                 mrp->mrp_nrxrings = 1;
6356         }
6357         /*
6358          * For static grouping we allow only specifying rings=0 and
6359          * unspecified
6360          */
6361         if (rxhw && mrp->mrp_nrxrings > 0 &&
6362             mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) {
6363                 return (NULL);
6364         }
6365         if (rxhw) {
6366                 /*
6367                  * We have explicitly asked for a group (with nrxrings,
6368                  * if unspec).
6369                  */
6370                 if (unspec || mrp->mrp_nrxrings > 0) {
6371                         need_exclgrp = B_TRUE;
6372                         need_rings = mrp->mrp_nrxrings;
6373                 } else if (mrp->mrp_nrxrings == 0) {
6374                         /*
6375                          * We have asked for a software group.
6376                          */
6377                         return (NULL);
6378                 }
6379         } else if (isprimary && mip->mi_nactiveclients == 1 &&
6380             mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6381                 /*
6382                  * If the primary is the only active client on this
6383                  * mip and we have not asked for any rings, we give
6384                  * it the default group so that the primary gets to
6385                  * use all the rings.
6386                  */
6387                 return (NULL);
6388         }
6389 
6390         /* The group that can donate rings */
6391         donorgrp = mip->mi_rx_donor_grp;
6392 
6393         /*
6394          * The number of rings that the default group can donate.
6395          * We need to leave at least one ring.
6396          */
6397         donor_grp_rcnt = donorgrp->mrg_cur_count - 1;
6398 
6399         /*
6400          * Try to exclusively reserve a RX group.
6401          *
6402          * For flows requiring HW_DEFAULT_RING (unicast flow of the primary
6403          * client), try to reserve the a non-default RX group and give
6404          * it all the rings from the donor group, except the default ring
6405          *
6406          * For flows requiring HW_RING (unicast flow of other clients), try
6407          * to reserve non-default RX group with the specified number of
6408          * rings, if available.
6409          *
6410          * For flows that have not asked for software or hardware ring,
6411          * try to reserve a non-default group with 1 ring, if available.
6412          */
6413         for (i = 1; i < mip->mi_rx_group_count; i++) {
6414                 grp = &mip->mi_rx_groups[i];
6415 
6416                 DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
6417                     int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
6418 
6419                 /*
6420                  * Check if this group could be a candidate group for
6421                  * eviction if we need a group for this MAC client,
6422                  * but there aren't any. A candidate group is one
6423                  * that didn't ask for an exclusive group, but got
6424                  * one and it has enough rings (combined with what
6425                  * the donor group can donate) for the new MAC
6426                  * client
6427                  */
6428                 if (grp->mrg_state >= MAC_GROUP_STATE_RESERVED) {
6429                         /*
6430                          * If the primary/donor group is not the default
6431                          * group, don't bother looking for a candidate group.
6432                          * If we don't have enough rings we will check
6433                          * if the primary group can be vacated.
6434                          */
6435                         if (candidate_grp == NULL &&
6436                             donorgrp == MAC_DEFAULT_RX_GROUP(mip)) {
6437                                 ASSERT(!MAC_GROUP_NO_CLIENT(grp));
6438                                 gclient = MAC_GROUP_ONLY_CLIENT(grp);
6439                                 if (gclient == NULL)
6440                                         gclient = mac_get_grp_primary(grp);
6441                                 ASSERT(gclient != NULL);
6442                                 gmrp = MCIP_RESOURCE_PROPS(gclient);
6443                                 if (gclient->mci_share == NULL &&
6444                                     (gmrp->mrp_mask & MRP_RX_RINGS) == 0 &&
6445                                     (unspec ||
6446                                     (grp->mrg_cur_count + donor_grp_rcnt >=
6447                                     need_rings))) {
6448                                         candidate_grp = grp;
6449                                 }
6450                         }
6451                         continue;
6452                 }
6453                 /*
6454                  * This group could already be SHARED by other multicast
6455                  * flows on this client. In that case, the group would
6456                  * be shared and has already been started.
6457                  */
6458                 ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
6459 
6460                 if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
6461                     (mac_start_group(grp) != 0)) {
6462                         continue;
6463                 }
6464 
6465                 if (mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6466                         break;
6467                 ASSERT(grp->mrg_cur_count == 0);
6468 
6469                 /*
6470                  * Populate the group. Rings should be taken
6471                  * from the donor group.
6472                  */
6473                 nrings = rxhw ? need_rings : isprimary ? donor_grp_rcnt: 1;
6474 
6475                 /*
6476                  * If the donor group can't donate, let's just walk and
6477                  * see if someone can vacate a group, so that we have
6478                  * enough rings for this, unless we already have
6479                  * identified a candiate group..
6480                  */
6481                 if (nrings <= donor_grp_rcnt) {
6482                         err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
6483                             donorgrp, grp, share, nrings);
6484                         if (err == 0) {
6485                                 /*
6486                                  * For a share i_mac_group_allocate_rings gets
6487                                  * the rings from the driver, let's populate
6488                                  * the property for the client now.
6489                                  */
6490                                 if (share != NULL) {
6491                                         mac_client_set_rings(
6492                                             (mac_client_handle_t)mcip,
6493                                             grp->mrg_cur_count, -1);
6494                                 }
6495                                 if (mac_is_primary_client(mcip) && !rxhw)
6496                                         mip->mi_rx_donor_grp = grp;
6497                                 break;
6498                         }
6499                 }
6500 
6501                 DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
6502                     mip->mi_name, int, grp->mrg_index, int, err);
6503 
6504                 /*
6505                  * It's a dynamic group but the grouping operation
6506                  * failed.
6507                  */
6508                 mac_stop_group(grp);
6509         }
6510         /* We didn't find an exclusive group for this MAC client */
6511         if (i >= mip->mi_rx_group_count) {
6512 
6513                 if (!need_exclgrp)
6514                         return (NULL);
6515 
6516                 /*
6517                  * If we found a candidate group then we switch the
6518                  * MAC client from the candidate_group to the default
6519                  * group and give the group to this MAC client. If
6520                  * we didn't find a candidate_group, check if the
6521                  * primary is in its own group and if it can make way
6522                  * for this MAC client.
6523                  */
6524                 if (candidate_grp == NULL &&
6525                     donorgrp != MAC_DEFAULT_RX_GROUP(mip) &&
6526                     donorgrp->mrg_cur_count >= need_rings) {
6527                         candidate_grp = donorgrp;
6528                 }
6529                 if (candidate_grp != NULL) {
6530                         boolean_t       prim_grp = B_FALSE;
6531 
6532                         /*
6533                          * Switch the MAC client from the candidate group
6534                          * to the default group.. If this group was the
6535                          * donor group, then after the switch we need
6536                          * to update the donor group too.
6537                          */
6538                         grp = candidate_grp;
6539                         gclient = MAC_GROUP_ONLY_CLIENT(grp);
6540                         if (gclient == NULL)
6541                                 gclient = mac_get_grp_primary(grp);
6542                         if (grp == mip->mi_rx_donor_grp)
6543                                 prim_grp = B_TRUE;
6544                         if (mac_rx_switch_group(gclient, grp,
6545                             MAC_DEFAULT_RX_GROUP(mip)) != 0) {
6546                                 return (NULL);
6547                         }
6548                         if (prim_grp) {
6549                                 mip->mi_rx_donor_grp =
6550                                     MAC_DEFAULT_RX_GROUP(mip);
6551                                 donorgrp = MAC_DEFAULT_RX_GROUP(mip);
6552                         }
6553 
6554 
6555                         /*
6556                          * Now give this group with the required rings
6557                          * to this MAC client.
6558                          */
6559                         ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
6560                         if (mac_start_group(grp) != 0)
6561                                 return (NULL);
6562 
6563                         if (mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6564                                 return (grp);
6565 
6566                         donor_grp_rcnt = donorgrp->mrg_cur_count - 1;
6567                         ASSERT(grp->mrg_cur_count == 0);
6568                         ASSERT(donor_grp_rcnt >= need_rings);
6569                         err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
6570                             donorgrp, grp, share, need_rings);
6571                         if (err == 0) {
6572                                 /*
6573                                  * For a share i_mac_group_allocate_rings gets
6574                                  * the rings from the driver, let's populate
6575                                  * the property for the client now.
6576                                  */
6577                                 if (share != NULL) {
6578                                         mac_client_set_rings(
6579                                             (mac_client_handle_t)mcip,
6580                                             grp->mrg_cur_count, -1);
6581                                 }
6582                                 DTRACE_PROBE2(rx__group__reserved,
6583                                     char *, mip->mi_name, int, grp->mrg_index);
6584                                 return (grp);
6585                         }
6586                         DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
6587                             mip->mi_name, int, grp->mrg_index, int, err);
6588                         mac_stop_group(grp);
6589                 }
6590                 return (NULL);
6591         }
6592         ASSERT(grp != NULL);
6593 
6594         DTRACE_PROBE2(rx__group__reserved,
6595             char *, mip->mi_name, int, grp->mrg_index);
6596         return (grp);
6597 }
6598 
6599 /*
6600  * mac_rx_release_group()
6601  *
6602  * This is called when there are no clients left for the group.
6603  * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
6604  * and if it is a non default group, the shares are removed and
6605  * all rings are assigned back to default group.
6606  */
6607 void
6608 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
6609 {
6610         mac_impl_t              *mip = mcip->mci_mip;
6611         mac_ring_t              *ring;
6612 
6613         ASSERT(group != MAC_DEFAULT_RX_GROUP(mip));
6614 
6615         if (mip->mi_rx_donor_grp == group)
6616                 mip->mi_rx_donor_grp = MAC_DEFAULT_RX_GROUP(mip);
6617 
6618         /*
6619          * This is the case where there are no clients left. Any
6620          * SRS etc on this group have also be quiesced.
6621          */
6622         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
6623                 if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
6624                         ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
6625                         /*
6626                          * Remove the SRS associated with the HW ring.
6627                          * As a result, polling will be disabled.
6628                          */
6629                         ring->mr_srs = NULL;
6630                 }
6631                 ASSERT(group->mrg_state < MAC_GROUP_STATE_RESERVED ||
6632                     ring->mr_state == MR_INUSE);
6633                 if (ring->mr_state == MR_INUSE) {
6634                         mac_stop_ring(ring);
6635                         ring->mr_flag = 0;
6636                 }
6637         }
6638 
6639         /* remove group from share */
6640         if (mcip->mci_share != NULL) {
6641                 mip->mi_share_capab.ms_sremove(mcip->mci_share,
6642                     group->mrg_driver);
6643         }
6644 
6645         if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6646                 mac_ring_t *ring;
6647 
6648                 /*
6649                  * Rings were dynamically allocated to group.
6650                  * Move rings back to default group.
6651                  */
6652                 while ((ring = group->mrg_rings) != NULL) {
6653                         (void) mac_group_mov_ring(mip, mip->mi_rx_donor_grp,
6654                             ring);
6655                 }
6656         }
6657         mac_stop_group(group);
6658         /*
6659          * Possible improvement: See if we can assign the group just released
6660          * to a another client of the mip
6661          */
6662 }
6663 
6664 /*
6665  * When we move the primary's mac address between groups, we need to also
6666  * take all the clients sharing the same mac address along with it (VLANs)
6667  * We remove the mac address for such clients from the group after quiescing
6668  * them. When we add the mac address we restart the client. Note that
6669  * the primary's mac address is removed from the group after all the
6670  * other clients sharing the address are removed. Similarly, the primary's
6671  * mac address is added before all the other client's mac address are
6672  * added. While grp is the group where the clients reside, tgrp is
6673  * the group where the addresses have to be added.
6674  */
6675 static void
6676 mac_rx_move_macaddr_prim(mac_client_impl_t *mcip, mac_group_t *grp,
6677     mac_group_t *tgrp, uint8_t *maddr, boolean_t add)
6678 {
6679         mac_impl_t              *mip = mcip->mci_mip;
6680         mac_grp_client_t        *mgcp = grp->mrg_clients;
6681         mac_client_impl_t       *gmcip;
6682         boolean_t               prim;
6683 
6684         prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
6685 
6686         /*
6687          * If the clients are in a non-default group, we just have to
6688          * walk the group's client list. If it is in the default group
6689          * (which will be shared by other clients as well, we need to
6690          * check if the unicast address matches mcip's unicast.
6691          */
6692         while (mgcp != NULL) {
6693                 gmcip = mgcp->mgc_client;
6694                 if (gmcip != mcip &&
6695                     (grp != MAC_DEFAULT_RX_GROUP(mip) ||
6696                     mcip->mci_unicast == gmcip->mci_unicast)) {
6697                         if (!add) {
6698                                 mac_rx_client_quiesce(
6699                                     (mac_client_handle_t)gmcip);
6700                                 (void) mac_remove_macaddr(mcip->mci_unicast);
6701                         } else {
6702                                 (void) mac_add_macaddr(mip, tgrp, maddr, prim);
6703                                 mac_rx_client_restart(
6704                                     (mac_client_handle_t)gmcip);
6705                         }
6706                 }
6707                 mgcp = mgcp->mgc_next;
6708         }
6709 }
6710 
6711 
6712 /*
6713  * Move the MAC address from fgrp to tgrp. If this is the primary client,
6714  * we need to take any VLANs etc. together too.
6715  */
6716 static int
6717 mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp,
6718     mac_group_t *tgrp)
6719 {
6720         mac_impl_t              *mip = mcip->mci_mip;
6721         uint8_t                 maddr[MAXMACADDRLEN];
6722         int                     err = 0;
6723         boolean_t               prim;
6724         boolean_t               multiclnt = B_FALSE;
6725 
6726         mac_rx_client_quiesce((mac_client_handle_t)mcip);
6727         ASSERT(mcip->mci_unicast != NULL);
6728         bcopy(mcip->mci_unicast->ma_addr, maddr, mcip->mci_unicast->ma_len);
6729 
6730         prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
6731         if (mcip->mci_unicast->ma_nusers > 1) {
6732                 mac_rx_move_macaddr_prim(mcip, fgrp, NULL, maddr, B_FALSE);
6733                 multiclnt = B_TRUE;
6734         }
6735         ASSERT(mcip->mci_unicast->ma_nusers == 1);
6736         err = mac_remove_macaddr(mcip->mci_unicast);
6737         if (err != 0) {
6738                 mac_rx_client_restart((mac_client_handle_t)mcip);
6739                 if (multiclnt) {
6740                         mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
6741                             B_TRUE);
6742                 }
6743                 return (err);
6744         }
6745         /*
6746          * Program the H/W Classifier first, if this fails we need
6747          * not proceed with the other stuff.
6748          */
6749         if ((err = mac_add_macaddr(mip, tgrp, maddr, prim)) != 0) {
6750                 /* Revert back the H/W Classifier */
6751                 if ((err = mac_add_macaddr(mip, fgrp, maddr, prim)) != 0) {
6752                         /*
6753                          * This should not fail now since it worked earlier,
6754                          * should we panic?
6755                          */
6756                         cmn_err(CE_WARN,
6757                             "mac_rx_switch_group: switching %p back"
6758                             " to group %p failed!!", (void *)mcip,
6759                             (void *)fgrp);
6760                 }
6761                 mac_rx_client_restart((mac_client_handle_t)mcip);
6762                 if (multiclnt) {
6763                         mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
6764                             B_TRUE);
6765                 }
6766                 return (err);
6767         }
6768         mcip->mci_unicast = mac_find_macaddr(mip, maddr);
6769         mac_rx_client_restart((mac_client_handle_t)mcip);
6770         if (multiclnt)
6771                 mac_rx_move_macaddr_prim(mcip, fgrp, tgrp, maddr, B_TRUE);
6772         return (err);
6773 }
6774 
6775 /*
6776  * Switch the MAC client from one group to another. This means we need
6777  * to remove the MAC address from the group, remove the MAC client,
6778  * teardown the SRSs and revert the group state. Then, we add the client
6779  * to the destination group, set the SRSs, and add the MAC address to the
6780  * group.
6781  */
6782 int
6783 mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
6784     mac_group_t *tgrp)
6785 {
6786         int                     err;
6787         mac_group_state_t       next_state;
6788         mac_client_impl_t       *group_only_mcip;
6789         mac_client_impl_t       *gmcip;
6790         mac_impl_t              *mip = mcip->mci_mip;
6791         mac_grp_client_t        *mgcp;
6792 
6793         ASSERT(fgrp == mcip->mci_flent->fe_rx_ring_group);
6794 
6795         if ((err = mac_rx_move_macaddr(mcip, fgrp, tgrp)) != 0)
6796                 return (err);
6797 
6798         /*
6799          * The group might be reserved, but SRSs may not be set up, e.g.
6800          * primary and its vlans using a reserved group.
6801          */
6802         if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED &&
6803             MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
6804                 mac_rx_srs_group_teardown(mcip->mci_flent, B_TRUE);
6805         }
6806         if (fgrp != MAC_DEFAULT_RX_GROUP(mip)) {
6807                 mgcp = fgrp->mrg_clients;
6808                 while (mgcp != NULL) {
6809                         gmcip = mgcp->mgc_client;
6810                         mgcp = mgcp->mgc_next;
6811                         mac_group_remove_client(fgrp, gmcip);
6812                         mac_group_add_client(tgrp, gmcip);
6813                         gmcip->mci_flent->fe_rx_ring_group = tgrp;
6814                 }
6815                 mac_release_rx_group(mcip, fgrp);
6816                 ASSERT(MAC_GROUP_NO_CLIENT(fgrp));
6817                 mac_set_group_state(fgrp, MAC_GROUP_STATE_REGISTERED);
6818         } else {
6819                 mac_group_remove_client(fgrp, mcip);
6820                 mac_group_add_client(tgrp, mcip);
6821                 mcip->mci_flent->fe_rx_ring_group = tgrp;
6822                 /*
6823                  * If there are other clients (VLANs) sharing this address
6824                  * we should be here only for the primary.
6825                  */
6826                 if (mcip->mci_unicast->ma_nusers > 1) {
6827                         /*
6828                          * We need to move all the clients that are using
6829                          * this h/w address.
6830                          */
6831                         mgcp = fgrp->mrg_clients;
6832                         while (mgcp != NULL) {
6833                                 gmcip = mgcp->mgc_client;
6834                                 mgcp = mgcp->mgc_next;
6835                                 if (mcip->mci_unicast == gmcip->mci_unicast) {
6836                                         mac_group_remove_client(fgrp, gmcip);
6837                                         mac_group_add_client(tgrp, gmcip);
6838                                         gmcip->mci_flent->fe_rx_ring_group =
6839                                             tgrp;
6840                                 }
6841                         }
6842                 }
6843                 /*
6844                  * The default group will still take the multicast,
6845                  * broadcast traffic etc., so it won't go to
6846                  * MAC_GROUP_STATE_REGISTERED.
6847                  */
6848                 if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED)
6849                         mac_rx_group_unmark(fgrp, MR_CONDEMNED);
6850                 mac_set_group_state(fgrp, MAC_GROUP_STATE_SHARED);
6851         }
6852         next_state = mac_group_next_state(tgrp, &group_only_mcip,
6853             MAC_DEFAULT_RX_GROUP(mip), B_TRUE);
6854         mac_set_group_state(tgrp, next_state);
6855         /*
6856          * If the destination group is reserved, setup the SRSs etc.
6857          */
6858         if (tgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
6859                 mac_rx_srs_group_setup(mcip, mcip->mci_flent, SRST_LINK);
6860                 mac_fanout_setup(mcip, mcip->mci_flent,
6861                     MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver, mcip, NULL,
6862                     NULL);
6863                 mac_rx_group_unmark(tgrp, MR_INCIPIENT);
6864         } else {
6865                 mac_rx_switch_grp_to_sw(tgrp);
6866         }
6867         return (0);
6868 }
6869 
6870 /*
6871  * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
6872  * when a share was allocated to the client.
6873  */
6874 mac_group_t *
6875 mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
6876 {
6877         mac_impl_t              *mip = mcip->mci_mip;
6878         mac_group_t             *grp = NULL;
6879         int                     rv;
6880         int                     i;
6881         int                     err;
6882         mac_group_t             *defgrp;
6883         mac_share_handle_t      share = mcip->mci_share;
6884         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
6885         int                     nrings;
6886         int                     defnrings;
6887         boolean_t               need_exclgrp = B_FALSE;
6888         int                     need_rings = 0;
6889         mac_group_t             *candidate_grp = NULL;
6890         mac_client_impl_t       *gclient;
6891         mac_resource_props_t    *gmrp;
6892         boolean_t               txhw = mrp->mrp_mask & MRP_TX_RINGS;
6893         boolean_t               unspec = mrp->mrp_mask & MRP_TXRINGS_UNSPEC;
6894         boolean_t               isprimary;
6895 
6896         isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
6897         /*
6898          * When we come here for a VLAN on the primary (dladm create-vlan),
6899          * we need to pair it along with the primary (to keep it consistent
6900          * with the RX side). So, we check if the primary is already assigned
6901          * to a group and return the group if so. The other way is also
6902          * true, i.e. the VLAN is already created and now we are plumbing
6903          * the primary.
6904          */
6905         if (!move && isprimary) {
6906                 for (gclient = mip->mi_clients_list; gclient != NULL;
6907                     gclient = gclient->mci_client_next) {
6908                         if (gclient->mci_flent->fe_type & FLOW_PRIMARY_MAC &&
6909                             gclient->mci_flent->fe_tx_ring_group != NULL) {
6910                                 return (gclient->mci_flent->fe_tx_ring_group);
6911                         }
6912                 }
6913         }
6914 
6915         if (mip->mi_tx_groups == NULL || mip->mi_tx_group_count == 0)
6916                 return (NULL);
6917 
6918         /* For dynamic groups, default unspec to 1 */
6919         if (txhw && unspec &&
6920             mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6921                 mrp->mrp_ntxrings = 1;
6922         }
6923         /*
6924          * For static grouping we allow only specifying rings=0 and
6925          * unspecified
6926          */
6927         if (txhw && mrp->mrp_ntxrings > 0 &&
6928             mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC) {
6929                 return (NULL);
6930         }
6931 
6932         if (txhw) {
6933                 /*
6934                  * We have explicitly asked for a group (with ntxrings,
6935                  * if unspec).
6936                  */
6937                 if (unspec || mrp->mrp_ntxrings > 0) {
6938                         need_exclgrp = B_TRUE;
6939                         need_rings = mrp->mrp_ntxrings;
6940                 } else if (mrp->mrp_ntxrings == 0) {
6941                         /*
6942                          * We have asked for a software group.
6943                          */
6944                         return (NULL);
6945                 }
6946         }
6947         defgrp = MAC_DEFAULT_TX_GROUP(mip);
6948         /*
6949          * The number of rings that the default group can donate.
6950          * We need to leave at least one ring - the default ring - in
6951          * this group.
6952          */
6953         defnrings = defgrp->mrg_cur_count - 1;
6954 
6955         /*
6956          * Primary gets default group unless explicitly told not
6957          * to  (i.e. rings > 0).
6958          */
6959         if (isprimary && !need_exclgrp)
6960                 return (NULL);
6961 
6962         nrings = (mrp->mrp_mask & MRP_TX_RINGS) != 0 ? mrp->mrp_ntxrings : 1;
6963         for (i = 0; i <  mip->mi_tx_group_count; i++) {
6964                 grp = &mip->mi_tx_groups[i];
6965                 if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
6966                     (grp->mrg_state == MAC_GROUP_STATE_UNINIT)) {
6967                         /*
6968                          * Select a candidate for replacement if we don't
6969                          * get an exclusive group. A candidate group is one
6970                          * that didn't ask for an exclusive group, but got
6971                          * one and it has enough rings (combined with what
6972                          * the default group can donate) for the new MAC
6973                          * client.
6974                          */
6975                         if (grp->mrg_state == MAC_GROUP_STATE_RESERVED &&
6976                             candidate_grp == NULL) {
6977                                 gclient = MAC_GROUP_ONLY_CLIENT(grp);
6978                                 if (gclient == NULL)
6979                                         gclient = mac_get_grp_primary(grp);
6980                                 gmrp = MCIP_RESOURCE_PROPS(gclient);
6981                                 if (gclient->mci_share == NULL &&
6982                                     (gmrp->mrp_mask & MRP_TX_RINGS) == 0 &&
6983                                     (unspec ||
6984                                     (grp->mrg_cur_count + defnrings) >=
6985                                     need_rings)) {
6986                                         candidate_grp = grp;
6987                                 }
6988                         }
6989                         continue;
6990                 }
6991                 /*
6992                  * If the default can't donate let's just walk and
6993                  * see if someone can vacate a group, so that we have
6994                  * enough rings for this.
6995                  */
6996                 if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC ||
6997                     nrings <= defnrings) {
6998                         if (grp->mrg_state == MAC_GROUP_STATE_REGISTERED) {
6999                                 rv = mac_start_group(grp);
7000                                 ASSERT(rv == 0);
7001                         }
7002                         break;
7003                 }
7004         }
7005 
7006         /* The default group */
7007         if (i >= mip->mi_tx_group_count) {
7008                 /*
7009                  * If we need an exclusive group and have identified a
7010                  * candidate group we switch the MAC client from the
7011                  * candidate group to the default group and give the
7012                  * candidate group to this client.
7013                  */
7014                 if (need_exclgrp && candidate_grp != NULL) {
7015                         /*
7016                          * Switch the MAC client from the candidate group
7017                          * to the default group.
7018                          */
7019                         grp = candidate_grp;
7020                         gclient = MAC_GROUP_ONLY_CLIENT(grp);
7021                         if (gclient == NULL)
7022                                 gclient = mac_get_grp_primary(grp);
7023                         mac_tx_client_quiesce((mac_client_handle_t)gclient);
7024                         mac_tx_switch_group(gclient, grp, defgrp);
7025                         mac_tx_client_restart((mac_client_handle_t)gclient);
7026 
7027                         /*
7028                          * Give the candidate group with the specified number
7029                          * of rings to this MAC client.
7030                          */
7031                         ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
7032                         rv = mac_start_group(grp);
7033                         ASSERT(rv == 0);
7034 
7035                         if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC)
7036                                 return (grp);
7037 
7038                         ASSERT(grp->mrg_cur_count == 0);
7039                         ASSERT(defgrp->mrg_cur_count > need_rings);
7040 
7041                         err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX,
7042                             defgrp, grp, share, need_rings);
7043                         if (err == 0) {
7044                                 /*
7045                                  * For a share i_mac_group_allocate_rings gets
7046                                  * the rings from the driver, let's populate
7047                                  * the property for the client now.
7048                                  */
7049                                 if (share != NULL) {
7050                                         mac_client_set_rings(
7051                                             (mac_client_handle_t)mcip, -1,
7052                                             grp->mrg_cur_count);
7053                                 }
7054                                 mip->mi_tx_group_free--;
7055                                 return (grp);
7056                         }
7057                         DTRACE_PROBE3(tx__group__reserve__alloc__rings, char *,
7058                             mip->mi_name, int, grp->mrg_index, int, err);
7059                         mac_stop_group(grp);
7060                 }
7061                 return (NULL);
7062         }
7063         /*
7064          * We got an exclusive group, but it is not dynamic.
7065          */
7066         if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
7067                 mip->mi_tx_group_free--;
7068                 return (grp);
7069         }
7070 
7071         rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, defgrp, grp,
7072             share, nrings);
7073         if (rv != 0) {
7074                 DTRACE_PROBE3(tx__group__reserve__alloc__rings,
7075                     char *, mip->mi_name, int, grp->mrg_index, int, rv);
7076                 mac_stop_group(grp);
7077                 return (NULL);
7078         }
7079         /*
7080          * For a share i_mac_group_allocate_rings gets the rings from the
7081          * driver, let's populate the property for the client now.
7082          */
7083         if (share != NULL) {
7084                 mac_client_set_rings((mac_client_handle_t)mcip, -1,
7085                     grp->mrg_cur_count);
7086         }
7087         mip->mi_tx_group_free--;
7088         return (grp);
7089 }
7090 
7091 void
7092 mac_release_tx_group(mac_client_impl_t *mcip, mac_group_t *grp)
7093 {
7094         mac_impl_t              *mip = mcip->mci_mip;
7095         mac_share_handle_t      share = mcip->mci_share;
7096         mac_ring_t              *ring;
7097         mac_soft_ring_set_t     *srs = MCIP_TX_SRS(mcip);
7098         mac_group_t             *defgrp;
7099 
7100         defgrp = MAC_DEFAULT_TX_GROUP(mip);
7101         if (srs != NULL) {
7102                 if (srs->srs_soft_ring_count > 0) {
7103                         for (ring = grp->mrg_rings; ring != NULL;
7104                             ring = ring->mr_next) {
7105                                 ASSERT(mac_tx_srs_ring_present(srs, ring));
7106                                 mac_tx_invoke_callbacks(mcip,
7107                                     (mac_tx_cookie_t)
7108                                     mac_tx_srs_get_soft_ring(srs, ring));
7109                                 mac_tx_srs_del_ring(srs, ring);
7110                         }
7111                 } else {
7112                         ASSERT(srs->srs_tx.st_arg2 != NULL);
7113                         srs->srs_tx.st_arg2 = NULL;
7114                         mac_srs_stat_delete(srs);
7115                 }
7116         }
7117         if (share != NULL)
7118                 mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
7119 
7120         /* move the ring back to the pool */
7121         if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
7122                 while ((ring = grp->mrg_rings) != NULL)
7123                         (void) mac_group_mov_ring(mip, defgrp, ring);
7124         }
7125         mac_stop_group(grp);
7126         mip->mi_tx_group_free++;
7127 }
7128 
7129 /*
7130  * Disassociate a MAC client from a group, i.e go through the rings in the
7131  * group and delete all the soft rings tied to them.
7132  */
7133 static void
7134 mac_tx_dismantle_soft_rings(mac_group_t *fgrp, flow_entry_t *flent)
7135 {
7136         mac_client_impl_t       *mcip = flent->fe_mcip;
7137         mac_soft_ring_set_t     *tx_srs;
7138         mac_srs_tx_t            *tx;
7139         mac_ring_t              *ring;
7140 
7141         tx_srs = flent->fe_tx_srs;
7142         tx = &tx_srs->srs_tx;
7143 
7144         /* Single ring case we haven't created any soft rings */
7145         if (tx->st_mode == SRS_TX_BW || tx->st_mode == SRS_TX_SERIALIZE ||
7146             tx->st_mode == SRS_TX_DEFAULT) {
7147                 tx->st_arg2 = NULL;
7148                 mac_srs_stat_delete(tx_srs);
7149         /* Fanout case, where we have to dismantle the soft rings */
7150         } else {
7151                 for (ring = fgrp->mrg_rings; ring != NULL;
7152                     ring = ring->mr_next) {
7153                         ASSERT(mac_tx_srs_ring_present(tx_srs, ring));
7154                         mac_tx_invoke_callbacks(mcip,
7155                             (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(tx_srs,
7156                             ring));
7157                         mac_tx_srs_del_ring(tx_srs, ring);
7158                 }
7159                 ASSERT(tx->st_arg2 == NULL);
7160         }
7161 }
7162 
7163 /*
7164  * Switch the MAC client from one group to another. This means we need
7165  * to remove the MAC client, teardown the SRSs and revert the group state.
7166  * Then, we add the client to the destination roup, set the SRSs etc.
7167  */
7168 void
7169 mac_tx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
7170     mac_group_t *tgrp)
7171 {
7172         mac_client_impl_t       *group_only_mcip;
7173         mac_impl_t              *mip = mcip->mci_mip;
7174         flow_entry_t            *flent = mcip->mci_flent;
7175         mac_group_t             *defgrp;
7176         mac_grp_client_t        *mgcp;
7177         mac_client_impl_t       *gmcip;
7178         flow_entry_t            *gflent;
7179 
7180         defgrp = MAC_DEFAULT_TX_GROUP(mip);
7181         ASSERT(fgrp == flent->fe_tx_ring_group);
7182 
7183         if (fgrp == defgrp) {
7184                 /*
7185                  * If this is the primary we need to find any VLANs on
7186                  * the primary and move them too.
7187                  */
7188                 mac_group_remove_client(fgrp, mcip);
7189                 mac_tx_dismantle_soft_rings(fgrp, flent);
7190                 if (mcip->mci_unicast->ma_nusers > 1) {
7191                         mgcp = fgrp->mrg_clients;
7192                         while (mgcp != NULL) {
7193                                 gmcip = mgcp->mgc_client;
7194                                 mgcp = mgcp->mgc_next;
7195                                 if (mcip->mci_unicast != gmcip->mci_unicast)
7196                                         continue;
7197                                 mac_tx_client_quiesce(
7198                                     (mac_client_handle_t)gmcip);
7199 
7200                                 gflent = gmcip->mci_flent;
7201                                 mac_group_remove_client(fgrp, gmcip);
7202                                 mac_tx_dismantle_soft_rings(fgrp, gflent);
7203 
7204                                 mac_group_add_client(tgrp, gmcip);
7205                                 gflent->fe_tx_ring_group = tgrp;
7206                                 /* We could directly set this to SHARED */
7207                                 tgrp->mrg_state = mac_group_next_state(tgrp,
7208                                     &group_only_mcip, defgrp, B_FALSE);
7209 
7210                                 mac_tx_srs_group_setup(gmcip, gflent,
7211                                     SRST_LINK);
7212                                 mac_fanout_setup(gmcip, gflent,
7213                                     MCIP_RESOURCE_PROPS(gmcip), mac_rx_deliver,
7214                                     gmcip, NULL, NULL);
7215 
7216                                 mac_tx_client_restart(
7217                                     (mac_client_handle_t)gmcip);
7218                         }
7219                 }
7220                 if (MAC_GROUP_NO_CLIENT(fgrp)) {
7221                         mac_ring_t      *ring;
7222                         int             cnt;
7223                         int             ringcnt;
7224 
7225                         fgrp->mrg_state = MAC_GROUP_STATE_REGISTERED;
7226                         /*
7227                          * Additionally, we also need to stop all
7228                          * the rings in the default group, except
7229                          * the default ring. The reason being
7230                          * this group won't be released since it is
7231                          * the default group, so the rings won't
7232                          * be stopped otherwise.
7233                          */
7234                         ringcnt = fgrp->mrg_cur_count;
7235                         ring = fgrp->mrg_rings;
7236                         for (cnt = 0; cnt < ringcnt; cnt++) {
7237                                 if (ring->mr_state == MR_INUSE &&
7238                                     ring !=
7239                                     (mac_ring_t *)mip->mi_default_tx_ring) {
7240                                         mac_stop_ring(ring);
7241                                         ring->mr_flag = 0;
7242                                 }
7243                                 ring = ring->mr_next;
7244                         }
7245                 } else if (MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
7246                         fgrp->mrg_state = MAC_GROUP_STATE_RESERVED;
7247                 } else {
7248                         ASSERT(fgrp->mrg_state == MAC_GROUP_STATE_SHARED);
7249                 }
7250         } else {
7251                 /*
7252                  * We could have VLANs sharing the non-default group with
7253                  * the primary.
7254                  */
7255                 mgcp = fgrp->mrg_clients;
7256                 while (mgcp != NULL) {
7257                         gmcip = mgcp->mgc_client;
7258                         mgcp = mgcp->mgc_next;
7259                         if (gmcip == mcip)
7260                                 continue;
7261                         mac_tx_client_quiesce((mac_client_handle_t)gmcip);
7262                         gflent = gmcip->mci_flent;
7263 
7264                         mac_group_remove_client(fgrp, gmcip);
7265                         mac_tx_dismantle_soft_rings(fgrp, gflent);
7266 
7267                         mac_group_add_client(tgrp, gmcip);
7268                         gflent->fe_tx_ring_group = tgrp;
7269                         /* We could directly set this to SHARED */
7270                         tgrp->mrg_state = mac_group_next_state(tgrp,
7271                             &group_only_mcip, defgrp, B_FALSE);
7272                         mac_tx_srs_group_setup(gmcip, gflent, SRST_LINK);
7273                         mac_fanout_setup(gmcip, gflent,
7274                             MCIP_RESOURCE_PROPS(gmcip), mac_rx_deliver,
7275                             gmcip, NULL, NULL);
7276 
7277                         mac_tx_client_restart((mac_client_handle_t)gmcip);
7278                 }
7279                 mac_group_remove_client(fgrp, mcip);
7280                 mac_release_tx_group(mcip, fgrp);
7281                 fgrp->mrg_state = MAC_GROUP_STATE_REGISTERED;
7282         }
7283 
7284         /* Add it to the tgroup */
7285         mac_group_add_client(tgrp, mcip);
7286         flent->fe_tx_ring_group = tgrp;
7287         tgrp->mrg_state = mac_group_next_state(tgrp, &group_only_mcip,
7288             defgrp, B_FALSE);
7289 
7290         mac_tx_srs_group_setup(mcip, flent, SRST_LINK);
7291         mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
7292             mac_rx_deliver, mcip, NULL, NULL);
7293 }
7294 
7295 /*
7296  * This is a 1-time control path activity initiated by the client (IP).
7297  * The mac perimeter protects against other simultaneous control activities,
7298  * for example an ioctl that attempts to change the degree of fanout and
7299  * increase or decrease the number of softrings associated with this Tx SRS.
7300  */
7301 static mac_tx_notify_cb_t *
7302 mac_client_tx_notify_add(mac_client_impl_t *mcip,
7303     mac_tx_notify_t notify, void *arg)
7304 {
7305         mac_cb_info_t *mcbi;
7306         mac_tx_notify_cb_t *mtnfp;
7307 
7308         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
7309 
7310         mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
7311         mtnfp->mtnf_fn = notify;
7312         mtnfp->mtnf_arg = arg;
7313         mtnfp->mtnf_link.mcb_objp = mtnfp;
7314         mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
7315         mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
7316 
7317         mcbi = &mcip->mci_tx_notify_cb_info;
7318         mutex_enter(mcbi->mcbi_lockp);
7319         mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
7320         mutex_exit(mcbi->mcbi_lockp);
7321         return (mtnfp);
7322 }
7323 
7324 static void
7325 mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
7326 {
7327         mac_cb_info_t   *mcbi;
7328         mac_cb_t        **cblist;
7329 
7330         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
7331 
7332         if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
7333             &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
7334                 cmn_err(CE_WARN,
7335                     "mac_client_tx_notify_remove: callback not "
7336                     "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
7337                 return;
7338         }
7339 
7340         mcbi = &mcip->mci_tx_notify_cb_info;
7341         cblist = &mcip->mci_tx_notify_cb_list;
7342         mutex_enter(mcbi->mcbi_lockp);
7343         if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
7344                 kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
7345         else
7346                 mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
7347         mutex_exit(mcbi->mcbi_lockp);
7348 }
7349 
7350 /*
7351  * mac_client_tx_notify():
7352  * call to add and remove flow control callback routine.
7353  */
7354 mac_tx_notify_handle_t
7355 mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
7356     void *ptr)
7357 {
7358         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
7359         mac_tx_notify_cb_t      *mtnfp = NULL;
7360 
7361         i_mac_perim_enter(mcip->mci_mip);
7362 
7363         if (callb_func != NULL) {
7364                 /* Add a notify callback */
7365                 mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
7366         } else {
7367                 mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
7368         }
7369         i_mac_perim_exit(mcip->mci_mip);
7370 
7371         return ((mac_tx_notify_handle_t)mtnfp);
7372 }
7373 
7374 void
7375 mac_bridge_vectors(mac_bridge_tx_t txf, mac_bridge_rx_t rxf,
7376     mac_bridge_ref_t reff, mac_bridge_ls_t lsf)
7377 {
7378         mac_bridge_tx_cb = txf;
7379         mac_bridge_rx_cb = rxf;
7380         mac_bridge_ref_cb = reff;
7381         mac_bridge_ls_cb = lsf;
7382 }
7383 
7384 int
7385 mac_bridge_set(mac_handle_t mh, mac_handle_t link)
7386 {
7387         mac_impl_t *mip = (mac_impl_t *)mh;
7388         int retv;
7389 
7390         mutex_enter(&mip->mi_bridge_lock);
7391         if (mip->mi_bridge_link == NULL) {
7392                 mip->mi_bridge_link = link;
7393                 retv = 0;
7394         } else {
7395                 retv = EBUSY;
7396         }
7397         mutex_exit(&mip->mi_bridge_lock);
7398         if (retv == 0) {
7399                 mac_poll_state_change(mh, B_FALSE);
7400                 mac_capab_update(mh);
7401         }
7402         return (retv);
7403 }
7404 
7405 /*
7406  * Disable bridging on the indicated link.
7407  */
7408 void
7409 mac_bridge_clear(mac_handle_t mh, mac_handle_t link)
7410 {
7411         mac_impl_t *mip = (mac_impl_t *)mh;
7412 
7413         mutex_enter(&mip->mi_bridge_lock);
7414         ASSERT(mip->mi_bridge_link == link);
7415         mip->mi_bridge_link = NULL;
7416         mutex_exit(&mip->mi_bridge_lock);
7417         mac_poll_state_change(mh, B_TRUE);
7418         mac_capab_update(mh);
7419 }
7420 
7421 void
7422 mac_no_active(mac_handle_t mh)
7423 {
7424         mac_impl_t *mip = (mac_impl_t *)mh;
7425 
7426         i_mac_perim_enter(mip);
7427         mip->mi_state_flags |= MIS_NO_ACTIVE;
7428         i_mac_perim_exit(mip);
7429 }
7430 
7431 /*
7432  * Walk the primary VLAN clients whenever the primary's rings property
7433  * changes and update the mac_resource_props_t for the VLAN's client.
7434  * We need to do this since we don't support setting these properties
7435  * on the primary's VLAN clients, but the VLAN clients have to
7436  * follow the primary w.r.t the rings property;
7437  */
7438 void
7439 mac_set_prim_vlan_rings(mac_impl_t  *mip, mac_resource_props_t *mrp)
7440 {
7441         mac_client_impl_t       *vmcip;
7442         mac_resource_props_t    *vmrp;
7443 
7444         for (vmcip = mip->mi_clients_list; vmcip != NULL;
7445             vmcip = vmcip->mci_client_next) {
7446                 if (!(vmcip->mci_flent->fe_type & FLOW_PRIMARY_MAC) ||
7447                     mac_client_vid((mac_client_handle_t)vmcip) ==
7448                     VLAN_ID_NONE) {
7449                         continue;
7450                 }
7451                 vmrp = MCIP_RESOURCE_PROPS(vmcip);
7452 
7453                 vmrp->mrp_nrxrings =  mrp->mrp_nrxrings;
7454                 if (mrp->mrp_mask & MRP_RX_RINGS)
7455                         vmrp->mrp_mask |= MRP_RX_RINGS;
7456                 else if (vmrp->mrp_mask & MRP_RX_RINGS)
7457                         vmrp->mrp_mask &= ~MRP_RX_RINGS;
7458 
7459                 vmrp->mrp_ntxrings =  mrp->mrp_ntxrings;
7460                 if (mrp->mrp_mask & MRP_TX_RINGS)
7461                         vmrp->mrp_mask |= MRP_TX_RINGS;
7462                 else if (vmrp->mrp_mask & MRP_TX_RINGS)
7463                         vmrp->mrp_mask &= ~MRP_TX_RINGS;
7464 
7465                 if (mrp->mrp_mask & MRP_RXRINGS_UNSPEC)
7466                         vmrp->mrp_mask |= MRP_RXRINGS_UNSPEC;
7467                 else
7468                         vmrp->mrp_mask &= ~MRP_RXRINGS_UNSPEC;
7469 
7470                 if (mrp->mrp_mask & MRP_TXRINGS_UNSPEC)
7471                         vmrp->mrp_mask |= MRP_TXRINGS_UNSPEC;
7472                 else
7473                         vmrp->mrp_mask &= ~MRP_TXRINGS_UNSPEC;
7474         }
7475 }
7476 
7477 /*
7478  * We are adding or removing ring(s) from a group. The source for taking
7479  * rings is the default group. The destination for giving rings back is
7480  * the default group.
7481  */
7482 int
7483 mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group,
7484     mac_group_t *defgrp)
7485 {
7486         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
7487         uint_t                  modify;
7488         int                     count;
7489         mac_ring_t              *ring;
7490         mac_ring_t              *next;
7491         mac_impl_t              *mip = mcip->mci_mip;
7492         mac_ring_t              **rings;
7493         uint_t                  ringcnt;
7494         int                     i = 0;
7495         boolean_t               rx_group = group->mrg_type == MAC_RING_TYPE_RX;
7496         int                     start;
7497         int                     end;
7498         mac_group_t             *tgrp;
7499         int                     j;
7500         int                     rv = 0;
7501 
7502         /*
7503          * If we are asked for just a group, we give 1 ring, else
7504          * the specified number of rings.
7505          */
7506         if (rx_group) {
7507                 ringcnt = (mrp->mrp_mask & MRP_RXRINGS_UNSPEC) ? 1:
7508                     mrp->mrp_nrxrings;
7509         } else {
7510                 ringcnt = (mrp->mrp_mask & MRP_TXRINGS_UNSPEC) ? 1:
7511                     mrp->mrp_ntxrings;
7512         }
7513 
7514         /* don't allow modifying rings for a share for now. */
7515         ASSERT(mcip->mci_share == NULL);
7516 
7517         if (ringcnt == group->mrg_cur_count)
7518                 return (0);
7519 
7520         if (group->mrg_cur_count > ringcnt) {
7521                 modify = group->mrg_cur_count - ringcnt;
7522                 if (rx_group) {
7523                         if (mip->mi_rx_donor_grp == group) {
7524                                 ASSERT(mac_is_primary_client(mcip));
7525                                 mip->mi_rx_donor_grp = defgrp;
7526                         } else {
7527                                 defgrp = mip->mi_rx_donor_grp;
7528                         }
7529                 }
7530                 ring = group->mrg_rings;
7531                 rings = kmem_alloc(modify * sizeof (mac_ring_handle_t),
7532                     KM_SLEEP);
7533                 j = 0;
7534                 for (count = 0; count < modify; count++) {
7535                         next = ring->mr_next;
7536                         rv = mac_group_mov_ring(mip, defgrp, ring);
7537                         if (rv != 0) {
7538                                 /* cleanup on failure */
7539                                 for (j = 0; j < count; j++) {
7540                                         (void) mac_group_mov_ring(mip, group,
7541                                             rings[j]);
7542                                 }
7543                                 break;
7544                         }
7545                         rings[j++] = ring;
7546                         ring = next;
7547                 }
7548                 kmem_free(rings, modify * sizeof (mac_ring_handle_t));
7549                 return (rv);
7550         }
7551         if (ringcnt >= MAX_RINGS_PER_GROUP)
7552                 return (EINVAL);
7553 
7554         modify = ringcnt - group->mrg_cur_count;
7555 
7556         if (rx_group) {
7557                 if (group != mip->mi_rx_donor_grp)
7558                         defgrp = mip->mi_rx_donor_grp;
7559                 else
7560                         /*
7561                          * This is the donor group with all the remaining
7562                          * rings. Default group now gets to be the donor
7563                          */
7564                         mip->mi_rx_donor_grp = defgrp;
7565                 start = 1;
7566                 end = mip->mi_rx_group_count;
7567         } else {
7568                 start = 0;
7569                 end = mip->mi_tx_group_count - 1;
7570         }
7571         /*
7572          * If the default doesn't have any rings, lets see if we can
7573          * take rings given to an h/w client that doesn't need it.
7574          * For now, we just see if there is  any one client that can donate
7575          * all the required rings.
7576          */
7577         if (defgrp->mrg_cur_count < (modify + 1)) {
7578                 for (i = start; i < end; i++) {
7579                         if (rx_group) {
7580                                 tgrp = &mip->mi_rx_groups[i];
7581                                 if (tgrp == group || tgrp->mrg_state <
7582                                     MAC_GROUP_STATE_RESERVED) {
7583                                         continue;
7584                                 }
7585                                 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
7586                                 if (mcip == NULL)
7587                                         mcip = mac_get_grp_primary(tgrp);
7588                                 ASSERT(mcip != NULL);
7589                                 mrp = MCIP_RESOURCE_PROPS(mcip);
7590                                 if ((mrp->mrp_mask & MRP_RX_RINGS) != 0)
7591                                         continue;
7592                                 if ((tgrp->mrg_cur_count +
7593                                     defgrp->mrg_cur_count) < (modify + 1)) {
7594                                         continue;
7595                                 }
7596                                 if (mac_rx_switch_group(mcip, tgrp,
7597                                     defgrp) != 0) {
7598                                         return (ENOSPC);
7599                                 }
7600                         } else {
7601                                 tgrp = &mip->mi_tx_groups[i];
7602                                 if (tgrp == group || tgrp->mrg_state <
7603                                     MAC_GROUP_STATE_RESERVED) {
7604                                         continue;
7605                                 }
7606                                 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
7607                                 if (mcip == NULL)
7608                                         mcip = mac_get_grp_primary(tgrp);
7609                                 mrp = MCIP_RESOURCE_PROPS(mcip);
7610                                 if ((mrp->mrp_mask & MRP_TX_RINGS) != 0)
7611                                         continue;
7612                                 if ((tgrp->mrg_cur_count +
7613                                     defgrp->mrg_cur_count) < (modify + 1)) {
7614                                         continue;
7615                                 }
7616                                 /* OK, we can switch this to s/w */
7617                                 mac_tx_client_quiesce(
7618                                     (mac_client_handle_t)mcip);
7619                                 mac_tx_switch_group(mcip, tgrp, defgrp);
7620                                 mac_tx_client_restart(
7621                                     (mac_client_handle_t)mcip);
7622                         }
7623                 }
7624                 if (defgrp->mrg_cur_count < (modify + 1))
7625                         return (ENOSPC);
7626         }
7627         if ((rv = i_mac_group_allocate_rings(mip, group->mrg_type, defgrp,
7628             group, mcip->mci_share, modify)) != 0) {
7629                 return (rv);
7630         }
7631         return (0);
7632 }
7633 
7634 /*
7635  * Given the poolname in mac_resource_props, find the cpupart
7636  * that is associated with this pool.  The cpupart will be used
7637  * later for finding the cpus to be bound to the networking threads.
7638  *
7639  * use_default is set B_TRUE if pools are enabled and pool_default
7640  * is returned.  This avoids a 2nd lookup to set the poolname
7641  * for pool-effective.
7642  *
7643  * returns:
7644  *
7645  *    NULL -   pools are disabled or if the 'cpus' property is set.
7646  *    cpupart of pool_default  - pools are enabled and the pool
7647  *             is not available or poolname is blank
7648  *    cpupart of named pool    - pools are enabled and the pool
7649  *             is available.
7650  */
7651 cpupart_t *
7652 mac_pset_find(mac_resource_props_t *mrp, boolean_t *use_default)
7653 {
7654         pool_t          *pool;
7655         cpupart_t       *cpupart;
7656 
7657         *use_default = B_FALSE;
7658 
7659         /* CPUs property is set */
7660         if (mrp->mrp_mask & MRP_CPUS)
7661                 return (NULL);
7662 
7663         ASSERT(pool_lock_held());
7664 
7665         /* Pools are disabled, no pset */
7666         if (pool_state == POOL_DISABLED)
7667                 return (NULL);
7668 
7669         /* Pools property is set */
7670         if (mrp->mrp_mask & MRP_POOL) {
7671                 if ((pool = pool_lookup_pool_by_name(mrp->mrp_pool)) == NULL) {
7672                         /* Pool not found */
7673                         DTRACE_PROBE1(mac_pset_find_no_pool, char *,
7674                             mrp->mrp_pool);
7675                         *use_default = B_TRUE;
7676                         pool = pool_default;
7677                 }
7678         /* Pools property is not set */
7679         } else {
7680                 *use_default = B_TRUE;
7681                 pool = pool_default;
7682         }
7683 
7684         /* Find the CPU pset that corresponds to the pool */
7685         mutex_enter(&cpu_lock);
7686         if ((cpupart = cpupart_find(pool->pool_pset->pset_id)) == NULL) {
7687                 DTRACE_PROBE1(mac_find_pset_no_pset, psetid_t,
7688                     pool->pool_pset->pset_id);
7689         }
7690         mutex_exit(&cpu_lock);
7691 
7692         return (cpupart);
7693 }
7694 
7695 void
7696 mac_set_pool_effective(boolean_t use_default, cpupart_t *cpupart,
7697     mac_resource_props_t *mrp, mac_resource_props_t *emrp)
7698 {
7699         ASSERT(pool_lock_held());
7700 
7701         if (cpupart != NULL) {
7702                 emrp->mrp_mask |= MRP_POOL;
7703                 if (use_default) {
7704                         (void) strcpy(emrp->mrp_pool,
7705                             "pool_default");
7706                 } else {
7707                         ASSERT(strlen(mrp->mrp_pool) != 0);
7708                         (void) strcpy(emrp->mrp_pool,
7709                             mrp->mrp_pool);
7710                 }
7711         } else {
7712                 emrp->mrp_mask &= ~MRP_POOL;
7713                 bzero(emrp->mrp_pool, MAXPATHLEN);
7714         }
7715 }
7716 
7717 struct mac_pool_arg {
7718         char            mpa_poolname[MAXPATHLEN];
7719         pool_event_t    mpa_what;
7720 };
7721 
7722 /*ARGSUSED*/
7723 static uint_t
7724 mac_pool_link_update(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
7725 {
7726         struct mac_pool_arg     *mpa = arg;
7727         mac_impl_t              *mip = (mac_impl_t *)val;
7728         mac_client_impl_t       *mcip;
7729         mac_resource_props_t    *mrp, *emrp;
7730         boolean_t               pool_update = B_FALSE;
7731         boolean_t               pool_clear = B_FALSE;
7732         boolean_t               use_default = B_FALSE;
7733         cpupart_t               *cpupart = NULL;
7734 
7735         mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
7736         i_mac_perim_enter(mip);
7737         for (mcip = mip->mi_clients_list; mcip != NULL;
7738             mcip = mcip->mci_client_next) {
7739                 pool_update = B_FALSE;
7740                 pool_clear = B_FALSE;
7741                 use_default = B_FALSE;
7742                 mac_client_get_resources((mac_client_handle_t)mcip, mrp);
7743                 emrp = MCIP_EFFECTIVE_PROPS(mcip);
7744 
7745                 /*
7746                  * When pools are enabled
7747                  */
7748                 if ((mpa->mpa_what == POOL_E_ENABLE) &&
7749                     ((mrp->mrp_mask & MRP_CPUS) == 0)) {
7750                         mrp->mrp_mask |= MRP_POOL;
7751                         pool_update = B_TRUE;
7752                 }
7753 
7754                 /*
7755                  * When pools are disabled
7756                  */
7757                 if ((mpa->mpa_what == POOL_E_DISABLE) &&
7758                     ((mrp->mrp_mask & MRP_CPUS) == 0)) {
7759                         mrp->mrp_mask |= MRP_POOL;
7760                         pool_clear = B_TRUE;
7761                 }
7762 
7763                 /*
7764                  * Look for links with the pool property set and the poolname
7765                  * matching the one which is changing.
7766                  */
7767                 if (strcmp(mrp->mrp_pool, mpa->mpa_poolname) == 0) {
7768                         /*
7769                          * The pool associated with the link has changed.
7770                          */
7771                         if (mpa->mpa_what == POOL_E_CHANGE) {
7772                                 mrp->mrp_mask |= MRP_POOL;
7773                                 pool_update = B_TRUE;
7774                         }
7775                 }
7776 
7777                 /*
7778                  * This link is associated with pool_default and
7779                  * pool_default has changed.
7780                  */
7781                 if ((mpa->mpa_what == POOL_E_CHANGE) &&
7782                     (strcmp(emrp->mrp_pool, "pool_default") == 0) &&
7783                     (strcmp(mpa->mpa_poolname, "pool_default") == 0)) {
7784                         mrp->mrp_mask |= MRP_POOL;
7785                         pool_update = B_TRUE;
7786                 }
7787 
7788                 /*
7789                  * Get new list of cpus for the pool, bind network
7790                  * threads to new list of cpus and update resources.
7791                  */
7792                 if (pool_update) {
7793                         if (MCIP_DATAPATH_SETUP(mcip)) {
7794                                 pool_lock();
7795                                 cpupart = mac_pset_find(mrp, &use_default);
7796                                 mac_fanout_setup(mcip, mcip->mci_flent, mrp,
7797                                     mac_rx_deliver, mcip, NULL, cpupart);
7798                                 mac_set_pool_effective(use_default, cpupart,
7799                                     mrp, emrp);
7800                                 pool_unlock();
7801                         }
7802                         mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip),
7803                             B_FALSE);
7804                 }
7805 
7806                 /*
7807                  * Clear the effective pool and bind network threads
7808                  * to any available CPU.
7809                  */
7810                 if (pool_clear) {
7811                         if (MCIP_DATAPATH_SETUP(mcip)) {
7812                                 emrp->mrp_mask &= ~MRP_POOL;
7813                                 bzero(emrp->mrp_pool, MAXPATHLEN);
7814                                 mac_fanout_setup(mcip, mcip->mci_flent, mrp,
7815                                     mac_rx_deliver, mcip, NULL, NULL);
7816                         }
7817                         mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip),
7818                             B_FALSE);
7819                 }
7820         }
7821         i_mac_perim_exit(mip);
7822         kmem_free(mrp, sizeof (*mrp));
7823         return (MH_WALK_CONTINUE);
7824 }
7825 
7826 static void
7827 mac_pool_update(void *arg)
7828 {
7829         mod_hash_walk(i_mac_impl_hash, mac_pool_link_update, arg);
7830         kmem_free(arg, sizeof (struct mac_pool_arg));
7831 }
7832 
7833 /*
7834  * Callback function to be executed when a noteworthy pool event
7835  * takes place.
7836  */
7837 /* ARGSUSED */
7838 static void
7839 mac_pool_event_cb(pool_event_t what, poolid_t id, void *arg)
7840 {
7841         pool_t                  *pool;
7842         char                    *poolname = NULL;
7843         struct mac_pool_arg     *mpa;
7844 
7845         pool_lock();
7846         mpa = kmem_zalloc(sizeof (struct mac_pool_arg), KM_SLEEP);
7847 
7848         switch (what) {
7849         case POOL_E_ENABLE:
7850         case POOL_E_DISABLE:
7851                 break;
7852 
7853         case POOL_E_CHANGE:
7854                 pool = pool_lookup_pool_by_id(id);
7855                 if (pool == NULL) {
7856                         kmem_free(mpa, sizeof (struct mac_pool_arg));
7857                         pool_unlock();
7858                         return;
7859                 }
7860                 pool_get_name(pool, &poolname);
7861                 (void) strlcpy(mpa->mpa_poolname, poolname,
7862                     sizeof (mpa->mpa_poolname));
7863                 break;
7864 
7865         default:
7866                 kmem_free(mpa, sizeof (struct mac_pool_arg));
7867                 pool_unlock();
7868                 return;
7869         }
7870         pool_unlock();
7871 
7872         mpa->mpa_what = what;
7873 
7874         mac_pool_update(mpa);
7875 }
7876 
7877 /*
7878  * Set effective rings property. This could be called from datapath_setup/
7879  * datapath_teardown or set-linkprop.
7880  * If the group is reserved we just go ahead and set the effective rings.
7881  * Additionally, for TX this could mean the default  group has lost/gained
7882  * some rings, so if the default group is reserved, we need to adjust the
7883  * effective rings for the default group clients. For RX, if we are working
7884  * with the non-default group, we just need * to reset the effective props
7885  * for the default group clients.
7886  */
7887 void
7888 mac_set_rings_effective(mac_client_impl_t *mcip)
7889 {
7890         mac_impl_t              *mip = mcip->mci_mip;
7891         mac_group_t             *grp;
7892         mac_group_t             *defgrp;
7893         flow_entry_t            *flent = mcip->mci_flent;
7894         mac_resource_props_t    *emrp = MCIP_EFFECTIVE_PROPS(mcip);
7895         mac_grp_client_t        *mgcp;
7896         mac_client_impl_t       *gmcip;
7897 
7898         grp = flent->fe_rx_ring_group;
7899         if (grp != NULL) {
7900                 defgrp = MAC_DEFAULT_RX_GROUP(mip);
7901                 /*
7902                  * If we have reserved a group, set the effective rings
7903                  * to the ring count in the group.
7904                  */
7905                 if (grp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7906                         emrp->mrp_mask |= MRP_RX_RINGS;
7907                         emrp->mrp_nrxrings = grp->mrg_cur_count;
7908                 }
7909 
7910                 /*
7911                  * We go through the clients in the shared group and
7912                  * reset the effective properties. It is possible this
7913                  * might have already been done for some client (i.e.
7914                  * if some client is being moved to a group that is
7915                  * already shared). The case where the default group is
7916                  * RESERVED is taken care of above (note in the RX side if
7917                  * there is a non-default group, the default group is always
7918                  * SHARED).
7919                  */
7920                 if (grp != defgrp || grp->mrg_state == MAC_GROUP_STATE_SHARED) {
7921                         if (grp->mrg_state == MAC_GROUP_STATE_SHARED)
7922                                 mgcp = grp->mrg_clients;
7923                         else
7924                                 mgcp = defgrp->mrg_clients;
7925                         while (mgcp != NULL) {
7926                                 gmcip = mgcp->mgc_client;
7927                                 emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7928                                 if (emrp->mrp_mask & MRP_RX_RINGS) {
7929                                         emrp->mrp_mask &= ~MRP_RX_RINGS;
7930                                         emrp->mrp_nrxrings = 0;
7931                                 }
7932                                 mgcp = mgcp->mgc_next;
7933                         }
7934                 }
7935         }
7936 
7937         /* Now the TX side */
7938         grp = flent->fe_tx_ring_group;
7939         if (grp != NULL) {
7940                 defgrp = MAC_DEFAULT_TX_GROUP(mip);
7941 
7942                 if (grp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7943                         emrp->mrp_mask |= MRP_TX_RINGS;
7944                         emrp->mrp_ntxrings = grp->mrg_cur_count;
7945                 } else if (grp->mrg_state == MAC_GROUP_STATE_SHARED) {
7946                         mgcp = grp->mrg_clients;
7947                         while (mgcp != NULL) {
7948                                 gmcip = mgcp->mgc_client;
7949                                 emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7950                                 if (emrp->mrp_mask & MRP_TX_RINGS) {
7951                                         emrp->mrp_mask &= ~MRP_TX_RINGS;
7952                                         emrp->mrp_ntxrings = 0;
7953                                 }
7954                                 mgcp = mgcp->mgc_next;
7955                         }
7956                 }
7957 
7958                 /*
7959                  * If the group is not the default group and the default
7960                  * group is reserved, the ring count in the default group
7961                  * might have changed, update it.
7962                  */
7963                 if (grp != defgrp &&
7964                     defgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7965                         gmcip = MAC_GROUP_ONLY_CLIENT(defgrp);
7966                         emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7967                         emrp->mrp_ntxrings = defgrp->mrg_cur_count;
7968                 }
7969         }
7970         emrp = MCIP_EFFECTIVE_PROPS(mcip);
7971 }
7972 
7973 /*
7974  * Check if the primary is in the default group. If so, see if we
7975  * can give it a an exclusive group now that another client is
7976  * being configured. We take the primary out of the default group
7977  * because the multicast/broadcast packets for the all the clients
7978  * will land in the default ring in the default group which means
7979  * any client in the default group, even if it is the only on in
7980  * the group, will lose exclusive access to the rings, hence
7981  * polling.
7982  */
7983 mac_client_impl_t *
7984 mac_check_primary_relocation(mac_client_impl_t *mcip, boolean_t rxhw)
7985 {
7986         mac_impl_t              *mip = mcip->mci_mip;
7987         mac_group_t             *defgrp = MAC_DEFAULT_RX_GROUP(mip);
7988         flow_entry_t            *flent = mcip->mci_flent;
7989         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
7990         uint8_t                 *mac_addr;
7991         mac_group_t             *ngrp;
7992 
7993         /*
7994          * Check if the primary is in the default group, if not
7995          * or if it is explicitly configured to be in the default
7996          * group OR set the RX rings property, return.
7997          */
7998         if (flent->fe_rx_ring_group != defgrp || mrp->mrp_mask & MRP_RX_RINGS)
7999                 return (NULL);
8000 
8001         /*
8002          * If the new client needs an exclusive group and we
8003          * don't have another for the primary, return.
8004          */
8005         if (rxhw && mip->mi_rxhwclnt_avail < 2)
8006                 return (NULL);
8007 
8008         mac_addr = flent->fe_flow_desc.fd_dst_mac;
8009         /*
8010          * We call this when we are setting up the datapath for
8011          * the first non-primary.
8012          */
8013         ASSERT(mip->mi_nactiveclients == 2);
8014         /*
8015          * OK, now we have the primary that needs to be relocated.
8016          */
8017         ngrp =  mac_reserve_rx_group(mcip, mac_addr, B_TRUE);
8018         if (ngrp == NULL)
8019                 return (NULL);
8020         if (mac_rx_switch_group(mcip, defgrp, ngrp) != 0) {
8021                 mac_stop_group(ngrp);
8022                 return (NULL);
8023         }
8024         return (mcip);
8025 }