1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
  27  *
  28  * An instance of the structure aggr_grp_t is allocated for each
  29  * link aggregation group. When created, aggr_grp_t objects are
  30  * entered into the aggr_grp_hash hash table maintained by the modhash
  31  * module. The hash key is the linkid associated with the link
  32  * aggregation group.
  33  *
  34  * A set of MAC ports are associated with each association group.
  35  *
  36  * Aggr pseudo TX rings
  37  * --------------------
  38  * The underlying ports (NICs) in an aggregation can have TX rings. To
  39  * enhance aggr's performance, these TX rings are made available to the
  40  * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
  41  * They are already present and implemented on the RX side. It is called
  42  * as pseudo RX rings. The same concept is extended to the TX side where
  43  * each TX ring of an underlying port is reflected in aggr as a pseudo
  44  * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
  45  * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
  46  * TX ring is given to the aggregation layer.
  47  *
  48  * With this change, the outgoing stack depth looks much better:
  49  *
  50  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
  51  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
  52  *
  53  * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
  54  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
  55  *
  56  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
  57  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
  58  * ring belonging to a port on which the packet has to be sent.
  59  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
  60  * policy and then uses the fanout_hint passed to it to pick a TX ring from
  61  * the selected port.
  62  *
  63  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
  64  * bandwidth limit is applied first on the outgoing packet and the packets
  65  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
  66  * particular TX ring.
  67  */
  68 
  69 #include <sys/types.h>
  70 #include <sys/sysmacros.h>
  71 #include <sys/conf.h>
  72 #include <sys/cmn_err.h>
  73 #include <sys/disp.h>
  74 #include <sys/list.h>
  75 #include <sys/ksynch.h>
  76 #include <sys/kmem.h>
  77 #include <sys/stream.h>
  78 #include <sys/modctl.h>
  79 #include <sys/ddi.h>
  80 #include <sys/sunddi.h>
  81 #include <sys/atomic.h>
  82 #include <sys/stat.h>
  83 #include <sys/modhash.h>
  84 #include <sys/id_space.h>
  85 #include <sys/strsun.h>
  86 #include <sys/cred.h>
  87 #include <sys/dlpi.h>
  88 #include <sys/zone.h>
  89 #include <sys/mac_provider.h>
  90 #include <sys/dls.h>
  91 #include <sys/vlan.h>
  92 #include <sys/aggr.h>
  93 #include <sys/aggr_impl.h>
  94 
  95 static int aggr_m_start(void *);
  96 static void aggr_m_stop(void *);
  97 static int aggr_m_promisc(void *, boolean_t);
  98 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
  99 static int aggr_m_unicst(void *, const uint8_t *);
 100 static int aggr_m_stat(void *, uint_t, uint64_t *);
 101 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
 102 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
 103 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
 104     const void *);
 105 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
 106     mac_prop_info_handle_t);
 107 
 108 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
 109 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
 110     boolean_t *);
 111 
 112 static void aggr_grp_capab_set(aggr_grp_t *);
 113 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
 114 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
 115 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
 116 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
 117 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
 118 
 119 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 120 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 121 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
 122 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
 123 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
 124 static void aggr_pseudo_stop_ring(mac_ring_driver_t);
 125 static int aggr_addmac(void *, const uint8_t *);
 126 static int aggr_remmac(void *, const uint8_t *);
 127 static mblk_t *aggr_rx_poll(void *, int);
 128 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
 129     const int, mac_ring_info_t *, mac_ring_handle_t);
 130 static void aggr_fill_group(void *, mac_ring_type_t, const int,
 131     mac_group_info_t *, mac_group_handle_t);
 132 
 133 static kmem_cache_t     *aggr_grp_cache;
 134 static mod_hash_t       *aggr_grp_hash;
 135 static krwlock_t        aggr_grp_lock;
 136 static uint_t           aggr_grp_cnt;
 137 static id_space_t       *key_ids;
 138 
 139 #define GRP_HASHSZ              64
 140 #define GRP_HASH_KEY(linkid)    ((mod_hash_key_t)(uintptr_t)linkid)
 141 #define AGGR_PORT_NAME_DELIMIT '-'
 142 
 143 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
 144 
 145 #define AGGR_M_CALLBACK_FLAGS   \
 146         (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
 147 
 148 static mac_callbacks_t aggr_m_callbacks = {
 149         AGGR_M_CALLBACK_FLAGS,
 150         aggr_m_stat,
 151         aggr_m_start,
 152         aggr_m_stop,
 153         aggr_m_promisc,
 154         aggr_m_multicst,
 155         NULL,
 156         NULL,
 157         NULL,
 158         aggr_m_ioctl,
 159         aggr_m_capab_get,
 160         NULL,
 161         NULL,
 162         aggr_m_setprop,
 163         NULL,
 164         aggr_m_propinfo
 165 };
 166 
 167 /*ARGSUSED*/
 168 static int
 169 aggr_grp_constructor(void *buf, void *arg, int kmflag)
 170 {
 171         aggr_grp_t *grp = buf;
 172 
 173         bzero(grp, sizeof (*grp));
 174         mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
 175         cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
 176         rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
 177         mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
 178         cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
 179         mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
 180         cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
 181         grp->lg_link_state = LINK_STATE_UNKNOWN;
 182         return (0);
 183 }
 184 
 185 /*ARGSUSED*/
 186 static void
 187 aggr_grp_destructor(void *buf, void *arg)
 188 {
 189         aggr_grp_t *grp = buf;
 190 
 191         if (grp->lg_tx_ports != NULL) {
 192                 kmem_free(grp->lg_tx_ports,
 193                     grp->lg_tx_ports_size * sizeof (aggr_port_t *));
 194         }
 195 
 196         mutex_destroy(&grp->lg_lacp_lock);
 197         cv_destroy(&grp->lg_lacp_cv);
 198         mutex_destroy(&grp->lg_port_lock);
 199         cv_destroy(&grp->lg_port_cv);
 200         rw_destroy(&grp->lg_tx_lock);
 201         mutex_destroy(&grp->lg_tx_flowctl_lock);
 202         cv_destroy(&grp->lg_tx_flowctl_cv);
 203 }
 204 
 205 void
 206 aggr_grp_init(void)
 207 {
 208         aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
 209             sizeof (aggr_grp_t), 0, aggr_grp_constructor,
 210             aggr_grp_destructor, NULL, NULL, NULL, 0);
 211 
 212         aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
 213             GRP_HASHSZ, mod_hash_null_valdtor);
 214         rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
 215         aggr_grp_cnt = 0;
 216 
 217         /*
 218          * Allocate an id space to manage key values (when key is not
 219          * specified). The range of the id space will be from
 220          * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
 221          * uses a 16-bit key.
 222          */
 223         key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
 224         ASSERT(key_ids != NULL);
 225 }
 226 
 227 void
 228 aggr_grp_fini(void)
 229 {
 230         id_space_destroy(key_ids);
 231         rw_destroy(&aggr_grp_lock);
 232         mod_hash_destroy_idhash(aggr_grp_hash);
 233         kmem_cache_destroy(aggr_grp_cache);
 234 }
 235 
 236 uint_t
 237 aggr_grp_count(void)
 238 {
 239         uint_t  count;
 240 
 241         rw_enter(&aggr_grp_lock, RW_READER);
 242         count = aggr_grp_cnt;
 243         rw_exit(&aggr_grp_lock);
 244         return (count);
 245 }
 246 
 247 /*
 248  * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
 249  * requires the mac perimeter, this function holds a reference of the aggr
 250  * and aggr won't call mac_unregister() until this reference drops to 0.
 251  */
 252 void
 253 aggr_grp_port_hold(aggr_port_t *port)
 254 {
 255         aggr_grp_t      *grp = port->lp_grp;
 256 
 257         AGGR_PORT_REFHOLD(port);
 258         mutex_enter(&grp->lg_port_lock);
 259         grp->lg_port_ref++;
 260         mutex_exit(&grp->lg_port_lock);
 261 }
 262 
 263 /*
 264  * Release the reference of the grp and inform aggr_grp_delete() calling
 265  * mac_unregister() is now safe.
 266  */
 267 void
 268 aggr_grp_port_rele(aggr_port_t *port)
 269 {
 270         aggr_grp_t      *grp = port->lp_grp;
 271 
 272         mutex_enter(&grp->lg_port_lock);
 273         if (--grp->lg_port_ref == 0)
 274                 cv_signal(&grp->lg_port_cv);
 275         mutex_exit(&grp->lg_port_lock);
 276         AGGR_PORT_REFRELE(port);
 277 }
 278 
 279 /*
 280  * Wait for the port's lacp timer thread and the port's notification callback
 281  * to exit.
 282  */
 283 void
 284 aggr_grp_port_wait(aggr_grp_t *grp)
 285 {
 286         mutex_enter(&grp->lg_port_lock);
 287         if (grp->lg_port_ref != 0)
 288                 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
 289         mutex_exit(&grp->lg_port_lock);
 290 }
 291 
 292 /*
 293  * Attach a port to a link aggregation group.
 294  *
 295  * A port is attached to a link aggregation group once its speed
 296  * and link state have been verified.
 297  *
 298  * Returns B_TRUE if the group link state or speed has changed. If
 299  * it's the case, the caller must notify the MAC layer via a call
 300  * to mac_link().
 301  */
 302 boolean_t
 303 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
 304 {
 305         boolean_t link_state_changed = B_FALSE;
 306 
 307         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 308         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 309 
 310         if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
 311                 return (B_FALSE);
 312 
 313         /*
 314          * Validate the MAC port link speed and update the group
 315          * link speed if needed.
 316          */
 317         if (port->lp_ifspeed == 0 ||
 318             port->lp_link_state != LINK_STATE_UP ||
 319             port->lp_link_duplex != LINK_DUPLEX_FULL) {
 320                 /*
 321                  * Can't attach a MAC port with unknown link speed,
 322                  * down link, or not in full duplex mode.
 323                  */
 324                 return (B_FALSE);
 325         }
 326 
 327         if (grp->lg_ifspeed == 0) {
 328                 /*
 329                  * The group inherits the speed of the first link being
 330                  * attached.
 331                  */
 332                 grp->lg_ifspeed = port->lp_ifspeed;
 333                 link_state_changed = B_TRUE;
 334         } else if (grp->lg_ifspeed != port->lp_ifspeed) {
 335                 /*
 336                  * The link speed of the MAC port must be the same as
 337                  * the group link speed, as per 802.3ad. Since it is
 338                  * not, the attach is cancelled.
 339                  */
 340                 return (B_FALSE);
 341         }
 342 
 343         grp->lg_nattached_ports++;
 344 
 345         /*
 346          * Update the group link state.
 347          */
 348         if (grp->lg_link_state != LINK_STATE_UP) {
 349                 grp->lg_link_state = LINK_STATE_UP;
 350                 grp->lg_link_duplex = LINK_DUPLEX_FULL;
 351                 link_state_changed = B_TRUE;
 352         }
 353 
 354         /*
 355          * Update port's state.
 356          */
 357         port->lp_state = AGGR_PORT_STATE_ATTACHED;
 358 
 359         aggr_grp_multicst_port(port, B_TRUE);
 360 
 361         /*
 362          * Set port's receive callback
 363          */
 364         mac_rx_set(port->lp_mch, aggr_recv_cb, port);
 365 
 366         /*
 367          * If LACP is OFF, the port can be used to send data as soon
 368          * as its link is up and verified to be compatible with the
 369          * aggregation.
 370          *
 371          * If LACP is active or passive, notify the LACP subsystem, which
 372          * will enable sending on the port following the LACP protocol.
 373          */
 374         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 375                 aggr_send_port_enable(port);
 376         else
 377                 aggr_lacp_port_attached(port);
 378 
 379         return (link_state_changed);
 380 }
 381 
 382 boolean_t
 383 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
 384 {
 385         boolean_t link_state_changed = B_FALSE;
 386 
 387         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 388         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 389 
 390         /* update state */
 391         if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
 392                 return (B_FALSE);
 393 
 394         mac_rx_clear(port->lp_mch);
 395 
 396         aggr_grp_multicst_port(port, B_FALSE);
 397 
 398         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 399                 aggr_send_port_disable(port);
 400         else
 401                 aggr_lacp_port_detached(port);
 402 
 403         port->lp_state = AGGR_PORT_STATE_STANDBY;
 404 
 405         grp->lg_nattached_ports--;
 406         if (grp->lg_nattached_ports == 0) {
 407                 /* the last attached MAC port of the group is being detached */
 408                 grp->lg_ifspeed = 0;
 409                 grp->lg_link_state = LINK_STATE_DOWN;
 410                 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
 411                 link_state_changed = B_TRUE;
 412         }
 413 
 414         return (link_state_changed);
 415 }
 416 
 417 /*
 418  * Update the MAC addresses of the constituent ports of the specified
 419  * group. This function is invoked:
 420  * - after creating a new aggregation group.
 421  * - after adding new ports to an aggregation group.
 422  * - after removing a port from a group when the MAC address of
 423  *   that port was used for the MAC address of the group.
 424  * - after the MAC address of a port changed when the MAC address
 425  *   of that port was used for the MAC address of the group.
 426  *
 427  * Return true if the link state of the aggregation changed, for example
 428  * as a result of a failure changing the MAC address of one of the
 429  * constituent ports.
 430  */
 431 boolean_t
 432 aggr_grp_update_ports_mac(aggr_grp_t *grp)
 433 {
 434         aggr_port_t *cport;
 435         boolean_t link_state_changed = B_FALSE;
 436         mac_perim_handle_t mph;
 437 
 438         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 439 
 440         for (cport = grp->lg_ports; cport != NULL;
 441             cport = cport->lp_next) {
 442                 mac_perim_enter_by_mh(cport->lp_mh, &mph);
 443                 if (aggr_port_unicst(cport) != 0) {
 444                         if (aggr_grp_detach_port(grp, cport))
 445                                 link_state_changed = B_TRUE;
 446                 } else {
 447                         /*
 448                          * If a port was detached because of a previous
 449                          * failure changing the MAC address, the port is
 450                          * reattached when it successfully changes the MAC
 451                          * address now, and this might cause the link state
 452                          * of the aggregation to change.
 453                          */
 454                         if (aggr_grp_attach_port(grp, cport))
 455                                 link_state_changed = B_TRUE;
 456                 }
 457                 mac_perim_exit(mph);
 458         }
 459         return (link_state_changed);
 460 }
 461 
 462 /*
 463  * Invoked when the MAC address of a port has changed. If the port's
 464  * MAC address was used for the group MAC address, set mac_addr_changedp
 465  * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
 466  * notification. If the link state changes due to detach/attach of
 467  * the constituent port, set link_state_changedp to B_TRUE to indicate
 468  * to the caller that it should send a MAC_NOTE_LINK notification. In both
 469  * cases, it is the responsibility of the caller to invoke notification
 470  * functions after releasing the the port lock.
 471  */
 472 void
 473 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
 474     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
 475 {
 476         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 477         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 478         ASSERT(mac_addr_changedp != NULL);
 479         ASSERT(link_state_changedp != NULL);
 480 
 481         *mac_addr_changedp = B_FALSE;
 482         *link_state_changedp = B_FALSE;
 483 
 484         if (grp->lg_addr_fixed) {
 485                 /*
 486                  * The group is using a fixed MAC address or an automatic
 487                  * MAC address has not been set.
 488                  */
 489                 return;
 490         }
 491 
 492         if (grp->lg_mac_addr_port == port) {
 493                 /*
 494                  * The MAC address of the port was assigned to the group
 495                  * MAC address. Update the group MAC address.
 496                  */
 497                 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
 498                 *mac_addr_changedp = B_TRUE;
 499         } else {
 500                 /*
 501                  * Update the actual port MAC address to the MAC address
 502                  * of the group.
 503                  */
 504                 if (aggr_port_unicst(port) != 0) {
 505                         *link_state_changedp = aggr_grp_detach_port(grp, port);
 506                 } else {
 507                         /*
 508                          * If a port was detached because of a previous
 509                          * failure changing the MAC address, the port is
 510                          * reattached when it successfully changes the MAC
 511                          * address now, and this might cause the link state
 512                          * of the aggregation to change.
 513                          */
 514                         *link_state_changedp = aggr_grp_attach_port(grp, port);
 515                 }
 516         }
 517 }
 518 
 519 /*
 520  * Add a port to a link aggregation group.
 521  */
 522 static int
 523 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
 524     aggr_port_t **pp)
 525 {
 526         aggr_port_t *port, **cport;
 527         mac_perim_handle_t mph;
 528         zoneid_t port_zoneid = ALL_ZONES;
 529         int err;
 530 
 531         /* The port must be int the same zone as the aggregation. */
 532         if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
 533                 port_zoneid = GLOBAL_ZONEID;
 534         if (grp->lg_zoneid != port_zoneid)
 535                 return (EBUSY);
 536 
 537         /*
 538          * lg_mh could be NULL when the function is called during the creation
 539          * of the aggregation.
 540          */
 541         ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
 542 
 543         /* create new port */
 544         err = aggr_port_create(grp, port_linkid, force, &port);
 545         if (err != 0)
 546                 return (err);
 547 
 548         mac_perim_enter_by_mh(port->lp_mh, &mph);
 549 
 550         /* add port to list of group constituent ports */
 551         cport = &grp->lg_ports;
 552         while (*cport != NULL)
 553                 cport = &((*cport)->lp_next);
 554         *cport = port;
 555 
 556         /*
 557          * Back reference to the group it is member of. A port always
 558          * holds a reference to its group to ensure that the back
 559          * reference is always valid.
 560          */
 561         port->lp_grp = grp;
 562         AGGR_GRP_REFHOLD(grp);
 563         grp->lg_nports++;
 564 
 565         aggr_lacp_init_port(port);
 566         mac_perim_exit(mph);
 567 
 568         if (pp != NULL)
 569                 *pp = port;
 570 
 571         return (0);
 572 }
 573 
 574 /*
 575  * Add a pseudo RX ring for the given HW ring handle.
 576  */
 577 static int
 578 aggr_add_pseudo_rx_ring(aggr_port_t *port,
 579     aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
 580 {
 581         aggr_pseudo_rx_ring_t   *ring;
 582         int                     err;
 583         int                     j;
 584 
 585         for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
 586                 ring = rx_grp->arg_rings + j;
 587                 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
 588                         break;
 589         }
 590 
 591         /*
 592          * No slot for this new RX ring.
 593          */
 594         if (j == MAX_RINGS_PER_GROUP)
 595                 return (EIO);
 596 
 597         ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
 598         ring->arr_hw_rh = hw_rh;
 599         ring->arr_port = port;
 600         rx_grp->arg_ring_cnt++;
 601 
 602         /*
 603          * The group is already registered, dynamically add a new ring to the
 604          * mac group.
 605          */
 606         if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
 607                 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 608                 ring->arr_hw_rh = NULL;
 609                 ring->arr_port = NULL;
 610                 rx_grp->arg_ring_cnt--;
 611         } else {
 612                 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
 613                     mac_find_ring(rx_grp->arg_gh, j));
 614         }
 615         return (err);
 616 }
 617 
 618 /*
 619  * Remove the pseudo RX ring of the given HW ring handle.
 620  */
 621 static void
 622 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
 623 {
 624         aggr_pseudo_rx_ring_t   *ring;
 625         int                     j;
 626 
 627         for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
 628                 ring = rx_grp->arg_rings + j;
 629                 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
 630                     ring->arr_hw_rh != hw_rh) {
 631                         continue;
 632                 }
 633 
 634                 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
 635 
 636                 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 637                 ring->arr_hw_rh = NULL;
 638                 ring->arr_port = NULL;
 639                 rx_grp->arg_ring_cnt--;
 640                 mac_hwring_teardown(hw_rh);
 641                 break;
 642         }
 643 }
 644 
 645 /*
 646  * This function is called to create pseudo rings over the hardware rings of
 647  * the underlying device. Note that there is a 1:1 mapping between the pseudo
 648  * RX rings of the aggr and the hardware rings of the underlying port.
 649  */
 650 static int
 651 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 652 {
 653         aggr_grp_t              *grp = port->lp_grp;
 654         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP];
 655         aggr_unicst_addr_t      *addr, *a;
 656         mac_perim_handle_t      pmph;
 657         int                     hw_rh_cnt, i = 0, j;
 658         int                     err = 0;
 659 
 660         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 661         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 662 
 663         /*
 664          * This function must be called after the aggr registers its mac
 665          * and its RX group has been initialized.
 666          */
 667         ASSERT(rx_grp->arg_gh != NULL);
 668 
 669         /*
 670          * Get the list the the underlying HW rings.
 671          */
 672         hw_rh_cnt = mac_hwrings_get(port->lp_mch,
 673             &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
 674 
 675         if (port->lp_hwgh != NULL) {
 676                 /*
 677                  * Quiesce the HW ring and the mac srs on the ring. Note
 678                  * that the HW ring will be restarted when the pseudo ring
 679                  * is started. At that time all the packets will be
 680                  * directly passed up to the pseudo RX ring and handled
 681                  * by mac srs created over the pseudo RX ring.
 682                  */
 683                 mac_rx_client_quiesce(port->lp_mch);
 684                 mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
 685         }
 686 
 687         /*
 688          * Add all the unicast addresses to the newly added port.
 689          */
 690         for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
 691                 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
 692                         break;
 693         }
 694 
 695         for (i = 0; err == 0 && i < hw_rh_cnt; i++)
 696                 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
 697 
 698         if (err != 0) {
 699                 for (j = 0; j < i; j++)
 700                         aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
 701 
 702                 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
 703                         aggr_port_remmac(port, a->aua_addr);
 704 
 705                 if (port->lp_hwgh != NULL) {
 706                         mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
 707                         mac_rx_client_restart(port->lp_mch);
 708                         port->lp_hwgh = NULL;
 709                 }
 710         } else {
 711                 port->lp_rx_grp_added = B_TRUE;
 712         }
 713 done:
 714         mac_perim_exit(pmph);
 715         return (err);
 716 }
 717 
 718 /*
 719  * This function is called by aggr to remove pseudo RX rings over the
 720  * HW rings of the underlying port.
 721  */
 722 static void
 723 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 724 {
 725         aggr_grp_t              *grp = port->lp_grp;
 726         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP];
 727         aggr_unicst_addr_t      *addr;
 728         mac_group_handle_t      hwgh;
 729         mac_perim_handle_t      pmph;
 730         int                     hw_rh_cnt, i;
 731 
 732         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 733         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 734 
 735         if (!port->lp_rx_grp_added)
 736                 goto done;
 737 
 738         ASSERT(rx_grp->arg_gh != NULL);
 739         hw_rh_cnt = mac_hwrings_get(port->lp_mch,
 740             &hwgh, hw_rh, MAC_RING_TYPE_RX);
 741 
 742         /*
 743          * If hw_rh_cnt is 0, it means that the underlying port does not
 744          * support RX rings. Directly return in this case.
 745          */
 746         for (i = 0; i < hw_rh_cnt; i++)
 747                 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
 748 
 749         for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
 750                 aggr_port_remmac(port, addr->aua_addr);
 751 
 752         if (port->lp_hwgh != NULL) {
 753                 port->lp_hwgh = NULL;
 754 
 755                 /*
 756                  * First clear the permanent-quiesced flag of the RX srs then
 757                  * restart the HW ring and the mac srs on the ring. Note that
 758                  * the HW ring and associated SRS will soon been removed when
 759                  * the port is removed from the aggr.
 760                  */
 761                 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
 762                 mac_rx_client_restart(port->lp_mch);
 763         }
 764 
 765         port->lp_rx_grp_added = B_FALSE;
 766 done:
 767         mac_perim_exit(pmph);
 768 }
 769 
 770 /*
 771  * Add a pseudo TX ring for the given HW ring handle.
 772  */
 773 static int
 774 aggr_add_pseudo_tx_ring(aggr_port_t *port,
 775     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
 776     mac_ring_handle_t *pseudo_rh)
 777 {
 778         aggr_pseudo_tx_ring_t   *ring;
 779         int                     err;
 780         int                     i;
 781 
 782         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 783         for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
 784                 ring = tx_grp->atg_rings + i;
 785                 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
 786                         break;
 787         }
 788         /*
 789          * No slot for this new TX ring.
 790          */
 791         if (i == MAX_RINGS_PER_GROUP)
 792                 return (EIO);
 793         /*
 794          * The following 4 statements needs to be done before
 795          * calling mac_group_add_ring(). Otherwise it will
 796          * result in an assertion failure in mac_init_ring().
 797          */
 798         ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
 799         ring->atr_hw_rh = hw_rh;
 800         ring->atr_port = port;
 801         tx_grp->atg_ring_cnt++;
 802 
 803         /*
 804          * The TX side has no concept of ring groups unlike RX groups.
 805          * There is just a single group which stores all the TX rings.
 806          * This group will be used to store aggr's pseudo TX rings.
 807          */
 808         if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
 809                 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
 810                 ring->atr_hw_rh = NULL;
 811                 ring->atr_port = NULL;
 812                 tx_grp->atg_ring_cnt--;
 813         } else {
 814                 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
 815                 if (hw_rh != NULL) {
 816                         mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
 817                             mac_find_ring(tx_grp->atg_gh, i));
 818                 }
 819         }
 820         return (err);
 821 }
 822 
 823 /*
 824  * Remove the pseudo TX ring of the given HW ring handle.
 825  */
 826 static void
 827 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
 828     mac_ring_handle_t pseudo_hw_rh)
 829 {
 830         aggr_pseudo_tx_ring_t   *ring;
 831         int                     i;
 832 
 833         for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
 834                 ring = tx_grp->atg_rings + i;
 835                 if (ring->atr_rh != pseudo_hw_rh)
 836                         continue;
 837 
 838                 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
 839                 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
 840                 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
 841                 mac_hwring_teardown(ring->atr_hw_rh);
 842                 ring->atr_hw_rh = NULL;
 843                 ring->atr_port = NULL;
 844                 tx_grp->atg_ring_cnt--;
 845                 break;
 846         }
 847 }
 848 
 849 /*
 850  * This function is called to create pseudo rings over hardware rings of
 851  * the underlying device. There is a 1:1 mapping between the pseudo TX
 852  * rings of the aggr and the hardware rings of the underlying port.
 853  */
 854 static int
 855 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
 856 {
 857         aggr_grp_t              *grp = port->lp_grp;
 858         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
 859         mac_perim_handle_t      pmph;
 860         int                     hw_rh_cnt, i = 0, j;
 861         int                     err = 0;
 862 
 863         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 864         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 865 
 866         /*
 867          * Get the list the the underlying HW rings.
 868          */
 869         hw_rh_cnt = mac_hwrings_get(port->lp_mch,
 870             NULL, hw_rh, MAC_RING_TYPE_TX);
 871 
 872         /*
 873          * Even if the underlying NIC does not have TX rings, we
 874          * still make a psuedo TX ring for that NIC with NULL as
 875          * the ring handle.
 876          */
 877         if (hw_rh_cnt == 0)
 878                 port->lp_tx_ring_cnt = 1;
 879         else
 880                 port->lp_tx_ring_cnt = hw_rh_cnt;
 881 
 882         port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 883             port->lp_tx_ring_cnt), KM_SLEEP);
 884         port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 885             port->lp_tx_ring_cnt), KM_SLEEP);
 886 
 887         if (hw_rh_cnt == 0) {
 888                 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
 889                     NULL, &pseudo_rh)) == 0) {
 890                         port->lp_tx_rings[0] = NULL;
 891                         port->lp_pseudo_tx_rings[0] = pseudo_rh;
 892                 }
 893         } else {
 894                 for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
 895                         err = aggr_add_pseudo_tx_ring(port,
 896                             tx_grp, hw_rh[i], &pseudo_rh);
 897                         if (err != 0)
 898                                 break;
 899                         port->lp_tx_rings[i] = hw_rh[i];
 900                         port->lp_pseudo_tx_rings[i] = pseudo_rh;
 901                 }
 902         }
 903 
 904         if (err != 0) {
 905                 if (hw_rh_cnt != 0) {
 906                         for (j = 0; j < i; j++) {
 907                                 aggr_rem_pseudo_tx_ring(tx_grp,
 908                                     port->lp_pseudo_tx_rings[j]);
 909                         }
 910                 }
 911                 kmem_free(port->lp_tx_rings,
 912                     (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
 913                 kmem_free(port->lp_pseudo_tx_rings,
 914                     (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
 915                 port->lp_tx_ring_cnt = 0;
 916         } else {
 917                 port->lp_tx_grp_added = B_TRUE;
 918                 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
 919                     aggr_tx_ring_update, port);
 920         }
 921         mac_perim_exit(pmph);
 922         return (err);
 923 }
 924 
 925 /*
 926  * This function is called by aggr to remove pseudo TX rings over the
 927  * HW rings of the underlying port.
 928  */
 929 static void
 930 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
 931 {
 932         aggr_grp_t              *grp = port->lp_grp;
 933         mac_perim_handle_t      pmph;
 934         int                     i;
 935 
 936         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 937         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 938 
 939         if (!port->lp_tx_grp_added)
 940                 goto done;
 941 
 942         ASSERT(tx_grp->atg_gh != NULL);
 943 
 944         for (i = 0; i < port->lp_tx_ring_cnt; i++)
 945                 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
 946 
 947         kmem_free(port->lp_tx_rings,
 948             (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
 949         kmem_free(port->lp_pseudo_tx_rings,
 950             (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
 951 
 952         port->lp_tx_ring_cnt = 0;
 953         (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
 954         port->lp_tx_grp_added = B_FALSE;
 955 done:
 956         mac_perim_exit(pmph);
 957 }
 958 
 959 static int
 960 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
 961 {
 962         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
 963         return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
 964 }
 965 
 966 static int
 967 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
 968 {
 969         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
 970         return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
 971 }
 972 
 973 static int
 974 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
 975 {
 976         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
 977         int err;
 978 
 979         err = mac_hwring_start(rr_ring->arr_hw_rh);
 980         if (err == 0)
 981                 rr_ring->arr_gen = mr_gen;
 982         return (err);
 983 }
 984 
 985 static void
 986 aggr_pseudo_stop_ring(mac_ring_driver_t arg)
 987 {
 988         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
 989         mac_hwring_stop(rr_ring->arr_hw_rh);
 990 }
 991 
 992 /*
 993  * Add one or more ports to an existing link aggregation group.
 994  */
 995 int
 996 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
 997     laioc_port_t *ports)
 998 {
 999         int rc, i, nadded = 0;
1000         aggr_grp_t *grp = NULL;
1001         aggr_port_t *port;
1002         boolean_t link_state_changed = B_FALSE;
1003         mac_perim_handle_t mph, pmph;
1004 
1005         /* get group corresponding to linkid */
1006         rw_enter(&aggr_grp_lock, RW_READER);
1007         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1008             (mod_hash_val_t *)&grp) != 0) {
1009                 rw_exit(&aggr_grp_lock);
1010                 return (ENOENT);
1011         }
1012         AGGR_GRP_REFHOLD(grp);
1013 
1014         /*
1015          * Hold the perimeter so that the aggregation won't be destroyed.
1016          */
1017         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1018         rw_exit(&aggr_grp_lock);
1019 
1020         /* add the specified ports to group */
1021         for (i = 0; i < nports; i++) {
1022                 /* add port to group */
1023                 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1024                     force, &port)) != 0) {
1025                         goto bail;
1026                 }
1027                 ASSERT(port != NULL);
1028                 nadded++;
1029 
1030                 /* check capabilities */
1031                 if (!aggr_grp_capab_check(grp, port) ||
1032                     !aggr_grp_sdu_check(grp, port) ||
1033                     !aggr_grp_margin_check(grp, port)) {
1034                         rc = ENOTSUP;
1035                         goto bail;
1036                 }
1037 
1038                 /*
1039                  * Create the pseudo ring for each HW ring of the underlying
1040                  * port.
1041                  */
1042                 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1043                 if (rc != 0)
1044                         goto bail;
1045                 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1046                 if (rc != 0)
1047                         goto bail;
1048 
1049                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1050 
1051                 /* set LACP mode */
1052                 aggr_port_lacp_set_mode(grp, port);
1053 
1054                 /* start port if group has already been started */
1055                 if (grp->lg_started) {
1056                         rc = aggr_port_start(port);
1057                         if (rc != 0) {
1058                                 mac_perim_exit(pmph);
1059                                 goto bail;
1060                         }
1061 
1062                         /*
1063                          * Turn on the promiscuous mode over the port when it
1064                          * is requested to be turned on to receive the
1065                          * non-primary address over a port, or the promiscous
1066                          * mode is enabled over the aggr.
1067                          */
1068                         if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1069                                 rc = aggr_port_promisc(port, B_TRUE);
1070                                 if (rc != 0) {
1071                                         mac_perim_exit(pmph);
1072                                         goto bail;
1073                                 }
1074                         }
1075                 }
1076                 mac_perim_exit(pmph);
1077 
1078                 /*
1079                  * Attach each port if necessary.
1080                  */
1081                 if (aggr_port_notify_link(grp, port))
1082                         link_state_changed = B_TRUE;
1083 
1084                 /*
1085                  * Initialize the callback functions for this port.
1086                  */
1087                 aggr_port_init_callbacks(port);
1088         }
1089 
1090         /* update the MAC address of the constituent ports */
1091         if (aggr_grp_update_ports_mac(grp))
1092                 link_state_changed = B_TRUE;
1093 
1094         if (link_state_changed)
1095                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1096 
1097 bail:
1098         if (rc != 0) {
1099                 /* stop and remove ports that have been added */
1100                 for (i = 0; i < nadded; i++) {
1101                         port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1102                         ASSERT(port != NULL);
1103                         if (grp->lg_started) {
1104                                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1105                                 (void) aggr_port_promisc(port, B_FALSE);
1106                                 aggr_port_stop(port);
1107                                 mac_perim_exit(pmph);
1108                         }
1109                         aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1110                         aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1111                         (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1112                 }
1113         }
1114 
1115         mac_perim_exit(mph);
1116         AGGR_GRP_REFRELE(grp);
1117         return (rc);
1118 }
1119 
1120 static int
1121 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1122     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1123     aggr_lacp_timer_t lacp_timer)
1124 {
1125         boolean_t mac_addr_changed = B_FALSE;
1126         boolean_t link_state_changed = B_FALSE;
1127         mac_perim_handle_t pmph;
1128 
1129         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1130 
1131         /* validate fixed address if specified */
1132         if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1133             ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1134             (mac_addr[0] & 0x01))) {
1135                 return (EINVAL);
1136         }
1137 
1138         /* update policy if requested */
1139         if (update_mask & AGGR_MODIFY_POLICY)
1140                 aggr_send_update_policy(grp, policy);
1141 
1142         /* update unicast MAC address if requested */
1143         if (update_mask & AGGR_MODIFY_MAC) {
1144                 if (mac_fixed) {
1145                         /* user-supplied MAC address */
1146                         grp->lg_mac_addr_port = NULL;
1147                         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1148                                 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1149                                 mac_addr_changed = B_TRUE;
1150                         }
1151                 } else if (grp->lg_addr_fixed) {
1152                         /* switch from user-supplied to automatic */
1153                         aggr_port_t *port = grp->lg_ports;
1154 
1155                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1156                         bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1157                         grp->lg_mac_addr_port = port;
1158                         mac_addr_changed = B_TRUE;
1159                         mac_perim_exit(pmph);
1160                 }
1161                 grp->lg_addr_fixed = mac_fixed;
1162         }
1163 
1164         if (mac_addr_changed)
1165                 link_state_changed = aggr_grp_update_ports_mac(grp);
1166 
1167         if (update_mask & AGGR_MODIFY_LACP_MODE)
1168                 aggr_lacp_update_mode(grp, lacp_mode);
1169 
1170         if (update_mask & AGGR_MODIFY_LACP_TIMER)
1171                 aggr_lacp_update_timer(grp, lacp_timer);
1172 
1173         if (link_state_changed)
1174                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1175 
1176         if (mac_addr_changed)
1177                 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1178 
1179         return (0);
1180 }
1181 
1182 /*
1183  * Update properties of an existing link aggregation group.
1184  */
1185 int
1186 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1187     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1188     aggr_lacp_timer_t lacp_timer)
1189 {
1190         aggr_grp_t *grp = NULL;
1191         mac_perim_handle_t mph;
1192         int err;
1193 
1194         /* get group corresponding to linkid */
1195         rw_enter(&aggr_grp_lock, RW_READER);
1196         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1197             (mod_hash_val_t *)&grp) != 0) {
1198                 rw_exit(&aggr_grp_lock);
1199                 return (ENOENT);
1200         }
1201         AGGR_GRP_REFHOLD(grp);
1202 
1203         /*
1204          * Hold the perimeter so that the aggregation won't be destroyed.
1205          */
1206         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1207         rw_exit(&aggr_grp_lock);
1208 
1209         err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1210             mac_addr, lacp_mode, lacp_timer);
1211 
1212         mac_perim_exit(mph);
1213         AGGR_GRP_REFRELE(grp);
1214         return (err);
1215 }
1216 
1217 /*
1218  * Create a new link aggregation group upon request from administrator.
1219  * Returns 0 on success, an errno on failure.
1220  */
1221 int
1222 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1223     laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1224     uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1225     cred_t *credp)
1226 {
1227         aggr_grp_t *grp = NULL;
1228         aggr_port_t *port;
1229         mac_register_t *mac;
1230         boolean_t link_state_changed;
1231         mac_perim_handle_t mph;
1232         int err;
1233         int i;
1234         kt_did_t tid = 0;
1235 
1236         /* need at least one port */
1237         if (nports == 0)
1238                 return (EINVAL);
1239 
1240         rw_enter(&aggr_grp_lock, RW_WRITER);
1241 
1242         /* does a group with the same linkid already exist? */
1243         err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1244             (mod_hash_val_t *)&grp);
1245         if (err == 0) {
1246                 rw_exit(&aggr_grp_lock);
1247                 return (EEXIST);
1248         }
1249 
1250         grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1251 
1252         grp->lg_refs = 1;
1253         grp->lg_closing = B_FALSE;
1254         grp->lg_force = force;
1255         grp->lg_linkid = linkid;
1256         grp->lg_zoneid = crgetzoneid(credp);
1257         grp->lg_ifspeed = 0;
1258         grp->lg_link_state = LINK_STATE_UNKNOWN;
1259         grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1260         grp->lg_started = B_FALSE;
1261         grp->lg_promisc = B_FALSE;
1262         grp->lg_lacp_done = B_FALSE;
1263         grp->lg_tx_notify_done = B_FALSE;
1264         grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1265         grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1266             aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1267         grp->lg_tx_notify_thread = thread_create(NULL, 0,
1268             aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1269         grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1270             MAX_RINGS_PER_GROUP), KM_SLEEP);
1271         grp->lg_tx_blocked_cnt = 0;
1272         bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1273         bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1274         aggr_lacp_init_grp(grp);
1275 
1276         /* add MAC ports to group */
1277         grp->lg_ports = NULL;
1278         grp->lg_nports = 0;
1279         grp->lg_nattached_ports = 0;
1280         grp->lg_ntx_ports = 0;
1281 
1282         /*
1283          * If key is not specified by the user, allocate the key.
1284          */
1285         if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1286                 err = ENOMEM;
1287                 goto bail;
1288         }
1289         grp->lg_key = key;
1290 
1291         for (i = 0; i < nports; i++) {
1292                 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
1293                 if (err != 0)
1294                         goto bail;
1295         }
1296 
1297         /*
1298          * If no explicit MAC address was specified by the administrator,
1299          * set it to the MAC address of the first port.
1300          */
1301         grp->lg_addr_fixed = mac_fixed;
1302         if (grp->lg_addr_fixed) {
1303                 /* validate specified address */
1304                 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1305                         err = EINVAL;
1306                         goto bail;
1307                 }
1308                 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1309         } else {
1310                 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1311                 grp->lg_mac_addr_port = grp->lg_ports;
1312         }
1313 
1314         /* set the initial group capabilities */
1315         aggr_grp_capab_set(grp);
1316 
1317         if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1318                 err = ENOMEM;
1319                 goto bail;
1320         }
1321         mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1322         mac->m_driver = grp;
1323         mac->m_dip = aggr_dip;
1324         mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1325         mac->m_src_addr = grp->lg_addr;
1326         mac->m_callbacks = &aggr_m_callbacks;
1327         mac->m_min_sdu = 0;
1328         mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1329         mac->m_margin = aggr_grp_max_margin(grp);
1330         mac->m_v12n = MAC_VIRT_LEVEL1;
1331         err = mac_register(mac, &grp->lg_mh);
1332         mac_free(mac);
1333         if (err != 0)
1334                 goto bail;
1335 
1336         err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1337         if (err != 0) {
1338                 (void) mac_unregister(grp->lg_mh);
1339                 grp->lg_mh = NULL;
1340                 goto bail;
1341         }
1342 
1343         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1344 
1345         /*
1346          * Update the MAC address of the constituent ports.
1347          * None of the port is attached at this time, the link state of the
1348          * aggregation will not change.
1349          */
1350         link_state_changed = aggr_grp_update_ports_mac(grp);
1351         ASSERT(!link_state_changed);
1352 
1353         /* update outbound load balancing policy */
1354         aggr_send_update_policy(grp, policy);
1355 
1356         /* set LACP mode */
1357         aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1358 
1359         /*
1360          * Attach each port if necessary.
1361          */
1362         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1363                 /*
1364                  * Create the pseudo ring for each HW ring of the underlying
1365                  * port. Note that this is done after the aggr registers the
1366                  * mac.
1367                  */
1368                 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1369                 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1370                 if (aggr_port_notify_link(grp, port))
1371                         link_state_changed = B_TRUE;
1372 
1373                 /*
1374                  * Initialize the callback functions for this port.
1375                  */
1376                 aggr_port_init_callbacks(port);
1377         }
1378 
1379         if (link_state_changed)
1380                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1381 
1382         /* add new group to hash table */
1383         err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1384             (mod_hash_val_t)grp);
1385         ASSERT(err == 0);
1386         aggr_grp_cnt++;
1387 
1388         mac_perim_exit(mph);
1389         rw_exit(&aggr_grp_lock);
1390         return (0);
1391 
1392 bail:
1393 
1394         grp->lg_closing = B_TRUE;
1395 
1396         port = grp->lg_ports;
1397         while (port != NULL) {
1398                 aggr_port_t *cport;
1399 
1400                 cport = port->lp_next;
1401                 aggr_port_delete(port);
1402                 port = cport;
1403         }
1404 
1405         /*
1406          * Inform the lacp_rx thread to exit.
1407          */
1408         mutex_enter(&grp->lg_lacp_lock);
1409         grp->lg_lacp_done = B_TRUE;
1410         cv_signal(&grp->lg_lacp_cv);
1411         while (grp->lg_lacp_rx_thread != NULL)
1412                 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1413         mutex_exit(&grp->lg_lacp_lock);
1414         /*
1415          * Inform the tx_notify thread to exit.
1416          */
1417         mutex_enter(&grp->lg_tx_flowctl_lock);
1418         if (grp->lg_tx_notify_thread != NULL) {
1419                 tid = grp->lg_tx_notify_thread->t_did;
1420                 grp->lg_tx_notify_done = B_TRUE;
1421                 cv_signal(&grp->lg_tx_flowctl_cv);
1422         }
1423         mutex_exit(&grp->lg_tx_flowctl_lock);
1424         if (tid != 0)
1425                 thread_join(tid);
1426 
1427         kmem_free(grp->lg_tx_blocked_rings,
1428             (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1429         rw_exit(&aggr_grp_lock);
1430         AGGR_GRP_REFRELE(grp);
1431         return (err);
1432 }
1433 
1434 /*
1435  * Return a pointer to the member of a group with specified linkid.
1436  */
1437 static aggr_port_t *
1438 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1439 {
1440         aggr_port_t *port;
1441 
1442         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1443 
1444         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1445                 if (port->lp_linkid == linkid)
1446                         break;
1447         }
1448 
1449         return (port);
1450 }
1451 
1452 /*
1453  * Stop, detach and remove a port from a link aggregation group.
1454  */
1455 static int
1456 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1457     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1458 {
1459         int rc = 0;
1460         aggr_port_t **pport;
1461         boolean_t mac_addr_changed = B_FALSE;
1462         boolean_t link_state_changed = B_FALSE;
1463         mac_perim_handle_t mph;
1464         uint64_t val;
1465         uint_t i;
1466         uint_t stat;
1467 
1468         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1469         ASSERT(grp->lg_nports > 1);
1470         ASSERT(!grp->lg_closing);
1471 
1472         /* unlink port */
1473         for (pport = &grp->lg_ports; *pport != port;
1474             pport = &(*pport)->lp_next) {
1475                 if (*pport == NULL) {
1476                         rc = ENOENT;
1477                         goto done;
1478                 }
1479         }
1480         *pport = port->lp_next;
1481 
1482         mac_perim_enter_by_mh(port->lp_mh, &mph);
1483 
1484         /*
1485          * If the MAC address of the port being removed was assigned
1486          * to the group, update the group MAC address
1487          * using the MAC address of a different port.
1488          */
1489         if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1490                 /*
1491                  * Set the MAC address of the group to the
1492                  * MAC address of its first port.
1493                  */
1494                 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1495                 grp->lg_mac_addr_port = grp->lg_ports;
1496                 mac_addr_changed = B_TRUE;
1497         }
1498 
1499         link_state_changed = aggr_grp_detach_port(grp, port);
1500 
1501         /*
1502          * Add the counter statistics of the ports while it was aggregated
1503          * to the group's residual statistics.  This is done by obtaining
1504          * the current counter from the underlying MAC then subtracting the
1505          * value of the counter at the moment it was added to the
1506          * aggregation.
1507          */
1508         for (i = 0; i < MAC_NSTAT; i++) {
1509                 stat = i + MAC_STAT_MIN;
1510                 if (!MAC_STAT_ISACOUNTER(stat))
1511                         continue;
1512                 val = aggr_port_stat(port, stat);
1513                 val -= port->lp_stat[i];
1514                 grp->lg_stat[i] += val;
1515         }
1516         for (i = 0; i < ETHER_NSTAT; i++) {
1517                 stat = i + MACTYPE_STAT_MIN;
1518                 if (!ETHER_STAT_ISACOUNTER(stat))
1519                         continue;
1520                 val = aggr_port_stat(port, stat);
1521                 val -= port->lp_ether_stat[i];
1522                 grp->lg_ether_stat[i] += val;
1523         }
1524 
1525         grp->lg_nports--;
1526         mac_perim_exit(mph);
1527 
1528         aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1529         aggr_port_delete(port);
1530 
1531         /*
1532          * If the group MAC address has changed, update the MAC address of
1533          * the remaining constituent ports according to the new MAC
1534          * address of the group.
1535          */
1536         if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1537                 link_state_changed = B_TRUE;
1538 
1539 done:
1540         if (mac_addr_changedp != NULL)
1541                 *mac_addr_changedp = mac_addr_changed;
1542         if (link_state_changedp != NULL)
1543                 *link_state_changedp = link_state_changed;
1544 
1545         return (rc);
1546 }
1547 
1548 /*
1549  * Remove one or more ports from an existing link aggregation group.
1550  */
1551 int
1552 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1553 {
1554         int rc = 0, i;
1555         aggr_grp_t *grp = NULL;
1556         aggr_port_t *port;
1557         boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1558         boolean_t link_state_update = B_FALSE, link_state_changed;
1559         mac_perim_handle_t mph, pmph;
1560 
1561         /* get group corresponding to linkid */
1562         rw_enter(&aggr_grp_lock, RW_READER);
1563         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1564             (mod_hash_val_t *)&grp) != 0) {
1565                 rw_exit(&aggr_grp_lock);
1566                 return (ENOENT);
1567         }
1568         AGGR_GRP_REFHOLD(grp);
1569 
1570         /*
1571          * Hold the perimeter so that the aggregation won't be destroyed.
1572          */
1573         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1574         rw_exit(&aggr_grp_lock);
1575 
1576         /* we need to keep at least one port per group */
1577         if (nports >= grp->lg_nports) {
1578                 rc = EINVAL;
1579                 goto bail;
1580         }
1581 
1582         /* first verify that all the groups are valid */
1583         for (i = 0; i < nports; i++) {
1584                 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1585                         /* port not found */
1586                         rc = ENOENT;
1587                         goto bail;
1588                 }
1589         }
1590 
1591         /* clear the promiscous mode for the specified ports */
1592         for (i = 0; i < nports && rc == 0; i++) {
1593                 /* lookup port */
1594                 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1595                 ASSERT(port != NULL);
1596 
1597                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1598                 rc = aggr_port_promisc(port, B_FALSE);
1599                 mac_perim_exit(pmph);
1600         }
1601         if (rc != 0) {
1602                 for (i = 0; i < nports; i++) {
1603                         port = aggr_grp_port_lookup(grp,
1604                             ports[i].lp_linkid);
1605                         ASSERT(port != NULL);
1606 
1607                         /*
1608                          * Turn the promiscuous mode back on if it is required
1609                          * to receive the non-primary address over a port, or
1610                          * the promiscous mode is enabled over the aggr.
1611                          */
1612                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1613                         if (port->lp_started && (grp->lg_promisc ||
1614                             port->lp_prom_addr != NULL)) {
1615                                 (void) aggr_port_promisc(port, B_TRUE);
1616                         }
1617                         mac_perim_exit(pmph);
1618                 }
1619                 goto bail;
1620         }
1621 
1622         /* remove the specified ports from group */
1623         for (i = 0; i < nports; i++) {
1624                 /* lookup port */
1625                 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1626                 ASSERT(port != NULL);
1627 
1628                 /* stop port if group has already been started */
1629                 if (grp->lg_started) {
1630                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1631                         aggr_port_stop(port);
1632                         mac_perim_exit(pmph);
1633                 }
1634 
1635                 /*
1636                  * aggr_rem_pseudo_tx_group() is not called here. Instead
1637                  * it is called from inside aggr_grp_rem_port() after the
1638                  * port has been detached. The reason is that
1639                  * aggr_rem_pseudo_tx_group() removes one ring at a time
1640                  * and if there is still traffic going on, then there
1641                  * is the possibility of aggr_find_tx_ring() returning a
1642                  * removed ring for transmission. Once the port has been
1643                  * detached, that port will not be used and
1644                  * aggr_find_tx_ring() will not return any rings
1645                  * belonging to it.
1646                  */
1647                 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1648 
1649                 /* remove port from group */
1650                 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1651                     &link_state_changed);
1652                 ASSERT(rc == 0);
1653                 mac_addr_update = mac_addr_update || mac_addr_changed;
1654                 link_state_update = link_state_update || link_state_changed;
1655         }
1656 
1657 bail:
1658         if (mac_addr_update)
1659                 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1660         if (link_state_update)
1661                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1662 
1663         mac_perim_exit(mph);
1664         AGGR_GRP_REFRELE(grp);
1665 
1666         return (rc);
1667 }
1668 
1669 int
1670 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1671 {
1672         aggr_grp_t *grp = NULL;
1673         aggr_port_t *port, *cport;
1674         datalink_id_t tmpid;
1675         mod_hash_val_t val;
1676         mac_perim_handle_t mph, pmph;
1677         int err;
1678         kt_did_t tid = 0;
1679 
1680         rw_enter(&aggr_grp_lock, RW_WRITER);
1681 
1682         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1683             (mod_hash_val_t *)&grp) != 0) {
1684                 rw_exit(&aggr_grp_lock);
1685                 return (ENOENT);
1686         }
1687 
1688         /*
1689          * Note that dls_devnet_destroy() must be called before lg_lock is
1690          * held. Otherwise, it will deadlock if another thread is in
1691          * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1692          * dls_devnet_destroy() needs to delete.
1693          */
1694         if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1695                 rw_exit(&aggr_grp_lock);
1696                 return (err);
1697         }
1698         ASSERT(linkid == tmpid);
1699 
1700         /*
1701          * Unregister from the MAC service module. Since this can
1702          * fail if a client hasn't closed the MAC port, we gracefully
1703          * fail the operation.
1704          */
1705         if ((err = mac_disable(grp->lg_mh)) != 0) {
1706                 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1707                 rw_exit(&aggr_grp_lock);
1708                 return (err);
1709         }
1710         (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1711         ASSERT(grp == (aggr_grp_t *)val);
1712 
1713         ASSERT(aggr_grp_cnt > 0);
1714         aggr_grp_cnt--;
1715         rw_exit(&aggr_grp_lock);
1716 
1717         /*
1718          * Inform the lacp_rx thread to exit.
1719          */
1720         mutex_enter(&grp->lg_lacp_lock);
1721         grp->lg_lacp_done = B_TRUE;
1722         cv_signal(&grp->lg_lacp_cv);
1723         while (grp->lg_lacp_rx_thread != NULL)
1724                 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1725         mutex_exit(&grp->lg_lacp_lock);
1726         /*
1727          * Inform the tx_notify_thread to exit.
1728          */
1729         mutex_enter(&grp->lg_tx_flowctl_lock);
1730         if (grp->lg_tx_notify_thread != NULL) {
1731                 tid = grp->lg_tx_notify_thread->t_did;
1732                 grp->lg_tx_notify_done = B_TRUE;
1733                 cv_signal(&grp->lg_tx_flowctl_cv);
1734         }
1735         mutex_exit(&grp->lg_tx_flowctl_lock);
1736         if (tid != 0)
1737                 thread_join(tid);
1738 
1739         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1740 
1741         grp->lg_closing = B_TRUE;
1742         /* detach and free MAC ports associated with group */
1743         port = grp->lg_ports;
1744         while (port != NULL) {
1745                 cport = port->lp_next;
1746                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1747                 if (grp->lg_started)
1748                         aggr_port_stop(port);
1749                 (void) aggr_grp_detach_port(grp, port);
1750                 mac_perim_exit(pmph);
1751                 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1752                 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1753                 aggr_port_delete(port);
1754                 port = cport;
1755         }
1756 
1757         mac_perim_exit(mph);
1758 
1759         kmem_free(grp->lg_tx_blocked_rings,
1760             (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1761         /*
1762          * Wait for the port's lacp timer thread and its notification callback
1763          * to exit before calling mac_unregister() since both needs to access
1764          * the mac perimeter of the grp.
1765          */
1766         aggr_grp_port_wait(grp);
1767 
1768         VERIFY(mac_unregister(grp->lg_mh) == 0);
1769         grp->lg_mh = NULL;
1770 
1771         AGGR_GRP_REFRELE(grp);
1772         return (0);
1773 }
1774 
1775 void
1776 aggr_grp_free(aggr_grp_t *grp)
1777 {
1778         ASSERT(grp->lg_refs == 0);
1779         ASSERT(grp->lg_port_ref == 0);
1780         if (grp->lg_key > AGGR_MAX_KEY) {
1781                 id_free(key_ids, grp->lg_key);
1782                 grp->lg_key = 0;
1783         }
1784         kmem_cache_free(aggr_grp_cache, grp);
1785 }
1786 
1787 int
1788 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1789     aggr_grp_info_new_grp_fn_t new_grp_fn,
1790     aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1791 {
1792         aggr_grp_t      *grp;
1793         aggr_port_t     *port;
1794         mac_perim_handle_t mph, pmph;
1795         int             rc = 0;
1796 
1797         /*
1798          * Make sure that the aggregation link is visible from the caller's
1799          * zone.
1800          */
1801         if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
1802                 return (ENOENT);
1803 
1804         rw_enter(&aggr_grp_lock, RW_READER);
1805 
1806         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1807             (mod_hash_val_t *)&grp) != 0) {
1808                 rw_exit(&aggr_grp_lock);
1809                 return (ENOENT);
1810         }
1811         AGGR_GRP_REFHOLD(grp);
1812 
1813         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1814         rw_exit(&aggr_grp_lock);
1815 
1816         rc = new_grp_fn(fn_arg, grp->lg_linkid,
1817             (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
1818             grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
1819             grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1820 
1821         if (rc != 0)
1822                 goto bail;
1823 
1824         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1825                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1826                 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
1827                     port->lp_state, &port->lp_lacp.ActorOperPortState);
1828                 mac_perim_exit(pmph);
1829 
1830                 if (rc != 0)
1831                         goto bail;
1832         }
1833 
1834 bail:
1835         mac_perim_exit(mph);
1836         AGGR_GRP_REFRELE(grp);
1837         return (rc);
1838 }
1839 
1840 /*ARGSUSED*/
1841 static void
1842 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1843 {
1844         miocnak(q, mp, 0, ENOTSUP);
1845 }
1846 
1847 static int
1848 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
1849 {
1850         aggr_port_t     *port;
1851         uint_t          stat_index;
1852 
1853         /* We only aggregate counter statistics. */
1854         if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
1855             IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
1856                 return (ENOTSUP);
1857         }
1858 
1859         /*
1860          * Counter statistics for a group are computed by aggregating the
1861          * counters of the members MACs while they were aggregated, plus
1862          * the residual counter of the group itself, which is updated each
1863          * time a MAC is removed from the group.
1864          */
1865         *val = 0;
1866         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1867                 /* actual port statistic */
1868                 *val += aggr_port_stat(port, stat);
1869                 /*
1870                  * minus the port stat when it was added, plus any residual
1871                  * amount for the group.
1872                  */
1873                 if (IS_MAC_STAT(stat)) {
1874                         stat_index = stat - MAC_STAT_MIN;
1875                         *val -= port->lp_stat[stat_index];
1876                         *val += grp->lg_stat[stat_index];
1877                 } else if (IS_MACTYPE_STAT(stat)) {
1878                         stat_index = stat - MACTYPE_STAT_MIN;
1879                         *val -= port->lp_ether_stat[stat_index];
1880                         *val += grp->lg_ether_stat[stat_index];
1881                 }
1882         }
1883         return (0);
1884 }
1885 
1886 int
1887 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1888 {
1889         aggr_pseudo_rx_ring_t   *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
1890 
1891         if (rx_ring->arr_hw_rh != NULL) {
1892                 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
1893         } else {
1894                 aggr_port_t     *port = rx_ring->arr_port;
1895 
1896                 *val = mac_stat_get(port->lp_mh, stat);
1897 
1898         }
1899         return (0);
1900 }
1901 
1902 int
1903 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1904 {
1905         aggr_pseudo_tx_ring_t   *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
1906 
1907         if (tx_ring->atr_hw_rh != NULL) {
1908                 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
1909         } else {
1910                 aggr_port_t     *port = tx_ring->atr_port;
1911 
1912                 *val = mac_stat_get(port->lp_mh, stat);
1913         }
1914         return (0);
1915 }
1916 
1917 static int
1918 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
1919 {
1920         aggr_grp_t              *grp = arg;
1921         mac_perim_handle_t      mph;
1922         int                     rval = 0;
1923 
1924         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1925 
1926         switch (stat) {
1927         case MAC_STAT_IFSPEED:
1928                 *val = grp->lg_ifspeed;
1929                 break;
1930 
1931         case ETHER_STAT_LINK_DUPLEX:
1932                 *val = grp->lg_link_duplex;
1933                 break;
1934 
1935         default:
1936                 /*
1937                  * For all other statistics, we return the aggregated stat
1938                  * from the underlying ports.  aggr_grp_stat() will set
1939                  * rval appropriately if the statistic isn't a counter.
1940                  */
1941                 rval = aggr_grp_stat(grp, stat, val);
1942         }
1943 
1944         mac_perim_exit(mph);
1945         return (rval);
1946 }
1947 
1948 static int
1949 aggr_m_start(void *arg)
1950 {
1951         aggr_grp_t *grp = arg;
1952         aggr_port_t *port;
1953         mac_perim_handle_t mph, pmph;
1954 
1955         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1956 
1957         /*
1958          * Attempts to start all configured members of the group.
1959          * Group members will be attached when their link-up notification
1960          * is received.
1961          */
1962         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1963                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1964                 if (aggr_port_start(port) != 0) {
1965                         mac_perim_exit(pmph);
1966                         continue;
1967                 }
1968 
1969                 /*
1970                  * Turn on the promiscuous mode if it is required to receive
1971                  * the non-primary address over a port, or the promiscous
1972                  * mode is enabled over the aggr.
1973                  */
1974                 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1975                         if (aggr_port_promisc(port, B_TRUE) != 0)
1976                                 aggr_port_stop(port);
1977                 }
1978                 mac_perim_exit(pmph);
1979         }
1980 
1981         grp->lg_started = B_TRUE;
1982 
1983         mac_perim_exit(mph);
1984         return (0);
1985 }
1986 
1987 static void
1988 aggr_m_stop(void *arg)
1989 {
1990         aggr_grp_t *grp = arg;
1991         aggr_port_t *port;
1992         mac_perim_handle_t mph, pmph;
1993 
1994         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1995 
1996         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1997                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1998 
1999                 /* reset port promiscuous mode */
2000                 (void) aggr_port_promisc(port, B_FALSE);
2001 
2002                 aggr_port_stop(port);
2003                 mac_perim_exit(pmph);
2004         }
2005 
2006         grp->lg_started = B_FALSE;
2007         mac_perim_exit(mph);
2008 }
2009 
2010 static int
2011 aggr_m_promisc(void *arg, boolean_t on)
2012 {
2013         aggr_grp_t *grp = arg;
2014         aggr_port_t *port;
2015         boolean_t link_state_changed = B_FALSE;
2016         mac_perim_handle_t mph, pmph;
2017 
2018         AGGR_GRP_REFHOLD(grp);
2019         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2020 
2021         ASSERT(!grp->lg_closing);
2022 
2023         if (on == grp->lg_promisc)
2024                 goto bail;
2025 
2026         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2027                 int     err = 0;
2028 
2029                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2030                 AGGR_PORT_REFHOLD(port);
2031                 if (!on && (port->lp_prom_addr == NULL))
2032                         err = aggr_port_promisc(port, B_FALSE);
2033                 else if (on && port->lp_started)
2034                         err = aggr_port_promisc(port, B_TRUE);
2035 
2036                 if (err != 0) {
2037                         if (aggr_grp_detach_port(grp, port))
2038                                 link_state_changed = B_TRUE;
2039                 } else {
2040                         /*
2041                          * If a port was detached because of a previous
2042                          * failure changing the promiscuity, the port
2043                          * is reattached when it successfully changes
2044                          * the promiscuity now, and this might cause
2045                          * the link state of the aggregation to change.
2046                          */
2047                         if (aggr_grp_attach_port(grp, port))
2048                                 link_state_changed = B_TRUE;
2049                 }
2050                 mac_perim_exit(pmph);
2051                 AGGR_PORT_REFRELE(port);
2052         }
2053 
2054         grp->lg_promisc = on;
2055 
2056         if (link_state_changed)
2057                 mac_link_update(grp->lg_mh, grp->lg_link_state);
2058 
2059 bail:
2060         mac_perim_exit(mph);
2061         AGGR_GRP_REFRELE(grp);
2062 
2063         return (0);
2064 }
2065 
2066 static void
2067 aggr_grp_port_rename(const char *new_name, void *arg)
2068 {
2069         /*
2070          * aggr port's mac client name is the format of "aggr link name" plus
2071          * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2072          */
2073         int aggr_len, link_len, clnt_name_len, i;
2074         char *str_end, *str_st, *str_del;
2075         char aggr_name[MAXNAMELEN];
2076         char link_name[MAXNAMELEN];
2077         char *clnt_name;
2078         aggr_grp_t *aggr_grp = arg;
2079         aggr_port_t *aggr_port = aggr_grp->lg_ports;
2080 
2081         for (i = 0; i < aggr_grp->lg_nports; i++) {
2082                 clnt_name = mac_client_name(aggr_port->lp_mch);
2083                 clnt_name_len = strlen(clnt_name);
2084                 str_st = clnt_name;
2085                 str_end = &(clnt_name[clnt_name_len]);
2086                 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2087                 ASSERT(str_del != NULL);
2088                 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2089                 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2090                 bzero(aggr_name, MAXNAMELEN);
2091                 bzero(link_name, MAXNAMELEN);
2092                 bcopy(clnt_name, aggr_name, aggr_len);
2093                 bcopy(str_del, link_name, link_len + 1);
2094                 bzero(clnt_name, MAXNAMELEN);
2095                 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2096                     link_name);
2097 
2098                 (void) mac_rename_primary(aggr_port->lp_mh, NULL);
2099                 aggr_port = aggr_port->lp_next;
2100         }
2101 }
2102 
2103 /*
2104  * Initialize the capabilities that are advertised for the group
2105  * according to the capabilities of the constituent ports.
2106  */
2107 static boolean_t
2108 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2109 {
2110         aggr_grp_t *grp = arg;
2111 
2112         switch (cap) {
2113         case MAC_CAPAB_HCKSUM: {
2114                 uint32_t *hcksum_txflags = cap_data;
2115                 *hcksum_txflags = grp->lg_hcksum_txflags;
2116                 break;
2117         }
2118         case MAC_CAPAB_LSO: {
2119                 mac_capab_lso_t *cap_lso = cap_data;
2120 
2121                 if (grp->lg_lso) {
2122                         *cap_lso = grp->lg_cap_lso;
2123                         break;
2124                 } else {
2125                         return (B_FALSE);
2126                 }
2127         }
2128         case MAC_CAPAB_NO_NATIVEVLAN:
2129                 return (!grp->lg_vlan);
2130         case MAC_CAPAB_NO_ZCOPY:
2131                 return (!grp->lg_zcopy);
2132         case MAC_CAPAB_RINGS: {
2133                 mac_capab_rings_t *cap_rings = cap_data;
2134 
2135                 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2136                         cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2137                         cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2138 
2139                         /*
2140                          * An aggregation advertises only one (pseudo) RX
2141                          * group, which virtualizes the main/primary group of
2142                          * the underlying devices.
2143                          */
2144                         cap_rings->mr_gnum = 1;
2145                         cap_rings->mr_gaddring = NULL;
2146                         cap_rings->mr_gremring = NULL;
2147                 } else {
2148                         cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2149                         cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2150                         cap_rings->mr_gnum = 0;
2151                 }
2152                 cap_rings->mr_rget = aggr_fill_ring;
2153                 cap_rings->mr_gget = aggr_fill_group;
2154                 break;
2155         }
2156         case MAC_CAPAB_AGGR:
2157         {
2158                 mac_capab_aggr_t *aggr_cap;
2159 
2160                 if (cap_data != NULL) {
2161                         aggr_cap = cap_data;
2162                         aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2163                         aggr_cap->mca_unicst = aggr_m_unicst;
2164                         aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2165                         aggr_cap->mca_arg = arg;
2166                 }
2167                 return (B_TRUE);
2168         }
2169         default:
2170                 return (B_FALSE);
2171         }
2172         return (B_TRUE);
2173 }
2174 
2175 /*
2176  * Callback funtion for MAC layer to register groups.
2177  */
2178 static void
2179 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2180     mac_group_info_t *infop, mac_group_handle_t gh)
2181 {
2182         aggr_grp_t *grp = arg;
2183         aggr_pseudo_rx_group_t *rx_group;
2184         aggr_pseudo_tx_group_t *tx_group;
2185 
2186         ASSERT(index == 0);
2187         if (rtype == MAC_RING_TYPE_RX) {
2188                 rx_group = &grp->lg_rx_group;
2189                 rx_group->arg_gh = gh;
2190                 rx_group->arg_grp = grp;
2191 
2192                 infop->mgi_driver = (mac_group_driver_t)rx_group;
2193                 infop->mgi_start = NULL;
2194                 infop->mgi_stop = NULL;
2195                 infop->mgi_addmac = aggr_addmac;
2196                 infop->mgi_remmac = aggr_remmac;
2197                 infop->mgi_count = rx_group->arg_ring_cnt;
2198         } else {
2199                 tx_group = &grp->lg_tx_group;
2200                 tx_group->atg_gh = gh;
2201         }
2202 }
2203 
2204 /*
2205  * Callback funtion for MAC layer to register all rings.
2206  */
2207 static void
2208 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2209     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2210 {
2211         aggr_grp_t      *grp = arg;
2212 
2213         switch (rtype) {
2214         case MAC_RING_TYPE_RX: {
2215                 aggr_pseudo_rx_group_t  *rx_group = &grp->lg_rx_group;
2216                 aggr_pseudo_rx_ring_t   *rx_ring;
2217                 mac_intr_t              aggr_mac_intr;
2218 
2219                 ASSERT(rg_index == 0);
2220 
2221                 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2222                 rx_ring = rx_group->arg_rings + index;
2223                 rx_ring->arr_rh = rh;
2224 
2225                 /*
2226                  * Entrypoint to enable interrupt (disable poll) and
2227                  * disable interrupt (enable poll).
2228                  */
2229                 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2230                 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2231                 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2232                 aggr_mac_intr.mi_ddi_handle = NULL;
2233 
2234                 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2235                 infop->mri_start = aggr_pseudo_start_ring;
2236                 infop->mri_stop = aggr_pseudo_stop_ring;
2237 
2238                 infop->mri_intr = aggr_mac_intr;
2239                 infop->mri_poll = aggr_rx_poll;
2240 
2241                 infop->mri_stat = aggr_rx_ring_stat;
2242                 break;
2243         }
2244         case MAC_RING_TYPE_TX: {
2245                 aggr_pseudo_tx_group_t  *tx_group = &grp->lg_tx_group;
2246                 aggr_pseudo_tx_ring_t   *tx_ring;
2247 
2248                 ASSERT(rg_index == -1);
2249                 ASSERT(index < tx_group->atg_ring_cnt);
2250 
2251                 tx_ring = &tx_group->atg_rings[index];
2252                 tx_ring->atr_rh = rh;
2253 
2254                 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2255                 infop->mri_start = NULL;
2256                 infop->mri_stop = NULL;
2257                 infop->mri_tx = aggr_ring_tx;
2258                 infop->mri_stat = aggr_tx_ring_stat;
2259                 /*
2260                  * Use the hw TX ring handle to find if the ring needs
2261                  * serialization or not. For NICs that do not expose
2262                  * Tx rings, atr_hw_rh will be NULL.
2263                  */
2264                 if (tx_ring->atr_hw_rh != NULL) {
2265                         infop->mri_flags =
2266                             mac_hwring_getinfo(tx_ring->atr_hw_rh);
2267                 }
2268                 break;
2269         }
2270         default:
2271                 break;
2272         }
2273 }
2274 
2275 static mblk_t *
2276 aggr_rx_poll(void *arg, int bytes_to_pickup)
2277 {
2278         aggr_pseudo_rx_ring_t *rr_ring = arg;
2279         aggr_port_t *port = rr_ring->arr_port;
2280         aggr_grp_t *grp = port->lp_grp;
2281         mblk_t *mp_chain, *mp, **mpp;
2282 
2283         mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2284 
2285         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2286                 return (mp_chain);
2287 
2288         mpp = &mp_chain;
2289         while ((mp = *mpp) != NULL) {
2290                 if (MBLKL(mp) >= sizeof (struct ether_header)) {
2291                         struct ether_header *ehp;
2292 
2293                         ehp = (struct ether_header *)mp->b_rptr;
2294                         if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2295                                 *mpp = mp->b_next;
2296                                 mp->b_next = NULL;
2297                                 aggr_recv_lacp(port,
2298                                     (mac_resource_handle_t)rr_ring, mp);
2299                                 continue;
2300                         }
2301                 }
2302 
2303                 if (!port->lp_collector_enabled) {
2304                         *mpp = mp->b_next;
2305                         mp->b_next = NULL;
2306                         freemsg(mp);
2307                         continue;
2308                 }
2309                 mpp = &mp->b_next;
2310         }
2311         return (mp_chain);
2312 }
2313 
2314 static int
2315 aggr_addmac(void *arg, const uint8_t *mac_addr)
2316 {
2317         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
2318         aggr_unicst_addr_t      *addr, **pprev;
2319         aggr_grp_t              *grp = rx_group->arg_grp;
2320         aggr_port_t             *port, *p;
2321         mac_perim_handle_t      mph;
2322         int                     err = 0;
2323 
2324         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2325 
2326         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2327                 mac_perim_exit(mph);
2328                 return (0);
2329         }
2330 
2331         /*
2332          * Insert this mac address into the list of mac addresses owned by
2333          * the aggregation pseudo group.
2334          */
2335         pprev = &rx_group->arg_macaddr;
2336         while ((addr = *pprev) != NULL) {
2337                 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2338                         mac_perim_exit(mph);
2339                         return (EEXIST);
2340                 }
2341                 pprev = &addr->aua_next;
2342         }
2343         addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2344         bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2345         addr->aua_next = NULL;
2346         *pprev = addr;
2347 
2348         for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2349                 if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2350                         break;
2351 
2352         if (err != 0) {
2353                 for (p = grp->lg_ports; p != port; p = p->lp_next)
2354                         aggr_port_remmac(p, mac_addr);
2355 
2356                 *pprev = NULL;
2357                 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2358         }
2359 
2360         mac_perim_exit(mph);
2361         return (err);
2362 }
2363 
2364 static int
2365 aggr_remmac(void *arg, const uint8_t *mac_addr)
2366 {
2367         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
2368         aggr_unicst_addr_t      *addr, **pprev;
2369         aggr_grp_t              *grp = rx_group->arg_grp;
2370         aggr_port_t             *port;
2371         mac_perim_handle_t      mph;
2372         int                     err = 0;
2373 
2374         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2375 
2376         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2377                 mac_perim_exit(mph);
2378                 return (0);
2379         }
2380 
2381         /*
2382          * Insert this mac address into the list of mac addresses owned by
2383          * the aggregation pseudo group.
2384          */
2385         pprev = &rx_group->arg_macaddr;
2386         while ((addr = *pprev) != NULL) {
2387                 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2388                         pprev = &addr->aua_next;
2389                         continue;
2390                 }
2391                 break;
2392         }
2393         if (addr == NULL) {
2394                 mac_perim_exit(mph);
2395                 return (EINVAL);
2396         }
2397 
2398         for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2399                 aggr_port_remmac(port, mac_addr);
2400 
2401         *pprev = addr->aua_next;
2402         kmem_free(addr, sizeof (aggr_unicst_addr_t));
2403 
2404         mac_perim_exit(mph);
2405         return (err);
2406 }
2407 
2408 /*
2409  * Add or remove the multicast addresses that are defined for the group
2410  * to or from the specified port.
2411  *
2412  * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2413  * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2414  * called when the port is either stopped or detached.
2415  */
2416 void
2417 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2418 {
2419         aggr_grp_t *grp = port->lp_grp;
2420 
2421         ASSERT(MAC_PERIM_HELD(port->lp_mh));
2422         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2423 
2424         if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2425                 return;
2426 
2427         mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2428 }
2429 
2430 static int
2431 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2432 {
2433         aggr_grp_t *grp = arg;
2434         aggr_port_t *port = NULL, *errport = NULL;
2435         mac_perim_handle_t mph;
2436         int err = 0;
2437 
2438         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2439         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2440                 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2441                     !port->lp_started) {
2442                         continue;
2443                 }
2444                 err = aggr_port_multicst(port, add, addrp);
2445                 if (err != 0) {
2446                         errport = port;
2447                         break;
2448                 }
2449         }
2450 
2451         /*
2452          * At least one port caused error return and this error is returned to
2453          * mac, eventually a NAK would be sent upwards.
2454          * Some ports have this multicast address listed now, and some don't.
2455          * Treat this error as a whole aggr failure not individual port failure.
2456          * Therefore remove this multicast address from other ports.
2457          */
2458         if ((err != 0) && add) {
2459                 for (port = grp->lg_ports; port != errport;
2460                     port = port->lp_next) {
2461                         if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2462                             !port->lp_started) {
2463                                 continue;
2464                         }
2465                         (void) aggr_port_multicst(port, B_FALSE, addrp);
2466                 }
2467         }
2468         mac_perim_exit(mph);
2469         return (err);
2470 }
2471 
2472 static int
2473 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2474 {
2475         aggr_grp_t *grp = arg;
2476         mac_perim_handle_t mph;
2477         int err;
2478 
2479         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2480         err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2481             0, 0);
2482         mac_perim_exit(mph);
2483         return (err);
2484 }
2485 
2486 /*
2487  * Initialize the capabilities that are advertised for the group
2488  * according to the capabilities of the constituent ports.
2489  */
2490 static void
2491 aggr_grp_capab_set(aggr_grp_t *grp)
2492 {
2493         uint32_t cksum;
2494         aggr_port_t *port;
2495         mac_capab_lso_t cap_lso;
2496 
2497         ASSERT(grp->lg_mh == NULL);
2498         ASSERT(grp->lg_ports != NULL);
2499 
2500         grp->lg_hcksum_txflags = (uint32_t)-1;
2501         grp->lg_zcopy = B_TRUE;
2502         grp->lg_vlan = B_TRUE;
2503 
2504         grp->lg_lso = B_TRUE;
2505         grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2506         grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2507 
2508         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2509                 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2510                         cksum = 0;
2511                 grp->lg_hcksum_txflags &= cksum;
2512 
2513                 grp->lg_vlan &=
2514                     !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2515 
2516                 grp->lg_zcopy &=
2517                     !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2518 
2519                 grp->lg_lso &=
2520                     mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2521                 if (grp->lg_lso) {
2522                         grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2523                         if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2524                             cap_lso.lso_basic_tcp_ipv4.lso_max)
2525                                 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2526                                     cap_lso.lso_basic_tcp_ipv4.lso_max;
2527                 }
2528         }
2529 }
2530 
2531 /*
2532  * Checks whether the capabilities of the port being added are compatible
2533  * with the current capabilities of the aggregation.
2534  */
2535 static boolean_t
2536 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2537 {
2538         uint32_t hcksum_txflags;
2539 
2540         ASSERT(grp->lg_ports != NULL);
2541 
2542         if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2543             grp->lg_vlan) != grp->lg_vlan) {
2544                 return (B_FALSE);
2545         }
2546 
2547         if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2548             grp->lg_zcopy) != grp->lg_zcopy) {
2549                 return (B_FALSE);
2550         }
2551 
2552         if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2553                 if (grp->lg_hcksum_txflags != 0)
2554                         return (B_FALSE);
2555         } else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2556             grp->lg_hcksum_txflags) {
2557                 return (B_FALSE);
2558         }
2559 
2560         if (grp->lg_lso) {
2561                 mac_capab_lso_t cap_lso;
2562 
2563                 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2564                         if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2565                             grp->lg_cap_lso.lso_flags)
2566                                 return (B_FALSE);
2567                         if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2568                             cap_lso.lso_basic_tcp_ipv4.lso_max)
2569                                 return (B_FALSE);
2570                 } else {
2571                         return (B_FALSE);
2572                 }
2573         }
2574 
2575         return (B_TRUE);
2576 }
2577 
2578 /*
2579  * Returns the maximum SDU according to the SDU of the constituent ports.
2580  */
2581 static uint_t
2582 aggr_grp_max_sdu(aggr_grp_t *grp)
2583 {
2584         uint_t max_sdu = (uint_t)-1;
2585         aggr_port_t *port;
2586 
2587         ASSERT(grp->lg_ports != NULL);
2588 
2589         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2590                 uint_t port_sdu_max;
2591 
2592                 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2593                 if (max_sdu > port_sdu_max)
2594                         max_sdu = port_sdu_max;
2595         }
2596 
2597         return (max_sdu);
2598 }
2599 
2600 /*
2601  * Checks if the maximum SDU of the specified port is compatible
2602  * with the maximum SDU of the specified aggregation group, returns
2603  * B_TRUE if it is, B_FALSE otherwise.
2604  */
2605 static boolean_t
2606 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2607 {
2608         uint_t port_sdu_max;
2609 
2610         mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2611         return (port_sdu_max >= grp->lg_max_sdu);
2612 }
2613 
2614 /*
2615  * Returns the maximum margin according to the margin of the constituent ports.
2616  */
2617 static uint32_t
2618 aggr_grp_max_margin(aggr_grp_t *grp)
2619 {
2620         uint32_t margin = UINT32_MAX;
2621         aggr_port_t *port;
2622 
2623         ASSERT(grp->lg_mh == NULL);
2624         ASSERT(grp->lg_ports != NULL);
2625 
2626         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2627                 if (margin > port->lp_margin)
2628                         margin = port->lp_margin;
2629         }
2630 
2631         grp->lg_margin = margin;
2632         return (margin);
2633 }
2634 
2635 /*
2636  * Checks if the maximum margin of the specified port is compatible
2637  * with the maximum margin of the specified aggregation group, returns
2638  * B_TRUE if it is, B_FALSE otherwise.
2639  */
2640 static boolean_t
2641 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
2642 {
2643         if (port->lp_margin >= grp->lg_margin)
2644                 return (B_TRUE);
2645 
2646         /*
2647          * See whether the current margin value is allowed to be changed to
2648          * the new value.
2649          */
2650         if (!mac_margin_update(grp->lg_mh, port->lp_margin))
2651                 return (B_FALSE);
2652 
2653         grp->lg_margin = port->lp_margin;
2654         return (B_TRUE);
2655 }
2656 
2657 /*
2658  * Set MTU on individual ports of an aggregation group
2659  */
2660 static int
2661 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
2662     uint32_t *old_mtu)
2663 {
2664         boolean_t               removed = B_FALSE;
2665         mac_perim_handle_t      mph;
2666         mac_diag_t              diag;
2667         int                     err, rv, retry = 0;
2668 
2669         if (port->lp_mah != NULL) {
2670                 (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
2671                 port->lp_mah = NULL;
2672                 removed = B_TRUE;
2673         }
2674         err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
2675 try_again:
2676         if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
2677             MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
2678             &port->lp_mah, 0, &diag)) != 0) {
2679                 /*
2680                  * following is a workaround for a bug in 'bge' driver.
2681                  * See CR 6794654 for more information and this work around
2682                  * will be removed once the CR is fixed.
2683                  */
2684                 if (rv == EIO && retry++ < 3) {
2685                         delay(2 * hz);
2686                         goto try_again;
2687                 }
2688                 /*
2689                  * if mac_unicast_add() failed while setting the MTU,
2690                  * detach the port from the group.
2691                  */
2692                 mac_perim_enter_by_mh(port->lp_mh, &mph);
2693                 (void) aggr_grp_detach_port(grp, port);
2694                 mac_perim_exit(mph);
2695                 cmn_err(CE_WARN, "Unable to restart the port %s while "
2696                     "setting MTU. Detaching the port from the aggregation.",
2697                     mac_client_name(port->lp_mch));
2698         }
2699         return (err);
2700 }
2701 
2702 static int
2703 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
2704 {
2705         int                     err = 0, i, rv;
2706         aggr_port_t             *port;
2707         uint32_t                *mtu;
2708 
2709         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2710 
2711         /*
2712          * If the MTU being set is equal to aggr group's maximum
2713          * allowable value, then there is nothing to change
2714          */
2715         if (sdu == grp->lg_max_sdu)
2716                 return (0);
2717 
2718         /* 0 is aggr group's min sdu */
2719         if (sdu == 0)
2720                 return (EINVAL);
2721 
2722         mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
2723         for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
2724             port = port->lp_next, i++) {
2725                 err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
2726         }
2727         if (err != 0) {
2728                 /* recover from error: reset the mtus of the ports */
2729                 aggr_port_t *tmp;
2730 
2731                 for (tmp = grp->lg_ports, i = 0; tmp != port;
2732                     tmp = tmp->lp_next, i++) {
2733                         (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
2734                 }
2735                 goto bail;
2736         }
2737         grp->lg_max_sdu = aggr_grp_max_sdu(grp);
2738         rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
2739         ASSERT(rv == 0);
2740 bail:
2741         kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
2742         return (err);
2743 }
2744 
2745 /*
2746  * Callback functions for set/get of properties
2747  */
2748 /*ARGSUSED*/
2749 static int
2750 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2751     uint_t pr_valsize, const void *pr_val)
2752 {
2753         int             err = ENOTSUP;
2754         aggr_grp_t      *grp = m_driver;
2755 
2756         switch (pr_num) {
2757         case MAC_PROP_MTU: {
2758                 uint32_t        mtu;
2759 
2760                 if (pr_valsize < sizeof (mtu)) {
2761                         err = EINVAL;
2762                         break;
2763                 }
2764                 bcopy(pr_val, &mtu, sizeof (mtu));
2765                 err = aggr_sdu_update(grp, mtu);
2766                 break;
2767         }
2768         default:
2769                 break;
2770         }
2771         return (err);
2772 }
2773 
2774 typedef struct rboundary {
2775         uint32_t        bval;
2776         int             btype;
2777 } rboundary_t;
2778 
2779 /*
2780  * This function finds the intersection of mtu ranges stored in arrays -
2781  * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
2782  * Individual arrays are assumed to contain non-overlapping ranges.
2783  * Algorithm:
2784  *   A range has two boundaries - min and max. We scan all arrays and store
2785  * each boundary as a separate element in a temporary array. We also store
2786  * the boundary types, min or max, as +1 or -1 respectively in the temporary
2787  * array. Then we sort the temporary array in ascending order. We scan the
2788  * sorted array from lower to higher values and keep a cumulative sum of
2789  * boundary types. Element in the temporary array for which the sum reaches
2790  * mcount is a min boundary of a range in the result and next element will be
2791  * max boundary.
2792  *
2793  * Example for mcount = 3,
2794  *
2795  *  ----|_________|-------|_______|----|__|------ mrange[0]
2796  *
2797  *  -------|________|--|____________|-----|___|-- mrange[1]
2798  *
2799  *  --------|________________|-------|____|------ mrange[2]
2800  *
2801  *                                      3 2 1
2802  *                                       \|/
2803  *      1  23     2 1  2  3  2    1 01 2  V   0  <- the sum
2804  *  ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
2805  *
2806  *                                 same min and max
2807  *                                        V
2808  *  --------|_____|-------|__|------------|------ intersecting ranges
2809  */
2810 void
2811 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
2812     mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
2813 {
2814         mac_propval_uint32_range_t      *rval, *ur;
2815         int                             rmaxcnt, rcount;
2816         size_t                          sz_range32;
2817         rboundary_t                     *ta; /* temporary array */
2818         rboundary_t                     temp;
2819         boolean_t                       range_started = B_FALSE;
2820         int                             i, j, m, sum;
2821 
2822         sz_range32 = sizeof (mac_propval_uint32_range_t);
2823 
2824         for (i = 0, rmaxcnt = 0; i < mcount; i++)
2825                 rmaxcnt += mrange[i]->mpr_count;
2826 
2827         /* Allocate enough space to store the results */
2828         rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
2829 
2830         /* Number of boundaries are twice as many as ranges */
2831         ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
2832 
2833         for (i = 0, m = 0; i < mcount; i++) {
2834                 ur = &(mrange[i]->mpr_range_uint32[0]);
2835                 for (j = 0; j < mrange[i]->mpr_count; j++) {
2836                         ta[m].bval = ur[j].mpur_min;
2837                         ta[m++].btype = 1;
2838                         ta[m].bval = ur[j].mpur_max;
2839                         ta[m++].btype = -1;
2840                 }
2841         }
2842 
2843         /*
2844          * Sort the temporary array in ascending order of bval;
2845          * if boundary values are same then sort on btype.
2846          */
2847         for (i = 0; i < m-1; i++) {
2848                 for (j = i+1; j < m; j++) {
2849                         if ((ta[i].bval > ta[j].bval) ||
2850                             ((ta[i].bval == ta[j].bval) &&
2851                             (ta[i].btype < ta[j].btype))) {
2852                                 temp = ta[i];
2853                                 ta[i] = ta[j];
2854                                 ta[j] = temp;
2855                         }
2856                 }
2857         }
2858 
2859         /* Walk through temporary array to find all ranges in the results */
2860         for (i = 0, sum = 0, rcount = 0; i < m; i++) {
2861                 sum += ta[i].btype;
2862                 if (sum == mcount) {
2863                         rval[rcount].mpur_min = ta[i].bval;
2864                         range_started = B_TRUE;
2865                 } else if (sum < mcount && range_started) {
2866                         rval[rcount++].mpur_max = ta[i].bval;
2867                         range_started = B_FALSE;
2868                 }
2869         }
2870 
2871         *prval = rval;
2872         *prmaxcnt = rmaxcnt;
2873         *prcount = rcount;
2874 
2875         kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
2876 }
2877 
2878 /*
2879  * Returns the mtu ranges which could be supported by aggr group.
2880  * prmaxcnt returns the size of the buffer prval, prcount returns
2881  * the number of valid entries in prval. Caller is responsible
2882  * for freeing up prval.
2883  */
2884 int
2885 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
2886     int *prmaxcnt, int *prcount)
2887 {
2888         mac_propval_range_t             **vals;
2889         aggr_port_t                     *port;
2890         mac_perim_handle_t              mph;
2891         uint_t                          i, numr;
2892         int                             err = 0;
2893         size_t                          sz_propval, sz_range32;
2894         size_t                          size;
2895 
2896         sz_propval = sizeof (mac_propval_range_t);
2897         sz_range32 = sizeof (mac_propval_uint32_range_t);
2898 
2899         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2900 
2901         vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
2902             KM_SLEEP);
2903 
2904         for (port = grp->lg_ports, i = 0; port != NULL;
2905             port = port->lp_next, i++) {
2906 
2907                 size = sz_propval;
2908                 vals[i] = kmem_alloc(size, KM_SLEEP);
2909                 vals[i]->mpr_count = 1;
2910 
2911                 mac_perim_enter_by_mh(port->lp_mh, &mph);
2912 
2913                 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2914                     NULL, 0, vals[i], NULL);
2915                 if (err == ENOSPC) {
2916                         /*
2917                          * Not enough space to hold all ranges.
2918                          * Allocate extra space as indicated and retry.
2919                          */
2920                         numr = vals[i]->mpr_count;
2921                         kmem_free(vals[i], sz_propval);
2922                         size = sz_propval + (numr - 1) * sz_range32;
2923                         vals[i] = kmem_alloc(size, KM_SLEEP);
2924                         vals[i]->mpr_count = numr;
2925                         err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2926                             NULL, 0, vals[i], NULL);
2927                         ASSERT(err != ENOSPC);
2928                 }
2929                 mac_perim_exit(mph);
2930                 if (err != 0) {
2931                         kmem_free(vals[i], size);
2932                         vals[i] = NULL;
2933                         break;
2934                 }
2935         }
2936 
2937         /*
2938          * if any of the underlying ports does not support changing MTU then
2939          * just return ENOTSUP
2940          */
2941         if (port != NULL) {
2942                 ASSERT(err != 0);
2943                 goto done;
2944         }
2945 
2946         aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
2947             prcount);
2948 
2949 done:
2950         for (i = 0; i < grp->lg_nports; i++) {
2951                 if (vals[i] != NULL) {
2952                         numr = vals[i]->mpr_count;
2953                         size = sz_propval + (numr - 1) * sz_range32;
2954                         kmem_free(vals[i], size);
2955                 }
2956         }
2957 
2958         kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
2959         return (err);
2960 }
2961 
2962 static void
2963 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2964     mac_prop_info_handle_t prh)
2965 {
2966         aggr_grp_t                      *grp = m_driver;
2967         mac_propval_uint32_range_t      *rval = NULL;
2968         int                             i, rcount, rmaxcnt;
2969         int                             err = 0;
2970 
2971         _NOTE(ARGUNUSED(pr_name));
2972 
2973         switch (pr_num) {
2974         case MAC_PROP_MTU:
2975 
2976                 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
2977                     &rcount);
2978                 if (err != 0) {
2979                         ASSERT(rval == NULL);
2980                         return;
2981                 }
2982                 for (i = 0; i < rcount; i++) {
2983                         mac_prop_info_set_range_uint32(prh,
2984                             rval[i].mpur_min, rval[i].mpur_max);
2985                 }
2986                 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
2987                 break;
2988         }
2989 }