Print this page
6274 MAC tries to use aggr rings from downed links
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/io/aggr/aggr_grp.c
+++ new/usr/src/uts/common/io/aggr/aggr_grp.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2015 Joyent, Inc.
23 24 */
24 25
25 26 /*
26 27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
27 28 *
28 29 * An instance of the structure aggr_grp_t is allocated for each
29 30 * link aggregation group. When created, aggr_grp_t objects are
30 31 * entered into the aggr_grp_hash hash table maintained by the modhash
31 32 * module. The hash key is the linkid associated with the link
32 33 * aggregation group.
33 34 *
34 35 * A set of MAC ports are associated with each association group.
35 36 *
36 37 * Aggr pseudo TX rings
37 38 * --------------------
38 39 * The underlying ports (NICs) in an aggregation can have TX rings. To
39 40 * enhance aggr's performance, these TX rings are made available to the
40 41 * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
41 42 * They are already present and implemented on the RX side. It is called
42 43 * as pseudo RX rings. The same concept is extended to the TX side where
43 44 * each TX ring of an underlying port is reflected in aggr as a pseudo
44 45 * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
45 46 * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
46 47 * TX ring is given to the aggregation layer.
47 48 *
48 49 * With this change, the outgoing stack depth looks much better:
49 50 *
50 51 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
51 52 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
52 53 *
53 54 * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
54 55 * SRS_TX_AGGR and SRS_TX_BW_AGGR.
55 56 *
56 57 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
57 58 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
58 59 * ring belonging to a port on which the packet has to be sent.
59 60 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
60 61 * policy and then uses the fanout_hint passed to it to pick a TX ring from
61 62 * the selected port.
62 63 *
63 64 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
64 65 * bandwidth limit is applied first on the outgoing packet and the packets
65 66 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
66 67 * particular TX ring.
67 68 */
68 69
69 70 #include <sys/types.h>
70 71 #include <sys/sysmacros.h>
71 72 #include <sys/conf.h>
72 73 #include <sys/cmn_err.h>
73 74 #include <sys/disp.h>
74 75 #include <sys/list.h>
75 76 #include <sys/ksynch.h>
76 77 #include <sys/kmem.h>
77 78 #include <sys/stream.h>
78 79 #include <sys/modctl.h>
79 80 #include <sys/ddi.h>
80 81 #include <sys/sunddi.h>
81 82 #include <sys/atomic.h>
82 83 #include <sys/stat.h>
83 84 #include <sys/modhash.h>
84 85 #include <sys/id_space.h>
85 86 #include <sys/strsun.h>
86 87 #include <sys/cred.h>
87 88 #include <sys/dlpi.h>
88 89 #include <sys/zone.h>
89 90 #include <sys/mac_provider.h>
90 91 #include <sys/dls.h>
91 92 #include <sys/vlan.h>
92 93 #include <sys/aggr.h>
93 94 #include <sys/aggr_impl.h>
94 95
95 96 static int aggr_m_start(void *);
96 97 static void aggr_m_stop(void *);
97 98 static int aggr_m_promisc(void *, boolean_t);
98 99 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
99 100 static int aggr_m_unicst(void *, const uint8_t *);
100 101 static int aggr_m_stat(void *, uint_t, uint64_t *);
101 102 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
102 103 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
103 104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
104 105 const void *);
105 106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
106 107 mac_prop_info_handle_t);
107 108
108 109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
109 110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
110 111 boolean_t *);
111 112
112 113 static void aggr_grp_capab_set(aggr_grp_t *);
113 114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
114 115 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
115 116 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
116 117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
117 118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
118 119
119 120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
120 121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
121 122 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
122 123 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
123 124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
124 125 static void aggr_pseudo_stop_ring(mac_ring_driver_t);
125 126 static int aggr_addmac(void *, const uint8_t *);
126 127 static int aggr_remmac(void *, const uint8_t *);
127 128 static mblk_t *aggr_rx_poll(void *, int);
128 129 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
129 130 const int, mac_ring_info_t *, mac_ring_handle_t);
130 131 static void aggr_fill_group(void *, mac_ring_type_t, const int,
131 132 mac_group_info_t *, mac_group_handle_t);
132 133
133 134 static kmem_cache_t *aggr_grp_cache;
134 135 static mod_hash_t *aggr_grp_hash;
135 136 static krwlock_t aggr_grp_lock;
136 137 static uint_t aggr_grp_cnt;
137 138 static id_space_t *key_ids;
138 139
139 140 #define GRP_HASHSZ 64
140 141 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid)
141 142 #define AGGR_PORT_NAME_DELIMIT '-'
142 143
143 144 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
144 145
145 146 #define AGGR_M_CALLBACK_FLAGS \
146 147 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
147 148
148 149 static mac_callbacks_t aggr_m_callbacks = {
149 150 AGGR_M_CALLBACK_FLAGS,
150 151 aggr_m_stat,
151 152 aggr_m_start,
152 153 aggr_m_stop,
153 154 aggr_m_promisc,
154 155 aggr_m_multicst,
155 156 NULL,
156 157 NULL,
157 158 NULL,
158 159 aggr_m_ioctl,
159 160 aggr_m_capab_get,
160 161 NULL,
161 162 NULL,
162 163 aggr_m_setprop,
163 164 NULL,
164 165 aggr_m_propinfo
165 166 };
166 167
167 168 /*ARGSUSED*/
168 169 static int
169 170 aggr_grp_constructor(void *buf, void *arg, int kmflag)
170 171 {
171 172 aggr_grp_t *grp = buf;
172 173
173 174 bzero(grp, sizeof (*grp));
174 175 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
175 176 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
176 177 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
177 178 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
178 179 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
179 180 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
180 181 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
181 182 grp->lg_link_state = LINK_STATE_UNKNOWN;
182 183 return (0);
183 184 }
184 185
185 186 /*ARGSUSED*/
186 187 static void
187 188 aggr_grp_destructor(void *buf, void *arg)
188 189 {
189 190 aggr_grp_t *grp = buf;
190 191
191 192 if (grp->lg_tx_ports != NULL) {
192 193 kmem_free(grp->lg_tx_ports,
193 194 grp->lg_tx_ports_size * sizeof (aggr_port_t *));
194 195 }
195 196
196 197 mutex_destroy(&grp->lg_lacp_lock);
197 198 cv_destroy(&grp->lg_lacp_cv);
198 199 mutex_destroy(&grp->lg_port_lock);
199 200 cv_destroy(&grp->lg_port_cv);
200 201 rw_destroy(&grp->lg_tx_lock);
201 202 mutex_destroy(&grp->lg_tx_flowctl_lock);
202 203 cv_destroy(&grp->lg_tx_flowctl_cv);
203 204 }
204 205
205 206 void
206 207 aggr_grp_init(void)
207 208 {
208 209 aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
209 210 sizeof (aggr_grp_t), 0, aggr_grp_constructor,
210 211 aggr_grp_destructor, NULL, NULL, NULL, 0);
211 212
212 213 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
213 214 GRP_HASHSZ, mod_hash_null_valdtor);
214 215 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
215 216 aggr_grp_cnt = 0;
216 217
217 218 /*
218 219 * Allocate an id space to manage key values (when key is not
219 220 * specified). The range of the id space will be from
220 221 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
221 222 * uses a 16-bit key.
222 223 */
223 224 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
224 225 ASSERT(key_ids != NULL);
225 226 }
226 227
227 228 void
228 229 aggr_grp_fini(void)
229 230 {
230 231 id_space_destroy(key_ids);
231 232 rw_destroy(&aggr_grp_lock);
232 233 mod_hash_destroy_idhash(aggr_grp_hash);
233 234 kmem_cache_destroy(aggr_grp_cache);
234 235 }
235 236
236 237 uint_t
237 238 aggr_grp_count(void)
238 239 {
239 240 uint_t count;
240 241
241 242 rw_enter(&aggr_grp_lock, RW_READER);
242 243 count = aggr_grp_cnt;
243 244 rw_exit(&aggr_grp_lock);
244 245 return (count);
245 246 }
246 247
247 248 /*
248 249 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
249 250 * requires the mac perimeter, this function holds a reference of the aggr
250 251 * and aggr won't call mac_unregister() until this reference drops to 0.
251 252 */
252 253 void
253 254 aggr_grp_port_hold(aggr_port_t *port)
254 255 {
255 256 aggr_grp_t *grp = port->lp_grp;
256 257
257 258 AGGR_PORT_REFHOLD(port);
258 259 mutex_enter(&grp->lg_port_lock);
259 260 grp->lg_port_ref++;
260 261 mutex_exit(&grp->lg_port_lock);
261 262 }
262 263
263 264 /*
264 265 * Release the reference of the grp and inform aggr_grp_delete() calling
265 266 * mac_unregister() is now safe.
266 267 */
267 268 void
268 269 aggr_grp_port_rele(aggr_port_t *port)
269 270 {
270 271 aggr_grp_t *grp = port->lp_grp;
271 272
272 273 mutex_enter(&grp->lg_port_lock);
273 274 if (--grp->lg_port_ref == 0)
274 275 cv_signal(&grp->lg_port_cv);
275 276 mutex_exit(&grp->lg_port_lock);
276 277 AGGR_PORT_REFRELE(port);
277 278 }
278 279
279 280 /*
280 281 * Wait for the port's lacp timer thread and the port's notification callback
281 282 * to exit.
282 283 */
283 284 void
284 285 aggr_grp_port_wait(aggr_grp_t *grp)
285 286 {
286 287 mutex_enter(&grp->lg_port_lock);
287 288 if (grp->lg_port_ref != 0)
288 289 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
289 290 mutex_exit(&grp->lg_port_lock);
290 291 }
291 292
292 293 /*
293 294 * Attach a port to a link aggregation group.
294 295 *
295 296 * A port is attached to a link aggregation group once its speed
296 297 * and link state have been verified.
297 298 *
298 299 * Returns B_TRUE if the group link state or speed has changed. If
299 300 * it's the case, the caller must notify the MAC layer via a call
300 301 * to mac_link().
301 302 */
302 303 boolean_t
303 304 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
304 305 {
305 306 boolean_t link_state_changed = B_FALSE;
306 307
307 308 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
308 309 ASSERT(MAC_PERIM_HELD(port->lp_mh));
309 310
310 311 if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
311 312 return (B_FALSE);
312 313
313 314 /*
314 315 * Validate the MAC port link speed and update the group
315 316 * link speed if needed.
316 317 */
317 318 if (port->lp_ifspeed == 0 ||
318 319 port->lp_link_state != LINK_STATE_UP ||
319 320 port->lp_link_duplex != LINK_DUPLEX_FULL) {
320 321 /*
321 322 * Can't attach a MAC port with unknown link speed,
322 323 * down link, or not in full duplex mode.
323 324 */
324 325 return (B_FALSE);
325 326 }
326 327
327 328 if (grp->lg_ifspeed == 0) {
328 329 /*
329 330 * The group inherits the speed of the first link being
330 331 * attached.
331 332 */
332 333 grp->lg_ifspeed = port->lp_ifspeed;
333 334 link_state_changed = B_TRUE;
334 335 } else if (grp->lg_ifspeed != port->lp_ifspeed) {
335 336 /*
336 337 * The link speed of the MAC port must be the same as
337 338 * the group link speed, as per 802.3ad. Since it is
338 339 * not, the attach is cancelled.
339 340 */
340 341 return (B_FALSE);
341 342 }
342 343
343 344 grp->lg_nattached_ports++;
344 345
345 346 /*
346 347 * Update the group link state.
347 348 */
348 349 if (grp->lg_link_state != LINK_STATE_UP) {
349 350 grp->lg_link_state = LINK_STATE_UP;
350 351 grp->lg_link_duplex = LINK_DUPLEX_FULL;
351 352 link_state_changed = B_TRUE;
352 353 }
353 354
354 355 /*
355 356 * Update port's state.
356 357 */
357 358 port->lp_state = AGGR_PORT_STATE_ATTACHED;
358 359
359 360 aggr_grp_multicst_port(port, B_TRUE);
360 361
361 362 /*
362 363 * Set port's receive callback
363 364 */
364 365 mac_rx_set(port->lp_mch, aggr_recv_cb, port);
365 366
366 367 /*
367 368 * If LACP is OFF, the port can be used to send data as soon
368 369 * as its link is up and verified to be compatible with the
369 370 * aggregation.
370 371 *
371 372 * If LACP is active or passive, notify the LACP subsystem, which
372 373 * will enable sending on the port following the LACP protocol.
373 374 */
374 375 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
375 376 aggr_send_port_enable(port);
376 377 else
377 378 aggr_lacp_port_attached(port);
378 379
379 380 return (link_state_changed);
380 381 }
381 382
382 383 boolean_t
383 384 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
384 385 {
385 386 boolean_t link_state_changed = B_FALSE;
386 387
387 388 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
388 389 ASSERT(MAC_PERIM_HELD(port->lp_mh));
389 390
390 391 /* update state */
391 392 if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
392 393 return (B_FALSE);
393 394
394 395 mac_rx_clear(port->lp_mch);
395 396
396 397 aggr_grp_multicst_port(port, B_FALSE);
397 398
398 399 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
399 400 aggr_send_port_disable(port);
400 401 else
401 402 aggr_lacp_port_detached(port);
402 403
403 404 port->lp_state = AGGR_PORT_STATE_STANDBY;
404 405
405 406 grp->lg_nattached_ports--;
406 407 if (grp->lg_nattached_ports == 0) {
407 408 /* the last attached MAC port of the group is being detached */
408 409 grp->lg_ifspeed = 0;
409 410 grp->lg_link_state = LINK_STATE_DOWN;
410 411 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
411 412 link_state_changed = B_TRUE;
412 413 }
413 414
414 415 return (link_state_changed);
415 416 }
416 417
417 418 /*
418 419 * Update the MAC addresses of the constituent ports of the specified
419 420 * group. This function is invoked:
420 421 * - after creating a new aggregation group.
421 422 * - after adding new ports to an aggregation group.
422 423 * - after removing a port from a group when the MAC address of
423 424 * that port was used for the MAC address of the group.
424 425 * - after the MAC address of a port changed when the MAC address
425 426 * of that port was used for the MAC address of the group.
426 427 *
427 428 * Return true if the link state of the aggregation changed, for example
428 429 * as a result of a failure changing the MAC address of one of the
429 430 * constituent ports.
430 431 */
431 432 boolean_t
432 433 aggr_grp_update_ports_mac(aggr_grp_t *grp)
433 434 {
434 435 aggr_port_t *cport;
435 436 boolean_t link_state_changed = B_FALSE;
436 437 mac_perim_handle_t mph;
437 438
438 439 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
439 440
440 441 for (cport = grp->lg_ports; cport != NULL;
441 442 cport = cport->lp_next) {
442 443 mac_perim_enter_by_mh(cport->lp_mh, &mph);
443 444 if (aggr_port_unicst(cport) != 0) {
444 445 if (aggr_grp_detach_port(grp, cport))
445 446 link_state_changed = B_TRUE;
446 447 } else {
447 448 /*
448 449 * If a port was detached because of a previous
449 450 * failure changing the MAC address, the port is
450 451 * reattached when it successfully changes the MAC
451 452 * address now, and this might cause the link state
452 453 * of the aggregation to change.
453 454 */
454 455 if (aggr_grp_attach_port(grp, cport))
455 456 link_state_changed = B_TRUE;
456 457 }
457 458 mac_perim_exit(mph);
458 459 }
459 460 return (link_state_changed);
460 461 }
461 462
462 463 /*
463 464 * Invoked when the MAC address of a port has changed. If the port's
464 465 * MAC address was used for the group MAC address, set mac_addr_changedp
465 466 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
466 467 * notification. If the link state changes due to detach/attach of
467 468 * the constituent port, set link_state_changedp to B_TRUE to indicate
468 469 * to the caller that it should send a MAC_NOTE_LINK notification. In both
469 470 * cases, it is the responsibility of the caller to invoke notification
470 471 * functions after releasing the the port lock.
471 472 */
472 473 void
473 474 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
474 475 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
475 476 {
476 477 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
477 478 ASSERT(MAC_PERIM_HELD(port->lp_mh));
478 479 ASSERT(mac_addr_changedp != NULL);
479 480 ASSERT(link_state_changedp != NULL);
480 481
481 482 *mac_addr_changedp = B_FALSE;
482 483 *link_state_changedp = B_FALSE;
483 484
484 485 if (grp->lg_addr_fixed) {
485 486 /*
486 487 * The group is using a fixed MAC address or an automatic
487 488 * MAC address has not been set.
488 489 */
489 490 return;
490 491 }
491 492
492 493 if (grp->lg_mac_addr_port == port) {
493 494 /*
494 495 * The MAC address of the port was assigned to the group
495 496 * MAC address. Update the group MAC address.
496 497 */
497 498 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
498 499 *mac_addr_changedp = B_TRUE;
499 500 } else {
500 501 /*
501 502 * Update the actual port MAC address to the MAC address
502 503 * of the group.
503 504 */
504 505 if (aggr_port_unicst(port) != 0) {
505 506 *link_state_changedp = aggr_grp_detach_port(grp, port);
506 507 } else {
507 508 /*
508 509 * If a port was detached because of a previous
509 510 * failure changing the MAC address, the port is
510 511 * reattached when it successfully changes the MAC
511 512 * address now, and this might cause the link state
512 513 * of the aggregation to change.
513 514 */
514 515 *link_state_changedp = aggr_grp_attach_port(grp, port);
515 516 }
516 517 }
517 518 }
518 519
519 520 /*
520 521 * Add a port to a link aggregation group.
521 522 */
522 523 static int
523 524 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
524 525 aggr_port_t **pp)
525 526 {
526 527 aggr_port_t *port, **cport;
527 528 mac_perim_handle_t mph;
528 529 zoneid_t port_zoneid = ALL_ZONES;
529 530 int err;
530 531
531 532 /* The port must be int the same zone as the aggregation. */
532 533 if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
533 534 port_zoneid = GLOBAL_ZONEID;
534 535 if (grp->lg_zoneid != port_zoneid)
535 536 return (EBUSY);
536 537
537 538 /*
538 539 * lg_mh could be NULL when the function is called during the creation
539 540 * of the aggregation.
540 541 */
541 542 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
542 543
543 544 /* create new port */
544 545 err = aggr_port_create(grp, port_linkid, force, &port);
545 546 if (err != 0)
546 547 return (err);
547 548
548 549 mac_perim_enter_by_mh(port->lp_mh, &mph);
549 550
550 551 /* add port to list of group constituent ports */
551 552 cport = &grp->lg_ports;
552 553 while (*cport != NULL)
553 554 cport = &((*cport)->lp_next);
554 555 *cport = port;
555 556
556 557 /*
557 558 * Back reference to the group it is member of. A port always
558 559 * holds a reference to its group to ensure that the back
559 560 * reference is always valid.
560 561 */
561 562 port->lp_grp = grp;
562 563 AGGR_GRP_REFHOLD(grp);
563 564 grp->lg_nports++;
564 565
|
↓ open down ↓ |
532 lines elided |
↑ open up ↑ |
565 566 aggr_lacp_init_port(port);
566 567 mac_perim_exit(mph);
567 568
568 569 if (pp != NULL)
569 570 *pp = port;
570 571
571 572 return (0);
572 573 }
573 574
574 575 /*
576 + * This is called in response to either our LACP state machine or a MAC
577 + * notification that the link has gone down via aggr_send_port_disable(). At
578 + * this point, we may need to update our default ring. To that end, we go
579 + * through the set of ports (underlying datalinks in an aggregation) that are
580 + * currently enabled to transmit data. If all our links have been disabled for
581 + * transmit, then we don't do anything.
582 + *
583 + * Note, because we only have a single TX group, we don't have to worry about
584 + * the rings moving between groups and the chance that mac will reassign it
585 + * unless someone removes a port, at which point, we play it safe and call this
586 + * again.
587 + */
588 +void
589 +aggr_grp_update_default(aggr_grp_t *grp)
590 +{
591 + aggr_port_t *port;
592 + ASSERT(MAC_PERIM_HELD(grp->lg_mh));
593 +
594 + rw_enter(&grp->lg_tx_lock, RW_WRITER);
595 +
596 + if (grp->lg_ntx_ports == 0) {
597 + rw_exit(&grp->lg_tx_lock);
598 + return;
599 + }
600 +
601 + port = grp->lg_tx_ports[0];
602 + ASSERT(port->lp_tx_ring_cnt > 0);
603 + mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
604 + rw_exit(&grp->lg_tx_lock);
605 +}
606 +
607 +/*
575 608 * Add a pseudo RX ring for the given HW ring handle.
576 609 */
577 610 static int
578 611 aggr_add_pseudo_rx_ring(aggr_port_t *port,
579 612 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
580 613 {
581 614 aggr_pseudo_rx_ring_t *ring;
582 615 int err;
583 616 int j;
584 617
585 618 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
586 619 ring = rx_grp->arg_rings + j;
587 620 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
588 621 break;
589 622 }
590 623
591 624 /*
592 625 * No slot for this new RX ring.
593 626 */
594 627 if (j == MAX_RINGS_PER_GROUP)
595 628 return (EIO);
596 629
597 630 ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
598 631 ring->arr_hw_rh = hw_rh;
599 632 ring->arr_port = port;
600 633 rx_grp->arg_ring_cnt++;
601 634
602 635 /*
603 636 * The group is already registered, dynamically add a new ring to the
604 637 * mac group.
605 638 */
606 639 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
607 640 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
608 641 ring->arr_hw_rh = NULL;
609 642 ring->arr_port = NULL;
610 643 rx_grp->arg_ring_cnt--;
611 644 } else {
612 645 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
613 646 mac_find_ring(rx_grp->arg_gh, j));
614 647 }
615 648 return (err);
616 649 }
617 650
618 651 /*
619 652 * Remove the pseudo RX ring of the given HW ring handle.
620 653 */
621 654 static void
622 655 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
623 656 {
624 657 aggr_pseudo_rx_ring_t *ring;
625 658 int j;
626 659
627 660 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
628 661 ring = rx_grp->arg_rings + j;
629 662 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
630 663 ring->arr_hw_rh != hw_rh) {
631 664 continue;
632 665 }
633 666
634 667 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
635 668
636 669 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
637 670 ring->arr_hw_rh = NULL;
638 671 ring->arr_port = NULL;
639 672 rx_grp->arg_ring_cnt--;
640 673 mac_hwring_teardown(hw_rh);
641 674 break;
642 675 }
643 676 }
644 677
645 678 /*
646 679 * This function is called to create pseudo rings over the hardware rings of
647 680 * the underlying device. Note that there is a 1:1 mapping between the pseudo
648 681 * RX rings of the aggr and the hardware rings of the underlying port.
649 682 */
650 683 static int
651 684 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
652 685 {
653 686 aggr_grp_t *grp = port->lp_grp;
654 687 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
655 688 aggr_unicst_addr_t *addr, *a;
656 689 mac_perim_handle_t pmph;
657 690 int hw_rh_cnt, i = 0, j;
658 691 int err = 0;
659 692
660 693 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
661 694 mac_perim_enter_by_mh(port->lp_mh, &pmph);
662 695
663 696 /*
664 697 * This function must be called after the aggr registers its mac
665 698 * and its RX group has been initialized.
666 699 */
667 700 ASSERT(rx_grp->arg_gh != NULL);
668 701
669 702 /*
670 703 * Get the list the the underlying HW rings.
671 704 */
672 705 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
673 706 &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
674 707
675 708 if (port->lp_hwgh != NULL) {
676 709 /*
677 710 * Quiesce the HW ring and the mac srs on the ring. Note
678 711 * that the HW ring will be restarted when the pseudo ring
679 712 * is started. At that time all the packets will be
680 713 * directly passed up to the pseudo RX ring and handled
681 714 * by mac srs created over the pseudo RX ring.
682 715 */
683 716 mac_rx_client_quiesce(port->lp_mch);
684 717 mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
685 718 }
686 719
687 720 /*
688 721 * Add all the unicast addresses to the newly added port.
689 722 */
690 723 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
691 724 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
692 725 break;
693 726 }
694 727
695 728 for (i = 0; err == 0 && i < hw_rh_cnt; i++)
696 729 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
697 730
698 731 if (err != 0) {
699 732 for (j = 0; j < i; j++)
700 733 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
701 734
702 735 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
703 736 aggr_port_remmac(port, a->aua_addr);
704 737
705 738 if (port->lp_hwgh != NULL) {
706 739 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
707 740 mac_rx_client_restart(port->lp_mch);
708 741 port->lp_hwgh = NULL;
709 742 }
710 743 } else {
711 744 port->lp_rx_grp_added = B_TRUE;
712 745 }
713 746 done:
714 747 mac_perim_exit(pmph);
715 748 return (err);
716 749 }
717 750
718 751 /*
719 752 * This function is called by aggr to remove pseudo RX rings over the
720 753 * HW rings of the underlying port.
721 754 */
722 755 static void
723 756 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
724 757 {
725 758 aggr_grp_t *grp = port->lp_grp;
726 759 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
727 760 aggr_unicst_addr_t *addr;
728 761 mac_group_handle_t hwgh;
729 762 mac_perim_handle_t pmph;
730 763 int hw_rh_cnt, i;
731 764
732 765 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
733 766 mac_perim_enter_by_mh(port->lp_mh, &pmph);
734 767
735 768 if (!port->lp_rx_grp_added)
736 769 goto done;
737 770
738 771 ASSERT(rx_grp->arg_gh != NULL);
739 772 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
740 773 &hwgh, hw_rh, MAC_RING_TYPE_RX);
741 774
742 775 /*
743 776 * If hw_rh_cnt is 0, it means that the underlying port does not
744 777 * support RX rings. Directly return in this case.
745 778 */
746 779 for (i = 0; i < hw_rh_cnt; i++)
747 780 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
748 781
749 782 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
750 783 aggr_port_remmac(port, addr->aua_addr);
751 784
752 785 if (port->lp_hwgh != NULL) {
753 786 port->lp_hwgh = NULL;
754 787
755 788 /*
756 789 * First clear the permanent-quiesced flag of the RX srs then
757 790 * restart the HW ring and the mac srs on the ring. Note that
758 791 * the HW ring and associated SRS will soon been removed when
759 792 * the port is removed from the aggr.
760 793 */
761 794 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
762 795 mac_rx_client_restart(port->lp_mch);
763 796 }
764 797
765 798 port->lp_rx_grp_added = B_FALSE;
766 799 done:
767 800 mac_perim_exit(pmph);
768 801 }
769 802
770 803 /*
771 804 * Add a pseudo TX ring for the given HW ring handle.
772 805 */
773 806 static int
774 807 aggr_add_pseudo_tx_ring(aggr_port_t *port,
775 808 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
776 809 mac_ring_handle_t *pseudo_rh)
777 810 {
778 811 aggr_pseudo_tx_ring_t *ring;
779 812 int err;
780 813 int i;
781 814
782 815 ASSERT(MAC_PERIM_HELD(port->lp_mh));
783 816 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
784 817 ring = tx_grp->atg_rings + i;
785 818 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
786 819 break;
787 820 }
788 821 /*
789 822 * No slot for this new TX ring.
790 823 */
791 824 if (i == MAX_RINGS_PER_GROUP)
792 825 return (EIO);
793 826 /*
794 827 * The following 4 statements needs to be done before
795 828 * calling mac_group_add_ring(). Otherwise it will
796 829 * result in an assertion failure in mac_init_ring().
797 830 */
798 831 ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
799 832 ring->atr_hw_rh = hw_rh;
800 833 ring->atr_port = port;
801 834 tx_grp->atg_ring_cnt++;
802 835
803 836 /*
804 837 * The TX side has no concept of ring groups unlike RX groups.
805 838 * There is just a single group which stores all the TX rings.
806 839 * This group will be used to store aggr's pseudo TX rings.
807 840 */
808 841 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
809 842 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
|
↓ open down ↓ |
225 lines elided |
↑ open up ↑ |
810 843 ring->atr_hw_rh = NULL;
811 844 ring->atr_port = NULL;
812 845 tx_grp->atg_ring_cnt--;
813 846 } else {
814 847 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
815 848 if (hw_rh != NULL) {
816 849 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
817 850 mac_find_ring(tx_grp->atg_gh, i));
818 851 }
819 852 }
853 +
820 854 return (err);
821 855 }
822 856
823 857 /*
824 858 * Remove the pseudo TX ring of the given HW ring handle.
825 859 */
826 860 static void
827 861 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
828 862 mac_ring_handle_t pseudo_hw_rh)
829 863 {
830 864 aggr_pseudo_tx_ring_t *ring;
831 865 int i;
832 866
833 867 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
834 868 ring = tx_grp->atg_rings + i;
835 869 if (ring->atr_rh != pseudo_hw_rh)
836 870 continue;
837 871
838 872 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
839 873 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
840 874 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
841 875 mac_hwring_teardown(ring->atr_hw_rh);
842 876 ring->atr_hw_rh = NULL;
843 877 ring->atr_port = NULL;
844 878 tx_grp->atg_ring_cnt--;
845 879 break;
846 880 }
847 881 }
848 882
849 883 /*
850 884 * This function is called to create pseudo rings over hardware rings of
851 885 * the underlying device. There is a 1:1 mapping between the pseudo TX
852 886 * rings of the aggr and the hardware rings of the underlying port.
853 887 */
854 888 static int
855 889 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
856 890 {
857 891 aggr_grp_t *grp = port->lp_grp;
858 892 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
859 893 mac_perim_handle_t pmph;
860 894 int hw_rh_cnt, i = 0, j;
861 895 int err = 0;
862 896
863 897 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
864 898 mac_perim_enter_by_mh(port->lp_mh, &pmph);
865 899
866 900 /*
867 901 * Get the list the the underlying HW rings.
868 902 */
869 903 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
870 904 NULL, hw_rh, MAC_RING_TYPE_TX);
871 905
872 906 /*
873 907 * Even if the underlying NIC does not have TX rings, we
874 908 * still make a psuedo TX ring for that NIC with NULL as
875 909 * the ring handle.
876 910 */
877 911 if (hw_rh_cnt == 0)
878 912 port->lp_tx_ring_cnt = 1;
879 913 else
880 914 port->lp_tx_ring_cnt = hw_rh_cnt;
881 915
882 916 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
883 917 port->lp_tx_ring_cnt), KM_SLEEP);
884 918 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
885 919 port->lp_tx_ring_cnt), KM_SLEEP);
886 920
887 921 if (hw_rh_cnt == 0) {
888 922 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
889 923 NULL, &pseudo_rh)) == 0) {
890 924 port->lp_tx_rings[0] = NULL;
891 925 port->lp_pseudo_tx_rings[0] = pseudo_rh;
892 926 }
893 927 } else {
894 928 for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
895 929 err = aggr_add_pseudo_tx_ring(port,
896 930 tx_grp, hw_rh[i], &pseudo_rh);
897 931 if (err != 0)
898 932 break;
899 933 port->lp_tx_rings[i] = hw_rh[i];
900 934 port->lp_pseudo_tx_rings[i] = pseudo_rh;
901 935 }
902 936 }
903 937
904 938 if (err != 0) {
905 939 if (hw_rh_cnt != 0) {
906 940 for (j = 0; j < i; j++) {
907 941 aggr_rem_pseudo_tx_ring(tx_grp,
908 942 port->lp_pseudo_tx_rings[j]);
909 943 }
910 944 }
911 945 kmem_free(port->lp_tx_rings,
|
↓ open down ↓ |
82 lines elided |
↑ open up ↑ |
912 946 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
913 947 kmem_free(port->lp_pseudo_tx_rings,
914 948 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
915 949 port->lp_tx_ring_cnt = 0;
916 950 } else {
917 951 port->lp_tx_grp_added = B_TRUE;
918 952 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
919 953 aggr_tx_ring_update, port);
920 954 }
921 955 mac_perim_exit(pmph);
956 + aggr_grp_update_default(grp);
922 957 return (err);
923 958 }
924 959
925 960 /*
926 961 * This function is called by aggr to remove pseudo TX rings over the
927 962 * HW rings of the underlying port.
928 963 */
929 964 static void
930 965 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
931 966 {
932 967 aggr_grp_t *grp = port->lp_grp;
933 968 mac_perim_handle_t pmph;
934 969 int i;
935 970
936 971 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
937 972 mac_perim_enter_by_mh(port->lp_mh, &pmph);
938 973
939 974 if (!port->lp_tx_grp_added)
940 975 goto done;
941 976
942 977 ASSERT(tx_grp->atg_gh != NULL);
943 978
944 979 for (i = 0; i < port->lp_tx_ring_cnt; i++)
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
945 980 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
946 981
947 982 kmem_free(port->lp_tx_rings,
948 983 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
949 984 kmem_free(port->lp_pseudo_tx_rings,
950 985 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
951 986
952 987 port->lp_tx_ring_cnt = 0;
953 988 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
954 989 port->lp_tx_grp_added = B_FALSE;
990 + aggr_grp_update_default(grp);
955 991 done:
956 992 mac_perim_exit(pmph);
957 993 }
958 994
959 995 static int
960 996 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
961 997 {
962 998 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
963 999 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
964 1000 }
965 1001
966 1002 static int
967 1003 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
968 1004 {
969 1005 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
970 1006 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
971 1007 }
972 1008
973 1009 static int
974 1010 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
975 1011 {
976 1012 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
977 1013 int err;
978 1014
979 1015 err = mac_hwring_start(rr_ring->arr_hw_rh);
980 1016 if (err == 0)
981 1017 rr_ring->arr_gen = mr_gen;
982 1018 return (err);
983 1019 }
984 1020
985 1021 static void
986 1022 aggr_pseudo_stop_ring(mac_ring_driver_t arg)
987 1023 {
988 1024 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
989 1025 mac_hwring_stop(rr_ring->arr_hw_rh);
990 1026 }
991 1027
992 1028 /*
993 1029 * Add one or more ports to an existing link aggregation group.
994 1030 */
995 1031 int
996 1032 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
997 1033 laioc_port_t *ports)
998 1034 {
999 1035 int rc, i, nadded = 0;
1000 1036 aggr_grp_t *grp = NULL;
1001 1037 aggr_port_t *port;
1002 1038 boolean_t link_state_changed = B_FALSE;
1003 1039 mac_perim_handle_t mph, pmph;
1004 1040
1005 1041 /* get group corresponding to linkid */
1006 1042 rw_enter(&aggr_grp_lock, RW_READER);
1007 1043 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1008 1044 (mod_hash_val_t *)&grp) != 0) {
1009 1045 rw_exit(&aggr_grp_lock);
1010 1046 return (ENOENT);
1011 1047 }
1012 1048 AGGR_GRP_REFHOLD(grp);
1013 1049
1014 1050 /*
1015 1051 * Hold the perimeter so that the aggregation won't be destroyed.
1016 1052 */
1017 1053 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1018 1054 rw_exit(&aggr_grp_lock);
1019 1055
1020 1056 /* add the specified ports to group */
1021 1057 for (i = 0; i < nports; i++) {
1022 1058 /* add port to group */
1023 1059 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1024 1060 force, &port)) != 0) {
1025 1061 goto bail;
1026 1062 }
1027 1063 ASSERT(port != NULL);
1028 1064 nadded++;
1029 1065
1030 1066 /* check capabilities */
1031 1067 if (!aggr_grp_capab_check(grp, port) ||
1032 1068 !aggr_grp_sdu_check(grp, port) ||
1033 1069 !aggr_grp_margin_check(grp, port)) {
1034 1070 rc = ENOTSUP;
1035 1071 goto bail;
1036 1072 }
1037 1073
1038 1074 /*
1039 1075 * Create the pseudo ring for each HW ring of the underlying
1040 1076 * port.
1041 1077 */
1042 1078 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1043 1079 if (rc != 0)
1044 1080 goto bail;
1045 1081 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1046 1082 if (rc != 0)
1047 1083 goto bail;
1048 1084
1049 1085 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1050 1086
1051 1087 /* set LACP mode */
1052 1088 aggr_port_lacp_set_mode(grp, port);
1053 1089
1054 1090 /* start port if group has already been started */
1055 1091 if (grp->lg_started) {
1056 1092 rc = aggr_port_start(port);
1057 1093 if (rc != 0) {
1058 1094 mac_perim_exit(pmph);
1059 1095 goto bail;
1060 1096 }
1061 1097
1062 1098 /*
1063 1099 * Turn on the promiscuous mode over the port when it
1064 1100 * is requested to be turned on to receive the
1065 1101 * non-primary address over a port, or the promiscous
1066 1102 * mode is enabled over the aggr.
1067 1103 */
1068 1104 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1069 1105 rc = aggr_port_promisc(port, B_TRUE);
1070 1106 if (rc != 0) {
1071 1107 mac_perim_exit(pmph);
1072 1108 goto bail;
1073 1109 }
1074 1110 }
1075 1111 }
1076 1112 mac_perim_exit(pmph);
1077 1113
1078 1114 /*
1079 1115 * Attach each port if necessary.
1080 1116 */
1081 1117 if (aggr_port_notify_link(grp, port))
1082 1118 link_state_changed = B_TRUE;
1083 1119
1084 1120 /*
1085 1121 * Initialize the callback functions for this port.
1086 1122 */
1087 1123 aggr_port_init_callbacks(port);
1088 1124 }
1089 1125
1090 1126 /* update the MAC address of the constituent ports */
1091 1127 if (aggr_grp_update_ports_mac(grp))
1092 1128 link_state_changed = B_TRUE;
1093 1129
1094 1130 if (link_state_changed)
1095 1131 mac_link_update(grp->lg_mh, grp->lg_link_state);
1096 1132
1097 1133 bail:
1098 1134 if (rc != 0) {
1099 1135 /* stop and remove ports that have been added */
1100 1136 for (i = 0; i < nadded; i++) {
1101 1137 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1102 1138 ASSERT(port != NULL);
1103 1139 if (grp->lg_started) {
1104 1140 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1105 1141 (void) aggr_port_promisc(port, B_FALSE);
1106 1142 aggr_port_stop(port);
1107 1143 mac_perim_exit(pmph);
1108 1144 }
1109 1145 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1110 1146 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1111 1147 (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1112 1148 }
1113 1149 }
1114 1150
1115 1151 mac_perim_exit(mph);
1116 1152 AGGR_GRP_REFRELE(grp);
1117 1153 return (rc);
1118 1154 }
1119 1155
1120 1156 static int
1121 1157 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1122 1158 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1123 1159 aggr_lacp_timer_t lacp_timer)
1124 1160 {
1125 1161 boolean_t mac_addr_changed = B_FALSE;
1126 1162 boolean_t link_state_changed = B_FALSE;
1127 1163 mac_perim_handle_t pmph;
1128 1164
1129 1165 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1130 1166
1131 1167 /* validate fixed address if specified */
1132 1168 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1133 1169 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1134 1170 (mac_addr[0] & 0x01))) {
1135 1171 return (EINVAL);
1136 1172 }
1137 1173
1138 1174 /* update policy if requested */
1139 1175 if (update_mask & AGGR_MODIFY_POLICY)
1140 1176 aggr_send_update_policy(grp, policy);
1141 1177
1142 1178 /* update unicast MAC address if requested */
1143 1179 if (update_mask & AGGR_MODIFY_MAC) {
1144 1180 if (mac_fixed) {
1145 1181 /* user-supplied MAC address */
1146 1182 grp->lg_mac_addr_port = NULL;
1147 1183 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1148 1184 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1149 1185 mac_addr_changed = B_TRUE;
1150 1186 }
1151 1187 } else if (grp->lg_addr_fixed) {
1152 1188 /* switch from user-supplied to automatic */
1153 1189 aggr_port_t *port = grp->lg_ports;
1154 1190
1155 1191 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1156 1192 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1157 1193 grp->lg_mac_addr_port = port;
1158 1194 mac_addr_changed = B_TRUE;
1159 1195 mac_perim_exit(pmph);
1160 1196 }
1161 1197 grp->lg_addr_fixed = mac_fixed;
1162 1198 }
1163 1199
1164 1200 if (mac_addr_changed)
1165 1201 link_state_changed = aggr_grp_update_ports_mac(grp);
1166 1202
1167 1203 if (update_mask & AGGR_MODIFY_LACP_MODE)
1168 1204 aggr_lacp_update_mode(grp, lacp_mode);
1169 1205
1170 1206 if (update_mask & AGGR_MODIFY_LACP_TIMER)
1171 1207 aggr_lacp_update_timer(grp, lacp_timer);
1172 1208
1173 1209 if (link_state_changed)
1174 1210 mac_link_update(grp->lg_mh, grp->lg_link_state);
1175 1211
1176 1212 if (mac_addr_changed)
1177 1213 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1178 1214
1179 1215 return (0);
1180 1216 }
1181 1217
1182 1218 /*
1183 1219 * Update properties of an existing link aggregation group.
1184 1220 */
1185 1221 int
1186 1222 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1187 1223 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1188 1224 aggr_lacp_timer_t lacp_timer)
1189 1225 {
1190 1226 aggr_grp_t *grp = NULL;
1191 1227 mac_perim_handle_t mph;
1192 1228 int err;
1193 1229
1194 1230 /* get group corresponding to linkid */
1195 1231 rw_enter(&aggr_grp_lock, RW_READER);
1196 1232 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1197 1233 (mod_hash_val_t *)&grp) != 0) {
1198 1234 rw_exit(&aggr_grp_lock);
1199 1235 return (ENOENT);
1200 1236 }
1201 1237 AGGR_GRP_REFHOLD(grp);
1202 1238
1203 1239 /*
1204 1240 * Hold the perimeter so that the aggregation won't be destroyed.
1205 1241 */
1206 1242 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1207 1243 rw_exit(&aggr_grp_lock);
1208 1244
1209 1245 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1210 1246 mac_addr, lacp_mode, lacp_timer);
1211 1247
1212 1248 mac_perim_exit(mph);
1213 1249 AGGR_GRP_REFRELE(grp);
1214 1250 return (err);
1215 1251 }
1216 1252
1217 1253 /*
1218 1254 * Create a new link aggregation group upon request from administrator.
1219 1255 * Returns 0 on success, an errno on failure.
1220 1256 */
1221 1257 int
1222 1258 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1223 1259 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1224 1260 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1225 1261 cred_t *credp)
1226 1262 {
1227 1263 aggr_grp_t *grp = NULL;
1228 1264 aggr_port_t *port;
1229 1265 mac_register_t *mac;
1230 1266 boolean_t link_state_changed;
1231 1267 mac_perim_handle_t mph;
1232 1268 int err;
1233 1269 int i;
1234 1270 kt_did_t tid = 0;
1235 1271
1236 1272 /* need at least one port */
1237 1273 if (nports == 0)
1238 1274 return (EINVAL);
1239 1275
1240 1276 rw_enter(&aggr_grp_lock, RW_WRITER);
1241 1277
1242 1278 /* does a group with the same linkid already exist? */
1243 1279 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1244 1280 (mod_hash_val_t *)&grp);
1245 1281 if (err == 0) {
1246 1282 rw_exit(&aggr_grp_lock);
1247 1283 return (EEXIST);
1248 1284 }
1249 1285
1250 1286 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1251 1287
1252 1288 grp->lg_refs = 1;
1253 1289 grp->lg_closing = B_FALSE;
1254 1290 grp->lg_force = force;
1255 1291 grp->lg_linkid = linkid;
1256 1292 grp->lg_zoneid = crgetzoneid(credp);
1257 1293 grp->lg_ifspeed = 0;
1258 1294 grp->lg_link_state = LINK_STATE_UNKNOWN;
1259 1295 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1260 1296 grp->lg_started = B_FALSE;
1261 1297 grp->lg_promisc = B_FALSE;
1262 1298 grp->lg_lacp_done = B_FALSE;
1263 1299 grp->lg_tx_notify_done = B_FALSE;
1264 1300 grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1265 1301 grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1266 1302 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1267 1303 grp->lg_tx_notify_thread = thread_create(NULL, 0,
1268 1304 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1269 1305 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1270 1306 MAX_RINGS_PER_GROUP), KM_SLEEP);
1271 1307 grp->lg_tx_blocked_cnt = 0;
1272 1308 bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1273 1309 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1274 1310 aggr_lacp_init_grp(grp);
1275 1311
1276 1312 /* add MAC ports to group */
1277 1313 grp->lg_ports = NULL;
1278 1314 grp->lg_nports = 0;
1279 1315 grp->lg_nattached_ports = 0;
1280 1316 grp->lg_ntx_ports = 0;
1281 1317
1282 1318 /*
1283 1319 * If key is not specified by the user, allocate the key.
1284 1320 */
1285 1321 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1286 1322 err = ENOMEM;
1287 1323 goto bail;
1288 1324 }
1289 1325 grp->lg_key = key;
1290 1326
1291 1327 for (i = 0; i < nports; i++) {
1292 1328 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
1293 1329 if (err != 0)
1294 1330 goto bail;
1295 1331 }
1296 1332
1297 1333 /*
1298 1334 * If no explicit MAC address was specified by the administrator,
1299 1335 * set it to the MAC address of the first port.
1300 1336 */
1301 1337 grp->lg_addr_fixed = mac_fixed;
1302 1338 if (grp->lg_addr_fixed) {
1303 1339 /* validate specified address */
1304 1340 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1305 1341 err = EINVAL;
1306 1342 goto bail;
1307 1343 }
1308 1344 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1309 1345 } else {
1310 1346 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1311 1347 grp->lg_mac_addr_port = grp->lg_ports;
1312 1348 }
1313 1349
1314 1350 /* set the initial group capabilities */
1315 1351 aggr_grp_capab_set(grp);
1316 1352
1317 1353 if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1318 1354 err = ENOMEM;
1319 1355 goto bail;
1320 1356 }
1321 1357 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1322 1358 mac->m_driver = grp;
1323 1359 mac->m_dip = aggr_dip;
1324 1360 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1325 1361 mac->m_src_addr = grp->lg_addr;
1326 1362 mac->m_callbacks = &aggr_m_callbacks;
1327 1363 mac->m_min_sdu = 0;
1328 1364 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1329 1365 mac->m_margin = aggr_grp_max_margin(grp);
1330 1366 mac->m_v12n = MAC_VIRT_LEVEL1;
1331 1367 err = mac_register(mac, &grp->lg_mh);
1332 1368 mac_free(mac);
1333 1369 if (err != 0)
1334 1370 goto bail;
1335 1371
1336 1372 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1337 1373 if (err != 0) {
1338 1374 (void) mac_unregister(grp->lg_mh);
1339 1375 grp->lg_mh = NULL;
1340 1376 goto bail;
1341 1377 }
1342 1378
1343 1379 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1344 1380
1345 1381 /*
1346 1382 * Update the MAC address of the constituent ports.
1347 1383 * None of the port is attached at this time, the link state of the
1348 1384 * aggregation will not change.
1349 1385 */
1350 1386 link_state_changed = aggr_grp_update_ports_mac(grp);
1351 1387 ASSERT(!link_state_changed);
1352 1388
1353 1389 /* update outbound load balancing policy */
1354 1390 aggr_send_update_policy(grp, policy);
1355 1391
1356 1392 /* set LACP mode */
1357 1393 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1358 1394
1359 1395 /*
1360 1396 * Attach each port if necessary.
1361 1397 */
1362 1398 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1363 1399 /*
1364 1400 * Create the pseudo ring for each HW ring of the underlying
1365 1401 * port. Note that this is done after the aggr registers the
1366 1402 * mac.
1367 1403 */
1368 1404 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1369 1405 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1370 1406 if (aggr_port_notify_link(grp, port))
1371 1407 link_state_changed = B_TRUE;
1372 1408
1373 1409 /*
1374 1410 * Initialize the callback functions for this port.
1375 1411 */
1376 1412 aggr_port_init_callbacks(port);
1377 1413 }
1378 1414
1379 1415 if (link_state_changed)
1380 1416 mac_link_update(grp->lg_mh, grp->lg_link_state);
1381 1417
1382 1418 /* add new group to hash table */
1383 1419 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1384 1420 (mod_hash_val_t)grp);
1385 1421 ASSERT(err == 0);
1386 1422 aggr_grp_cnt++;
1387 1423
1388 1424 mac_perim_exit(mph);
1389 1425 rw_exit(&aggr_grp_lock);
1390 1426 return (0);
1391 1427
1392 1428 bail:
1393 1429
1394 1430 grp->lg_closing = B_TRUE;
1395 1431
1396 1432 port = grp->lg_ports;
1397 1433 while (port != NULL) {
1398 1434 aggr_port_t *cport;
1399 1435
1400 1436 cport = port->lp_next;
1401 1437 aggr_port_delete(port);
1402 1438 port = cport;
1403 1439 }
1404 1440
1405 1441 /*
1406 1442 * Inform the lacp_rx thread to exit.
1407 1443 */
1408 1444 mutex_enter(&grp->lg_lacp_lock);
1409 1445 grp->lg_lacp_done = B_TRUE;
1410 1446 cv_signal(&grp->lg_lacp_cv);
1411 1447 while (grp->lg_lacp_rx_thread != NULL)
1412 1448 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1413 1449 mutex_exit(&grp->lg_lacp_lock);
1414 1450 /*
1415 1451 * Inform the tx_notify thread to exit.
1416 1452 */
1417 1453 mutex_enter(&grp->lg_tx_flowctl_lock);
1418 1454 if (grp->lg_tx_notify_thread != NULL) {
1419 1455 tid = grp->lg_tx_notify_thread->t_did;
1420 1456 grp->lg_tx_notify_done = B_TRUE;
1421 1457 cv_signal(&grp->lg_tx_flowctl_cv);
1422 1458 }
1423 1459 mutex_exit(&grp->lg_tx_flowctl_lock);
1424 1460 if (tid != 0)
1425 1461 thread_join(tid);
1426 1462
1427 1463 kmem_free(grp->lg_tx_blocked_rings,
1428 1464 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1429 1465 rw_exit(&aggr_grp_lock);
1430 1466 AGGR_GRP_REFRELE(grp);
1431 1467 return (err);
1432 1468 }
1433 1469
1434 1470 /*
1435 1471 * Return a pointer to the member of a group with specified linkid.
1436 1472 */
1437 1473 static aggr_port_t *
1438 1474 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1439 1475 {
1440 1476 aggr_port_t *port;
1441 1477
1442 1478 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1443 1479
1444 1480 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1445 1481 if (port->lp_linkid == linkid)
1446 1482 break;
1447 1483 }
1448 1484
1449 1485 return (port);
1450 1486 }
1451 1487
1452 1488 /*
1453 1489 * Stop, detach and remove a port from a link aggregation group.
1454 1490 */
1455 1491 static int
1456 1492 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1457 1493 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1458 1494 {
1459 1495 int rc = 0;
1460 1496 aggr_port_t **pport;
1461 1497 boolean_t mac_addr_changed = B_FALSE;
1462 1498 boolean_t link_state_changed = B_FALSE;
1463 1499 mac_perim_handle_t mph;
1464 1500 uint64_t val;
1465 1501 uint_t i;
1466 1502 uint_t stat;
1467 1503
1468 1504 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1469 1505 ASSERT(grp->lg_nports > 1);
1470 1506 ASSERT(!grp->lg_closing);
1471 1507
1472 1508 /* unlink port */
1473 1509 for (pport = &grp->lg_ports; *pport != port;
1474 1510 pport = &(*pport)->lp_next) {
1475 1511 if (*pport == NULL) {
1476 1512 rc = ENOENT;
1477 1513 goto done;
1478 1514 }
1479 1515 }
1480 1516 *pport = port->lp_next;
1481 1517
1482 1518 mac_perim_enter_by_mh(port->lp_mh, &mph);
1483 1519
1484 1520 /*
1485 1521 * If the MAC address of the port being removed was assigned
1486 1522 * to the group, update the group MAC address
1487 1523 * using the MAC address of a different port.
1488 1524 */
1489 1525 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1490 1526 /*
1491 1527 * Set the MAC address of the group to the
1492 1528 * MAC address of its first port.
1493 1529 */
1494 1530 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1495 1531 grp->lg_mac_addr_port = grp->lg_ports;
1496 1532 mac_addr_changed = B_TRUE;
1497 1533 }
1498 1534
1499 1535 link_state_changed = aggr_grp_detach_port(grp, port);
1500 1536
1501 1537 /*
1502 1538 * Add the counter statistics of the ports while it was aggregated
1503 1539 * to the group's residual statistics. This is done by obtaining
1504 1540 * the current counter from the underlying MAC then subtracting the
1505 1541 * value of the counter at the moment it was added to the
1506 1542 * aggregation.
1507 1543 */
1508 1544 for (i = 0; i < MAC_NSTAT; i++) {
1509 1545 stat = i + MAC_STAT_MIN;
1510 1546 if (!MAC_STAT_ISACOUNTER(stat))
1511 1547 continue;
1512 1548 val = aggr_port_stat(port, stat);
1513 1549 val -= port->lp_stat[i];
1514 1550 grp->lg_stat[i] += val;
1515 1551 }
1516 1552 for (i = 0; i < ETHER_NSTAT; i++) {
1517 1553 stat = i + MACTYPE_STAT_MIN;
1518 1554 if (!ETHER_STAT_ISACOUNTER(stat))
1519 1555 continue;
1520 1556 val = aggr_port_stat(port, stat);
1521 1557 val -= port->lp_ether_stat[i];
1522 1558 grp->lg_ether_stat[i] += val;
1523 1559 }
1524 1560
1525 1561 grp->lg_nports--;
1526 1562 mac_perim_exit(mph);
1527 1563
1528 1564 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1529 1565 aggr_port_delete(port);
1530 1566
1531 1567 /*
1532 1568 * If the group MAC address has changed, update the MAC address of
1533 1569 * the remaining constituent ports according to the new MAC
1534 1570 * address of the group.
1535 1571 */
1536 1572 if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1537 1573 link_state_changed = B_TRUE;
1538 1574
1539 1575 done:
1540 1576 if (mac_addr_changedp != NULL)
1541 1577 *mac_addr_changedp = mac_addr_changed;
1542 1578 if (link_state_changedp != NULL)
1543 1579 *link_state_changedp = link_state_changed;
1544 1580
1545 1581 return (rc);
1546 1582 }
1547 1583
1548 1584 /*
1549 1585 * Remove one or more ports from an existing link aggregation group.
1550 1586 */
1551 1587 int
1552 1588 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1553 1589 {
1554 1590 int rc = 0, i;
1555 1591 aggr_grp_t *grp = NULL;
1556 1592 aggr_port_t *port;
1557 1593 boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1558 1594 boolean_t link_state_update = B_FALSE, link_state_changed;
1559 1595 mac_perim_handle_t mph, pmph;
1560 1596
1561 1597 /* get group corresponding to linkid */
1562 1598 rw_enter(&aggr_grp_lock, RW_READER);
1563 1599 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1564 1600 (mod_hash_val_t *)&grp) != 0) {
1565 1601 rw_exit(&aggr_grp_lock);
1566 1602 return (ENOENT);
1567 1603 }
1568 1604 AGGR_GRP_REFHOLD(grp);
1569 1605
1570 1606 /*
1571 1607 * Hold the perimeter so that the aggregation won't be destroyed.
1572 1608 */
1573 1609 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1574 1610 rw_exit(&aggr_grp_lock);
1575 1611
1576 1612 /* we need to keep at least one port per group */
1577 1613 if (nports >= grp->lg_nports) {
1578 1614 rc = EINVAL;
1579 1615 goto bail;
1580 1616 }
1581 1617
1582 1618 /* first verify that all the groups are valid */
1583 1619 for (i = 0; i < nports; i++) {
1584 1620 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1585 1621 /* port not found */
1586 1622 rc = ENOENT;
1587 1623 goto bail;
1588 1624 }
1589 1625 }
1590 1626
1591 1627 /* clear the promiscous mode for the specified ports */
1592 1628 for (i = 0; i < nports && rc == 0; i++) {
1593 1629 /* lookup port */
1594 1630 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1595 1631 ASSERT(port != NULL);
1596 1632
1597 1633 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1598 1634 rc = aggr_port_promisc(port, B_FALSE);
1599 1635 mac_perim_exit(pmph);
1600 1636 }
1601 1637 if (rc != 0) {
1602 1638 for (i = 0; i < nports; i++) {
1603 1639 port = aggr_grp_port_lookup(grp,
1604 1640 ports[i].lp_linkid);
1605 1641 ASSERT(port != NULL);
1606 1642
1607 1643 /*
1608 1644 * Turn the promiscuous mode back on if it is required
1609 1645 * to receive the non-primary address over a port, or
1610 1646 * the promiscous mode is enabled over the aggr.
1611 1647 */
1612 1648 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1613 1649 if (port->lp_started && (grp->lg_promisc ||
1614 1650 port->lp_prom_addr != NULL)) {
1615 1651 (void) aggr_port_promisc(port, B_TRUE);
1616 1652 }
1617 1653 mac_perim_exit(pmph);
1618 1654 }
1619 1655 goto bail;
1620 1656 }
1621 1657
1622 1658 /* remove the specified ports from group */
1623 1659 for (i = 0; i < nports; i++) {
1624 1660 /* lookup port */
1625 1661 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1626 1662 ASSERT(port != NULL);
1627 1663
1628 1664 /* stop port if group has already been started */
1629 1665 if (grp->lg_started) {
1630 1666 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1631 1667 aggr_port_stop(port);
1632 1668 mac_perim_exit(pmph);
1633 1669 }
1634 1670
1635 1671 /*
1636 1672 * aggr_rem_pseudo_tx_group() is not called here. Instead
1637 1673 * it is called from inside aggr_grp_rem_port() after the
1638 1674 * port has been detached. The reason is that
1639 1675 * aggr_rem_pseudo_tx_group() removes one ring at a time
1640 1676 * and if there is still traffic going on, then there
1641 1677 * is the possibility of aggr_find_tx_ring() returning a
1642 1678 * removed ring for transmission. Once the port has been
1643 1679 * detached, that port will not be used and
1644 1680 * aggr_find_tx_ring() will not return any rings
1645 1681 * belonging to it.
1646 1682 */
1647 1683 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1648 1684
1649 1685 /* remove port from group */
1650 1686 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1651 1687 &link_state_changed);
1652 1688 ASSERT(rc == 0);
1653 1689 mac_addr_update = mac_addr_update || mac_addr_changed;
1654 1690 link_state_update = link_state_update || link_state_changed;
1655 1691 }
1656 1692
1657 1693 bail:
1658 1694 if (mac_addr_update)
1659 1695 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1660 1696 if (link_state_update)
1661 1697 mac_link_update(grp->lg_mh, grp->lg_link_state);
1662 1698
1663 1699 mac_perim_exit(mph);
1664 1700 AGGR_GRP_REFRELE(grp);
1665 1701
1666 1702 return (rc);
1667 1703 }
1668 1704
1669 1705 int
1670 1706 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1671 1707 {
1672 1708 aggr_grp_t *grp = NULL;
1673 1709 aggr_port_t *port, *cport;
1674 1710 datalink_id_t tmpid;
1675 1711 mod_hash_val_t val;
1676 1712 mac_perim_handle_t mph, pmph;
1677 1713 int err;
1678 1714 kt_did_t tid = 0;
1679 1715
1680 1716 rw_enter(&aggr_grp_lock, RW_WRITER);
1681 1717
1682 1718 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1683 1719 (mod_hash_val_t *)&grp) != 0) {
1684 1720 rw_exit(&aggr_grp_lock);
1685 1721 return (ENOENT);
1686 1722 }
1687 1723
1688 1724 /*
1689 1725 * Note that dls_devnet_destroy() must be called before lg_lock is
1690 1726 * held. Otherwise, it will deadlock if another thread is in
1691 1727 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1692 1728 * dls_devnet_destroy() needs to delete.
1693 1729 */
1694 1730 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1695 1731 rw_exit(&aggr_grp_lock);
1696 1732 return (err);
1697 1733 }
1698 1734 ASSERT(linkid == tmpid);
1699 1735
1700 1736 /*
1701 1737 * Unregister from the MAC service module. Since this can
1702 1738 * fail if a client hasn't closed the MAC port, we gracefully
1703 1739 * fail the operation.
1704 1740 */
1705 1741 if ((err = mac_disable(grp->lg_mh)) != 0) {
1706 1742 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1707 1743 rw_exit(&aggr_grp_lock);
1708 1744 return (err);
1709 1745 }
1710 1746 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1711 1747 ASSERT(grp == (aggr_grp_t *)val);
1712 1748
1713 1749 ASSERT(aggr_grp_cnt > 0);
1714 1750 aggr_grp_cnt--;
1715 1751 rw_exit(&aggr_grp_lock);
1716 1752
1717 1753 /*
1718 1754 * Inform the lacp_rx thread to exit.
1719 1755 */
1720 1756 mutex_enter(&grp->lg_lacp_lock);
1721 1757 grp->lg_lacp_done = B_TRUE;
1722 1758 cv_signal(&grp->lg_lacp_cv);
1723 1759 while (grp->lg_lacp_rx_thread != NULL)
1724 1760 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1725 1761 mutex_exit(&grp->lg_lacp_lock);
1726 1762 /*
1727 1763 * Inform the tx_notify_thread to exit.
1728 1764 */
1729 1765 mutex_enter(&grp->lg_tx_flowctl_lock);
1730 1766 if (grp->lg_tx_notify_thread != NULL) {
1731 1767 tid = grp->lg_tx_notify_thread->t_did;
1732 1768 grp->lg_tx_notify_done = B_TRUE;
1733 1769 cv_signal(&grp->lg_tx_flowctl_cv);
1734 1770 }
1735 1771 mutex_exit(&grp->lg_tx_flowctl_lock);
1736 1772 if (tid != 0)
1737 1773 thread_join(tid);
1738 1774
1739 1775 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1740 1776
1741 1777 grp->lg_closing = B_TRUE;
1742 1778 /* detach and free MAC ports associated with group */
1743 1779 port = grp->lg_ports;
1744 1780 while (port != NULL) {
1745 1781 cport = port->lp_next;
1746 1782 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1747 1783 if (grp->lg_started)
1748 1784 aggr_port_stop(port);
1749 1785 (void) aggr_grp_detach_port(grp, port);
1750 1786 mac_perim_exit(pmph);
1751 1787 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1752 1788 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1753 1789 aggr_port_delete(port);
1754 1790 port = cport;
1755 1791 }
1756 1792
1757 1793 mac_perim_exit(mph);
1758 1794
1759 1795 kmem_free(grp->lg_tx_blocked_rings,
1760 1796 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1761 1797 /*
1762 1798 * Wait for the port's lacp timer thread and its notification callback
1763 1799 * to exit before calling mac_unregister() since both needs to access
1764 1800 * the mac perimeter of the grp.
1765 1801 */
1766 1802 aggr_grp_port_wait(grp);
1767 1803
1768 1804 VERIFY(mac_unregister(grp->lg_mh) == 0);
1769 1805 grp->lg_mh = NULL;
1770 1806
1771 1807 AGGR_GRP_REFRELE(grp);
1772 1808 return (0);
1773 1809 }
1774 1810
1775 1811 void
1776 1812 aggr_grp_free(aggr_grp_t *grp)
1777 1813 {
1778 1814 ASSERT(grp->lg_refs == 0);
1779 1815 ASSERT(grp->lg_port_ref == 0);
1780 1816 if (grp->lg_key > AGGR_MAX_KEY) {
1781 1817 id_free(key_ids, grp->lg_key);
1782 1818 grp->lg_key = 0;
1783 1819 }
1784 1820 kmem_cache_free(aggr_grp_cache, grp);
1785 1821 }
1786 1822
1787 1823 int
1788 1824 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1789 1825 aggr_grp_info_new_grp_fn_t new_grp_fn,
1790 1826 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1791 1827 {
1792 1828 aggr_grp_t *grp;
1793 1829 aggr_port_t *port;
1794 1830 mac_perim_handle_t mph, pmph;
1795 1831 int rc = 0;
1796 1832
1797 1833 /*
1798 1834 * Make sure that the aggregation link is visible from the caller's
1799 1835 * zone.
1800 1836 */
1801 1837 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
1802 1838 return (ENOENT);
1803 1839
1804 1840 rw_enter(&aggr_grp_lock, RW_READER);
1805 1841
1806 1842 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1807 1843 (mod_hash_val_t *)&grp) != 0) {
1808 1844 rw_exit(&aggr_grp_lock);
1809 1845 return (ENOENT);
1810 1846 }
1811 1847 AGGR_GRP_REFHOLD(grp);
1812 1848
1813 1849 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1814 1850 rw_exit(&aggr_grp_lock);
1815 1851
1816 1852 rc = new_grp_fn(fn_arg, grp->lg_linkid,
1817 1853 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
1818 1854 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
1819 1855 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1820 1856
1821 1857 if (rc != 0)
1822 1858 goto bail;
1823 1859
1824 1860 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1825 1861 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1826 1862 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
1827 1863 port->lp_state, &port->lp_lacp.ActorOperPortState);
1828 1864 mac_perim_exit(pmph);
1829 1865
1830 1866 if (rc != 0)
1831 1867 goto bail;
1832 1868 }
1833 1869
1834 1870 bail:
1835 1871 mac_perim_exit(mph);
1836 1872 AGGR_GRP_REFRELE(grp);
1837 1873 return (rc);
1838 1874 }
1839 1875
1840 1876 /*ARGSUSED*/
1841 1877 static void
1842 1878 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1843 1879 {
1844 1880 miocnak(q, mp, 0, ENOTSUP);
1845 1881 }
1846 1882
1847 1883 static int
1848 1884 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
1849 1885 {
1850 1886 aggr_port_t *port;
1851 1887 uint_t stat_index;
1852 1888
1853 1889 /* We only aggregate counter statistics. */
1854 1890 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
1855 1891 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
1856 1892 return (ENOTSUP);
1857 1893 }
1858 1894
1859 1895 /*
1860 1896 * Counter statistics for a group are computed by aggregating the
1861 1897 * counters of the members MACs while they were aggregated, plus
1862 1898 * the residual counter of the group itself, which is updated each
1863 1899 * time a MAC is removed from the group.
1864 1900 */
1865 1901 *val = 0;
1866 1902 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1867 1903 /* actual port statistic */
1868 1904 *val += aggr_port_stat(port, stat);
1869 1905 /*
1870 1906 * minus the port stat when it was added, plus any residual
1871 1907 * amount for the group.
1872 1908 */
1873 1909 if (IS_MAC_STAT(stat)) {
1874 1910 stat_index = stat - MAC_STAT_MIN;
1875 1911 *val -= port->lp_stat[stat_index];
1876 1912 *val += grp->lg_stat[stat_index];
1877 1913 } else if (IS_MACTYPE_STAT(stat)) {
1878 1914 stat_index = stat - MACTYPE_STAT_MIN;
1879 1915 *val -= port->lp_ether_stat[stat_index];
1880 1916 *val += grp->lg_ether_stat[stat_index];
1881 1917 }
1882 1918 }
1883 1919 return (0);
1884 1920 }
1885 1921
1886 1922 int
1887 1923 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1888 1924 {
1889 1925 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
1890 1926
1891 1927 if (rx_ring->arr_hw_rh != NULL) {
1892 1928 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
1893 1929 } else {
1894 1930 aggr_port_t *port = rx_ring->arr_port;
1895 1931
1896 1932 *val = mac_stat_get(port->lp_mh, stat);
1897 1933
1898 1934 }
1899 1935 return (0);
1900 1936 }
1901 1937
1902 1938 int
1903 1939 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1904 1940 {
1905 1941 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
1906 1942
1907 1943 if (tx_ring->atr_hw_rh != NULL) {
1908 1944 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
1909 1945 } else {
1910 1946 aggr_port_t *port = tx_ring->atr_port;
1911 1947
1912 1948 *val = mac_stat_get(port->lp_mh, stat);
1913 1949 }
1914 1950 return (0);
1915 1951 }
1916 1952
1917 1953 static int
1918 1954 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
1919 1955 {
1920 1956 aggr_grp_t *grp = arg;
1921 1957 mac_perim_handle_t mph;
1922 1958 int rval = 0;
1923 1959
1924 1960 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1925 1961
1926 1962 switch (stat) {
1927 1963 case MAC_STAT_IFSPEED:
1928 1964 *val = grp->lg_ifspeed;
1929 1965 break;
1930 1966
1931 1967 case ETHER_STAT_LINK_DUPLEX:
1932 1968 *val = grp->lg_link_duplex;
1933 1969 break;
1934 1970
1935 1971 default:
1936 1972 /*
1937 1973 * For all other statistics, we return the aggregated stat
1938 1974 * from the underlying ports. aggr_grp_stat() will set
1939 1975 * rval appropriately if the statistic isn't a counter.
1940 1976 */
1941 1977 rval = aggr_grp_stat(grp, stat, val);
1942 1978 }
1943 1979
1944 1980 mac_perim_exit(mph);
1945 1981 return (rval);
1946 1982 }
1947 1983
1948 1984 static int
1949 1985 aggr_m_start(void *arg)
1950 1986 {
1951 1987 aggr_grp_t *grp = arg;
1952 1988 aggr_port_t *port;
1953 1989 mac_perim_handle_t mph, pmph;
1954 1990
1955 1991 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1956 1992
1957 1993 /*
1958 1994 * Attempts to start all configured members of the group.
1959 1995 * Group members will be attached when their link-up notification
1960 1996 * is received.
1961 1997 */
1962 1998 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1963 1999 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1964 2000 if (aggr_port_start(port) != 0) {
1965 2001 mac_perim_exit(pmph);
1966 2002 continue;
1967 2003 }
1968 2004
1969 2005 /*
1970 2006 * Turn on the promiscuous mode if it is required to receive
1971 2007 * the non-primary address over a port, or the promiscous
1972 2008 * mode is enabled over the aggr.
1973 2009 */
1974 2010 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1975 2011 if (aggr_port_promisc(port, B_TRUE) != 0)
1976 2012 aggr_port_stop(port);
1977 2013 }
1978 2014 mac_perim_exit(pmph);
1979 2015 }
1980 2016
1981 2017 grp->lg_started = B_TRUE;
1982 2018
1983 2019 mac_perim_exit(mph);
1984 2020 return (0);
1985 2021 }
1986 2022
1987 2023 static void
1988 2024 aggr_m_stop(void *arg)
1989 2025 {
1990 2026 aggr_grp_t *grp = arg;
1991 2027 aggr_port_t *port;
1992 2028 mac_perim_handle_t mph, pmph;
1993 2029
1994 2030 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1995 2031
1996 2032 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1997 2033 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1998 2034
1999 2035 /* reset port promiscuous mode */
2000 2036 (void) aggr_port_promisc(port, B_FALSE);
2001 2037
2002 2038 aggr_port_stop(port);
2003 2039 mac_perim_exit(pmph);
2004 2040 }
2005 2041
2006 2042 grp->lg_started = B_FALSE;
2007 2043 mac_perim_exit(mph);
2008 2044 }
2009 2045
2010 2046 static int
2011 2047 aggr_m_promisc(void *arg, boolean_t on)
2012 2048 {
2013 2049 aggr_grp_t *grp = arg;
2014 2050 aggr_port_t *port;
2015 2051 boolean_t link_state_changed = B_FALSE;
2016 2052 mac_perim_handle_t mph, pmph;
2017 2053
2018 2054 AGGR_GRP_REFHOLD(grp);
2019 2055 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2020 2056
2021 2057 ASSERT(!grp->lg_closing);
2022 2058
2023 2059 if (on == grp->lg_promisc)
2024 2060 goto bail;
2025 2061
2026 2062 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2027 2063 int err = 0;
2028 2064
2029 2065 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2030 2066 AGGR_PORT_REFHOLD(port);
2031 2067 if (!on && (port->lp_prom_addr == NULL))
2032 2068 err = aggr_port_promisc(port, B_FALSE);
2033 2069 else if (on && port->lp_started)
2034 2070 err = aggr_port_promisc(port, B_TRUE);
2035 2071
2036 2072 if (err != 0) {
2037 2073 if (aggr_grp_detach_port(grp, port))
2038 2074 link_state_changed = B_TRUE;
2039 2075 } else {
2040 2076 /*
2041 2077 * If a port was detached because of a previous
2042 2078 * failure changing the promiscuity, the port
2043 2079 * is reattached when it successfully changes
2044 2080 * the promiscuity now, and this might cause
2045 2081 * the link state of the aggregation to change.
2046 2082 */
2047 2083 if (aggr_grp_attach_port(grp, port))
2048 2084 link_state_changed = B_TRUE;
2049 2085 }
2050 2086 mac_perim_exit(pmph);
2051 2087 AGGR_PORT_REFRELE(port);
2052 2088 }
2053 2089
2054 2090 grp->lg_promisc = on;
2055 2091
2056 2092 if (link_state_changed)
2057 2093 mac_link_update(grp->lg_mh, grp->lg_link_state);
2058 2094
2059 2095 bail:
2060 2096 mac_perim_exit(mph);
2061 2097 AGGR_GRP_REFRELE(grp);
2062 2098
2063 2099 return (0);
2064 2100 }
2065 2101
2066 2102 static void
2067 2103 aggr_grp_port_rename(const char *new_name, void *arg)
2068 2104 {
2069 2105 /*
2070 2106 * aggr port's mac client name is the format of "aggr link name" plus
2071 2107 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2072 2108 */
2073 2109 int aggr_len, link_len, clnt_name_len, i;
2074 2110 char *str_end, *str_st, *str_del;
2075 2111 char aggr_name[MAXNAMELEN];
2076 2112 char link_name[MAXNAMELEN];
2077 2113 char *clnt_name;
2078 2114 aggr_grp_t *aggr_grp = arg;
2079 2115 aggr_port_t *aggr_port = aggr_grp->lg_ports;
2080 2116
2081 2117 for (i = 0; i < aggr_grp->lg_nports; i++) {
2082 2118 clnt_name = mac_client_name(aggr_port->lp_mch);
2083 2119 clnt_name_len = strlen(clnt_name);
2084 2120 str_st = clnt_name;
2085 2121 str_end = &(clnt_name[clnt_name_len]);
2086 2122 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2087 2123 ASSERT(str_del != NULL);
2088 2124 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2089 2125 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2090 2126 bzero(aggr_name, MAXNAMELEN);
2091 2127 bzero(link_name, MAXNAMELEN);
2092 2128 bcopy(clnt_name, aggr_name, aggr_len);
2093 2129 bcopy(str_del, link_name, link_len + 1);
2094 2130 bzero(clnt_name, MAXNAMELEN);
2095 2131 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2096 2132 link_name);
2097 2133
2098 2134 (void) mac_rename_primary(aggr_port->lp_mh, NULL);
2099 2135 aggr_port = aggr_port->lp_next;
2100 2136 }
2101 2137 }
2102 2138
2103 2139 /*
2104 2140 * Initialize the capabilities that are advertised for the group
2105 2141 * according to the capabilities of the constituent ports.
2106 2142 */
2107 2143 static boolean_t
2108 2144 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2109 2145 {
2110 2146 aggr_grp_t *grp = arg;
2111 2147
2112 2148 switch (cap) {
2113 2149 case MAC_CAPAB_HCKSUM: {
2114 2150 uint32_t *hcksum_txflags = cap_data;
2115 2151 *hcksum_txflags = grp->lg_hcksum_txflags;
2116 2152 break;
2117 2153 }
2118 2154 case MAC_CAPAB_LSO: {
2119 2155 mac_capab_lso_t *cap_lso = cap_data;
2120 2156
2121 2157 if (grp->lg_lso) {
2122 2158 *cap_lso = grp->lg_cap_lso;
2123 2159 break;
2124 2160 } else {
2125 2161 return (B_FALSE);
2126 2162 }
2127 2163 }
2128 2164 case MAC_CAPAB_NO_NATIVEVLAN:
2129 2165 return (!grp->lg_vlan);
2130 2166 case MAC_CAPAB_NO_ZCOPY:
2131 2167 return (!grp->lg_zcopy);
2132 2168 case MAC_CAPAB_RINGS: {
2133 2169 mac_capab_rings_t *cap_rings = cap_data;
2134 2170
2135 2171 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2136 2172 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2137 2173 cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2138 2174
2139 2175 /*
2140 2176 * An aggregation advertises only one (pseudo) RX
2141 2177 * group, which virtualizes the main/primary group of
2142 2178 * the underlying devices.
2143 2179 */
2144 2180 cap_rings->mr_gnum = 1;
2145 2181 cap_rings->mr_gaddring = NULL;
2146 2182 cap_rings->mr_gremring = NULL;
2147 2183 } else {
2148 2184 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2149 2185 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2150 2186 cap_rings->mr_gnum = 0;
2151 2187 }
2152 2188 cap_rings->mr_rget = aggr_fill_ring;
2153 2189 cap_rings->mr_gget = aggr_fill_group;
2154 2190 break;
2155 2191 }
2156 2192 case MAC_CAPAB_AGGR:
2157 2193 {
2158 2194 mac_capab_aggr_t *aggr_cap;
2159 2195
2160 2196 if (cap_data != NULL) {
2161 2197 aggr_cap = cap_data;
2162 2198 aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2163 2199 aggr_cap->mca_unicst = aggr_m_unicst;
2164 2200 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2165 2201 aggr_cap->mca_arg = arg;
2166 2202 }
2167 2203 return (B_TRUE);
2168 2204 }
2169 2205 default:
2170 2206 return (B_FALSE);
2171 2207 }
2172 2208 return (B_TRUE);
2173 2209 }
2174 2210
2175 2211 /*
2176 2212 * Callback funtion for MAC layer to register groups.
2177 2213 */
2178 2214 static void
2179 2215 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2180 2216 mac_group_info_t *infop, mac_group_handle_t gh)
2181 2217 {
2182 2218 aggr_grp_t *grp = arg;
2183 2219 aggr_pseudo_rx_group_t *rx_group;
2184 2220 aggr_pseudo_tx_group_t *tx_group;
2185 2221
2186 2222 ASSERT(index == 0);
2187 2223 if (rtype == MAC_RING_TYPE_RX) {
2188 2224 rx_group = &grp->lg_rx_group;
2189 2225 rx_group->arg_gh = gh;
2190 2226 rx_group->arg_grp = grp;
2191 2227
2192 2228 infop->mgi_driver = (mac_group_driver_t)rx_group;
2193 2229 infop->mgi_start = NULL;
2194 2230 infop->mgi_stop = NULL;
2195 2231 infop->mgi_addmac = aggr_addmac;
2196 2232 infop->mgi_remmac = aggr_remmac;
2197 2233 infop->mgi_count = rx_group->arg_ring_cnt;
2198 2234 } else {
2199 2235 tx_group = &grp->lg_tx_group;
2200 2236 tx_group->atg_gh = gh;
2201 2237 }
2202 2238 }
2203 2239
2204 2240 /*
2205 2241 * Callback funtion for MAC layer to register all rings.
2206 2242 */
2207 2243 static void
2208 2244 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2209 2245 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2210 2246 {
2211 2247 aggr_grp_t *grp = arg;
2212 2248
2213 2249 switch (rtype) {
2214 2250 case MAC_RING_TYPE_RX: {
2215 2251 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group;
2216 2252 aggr_pseudo_rx_ring_t *rx_ring;
2217 2253 mac_intr_t aggr_mac_intr;
2218 2254
2219 2255 ASSERT(rg_index == 0);
2220 2256
2221 2257 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2222 2258 rx_ring = rx_group->arg_rings + index;
2223 2259 rx_ring->arr_rh = rh;
2224 2260
2225 2261 /*
2226 2262 * Entrypoint to enable interrupt (disable poll) and
2227 2263 * disable interrupt (enable poll).
2228 2264 */
2229 2265 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2230 2266 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2231 2267 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2232 2268 aggr_mac_intr.mi_ddi_handle = NULL;
2233 2269
2234 2270 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2235 2271 infop->mri_start = aggr_pseudo_start_ring;
2236 2272 infop->mri_stop = aggr_pseudo_stop_ring;
2237 2273
2238 2274 infop->mri_intr = aggr_mac_intr;
2239 2275 infop->mri_poll = aggr_rx_poll;
2240 2276
2241 2277 infop->mri_stat = aggr_rx_ring_stat;
2242 2278 break;
2243 2279 }
2244 2280 case MAC_RING_TYPE_TX: {
2245 2281 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2246 2282 aggr_pseudo_tx_ring_t *tx_ring;
2247 2283
2248 2284 ASSERT(rg_index == -1);
2249 2285 ASSERT(index < tx_group->atg_ring_cnt);
2250 2286
2251 2287 tx_ring = &tx_group->atg_rings[index];
2252 2288 tx_ring->atr_rh = rh;
2253 2289
2254 2290 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2255 2291 infop->mri_start = NULL;
2256 2292 infop->mri_stop = NULL;
2257 2293 infop->mri_tx = aggr_ring_tx;
2258 2294 infop->mri_stat = aggr_tx_ring_stat;
2259 2295 /*
2260 2296 * Use the hw TX ring handle to find if the ring needs
2261 2297 * serialization or not. For NICs that do not expose
2262 2298 * Tx rings, atr_hw_rh will be NULL.
2263 2299 */
2264 2300 if (tx_ring->atr_hw_rh != NULL) {
2265 2301 infop->mri_flags =
2266 2302 mac_hwring_getinfo(tx_ring->atr_hw_rh);
2267 2303 }
2268 2304 break;
2269 2305 }
2270 2306 default:
2271 2307 break;
2272 2308 }
2273 2309 }
2274 2310
2275 2311 static mblk_t *
2276 2312 aggr_rx_poll(void *arg, int bytes_to_pickup)
2277 2313 {
2278 2314 aggr_pseudo_rx_ring_t *rr_ring = arg;
2279 2315 aggr_port_t *port = rr_ring->arr_port;
2280 2316 aggr_grp_t *grp = port->lp_grp;
2281 2317 mblk_t *mp_chain, *mp, **mpp;
2282 2318
2283 2319 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2284 2320
2285 2321 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2286 2322 return (mp_chain);
2287 2323
2288 2324 mpp = &mp_chain;
2289 2325 while ((mp = *mpp) != NULL) {
2290 2326 if (MBLKL(mp) >= sizeof (struct ether_header)) {
2291 2327 struct ether_header *ehp;
2292 2328
2293 2329 ehp = (struct ether_header *)mp->b_rptr;
2294 2330 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2295 2331 *mpp = mp->b_next;
2296 2332 mp->b_next = NULL;
2297 2333 aggr_recv_lacp(port,
2298 2334 (mac_resource_handle_t)rr_ring, mp);
2299 2335 continue;
2300 2336 }
2301 2337 }
2302 2338
2303 2339 if (!port->lp_collector_enabled) {
2304 2340 *mpp = mp->b_next;
2305 2341 mp->b_next = NULL;
2306 2342 freemsg(mp);
2307 2343 continue;
2308 2344 }
2309 2345 mpp = &mp->b_next;
2310 2346 }
2311 2347 return (mp_chain);
2312 2348 }
2313 2349
2314 2350 static int
2315 2351 aggr_addmac(void *arg, const uint8_t *mac_addr)
2316 2352 {
2317 2353 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2318 2354 aggr_unicst_addr_t *addr, **pprev;
2319 2355 aggr_grp_t *grp = rx_group->arg_grp;
2320 2356 aggr_port_t *port, *p;
2321 2357 mac_perim_handle_t mph;
2322 2358 int err = 0;
2323 2359
2324 2360 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2325 2361
2326 2362 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2327 2363 mac_perim_exit(mph);
2328 2364 return (0);
2329 2365 }
2330 2366
2331 2367 /*
2332 2368 * Insert this mac address into the list of mac addresses owned by
2333 2369 * the aggregation pseudo group.
2334 2370 */
2335 2371 pprev = &rx_group->arg_macaddr;
2336 2372 while ((addr = *pprev) != NULL) {
2337 2373 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2338 2374 mac_perim_exit(mph);
2339 2375 return (EEXIST);
2340 2376 }
2341 2377 pprev = &addr->aua_next;
2342 2378 }
2343 2379 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2344 2380 bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2345 2381 addr->aua_next = NULL;
2346 2382 *pprev = addr;
2347 2383
2348 2384 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2349 2385 if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2350 2386 break;
2351 2387
2352 2388 if (err != 0) {
2353 2389 for (p = grp->lg_ports; p != port; p = p->lp_next)
2354 2390 aggr_port_remmac(p, mac_addr);
2355 2391
2356 2392 *pprev = NULL;
2357 2393 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2358 2394 }
2359 2395
2360 2396 mac_perim_exit(mph);
2361 2397 return (err);
2362 2398 }
2363 2399
2364 2400 static int
2365 2401 aggr_remmac(void *arg, const uint8_t *mac_addr)
2366 2402 {
2367 2403 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2368 2404 aggr_unicst_addr_t *addr, **pprev;
2369 2405 aggr_grp_t *grp = rx_group->arg_grp;
2370 2406 aggr_port_t *port;
2371 2407 mac_perim_handle_t mph;
2372 2408 int err = 0;
2373 2409
2374 2410 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2375 2411
2376 2412 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2377 2413 mac_perim_exit(mph);
2378 2414 return (0);
2379 2415 }
2380 2416
2381 2417 /*
2382 2418 * Insert this mac address into the list of mac addresses owned by
2383 2419 * the aggregation pseudo group.
2384 2420 */
2385 2421 pprev = &rx_group->arg_macaddr;
2386 2422 while ((addr = *pprev) != NULL) {
2387 2423 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2388 2424 pprev = &addr->aua_next;
2389 2425 continue;
2390 2426 }
2391 2427 break;
2392 2428 }
2393 2429 if (addr == NULL) {
2394 2430 mac_perim_exit(mph);
2395 2431 return (EINVAL);
2396 2432 }
2397 2433
2398 2434 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2399 2435 aggr_port_remmac(port, mac_addr);
2400 2436
2401 2437 *pprev = addr->aua_next;
2402 2438 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2403 2439
2404 2440 mac_perim_exit(mph);
2405 2441 return (err);
2406 2442 }
2407 2443
2408 2444 /*
2409 2445 * Add or remove the multicast addresses that are defined for the group
2410 2446 * to or from the specified port.
2411 2447 *
2412 2448 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2413 2449 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2414 2450 * called when the port is either stopped or detached.
2415 2451 */
2416 2452 void
2417 2453 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2418 2454 {
2419 2455 aggr_grp_t *grp = port->lp_grp;
2420 2456
2421 2457 ASSERT(MAC_PERIM_HELD(port->lp_mh));
2422 2458 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2423 2459
2424 2460 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2425 2461 return;
2426 2462
2427 2463 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2428 2464 }
2429 2465
2430 2466 static int
2431 2467 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2432 2468 {
2433 2469 aggr_grp_t *grp = arg;
2434 2470 aggr_port_t *port = NULL, *errport = NULL;
2435 2471 mac_perim_handle_t mph;
2436 2472 int err = 0;
2437 2473
2438 2474 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2439 2475 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2440 2476 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2441 2477 !port->lp_started) {
2442 2478 continue;
2443 2479 }
2444 2480 err = aggr_port_multicst(port, add, addrp);
2445 2481 if (err != 0) {
2446 2482 errport = port;
2447 2483 break;
2448 2484 }
2449 2485 }
2450 2486
2451 2487 /*
2452 2488 * At least one port caused error return and this error is returned to
2453 2489 * mac, eventually a NAK would be sent upwards.
2454 2490 * Some ports have this multicast address listed now, and some don't.
2455 2491 * Treat this error as a whole aggr failure not individual port failure.
2456 2492 * Therefore remove this multicast address from other ports.
2457 2493 */
2458 2494 if ((err != 0) && add) {
2459 2495 for (port = grp->lg_ports; port != errport;
2460 2496 port = port->lp_next) {
2461 2497 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2462 2498 !port->lp_started) {
2463 2499 continue;
2464 2500 }
2465 2501 (void) aggr_port_multicst(port, B_FALSE, addrp);
2466 2502 }
2467 2503 }
2468 2504 mac_perim_exit(mph);
2469 2505 return (err);
2470 2506 }
2471 2507
2472 2508 static int
2473 2509 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2474 2510 {
2475 2511 aggr_grp_t *grp = arg;
2476 2512 mac_perim_handle_t mph;
2477 2513 int err;
2478 2514
2479 2515 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2480 2516 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2481 2517 0, 0);
2482 2518 mac_perim_exit(mph);
2483 2519 return (err);
2484 2520 }
2485 2521
2486 2522 /*
2487 2523 * Initialize the capabilities that are advertised for the group
2488 2524 * according to the capabilities of the constituent ports.
2489 2525 */
2490 2526 static void
2491 2527 aggr_grp_capab_set(aggr_grp_t *grp)
2492 2528 {
2493 2529 uint32_t cksum;
2494 2530 aggr_port_t *port;
2495 2531 mac_capab_lso_t cap_lso;
2496 2532
2497 2533 ASSERT(grp->lg_mh == NULL);
2498 2534 ASSERT(grp->lg_ports != NULL);
2499 2535
2500 2536 grp->lg_hcksum_txflags = (uint32_t)-1;
2501 2537 grp->lg_zcopy = B_TRUE;
2502 2538 grp->lg_vlan = B_TRUE;
2503 2539
2504 2540 grp->lg_lso = B_TRUE;
2505 2541 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2506 2542 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2507 2543
2508 2544 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2509 2545 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2510 2546 cksum = 0;
2511 2547 grp->lg_hcksum_txflags &= cksum;
2512 2548
2513 2549 grp->lg_vlan &=
2514 2550 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2515 2551
2516 2552 grp->lg_zcopy &=
2517 2553 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2518 2554
2519 2555 grp->lg_lso &=
2520 2556 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2521 2557 if (grp->lg_lso) {
2522 2558 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2523 2559 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2524 2560 cap_lso.lso_basic_tcp_ipv4.lso_max)
2525 2561 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2526 2562 cap_lso.lso_basic_tcp_ipv4.lso_max;
2527 2563 }
2528 2564 }
2529 2565 }
2530 2566
2531 2567 /*
2532 2568 * Checks whether the capabilities of the port being added are compatible
2533 2569 * with the current capabilities of the aggregation.
2534 2570 */
2535 2571 static boolean_t
2536 2572 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2537 2573 {
2538 2574 uint32_t hcksum_txflags;
2539 2575
2540 2576 ASSERT(grp->lg_ports != NULL);
2541 2577
2542 2578 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2543 2579 grp->lg_vlan) != grp->lg_vlan) {
2544 2580 return (B_FALSE);
2545 2581 }
2546 2582
2547 2583 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2548 2584 grp->lg_zcopy) != grp->lg_zcopy) {
2549 2585 return (B_FALSE);
2550 2586 }
2551 2587
2552 2588 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2553 2589 if (grp->lg_hcksum_txflags != 0)
2554 2590 return (B_FALSE);
2555 2591 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2556 2592 grp->lg_hcksum_txflags) {
2557 2593 return (B_FALSE);
2558 2594 }
2559 2595
2560 2596 if (grp->lg_lso) {
2561 2597 mac_capab_lso_t cap_lso;
2562 2598
2563 2599 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2564 2600 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2565 2601 grp->lg_cap_lso.lso_flags)
2566 2602 return (B_FALSE);
2567 2603 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2568 2604 cap_lso.lso_basic_tcp_ipv4.lso_max)
2569 2605 return (B_FALSE);
2570 2606 } else {
2571 2607 return (B_FALSE);
2572 2608 }
2573 2609 }
2574 2610
2575 2611 return (B_TRUE);
2576 2612 }
2577 2613
2578 2614 /*
2579 2615 * Returns the maximum SDU according to the SDU of the constituent ports.
2580 2616 */
2581 2617 static uint_t
2582 2618 aggr_grp_max_sdu(aggr_grp_t *grp)
2583 2619 {
2584 2620 uint_t max_sdu = (uint_t)-1;
2585 2621 aggr_port_t *port;
2586 2622
2587 2623 ASSERT(grp->lg_ports != NULL);
2588 2624
2589 2625 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2590 2626 uint_t port_sdu_max;
2591 2627
2592 2628 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2593 2629 if (max_sdu > port_sdu_max)
2594 2630 max_sdu = port_sdu_max;
2595 2631 }
2596 2632
2597 2633 return (max_sdu);
2598 2634 }
2599 2635
2600 2636 /*
2601 2637 * Checks if the maximum SDU of the specified port is compatible
2602 2638 * with the maximum SDU of the specified aggregation group, returns
2603 2639 * B_TRUE if it is, B_FALSE otherwise.
2604 2640 */
2605 2641 static boolean_t
2606 2642 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2607 2643 {
2608 2644 uint_t port_sdu_max;
2609 2645
2610 2646 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2611 2647 return (port_sdu_max >= grp->lg_max_sdu);
2612 2648 }
2613 2649
2614 2650 /*
2615 2651 * Returns the maximum margin according to the margin of the constituent ports.
2616 2652 */
2617 2653 static uint32_t
2618 2654 aggr_grp_max_margin(aggr_grp_t *grp)
2619 2655 {
2620 2656 uint32_t margin = UINT32_MAX;
2621 2657 aggr_port_t *port;
2622 2658
2623 2659 ASSERT(grp->lg_mh == NULL);
2624 2660 ASSERT(grp->lg_ports != NULL);
2625 2661
2626 2662 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2627 2663 if (margin > port->lp_margin)
2628 2664 margin = port->lp_margin;
2629 2665 }
2630 2666
2631 2667 grp->lg_margin = margin;
2632 2668 return (margin);
2633 2669 }
2634 2670
2635 2671 /*
2636 2672 * Checks if the maximum margin of the specified port is compatible
2637 2673 * with the maximum margin of the specified aggregation group, returns
2638 2674 * B_TRUE if it is, B_FALSE otherwise.
2639 2675 */
2640 2676 static boolean_t
2641 2677 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
2642 2678 {
2643 2679 if (port->lp_margin >= grp->lg_margin)
2644 2680 return (B_TRUE);
2645 2681
2646 2682 /*
2647 2683 * See whether the current margin value is allowed to be changed to
2648 2684 * the new value.
2649 2685 */
2650 2686 if (!mac_margin_update(grp->lg_mh, port->lp_margin))
2651 2687 return (B_FALSE);
2652 2688
2653 2689 grp->lg_margin = port->lp_margin;
2654 2690 return (B_TRUE);
2655 2691 }
2656 2692
2657 2693 /*
2658 2694 * Set MTU on individual ports of an aggregation group
2659 2695 */
2660 2696 static int
2661 2697 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
2662 2698 uint32_t *old_mtu)
2663 2699 {
2664 2700 boolean_t removed = B_FALSE;
2665 2701 mac_perim_handle_t mph;
2666 2702 mac_diag_t diag;
2667 2703 int err, rv, retry = 0;
2668 2704
2669 2705 if (port->lp_mah != NULL) {
2670 2706 (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
2671 2707 port->lp_mah = NULL;
2672 2708 removed = B_TRUE;
2673 2709 }
2674 2710 err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
2675 2711 try_again:
2676 2712 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
2677 2713 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
2678 2714 &port->lp_mah, 0, &diag)) != 0) {
2679 2715 /*
2680 2716 * following is a workaround for a bug in 'bge' driver.
2681 2717 * See CR 6794654 for more information and this work around
2682 2718 * will be removed once the CR is fixed.
2683 2719 */
2684 2720 if (rv == EIO && retry++ < 3) {
2685 2721 delay(2 * hz);
2686 2722 goto try_again;
2687 2723 }
2688 2724 /*
2689 2725 * if mac_unicast_add() failed while setting the MTU,
2690 2726 * detach the port from the group.
2691 2727 */
2692 2728 mac_perim_enter_by_mh(port->lp_mh, &mph);
2693 2729 (void) aggr_grp_detach_port(grp, port);
2694 2730 mac_perim_exit(mph);
2695 2731 cmn_err(CE_WARN, "Unable to restart the port %s while "
2696 2732 "setting MTU. Detaching the port from the aggregation.",
2697 2733 mac_client_name(port->lp_mch));
2698 2734 }
2699 2735 return (err);
2700 2736 }
2701 2737
2702 2738 static int
2703 2739 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
2704 2740 {
2705 2741 int err = 0, i, rv;
2706 2742 aggr_port_t *port;
2707 2743 uint32_t *mtu;
2708 2744
2709 2745 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2710 2746
2711 2747 /*
2712 2748 * If the MTU being set is equal to aggr group's maximum
2713 2749 * allowable value, then there is nothing to change
2714 2750 */
2715 2751 if (sdu == grp->lg_max_sdu)
2716 2752 return (0);
2717 2753
2718 2754 /* 0 is aggr group's min sdu */
2719 2755 if (sdu == 0)
2720 2756 return (EINVAL);
2721 2757
2722 2758 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
2723 2759 for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
2724 2760 port = port->lp_next, i++) {
2725 2761 err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
2726 2762 }
2727 2763 if (err != 0) {
2728 2764 /* recover from error: reset the mtus of the ports */
2729 2765 aggr_port_t *tmp;
2730 2766
2731 2767 for (tmp = grp->lg_ports, i = 0; tmp != port;
2732 2768 tmp = tmp->lp_next, i++) {
2733 2769 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
2734 2770 }
2735 2771 goto bail;
2736 2772 }
2737 2773 grp->lg_max_sdu = aggr_grp_max_sdu(grp);
2738 2774 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
2739 2775 ASSERT(rv == 0);
2740 2776 bail:
2741 2777 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
2742 2778 return (err);
2743 2779 }
2744 2780
2745 2781 /*
2746 2782 * Callback functions for set/get of properties
2747 2783 */
2748 2784 /*ARGSUSED*/
2749 2785 static int
2750 2786 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2751 2787 uint_t pr_valsize, const void *pr_val)
2752 2788 {
2753 2789 int err = ENOTSUP;
2754 2790 aggr_grp_t *grp = m_driver;
2755 2791
2756 2792 switch (pr_num) {
2757 2793 case MAC_PROP_MTU: {
2758 2794 uint32_t mtu;
2759 2795
2760 2796 if (pr_valsize < sizeof (mtu)) {
2761 2797 err = EINVAL;
2762 2798 break;
2763 2799 }
2764 2800 bcopy(pr_val, &mtu, sizeof (mtu));
2765 2801 err = aggr_sdu_update(grp, mtu);
2766 2802 break;
2767 2803 }
2768 2804 default:
2769 2805 break;
2770 2806 }
2771 2807 return (err);
2772 2808 }
2773 2809
2774 2810 typedef struct rboundary {
2775 2811 uint32_t bval;
2776 2812 int btype;
2777 2813 } rboundary_t;
2778 2814
2779 2815 /*
2780 2816 * This function finds the intersection of mtu ranges stored in arrays -
2781 2817 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
2782 2818 * Individual arrays are assumed to contain non-overlapping ranges.
2783 2819 * Algorithm:
2784 2820 * A range has two boundaries - min and max. We scan all arrays and store
2785 2821 * each boundary as a separate element in a temporary array. We also store
2786 2822 * the boundary types, min or max, as +1 or -1 respectively in the temporary
2787 2823 * array. Then we sort the temporary array in ascending order. We scan the
2788 2824 * sorted array from lower to higher values and keep a cumulative sum of
2789 2825 * boundary types. Element in the temporary array for which the sum reaches
2790 2826 * mcount is a min boundary of a range in the result and next element will be
2791 2827 * max boundary.
2792 2828 *
2793 2829 * Example for mcount = 3,
2794 2830 *
2795 2831 * ----|_________|-------|_______|----|__|------ mrange[0]
2796 2832 *
2797 2833 * -------|________|--|____________|-----|___|-- mrange[1]
2798 2834 *
2799 2835 * --------|________________|-------|____|------ mrange[2]
2800 2836 *
2801 2837 * 3 2 1
2802 2838 * \|/
2803 2839 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum
2804 2840 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
2805 2841 *
2806 2842 * same min and max
2807 2843 * V
2808 2844 * --------|_____|-------|__|------------|------ intersecting ranges
2809 2845 */
2810 2846 void
2811 2847 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
2812 2848 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
2813 2849 {
2814 2850 mac_propval_uint32_range_t *rval, *ur;
2815 2851 int rmaxcnt, rcount;
2816 2852 size_t sz_range32;
2817 2853 rboundary_t *ta; /* temporary array */
2818 2854 rboundary_t temp;
2819 2855 boolean_t range_started = B_FALSE;
2820 2856 int i, j, m, sum;
2821 2857
2822 2858 sz_range32 = sizeof (mac_propval_uint32_range_t);
2823 2859
2824 2860 for (i = 0, rmaxcnt = 0; i < mcount; i++)
2825 2861 rmaxcnt += mrange[i]->mpr_count;
2826 2862
2827 2863 /* Allocate enough space to store the results */
2828 2864 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
2829 2865
2830 2866 /* Number of boundaries are twice as many as ranges */
2831 2867 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
2832 2868
2833 2869 for (i = 0, m = 0; i < mcount; i++) {
2834 2870 ur = &(mrange[i]->mpr_range_uint32[0]);
2835 2871 for (j = 0; j < mrange[i]->mpr_count; j++) {
2836 2872 ta[m].bval = ur[j].mpur_min;
2837 2873 ta[m++].btype = 1;
2838 2874 ta[m].bval = ur[j].mpur_max;
2839 2875 ta[m++].btype = -1;
2840 2876 }
2841 2877 }
2842 2878
2843 2879 /*
2844 2880 * Sort the temporary array in ascending order of bval;
2845 2881 * if boundary values are same then sort on btype.
2846 2882 */
2847 2883 for (i = 0; i < m-1; i++) {
2848 2884 for (j = i+1; j < m; j++) {
2849 2885 if ((ta[i].bval > ta[j].bval) ||
2850 2886 ((ta[i].bval == ta[j].bval) &&
2851 2887 (ta[i].btype < ta[j].btype))) {
2852 2888 temp = ta[i];
2853 2889 ta[i] = ta[j];
2854 2890 ta[j] = temp;
2855 2891 }
2856 2892 }
2857 2893 }
2858 2894
2859 2895 /* Walk through temporary array to find all ranges in the results */
2860 2896 for (i = 0, sum = 0, rcount = 0; i < m; i++) {
2861 2897 sum += ta[i].btype;
2862 2898 if (sum == mcount) {
2863 2899 rval[rcount].mpur_min = ta[i].bval;
2864 2900 range_started = B_TRUE;
2865 2901 } else if (sum < mcount && range_started) {
2866 2902 rval[rcount++].mpur_max = ta[i].bval;
2867 2903 range_started = B_FALSE;
2868 2904 }
2869 2905 }
2870 2906
2871 2907 *prval = rval;
2872 2908 *prmaxcnt = rmaxcnt;
2873 2909 *prcount = rcount;
2874 2910
2875 2911 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
2876 2912 }
2877 2913
2878 2914 /*
2879 2915 * Returns the mtu ranges which could be supported by aggr group.
2880 2916 * prmaxcnt returns the size of the buffer prval, prcount returns
2881 2917 * the number of valid entries in prval. Caller is responsible
2882 2918 * for freeing up prval.
2883 2919 */
2884 2920 int
2885 2921 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
2886 2922 int *prmaxcnt, int *prcount)
2887 2923 {
2888 2924 mac_propval_range_t **vals;
2889 2925 aggr_port_t *port;
2890 2926 mac_perim_handle_t mph;
2891 2927 uint_t i, numr;
2892 2928 int err = 0;
2893 2929 size_t sz_propval, sz_range32;
2894 2930 size_t size;
2895 2931
2896 2932 sz_propval = sizeof (mac_propval_range_t);
2897 2933 sz_range32 = sizeof (mac_propval_uint32_range_t);
2898 2934
2899 2935 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2900 2936
2901 2937 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
2902 2938 KM_SLEEP);
2903 2939
2904 2940 for (port = grp->lg_ports, i = 0; port != NULL;
2905 2941 port = port->lp_next, i++) {
2906 2942
2907 2943 size = sz_propval;
2908 2944 vals[i] = kmem_alloc(size, KM_SLEEP);
2909 2945 vals[i]->mpr_count = 1;
2910 2946
2911 2947 mac_perim_enter_by_mh(port->lp_mh, &mph);
2912 2948
2913 2949 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2914 2950 NULL, 0, vals[i], NULL);
2915 2951 if (err == ENOSPC) {
2916 2952 /*
2917 2953 * Not enough space to hold all ranges.
2918 2954 * Allocate extra space as indicated and retry.
2919 2955 */
2920 2956 numr = vals[i]->mpr_count;
2921 2957 kmem_free(vals[i], sz_propval);
2922 2958 size = sz_propval + (numr - 1) * sz_range32;
2923 2959 vals[i] = kmem_alloc(size, KM_SLEEP);
2924 2960 vals[i]->mpr_count = numr;
2925 2961 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2926 2962 NULL, 0, vals[i], NULL);
2927 2963 ASSERT(err != ENOSPC);
2928 2964 }
2929 2965 mac_perim_exit(mph);
2930 2966 if (err != 0) {
2931 2967 kmem_free(vals[i], size);
2932 2968 vals[i] = NULL;
2933 2969 break;
2934 2970 }
2935 2971 }
2936 2972
2937 2973 /*
2938 2974 * if any of the underlying ports does not support changing MTU then
2939 2975 * just return ENOTSUP
2940 2976 */
2941 2977 if (port != NULL) {
2942 2978 ASSERT(err != 0);
2943 2979 goto done;
2944 2980 }
2945 2981
2946 2982 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
2947 2983 prcount);
2948 2984
2949 2985 done:
2950 2986 for (i = 0; i < grp->lg_nports; i++) {
2951 2987 if (vals[i] != NULL) {
2952 2988 numr = vals[i]->mpr_count;
2953 2989 size = sz_propval + (numr - 1) * sz_range32;
2954 2990 kmem_free(vals[i], size);
2955 2991 }
2956 2992 }
2957 2993
2958 2994 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
2959 2995 return (err);
2960 2996 }
2961 2997
2962 2998 static void
2963 2999 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2964 3000 mac_prop_info_handle_t prh)
2965 3001 {
2966 3002 aggr_grp_t *grp = m_driver;
2967 3003 mac_propval_uint32_range_t *rval = NULL;
2968 3004 int i, rcount, rmaxcnt;
2969 3005 int err = 0;
2970 3006
2971 3007 _NOTE(ARGUNUSED(pr_name));
2972 3008
2973 3009 switch (pr_num) {
2974 3010 case MAC_PROP_MTU:
2975 3011
2976 3012 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
2977 3013 &rcount);
2978 3014 if (err != 0) {
2979 3015 ASSERT(rval == NULL);
2980 3016 return;
2981 3017 }
2982 3018 for (i = 0; i < rcount; i++) {
2983 3019 mac_prop_info_set_range_uint32(prh,
2984 3020 rval[i].mpur_min, rval[i].mpur_max);
2985 3021 }
2986 3022 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
2987 3023 break;
2988 3024 }
2989 3025 }
|
↓ open down ↓ |
2025 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX