Print this page
6274 MAC tries to use aggr rings from downed links
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/io/aggr/aggr_send.c
+++ new/usr/src/uts/common/io/aggr/aggr_send.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /*
27 27 * IEEE 802.3ad Link Aggregation - Send code.
28 28 *
29 29 * Implements the Distributor function.
30 30 */
31 31
32 32 #include <sys/conf.h>
33 33 #include <sys/modctl.h>
34 34 #include <sys/sunddi.h>
35 35 #include <sys/callb.h>
36 36 #include <sys/vlan.h>
37 37 #include <sys/strsun.h>
38 38 #include <sys/strsubr.h>
39 39 #include <sys/dlpi.h>
40 40
41 41 #include <inet/common.h>
42 42 #include <inet/led.h>
43 43 #include <inet/ip.h>
44 44 #include <inet/ip6.h>
45 45 #include <inet/tcp.h>
46 46 #include <netinet/udp.h>
47 47
48 48 #include <sys/aggr.h>
49 49 #include <sys/aggr_impl.h>
50 50
51 51 /*
52 52 * Update the TX load balancing policy of the specified group.
53 53 */
54 54 void
55 55 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
56 56 {
57 57 uint8_t mac_policy = 0;
58 58
59 59 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
60 60
61 61 if ((policy & AGGR_POLICY_L2) != 0)
62 62 mac_policy |= MAC_PKT_HASH_L2;
63 63 if ((policy & AGGR_POLICY_L3) != 0)
64 64 mac_policy |= MAC_PKT_HASH_L3;
65 65 if ((policy & AGGR_POLICY_L4) != 0)
66 66 mac_policy |= MAC_PKT_HASH_L4;
67 67
68 68 grp->lg_tx_policy = policy;
69 69 grp->lg_mac_tx_policy = mac_policy;
70 70 }
71 71
72 72 #define HASH_HINT(hint) \
73 73 ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
74 74
75 75 /*
76 76 * Function invoked by mac layer to find a specific TX ring on a port
77 77 * to send data.
78 78 */
79 79 mblk_t *
80 80 aggr_find_tx_ring(void *arg, mblk_t *mp, uintptr_t hint, mac_ring_handle_t *rh)
81 81 {
82 82 aggr_grp_t *grp = arg;
83 83 aggr_port_t *port;
84 84 uint64_t hash;
85 85
86 86 rw_enter(&grp->lg_tx_lock, RW_READER);
87 87 if (grp->lg_ntx_ports == 0) {
88 88 /*
89 89 * We could have returned from aggr_m_start() before
90 90 * the ports were actually attached. Drop the chain.
91 91 */
92 92 rw_exit(&grp->lg_tx_lock);
93 93 freemsgchain(mp);
94 94 return (NULL);
95 95 }
96 96 hash = mac_pkt_hash(DL_ETHER, mp, grp->lg_mac_tx_policy, B_TRUE);
97 97 port = grp->lg_tx_ports[hash % grp->lg_ntx_ports];
98 98
99 99 /*
100 100 * Use hash as the hint so to direct traffic to
101 101 * different TX rings. Note below bit operation
102 102 * is needed in case hint is 0 to get the most
103 103 * benefit from HASH_HINT() algorithm.
104 104 */
105 105 if (port->lp_tx_ring_cnt > 1) {
106 106 if (hint == 0) {
107 107 hash = (hash << 24 | hash << 16 | hash);
108 108 hash = (hash << 32 | hash);
109 109 } else {
110 110 hash = hint;
111 111 }
112 112 hash = HASH_HINT(hash);
113 113 *rh = port->lp_pseudo_tx_rings[hash % port->lp_tx_ring_cnt];
114 114 } else {
115 115 *rh = port->lp_pseudo_tx_rings[0];
116 116 }
117 117 rw_exit(&grp->lg_tx_lock);
118 118
119 119 return (mp);
120 120 }
121 121
122 122 /*
123 123 * aggr_tx_notify_thread:
124 124 *
125 125 * aggr_tx_ring_update() callback function wakes up this thread when
126 126 * it gets called. This thread will call mac_tx_ring_update() to
127 127 * notify upper mac of flow control getting relieved. Note that
128 128 * aggr_tx_ring_update() cannot call mac_tx_ring_update() directly
129 129 * because aggr_tx_ring_update() is called from lower mac with
130 130 * mi_rw_lock held.
131 131 */
132 132 void
133 133 aggr_tx_notify_thread(void *arg)
134 134 {
135 135 callb_cpr_t cprinfo;
136 136 aggr_grp_t *grp = (aggr_grp_t *)arg;
137 137 mac_ring_handle_t pseudo_mrh;
138 138
139 139 CALLB_CPR_INIT(&cprinfo, &grp->lg_tx_flowctl_lock, callb_generic_cpr,
140 140 "aggr_tx_notify_thread");
141 141
142 142 mutex_enter(&grp->lg_tx_flowctl_lock);
143 143 while (!grp->lg_tx_notify_done) {
144 144 if ((grp->lg_tx_blocked_cnt) == 0) {
145 145 CALLB_CPR_SAFE_BEGIN(&cprinfo);
146 146 cv_wait(&grp->lg_tx_flowctl_cv,
147 147 &grp->lg_tx_flowctl_lock);
148 148 CALLB_CPR_SAFE_END(&cprinfo, &grp->lg_tx_flowctl_lock);
149 149 continue;
150 150 }
151 151 while (grp->lg_tx_blocked_cnt != 0) {
152 152 grp->lg_tx_blocked_cnt--;
153 153 pseudo_mrh =
154 154 grp->lg_tx_blocked_rings[grp->lg_tx_blocked_cnt];
155 155 mutex_exit(&grp->lg_tx_flowctl_lock);
156 156 mac_tx_ring_update(grp->lg_mh, pseudo_mrh);
157 157 mutex_enter(&grp->lg_tx_flowctl_lock);
158 158 }
159 159 }
160 160 /*
161 161 * The grp is being destroyed, exit the thread.
162 162 */
163 163 grp->lg_tx_notify_thread = NULL;
164 164 CALLB_CPR_EXIT(&cprinfo);
165 165 thread_exit();
166 166 }
167 167
168 168 /*
169 169 * Callback function registered with lower mac to receive wakeups from
170 170 * drivers when flow control is relieved (i.e. Tx descriptors are
171 171 * available).
172 172 */
173 173 void
174 174 aggr_tx_ring_update(void *arg1, uintptr_t arg2)
175 175 {
176 176 aggr_port_t *port = (aggr_port_t *)arg1;
177 177 mac_ring_handle_t mrh = (mac_ring_handle_t)arg2;
178 178 mac_ring_handle_t pseudo_mrh;
179 179 aggr_grp_t *grp = port->lp_grp;
180 180 int i = 0;
181 181
182 182 if (mrh == NULL) {
183 183 /*
184 184 * If the underlying NIC does not expose TX rings,
185 185 * still as pseudo TX ring is presented to the
186 186 * aggr mac.
187 187 */
188 188 pseudo_mrh = port->lp_pseudo_tx_rings[0];
189 189 } else {
190 190 for (i = 0; i < port->lp_tx_ring_cnt; i++) {
191 191 if (port->lp_tx_rings[i] == mrh)
192 192 break;
193 193 }
194 194 ASSERT(i < port->lp_tx_ring_cnt);
195 195 pseudo_mrh = port->lp_pseudo_tx_rings[i];
196 196 }
197 197 mutex_enter(&grp->lg_tx_flowctl_lock);
198 198 /*
199 199 * It could be possible that some (broken?) device driver
200 200 * could send more than one wakeup on the same ring. In
201 201 * such a case, multiple instances of the same pseudo TX
202 202 * ring should not be saved in lg_tx_blocked_rings[]
203 203 * array. So first check if woken up ring (pseudo_mrh) is
204 204 * already in the lg_tx_blocked_rings[] array.
205 205 */
206 206 for (i = 0; i < grp->lg_tx_blocked_cnt; i++) {
207 207 if (grp->lg_tx_blocked_rings[i] == pseudo_mrh) {
208 208 mutex_exit(&grp->lg_tx_flowctl_lock);
209 209 return;
210 210 }
211 211 }
212 212 /* A distinct mac_ring_handle. Save and increment count */
213 213 grp->lg_tx_blocked_rings[grp->lg_tx_blocked_cnt] = pseudo_mrh;
214 214 grp->lg_tx_blocked_cnt++;
215 215 cv_signal(&grp->lg_tx_flowctl_cv);
216 216 mutex_exit(&grp->lg_tx_flowctl_lock);
217 217 }
218 218
219 219 /*
220 220 * Send function invoked by the MAC service module.
221 221 */
222 222 mblk_t *
223 223 aggr_ring_tx(void *arg, mblk_t *mp)
224 224 {
225 225 aggr_pseudo_tx_ring_t *pseudo_ring = (aggr_pseudo_tx_ring_t *)arg;
226 226 aggr_port_t *port = pseudo_ring->atr_port;
227 227
228 228 return (mac_hwring_send_priv(port->lp_mch, pseudo_ring->atr_hw_rh, mp));
229 229 }
230 230
231 231 /*
232 232 * Enable sending on the specified port.
233 233 */
234 234 void
235 235 aggr_send_port_enable(aggr_port_t *port)
236 236 {
237 237 aggr_grp_t *grp = port->lp_grp;
238 238
239 239 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
240 240
241 241 if (port->lp_tx_enabled || (port->lp_state !=
242 242 AGGR_PORT_STATE_ATTACHED)) {
243 243 /* already enabled or port not yet attached */
244 244 return;
245 245 }
246 246
247 247 /*
248 248 * Add to group's array of tx ports.
249 249 */
250 250 rw_enter(&grp->lg_tx_lock, RW_WRITER);
251 251 if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
252 252 /* current array too small */
253 253 aggr_port_t **new_ports;
254 254 uint_t new_size;
255 255
256 256 new_size = grp->lg_ntx_ports+1;
257 257 new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *),
258 258 KM_SLEEP);
259 259
260 260 if (grp->lg_tx_ports_size > 0) {
261 261 ASSERT(grp->lg_tx_ports != NULL);
262 262 bcopy(grp->lg_tx_ports, new_ports,
263 263 grp->lg_ntx_ports * sizeof (aggr_port_t *));
264 264 kmem_free(grp->lg_tx_ports,
265 265 grp->lg_tx_ports_size * sizeof (aggr_port_t *));
266 266 }
|
↓ open down ↓ |
266 lines elided |
↑ open up ↑ |
267 267
268 268 grp->lg_tx_ports = new_ports;
269 269 grp->lg_tx_ports_size = new_size;
270 270 }
271 271
272 272 grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
273 273 port->lp_tx_idx = grp->lg_ntx_ports-1;
274 274 rw_exit(&grp->lg_tx_lock);
275 275
276 276 port->lp_tx_enabled = B_TRUE;
277 +
278 + aggr_grp_update_default(grp);
277 279 }
278 280
279 281 /*
280 282 * Disable sending from the specified port.
281 283 */
282 284 void
283 285 aggr_send_port_disable(aggr_port_t *port)
284 286 {
285 287 uint_t idx, ntx;
286 288 aggr_grp_t *grp = port->lp_grp;
287 289
288 290 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
289 291 ASSERT(MAC_PERIM_HELD(port->lp_mh));
290 292
291 293 if (!port->lp_tx_enabled) {
292 294 /* not yet enabled */
293 295 return;
294 296 }
295 297
296 298 rw_enter(&grp->lg_tx_lock, RW_WRITER);
297 299 idx = port->lp_tx_idx;
298 300 ntx = grp->lg_ntx_ports;
299 301 ASSERT(idx < ntx);
300 302
301 303 /* remove from array of attached ports */
302 304 if (idx == (ntx - 1)) {
303 305 grp->lg_tx_ports[idx] = NULL;
304 306 } else {
305 307 /* not the last entry, replace with last one */
306 308 aggr_port_t *victim;
307 309
308 310 victim = grp->lg_tx_ports[ntx - 1];
|
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
309 311 grp->lg_tx_ports[ntx - 1] = NULL;
310 312 victim->lp_tx_idx = idx;
311 313 grp->lg_tx_ports[idx] = victim;
312 314 }
313 315
314 316 port->lp_tx_idx = 0;
315 317 grp->lg_ntx_ports--;
316 318 rw_exit(&grp->lg_tx_lock);
317 319
318 320 port->lp_tx_enabled = B_FALSE;
321 +
322 + aggr_grp_update_default(grp);
319 323 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX