Print this page
10472 Limit number of multicast NCEs
Reviewed by: Cody Peter Mello <melloc@writev.io>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/ip/ip_ndp.c
+++ new/usr/src/uts/common/inet/ip/ip_ndp.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
|
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 /*
26 - * Copyright (c) 2018, Joyent, Inc.
26 + * Copyright (c) 2019, Joyent, Inc.
27 27 */
28 28
29 29 #include <sys/types.h>
30 30 #include <sys/stream.h>
31 31 #include <sys/stropts.h>
32 32 #include <sys/strsun.h>
33 33 #include <sys/sysmacros.h>
34 34 #include <sys/errno.h>
35 35 #include <sys/dlpi.h>
36 36 #include <sys/socket.h>
37 37 #include <sys/ddi.h>
38 38 #include <sys/sunddi.h>
39 39 #include <sys/cmn_err.h>
40 40 #include <sys/debug.h>
41 41 #include <sys/vtrace.h>
42 42 #include <sys/kmem.h>
43 43 #include <sys/zone.h>
44 44 #include <sys/ethernet.h>
45 45 #include <sys/sdt.h>
46 46 #include <sys/mac.h>
47 47
48 48 #include <net/if.h>
49 49 #include <net/if_types.h>
50 50 #include <net/if_dl.h>
51 51 #include <net/route.h>
52 52 #include <netinet/in.h>
53 53 #include <netinet/ip6.h>
54 54 #include <netinet/icmp6.h>
55 55
56 56 #include <inet/common.h>
57 57 #include <inet/mi.h>
58 58 #include <inet/mib2.h>
59 59 #include <inet/nd.h>
60 60 #include <inet/ip.h>
61 61 #include <inet/ip_impl.h>
62 62 #include <inet/ipclassifier.h>
63 63 #include <inet/ip_if.h>
64 64 #include <inet/ip_ire.h>
65 65 #include <inet/ip_rts.h>
66 66 #include <inet/ip6.h>
67 67 #include <inet/ip_ndp.h>
68 68 #include <inet/sctp_ip.h>
69 69 #include <inet/ip_arp.h>
70 70 #include <inet/ip2mac_impl.h>
71 71
72 72 #define ANNOUNCE_INTERVAL(isv6) \
73 73 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
74 74 ipst->ips_ip_arp_publish_interval)
75 75
76 76 #define DEFENSE_INTERVAL(isv6) \
77 77 (isv6 ? ipst->ips_ndp_defend_interval : \
78 78 ipst->ips_arp_defend_interval)
79 79
80 80 /* Non-tunable probe interval, based on link capabilities */
81 81 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
82 82
83 83 /*
84 84 * The IPv4 Link Local address space is special; we do extra duplicate checking
85 85 * there, as the entire assignment mechanism rests on random numbers.
86 86 */
87 87 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
88 88 ((uchar_t *)ptr)[1] == 254)
89 89
90 90 /*
91 91 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
92 92 * in to the ncec*add* functions.
93 93 *
94 94 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
95 95 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
96 96 * that we will respond to requests for the protocol address.
97 97 */
98 98 #define NCE_EXTERNAL_FLAGS_MASK \
99 99 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
100 100 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
101 101 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
102 102
103 103 /*
104 104 * Lock ordering:
105 105 *
106 106 * ndp_g_lock -> ill_lock -> ncec_lock
107 107 *
108 108 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
109 109 * ncec_next. ncec_lock protects the contents of the NCE (particularly
110 110 * ncec_refcnt).
111 111 */
112 112
113 113 static void nce_cleanup_list(ncec_t *ncec);
114 114 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
115 115 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
116 116 ncec_t *);
117 117 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *);
118 118 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
119 119 uint16_t ncec_flags, nce_t **newnce);
|
↓ open down ↓ |
83 lines elided |
↑ open up ↑ |
120 120 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
121 121 uint16_t ncec_flags, nce_t **newnce);
122 122 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation,
123 123 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
124 124 const in6_addr_t *target, int flag);
125 125 static void ncec_refhold_locked(ncec_t *);
126 126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
127 127 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
128 128 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
129 129 uint16_t, uint16_t, nce_t **);
130 -static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
131 -static nce_t *nce_add(ill_t *, ncec_t *);
130 +static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
131 +static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
132 132 static void nce_inactive(nce_t *);
133 133 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
134 134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
135 135 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
136 136 uint16_t, uint16_t, nce_t **);
137 137 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
138 138 uint16_t, uint16_t, nce_t **);
139 139 static int nce_add_v6_postprocess(nce_t *);
140 140 static int nce_add_v4_postprocess(nce_t *);
141 141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
142 142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
143 143 static void nce_resolv_ipmp_ok(ncec_t *);
144 144 static void nce_walk_common(ill_t *, pfi_t, void *);
145 145 static void nce_start_timer(ncec_t *, uint_t);
146 146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
147 147 static void nce_fastpath_trigger(nce_t *);
148 148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
149 149
150 150 #ifdef DEBUG
151 151 static void ncec_trace_cleanup(const ncec_t *);
152 152 #endif
153 153
154 154 #define NCE_HASH_PTR_V4(ipst, addr) \
155 155 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
156 156
157 157 #define NCE_HASH_PTR_V6(ipst, addr) \
158 158 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
159 159 NCE_TABLE_SIZE)]))
160 160
161 161 extern kmem_cache_t *ncec_cache;
162 162 extern kmem_cache_t *nce_cache;
163 163
164 164 /*
165 165 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
166 166 * If src_ill is not null, the ncec_addr is bound to src_ill. The
167 167 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
168 168 * the probe is sent on the ncec_ill (in the non-IPMP case) or the
169 169 * IPMP cast_ill (in the IPMP case).
170 170 *
171 171 * Note that the probe interval is based on the src_ill for IPv6, and
172 172 * the ncec_xmit_interval for IPv4.
173 173 */
174 174 static void
175 175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
176 176 {
177 177 boolean_t dropped;
178 178 uint32_t probe_interval;
179 179
180 180 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
181 181 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
182 182 if (ncec->ncec_ipversion == IPV6_VERSION) {
183 183 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
184 184 ncec->ncec_lladdr, ncec->ncec_lladdr_length,
185 185 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
186 186 probe_interval = ILL_PROBE_INTERVAL(src_ill);
187 187 } else {
188 188 /* IPv4 DAD delay the initial probe. */
189 189 if (send_probe)
190 190 dropped = arp_probe(ncec);
191 191 else
192 192 dropped = B_TRUE;
193 193 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
194 194 !send_probe);
195 195 }
196 196 if (!dropped) {
197 197 mutex_enter(&ncec->ncec_lock);
198 198 ncec->ncec_pcnt--;
199 199 mutex_exit(&ncec->ncec_lock);
200 200 }
201 201 nce_restart_timer(ncec, probe_interval);
202 202 }
203 203
204 204 /*
205 205 * Compute default flags to use for an advertisement of this ncec's address.
206 206 */
207 207 static int
208 208 nce_advert_flags(const ncec_t *ncec)
209 209 {
210 210 int flag = 0;
211 211
212 212 if (ncec->ncec_flags & NCE_F_ISROUTER)
213 213 flag |= NDP_ISROUTER;
214 214 if (!(ncec->ncec_flags & NCE_F_ANYCAST))
215 215 flag |= NDP_ORIDE;
216 216
217 217 return (flag);
218 218 }
219 219
220 220 /*
221 221 * NDP Cache Entry creation routine.
222 222 * This routine must always be called with ndp6->ndp_g_lock held.
223 223 */
224 224 int
225 225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
226 226 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
227 227 {
228 228 int err;
229 229 nce_t *nce;
230 230
231 231 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
232 232 ASSERT(ill != NULL && ill->ill_isv6);
233 233
234 234 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
235 235 &nce);
236 236 if (err != 0)
237 237 return (err);
238 238 ASSERT(newnce != NULL);
239 239 *newnce = nce;
240 240 return (err);
241 241 }
242 242
243 243 /*
244 244 * Post-processing routine to be executed after nce_add_v6(). This function
245 245 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
246 246 * and must be called without any locks held.
247 247 */
248 248 int
249 249 nce_add_v6_postprocess(nce_t *nce)
250 250 {
251 251 ncec_t *ncec = nce->nce_common;
252 252 boolean_t dropped = B_FALSE;
253 253 uchar_t *hw_addr = ncec->ncec_lladdr;
254 254 uint_t hw_addr_len = ncec->ncec_lladdr_length;
255 255 ill_t *ill = ncec->ncec_ill;
256 256 int err = 0;
257 257 uint16_t flags = ncec->ncec_flags;
258 258 ip_stack_t *ipst = ill->ill_ipst;
259 259 boolean_t trigger_fastpath = B_TRUE;
260 260
261 261 /*
262 262 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
263 263 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
264 264 * We call nce_fastpath from nce_update if the link layer address of
265 265 * the peer changes from nce_update
266 266 */
267 267 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
268 268 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
269 269 trigger_fastpath = B_FALSE;
270 270
271 271 if (trigger_fastpath)
272 272 nce_fastpath_trigger(nce);
273 273 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
274 274 ill_t *hwaddr_ill;
275 275 /*
276 276 * Unicast entry that needs DAD.
277 277 */
278 278 if (IS_IPMP(ill)) {
279 279 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
280 280 hw_addr, hw_addr_len);
281 281 } else {
282 282 hwaddr_ill = ill;
283 283 }
284 284 nce_dad(ncec, hwaddr_ill, B_TRUE);
285 285 err = EINPROGRESS;
286 286 } else if (flags & NCE_F_UNSOL_ADV) {
287 287 /*
288 288 * We account for the transmit below by assigning one
289 289 * less than the ndd variable. Subsequent decrements
290 290 * are done in nce_timer.
291 291 */
292 292 mutex_enter(&ncec->ncec_lock);
293 293 ncec->ncec_unsolicit_count =
294 294 ipst->ips_ip_ndp_unsolicit_count - 1;
295 295 mutex_exit(&ncec->ncec_lock);
296 296 dropped = ndp_xmit(ill,
297 297 ND_NEIGHBOR_ADVERT,
298 298 hw_addr,
299 299 hw_addr_len,
300 300 &ncec->ncec_addr, /* Source and target of the adv */
301 301 &ipv6_all_hosts_mcast, /* Destination of the packet */
302 302 nce_advert_flags(ncec));
303 303 mutex_enter(&ncec->ncec_lock);
304 304 if (dropped)
305 305 ncec->ncec_unsolicit_count++;
306 306 else
307 307 ncec->ncec_last_time_defended = ddi_get_lbolt();
308 308 if (ncec->ncec_unsolicit_count != 0) {
309 309 nce_start_timer(ncec,
310 310 ipst->ips_ip_ndp_unsolicit_interval);
311 311 }
312 312 mutex_exit(&ncec->ncec_lock);
313 313 }
314 314 return (err);
315 315 }
316 316
317 317 /*
318 318 * Atomically lookup and add (if needed) Neighbor Cache information for
319 319 * an address.
320 320 *
321 321 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
322 322 * are always added pointing at the ipmp_ill. Thus, when the ill passed
323 323 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
324 324 * entries will be created, both pointing at the same ncec_t. The nce_t
325 325 * entries will have their nce_ill set to the ipmp_ill and the under_ill
326 326 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
327 327 * Local addresses are always created on the ill passed to nce_add_v6.
328 328 */
329 329 int
330 330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
331 331 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
332 332 {
333 333 int err = 0;
334 334 ip_stack_t *ipst = ill->ill_ipst;
335 335 nce_t *nce, *upper_nce = NULL;
336 336 ill_t *in_ill = ill;
337 337 boolean_t need_ill_refrele = B_FALSE;
338 338
339 339 if (flags & NCE_F_MCAST) {
340 340 /*
341 341 * hw_addr will be figured out in nce_set_multicast_v6;
342 342 * caller has to select the cast_ill
343 343 */
344 344 ASSERT(hw_addr == NULL);
345 345 ASSERT(!IS_IPMP(ill));
346 346 err = nce_set_multicast_v6(ill, addr, flags, newnce);
347 347 return (err);
348 348 }
349 349 ASSERT(ill->ill_isv6);
350 350 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
351 351 ill = ipmp_ill_hold_ipmp_ill(ill);
352 352 if (ill == NULL)
353 353 return (ENXIO);
354 354 need_ill_refrele = B_TRUE;
355 355 }
356 356
357 357 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
358 358 nce = nce_lookup_addr(ill, addr);
359 359 if (nce == NULL) {
360 360 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
361 361 &nce);
362 362 } else {
363 363 err = EEXIST;
364 364 }
365 365 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
366 366 if (err == 0)
367 367 err = nce_add_v6_postprocess(nce);
368 368 if (in_ill != ill && nce != NULL) {
369 369 nce_t *under_nce = NULL;
370 370
371 371 /*
372 372 * in_ill was the under_ill. Try to create the under_nce.
373 373 * Hold the ill_g_lock to prevent changes to group membership
374 374 * until we are done.
375 375 */
376 376 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
377 377 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
378 378 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
379 379 ill_t *, ill);
380 380 rw_exit(&ipst->ips_ill_g_lock);
381 381 err = ENXIO;
382 382 nce_refrele(nce);
383 383 nce = NULL;
384 384 goto bail;
385 385 }
386 386 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
387 387 if (under_nce == NULL) {
388 388 rw_exit(&ipst->ips_ill_g_lock);
389 389 err = EINVAL;
390 390 nce_refrele(nce);
391 391 nce = NULL;
392 392 goto bail;
393 393 }
394 394 rw_exit(&ipst->ips_ill_g_lock);
395 395 upper_nce = nce;
396 396 nce = under_nce; /* will be returned to caller */
397 397 if (NCE_ISREACHABLE(nce->nce_common))
398 398 nce_fastpath_trigger(under_nce);
399 399 }
400 400 /* nce_refrele is deferred until the lock is dropped */
401 401 if (nce != NULL) {
402 402 if (newnce != NULL)
403 403 *newnce = nce;
404 404 else
405 405 nce_refrele(nce);
406 406 }
407 407 bail:
408 408 if (upper_nce != NULL)
409 409 nce_refrele(upper_nce);
410 410 if (need_ill_refrele)
411 411 ill_refrele(ill);
412 412 return (err);
413 413 }
414 414
415 415 /*
416 416 * Remove all the CONDEMNED nces from the appropriate hash table.
417 417 * We create a private list of NCEs, these may have ires pointing
418 418 * to them, so the list will be passed through to clean up dependent
419 419 * ires and only then we can do ncec_refrele() which can make NCE inactive.
420 420 */
421 421 static void
422 422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
423 423 {
424 424 ncec_t *ncec1;
425 425 ncec_t **ptpn;
426 426
427 427 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
428 428 ASSERT(ndp->ndp_g_walker == 0);
429 429 for (; ncec; ncec = ncec1) {
430 430 ncec1 = ncec->ncec_next;
431 431 mutex_enter(&ncec->ncec_lock);
432 432 if (NCE_ISCONDEMNED(ncec)) {
433 433 ptpn = ncec->ncec_ptpn;
434 434 ncec1 = ncec->ncec_next;
435 435 if (ncec1 != NULL)
436 436 ncec1->ncec_ptpn = ptpn;
437 437 *ptpn = ncec1;
438 438 ncec->ncec_ptpn = NULL;
439 439 ncec->ncec_next = NULL;
440 440 ncec->ncec_next = *free_nce_list;
441 441 *free_nce_list = ncec;
442 442 }
443 443 mutex_exit(&ncec->ncec_lock);
444 444 }
445 445 }
446 446
447 447 /*
448 448 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
449 449 * will return this NCE. Also no new timeouts will
450 450 * be started (See nce_restart_timer).
451 451 * 2. Cancel any currently running timeouts.
452 452 * 3. If there is an ndp walker, return. The walker will do the cleanup.
453 453 * This ensures that walkers see a consistent list of NCEs while walking.
454 454 * 4. Otherwise remove the NCE from the list of NCEs
455 455 */
456 456 void
457 457 ncec_delete(ncec_t *ncec)
458 458 {
459 459 ncec_t **ptpn;
460 460 ncec_t *ncec1;
461 461 int ipversion = ncec->ncec_ipversion;
462 462 ndp_g_t *ndp;
463 463 ip_stack_t *ipst = ncec->ncec_ipst;
464 464
465 465 if (ipversion == IPV4_VERSION)
466 466 ndp = ipst->ips_ndp4;
467 467 else
468 468 ndp = ipst->ips_ndp6;
469 469
470 470 /* Serialize deletes */
471 471 mutex_enter(&ncec->ncec_lock);
472 472 if (NCE_ISCONDEMNED(ncec)) {
473 473 /* Some other thread is doing the delete */
474 474 mutex_exit(&ncec->ncec_lock);
475 475 return;
476 476 }
477 477 /*
478 478 * Caller has a refhold. Also 1 ref for being in the list. Thus
479 479 * refcnt has to be >= 2
480 480 */
481 481 ASSERT(ncec->ncec_refcnt >= 2);
482 482 ncec->ncec_flags |= NCE_F_CONDEMNED;
483 483 mutex_exit(&ncec->ncec_lock);
484 484
485 485 /* Count how many condemned ires for kmem_cache callback */
486 486 atomic_inc_32(&ipst->ips_num_nce_condemned);
487 487 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
488 488
489 489 /* Complete any waiting callbacks */
490 490 ncec_cb_dispatch(ncec);
491 491
492 492 /*
493 493 * Cancel any running timer. Timeout can't be restarted
494 494 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
495 495 * Passing invalid timeout id is fine.
496 496 */
497 497 if (ncec->ncec_timeout_id != 0) {
498 498 (void) untimeout(ncec->ncec_timeout_id);
499 499 ncec->ncec_timeout_id = 0;
500 500 }
501 501
502 502 mutex_enter(&ndp->ndp_g_lock);
503 503 if (ncec->ncec_ptpn == NULL) {
504 504 /*
505 505 * The last ndp walker has already removed this ncec from
506 506 * the list after we marked the ncec CONDEMNED and before
507 507 * we grabbed the global lock.
508 508 */
509 509 mutex_exit(&ndp->ndp_g_lock);
510 510 return;
511 511 }
512 512 if (ndp->ndp_g_walker > 0) {
513 513 /*
514 514 * Can't unlink. The walker will clean up
515 515 */
516 516 ndp->ndp_g_walker_cleanup = B_TRUE;
517 517 mutex_exit(&ndp->ndp_g_lock);
518 518 return;
519 519 }
520 520
521 521 /*
522 522 * Now remove the ncec from the list. nce_restart_timer won't restart
523 523 * the timer since it is marked CONDEMNED.
524 524 */
525 525 ptpn = ncec->ncec_ptpn;
526 526 ncec1 = ncec->ncec_next;
527 527 if (ncec1 != NULL)
528 528 ncec1->ncec_ptpn = ptpn;
529 529 *ptpn = ncec1;
530 530 ncec->ncec_ptpn = NULL;
531 531 ncec->ncec_next = NULL;
532 532 mutex_exit(&ndp->ndp_g_lock);
533 533
534 534 /* Removed from ncec_ptpn/ncec_next list */
535 535 ncec_refrele_notr(ncec);
536 536 }
537 537
538 538 void
539 539 ncec_inactive(ncec_t *ncec)
540 540 {
541 541 mblk_t **mpp;
542 542 ill_t *ill = ncec->ncec_ill;
543 543 ip_stack_t *ipst = ncec->ncec_ipst;
544 544
545 545 ASSERT(ncec->ncec_refcnt == 0);
546 546 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
547 547
548 548 /* Count how many condemned nces for kmem_cache callback */
549 549 if (NCE_ISCONDEMNED(ncec))
550 550 atomic_add_32(&ipst->ips_num_nce_condemned, -1);
551 551
552 552 /* Free all allocated messages */
553 553 mpp = &ncec->ncec_qd_mp;
554 554 while (*mpp != NULL) {
555 555 mblk_t *mp;
556 556
557 557 mp = *mpp;
558 558 *mpp = mp->b_next;
559 559
560 560 inet_freemsg(mp);
561 561 }
562 562 /*
563 563 * must have been cleaned up in ncec_delete
564 564 */
565 565 ASSERT(list_is_empty(&ncec->ncec_cb));
566 566 list_destroy(&ncec->ncec_cb);
567 567 /*
568 568 * free the ncec_lladdr if one was allocated in nce_add_common()
569 569 */
570 570 if (ncec->ncec_lladdr_length > 0)
571 571 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
572 572
573 573 #ifdef DEBUG
574 574 ncec_trace_cleanup(ncec);
575 575 #endif
576 576
577 577 mutex_enter(&ill->ill_lock);
578 578 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
579 579 (char *), "ncec", (void *), ncec);
580 580 ill->ill_ncec_cnt--;
581 581 ncec->ncec_ill = NULL;
582 582 /*
583 583 * If the number of ncec's associated with this ill have dropped
584 584 * to zero, check whether we need to restart any operation that
585 585 * is waiting for this to happen.
586 586 */
587 587 if (ILL_DOWN_OK(ill)) {
588 588 /* ipif_ill_refrele_tail drops the ill_lock */
589 589 ipif_ill_refrele_tail(ill);
590 590 } else {
591 591 mutex_exit(&ill->ill_lock);
592 592 }
593 593
594 594 mutex_destroy(&ncec->ncec_lock);
595 595 kmem_cache_free(ncec_cache, ncec);
596 596 }
597 597
598 598 /*
599 599 * ncec_walk routine. Delete the ncec if it is associated with the ill
600 600 * that is going away. Always called as a writer.
601 601 */
602 602 void
603 603 ncec_delete_per_ill(ncec_t *ncec, void *arg)
604 604 {
605 605 if ((ncec != NULL) && ncec->ncec_ill == arg) {
606 606 ncec_delete(ncec);
607 607 }
608 608 }
609 609
610 610 /*
611 611 * Neighbor Cache cleanup logic for a list of ncec_t entries.
612 612 */
613 613 static void
614 614 nce_cleanup_list(ncec_t *ncec)
615 615 {
616 616 ncec_t *ncec_next;
617 617
618 618 ASSERT(ncec != NULL);
619 619 while (ncec != NULL) {
620 620 ncec_next = ncec->ncec_next;
621 621 ncec->ncec_next = NULL;
622 622
623 623 /*
624 624 * It is possible for the last ndp walker (this thread)
625 625 * to come here after ncec_delete has marked the ncec CONDEMNED
626 626 * and before it has removed the ncec from the fastpath list
627 627 * or called untimeout. So we need to do it here. It is safe
628 628 * for both ncec_delete and this thread to do it twice or
629 629 * even simultaneously since each of the threads has a
630 630 * reference on the ncec.
631 631 */
632 632 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
633 633 /*
634 634 * Cancel any running timer. Timeout can't be restarted
635 635 * since CONDEMNED is set. The ncec_lock can't be
636 636 * held across untimeout though passing invalid timeout
637 637 * id is fine.
638 638 */
639 639 if (ncec->ncec_timeout_id != 0) {
640 640 (void) untimeout(ncec->ncec_timeout_id);
641 641 ncec->ncec_timeout_id = 0;
642 642 }
643 643 /* Removed from ncec_ptpn/ncec_next list */
644 644 ncec_refrele_notr(ncec);
645 645 ncec = ncec_next;
646 646 }
647 647 }
648 648
649 649 /*
650 650 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted.
651 651 */
652 652 boolean_t
653 653 nce_restart_dad(ncec_t *ncec)
654 654 {
655 655 boolean_t started;
656 656 ill_t *ill, *hwaddr_ill;
657 657
658 658 if (ncec == NULL)
659 659 return (B_FALSE);
660 660 ill = ncec->ncec_ill;
661 661 mutex_enter(&ncec->ncec_lock);
662 662 if (ncec->ncec_state == ND_PROBE) {
663 663 mutex_exit(&ncec->ncec_lock);
664 664 started = B_TRUE;
665 665 } else if (ncec->ncec_state == ND_REACHABLE) {
666 666 ASSERT(ncec->ncec_lladdr != NULL);
667 667 ncec->ncec_state = ND_PROBE;
668 668 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
669 669 /*
670 670 * Slight cheat here: we don't use the initial probe delay
671 671 * for IPv4 in this obscure case.
672 672 */
673 673 mutex_exit(&ncec->ncec_lock);
674 674 if (IS_IPMP(ill)) {
675 675 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
676 676 ncec->ncec_lladdr, ncec->ncec_lladdr_length);
677 677 } else {
678 678 hwaddr_ill = ill;
679 679 }
680 680 nce_dad(ncec, hwaddr_ill, B_TRUE);
681 681 started = B_TRUE;
682 682 } else {
683 683 mutex_exit(&ncec->ncec_lock);
684 684 started = B_FALSE;
685 685 }
686 686 return (started);
687 687 }
688 688
689 689 /*
690 690 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed.
691 691 * If one is found, the refcnt on the ncec will be incremented.
692 692 */
693 693 ncec_t *
694 694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
695 695 {
696 696 ncec_t *ncec;
697 697 ip_stack_t *ipst = ill->ill_ipst;
698 698
699 699 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
700 700 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
701 701
702 702 /* Get head of v6 hash table */
703 703 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
704 704 ncec = ncec_lookup_illgrp(ill, addr, ncec);
705 705 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
706 706 rw_exit(&ipst->ips_ill_g_lock);
707 707 return (ncec);
708 708 }
709 709 /*
710 710 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed.
711 711 * If one is found, the refcnt on the ncec will be incremented.
712 712 */
713 713 ncec_t *
714 714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
715 715 {
716 716 ncec_t *ncec = NULL;
717 717 in6_addr_t addr6;
718 718 ip_stack_t *ipst = ill->ill_ipst;
719 719
720 720 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
721 721 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
722 722
723 723 /* Get head of v4 hash table */
724 724 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
725 725 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
726 726 ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
727 727 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
728 728 rw_exit(&ipst->ips_ill_g_lock);
729 729 return (ncec);
730 730 }
731 731
732 732 /*
733 733 * Cache entry lookup. Try to find an ncec matching the parameters passed.
734 734 * If an ncec is found, increment the hold count on that ncec.
735 735 * The caller passes in the start of the appropriate hash table, and must
736 736 * be holding the appropriate global lock (ndp_g_lock). In addition, since
737 737 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
738 738 * must be held as reader.
739 739 *
740 740 * This function always matches across the ipmp group.
741 741 */
742 742 ncec_t *
743 743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
744 744 {
745 745 ndp_g_t *ndp;
746 746 ip_stack_t *ipst = ill->ill_ipst;
747 747
748 748 if (ill->ill_isv6)
749 749 ndp = ipst->ips_ndp6;
750 750 else
751 751 ndp = ipst->ips_ndp4;
752 752
753 753 ASSERT(ill != NULL);
754 754 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
755 755 if (IN6_IS_ADDR_UNSPECIFIED(addr))
756 756 return (NULL);
757 757 for (; ncec != NULL; ncec = ncec->ncec_next) {
758 758 if (ncec->ncec_ill == ill ||
759 759 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
760 760 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
761 761 mutex_enter(&ncec->ncec_lock);
762 762 if (!NCE_ISCONDEMNED(ncec)) {
763 763 ncec_refhold_locked(ncec);
764 764 mutex_exit(&ncec->ncec_lock);
765 765 break;
766 766 }
767 767 mutex_exit(&ncec->ncec_lock);
768 768 }
769 769 }
770 770 }
771 771 return (ncec);
772 772 }
773 773
774 774 /*
775 775 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
776 776 * entries for ill only, i.e., when ill is part of an ipmp group,
777 777 * nce_lookup_v4 will never try to match across the group.
778 778 */
779 779 nce_t *
780 780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
781 781 {
782 782 nce_t *nce;
783 783 in6_addr_t addr6;
784 784 ip_stack_t *ipst = ill->ill_ipst;
785 785
786 786 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
787 787 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
788 788 nce = nce_lookup_addr(ill, &addr6);
789 789 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
790 790 return (nce);
791 791 }
792 792
793 793 /*
794 794 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
795 795 * entries for ill only, i.e., when ill is part of an ipmp group,
796 796 * nce_lookup_v6 will never try to match across the group.
797 797 */
798 798 nce_t *
799 799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
800 800 {
801 801 nce_t *nce;
802 802 ip_stack_t *ipst = ill->ill_ipst;
803 803
804 804 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
805 805 nce = nce_lookup_addr(ill, addr6);
806 806 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
807 807 return (nce);
808 808 }
809 809
810 810 static nce_t *
811 811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
812 812 {
813 813 nce_t *nce;
814 814
815 815 ASSERT(ill != NULL);
816 816 #ifdef DEBUG
817 817 if (ill->ill_isv6)
818 818 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
819 819 else
820 820 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
821 821 #endif
822 822 mutex_enter(&ill->ill_lock);
823 823 nce = nce_lookup(ill, addr);
824 824 mutex_exit(&ill->ill_lock);
825 825 return (nce);
826 826 }
827 827
828 828
829 829 /*
830 830 * Router turned to host. We need to make sure that cached copies of the ncec
831 831 * are not used for forwarding packets if they were derived from the default
832 832 * route, and that the default route itself is removed, as required by
833 833 * section 7.2.5 of RFC 2461.
834 834 *
835 835 * Note that the ncec itself probably has valid link-layer information for the
836 836 * nexthop, so that there is no reason to delete the ncec, as long as the
837 837 * ISROUTER flag is turned off.
838 838 */
839 839 static void
840 840 ncec_router_to_host(ncec_t *ncec)
841 841 {
842 842 ire_t *ire;
843 843 ip_stack_t *ipst = ncec->ncec_ipst;
844 844
845 845 mutex_enter(&ncec->ncec_lock);
846 846 ncec->ncec_flags &= ~NCE_F_ISROUTER;
847 847 mutex_exit(&ncec->ncec_lock);
848 848
849 849 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
850 850 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
851 851 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
852 852 if (ire != NULL) {
853 853 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
854 854 ire_delete(ire);
855 855 ire_refrele(ire);
856 856 }
857 857 }
858 858
859 859 /*
860 860 * Process passed in parameters either from an incoming packet or via
861 861 * user ioctl.
862 862 */
863 863 void
864 864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
865 865 {
866 866 ill_t *ill = ncec->ncec_ill;
867 867 uint32_t hw_addr_len = ill->ill_phys_addr_length;
868 868 boolean_t ll_updated = B_FALSE;
869 869 boolean_t ll_changed;
870 870 nce_t *nce;
871 871
872 872 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
873 873 /*
874 874 * No updates of link layer address or the neighbor state is
875 875 * allowed, when the cache is in NONUD state. This still
876 876 * allows for responding to reachability solicitation.
877 877 */
878 878 mutex_enter(&ncec->ncec_lock);
879 879 if (ncec->ncec_state == ND_INCOMPLETE) {
880 880 if (hw_addr == NULL) {
881 881 mutex_exit(&ncec->ncec_lock);
882 882 return;
883 883 }
884 884 nce_set_ll(ncec, hw_addr);
885 885 /*
886 886 * Update ncec state and send the queued packets
887 887 * back to ip this time ire will be added.
888 888 */
889 889 if (flag & ND_NA_FLAG_SOLICITED) {
890 890 nce_update(ncec, ND_REACHABLE, NULL);
891 891 } else {
892 892 nce_update(ncec, ND_STALE, NULL);
893 893 }
894 894 mutex_exit(&ncec->ncec_lock);
895 895 nce = nce_fastpath(ncec, B_TRUE, NULL);
896 896 nce_resolv_ok(ncec);
897 897 if (nce != NULL)
898 898 nce_refrele(nce);
899 899 return;
900 900 }
901 901 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
902 902 if (!is_adv) {
903 903 /* If this is a SOLICITATION request only */
904 904 if (ll_changed)
905 905 nce_update(ncec, ND_STALE, hw_addr);
906 906 mutex_exit(&ncec->ncec_lock);
907 907 ncec_cb_dispatch(ncec);
908 908 return;
909 909 }
910 910 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
911 911 /* If in any other state than REACHABLE, ignore */
912 912 if (ncec->ncec_state == ND_REACHABLE) {
913 913 nce_update(ncec, ND_STALE, NULL);
914 914 }
915 915 mutex_exit(&ncec->ncec_lock);
916 916 ncec_cb_dispatch(ncec);
917 917 return;
918 918 } else {
919 919 if (ll_changed) {
920 920 nce_update(ncec, ND_UNCHANGED, hw_addr);
921 921 ll_updated = B_TRUE;
922 922 }
923 923 if (flag & ND_NA_FLAG_SOLICITED) {
924 924 nce_update(ncec, ND_REACHABLE, NULL);
925 925 } else {
926 926 if (ll_updated) {
927 927 nce_update(ncec, ND_STALE, NULL);
928 928 }
929 929 }
930 930 mutex_exit(&ncec->ncec_lock);
931 931 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
932 932 NCE_F_ISROUTER)) {
933 933 ncec_router_to_host(ncec);
934 934 } else {
935 935 ncec_cb_dispatch(ncec);
936 936 }
937 937 }
938 938 }
939 939
940 940 /*
941 941 * Pass arg1 to the cbf supplied, along with each ncec in existence.
942 942 * ncec_walk() places a REFHOLD on the ncec and drops the lock when
943 943 * walking the hash list.
944 944 */
945 945 void
946 946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
947 947 void *arg1, boolean_t trace)
948 948 {
949 949 ncec_t *ncec;
950 950 ncec_t *ncec1;
951 951 ncec_t **ncep;
952 952 ncec_t *free_nce_list = NULL;
953 953
954 954 mutex_enter(&ndp->ndp_g_lock);
955 955 /* Prevent ncec_delete from unlink and free of NCE */
956 956 ndp->ndp_g_walker++;
957 957 mutex_exit(&ndp->ndp_g_lock);
958 958 for (ncep = ndp->nce_hash_tbl;
959 959 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
960 960 for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
961 961 ncec1 = ncec->ncec_next;
962 962 if (ill == NULL || ncec->ncec_ill == ill) {
963 963 if (trace) {
964 964 ncec_refhold(ncec);
965 965 (*cbf)(ncec, arg1);
966 966 ncec_refrele(ncec);
967 967 } else {
968 968 ncec_refhold_notr(ncec);
969 969 (*cbf)(ncec, arg1);
970 970 ncec_refrele_notr(ncec);
971 971 }
972 972 }
973 973 }
974 974 }
975 975 mutex_enter(&ndp->ndp_g_lock);
976 976 ndp->ndp_g_walker--;
977 977 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
978 978 /* Time to delete condemned entries */
979 979 for (ncep = ndp->nce_hash_tbl;
980 980 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
981 981 ncec = *ncep;
982 982 if (ncec != NULL) {
983 983 nce_remove(ndp, ncec, &free_nce_list);
984 984 }
985 985 }
986 986 ndp->ndp_g_walker_cleanup = B_FALSE;
987 987 }
988 988
989 989 mutex_exit(&ndp->ndp_g_lock);
990 990
991 991 if (free_nce_list != NULL) {
992 992 nce_cleanup_list(free_nce_list);
993 993 }
994 994 }
995 995
996 996 /*
997 997 * Walk everything.
|
↓ open down ↓ |
856 lines elided |
↑ open up ↑ |
998 998 * Note that ill can be NULL hence can't derive the ipst from it.
999 999 */
1000 1000 void
1001 1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 1002 {
1003 1003 ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004 1004 ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 1005 }
1006 1006
1007 1007 /*
1008 + * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
1009 + * NCEs, and the number to reclaim if we hit the limit. Used by
1010 + * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
1011 + * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
1012 + */
1013 +
1014 +/* Maximum number of multicast NCEs on an ill. */
1015 +uint_t ip_max_ill_mcast_nces = 16384;
1016 +/*
1017 + * Number of NCEs to delete if we hit the maximum above. 0 means *don't* and
1018 + * return an error. Non-zero means delete so many, and if the number is >=
1019 + * the max above, that means delete them all.
1020 + */
1021 +uint_t ip_ill_mcast_reclaim = 256;
1022 +
1023 +/*
1024 + * Encapsulate multicast ill capping in a function, for easier DTrace
1025 + * detections. Return a list of refheld NCEs to destroy-via-refrele. That
1026 + * list can be NULL, but can only be non-NULL if we successfully reclaimed.
1027 + *
1028 + * NOTE: This function must be called while holding the ill_lock AND
1029 + * JUST PRIOR to making the insertion into the ill_nce list.
1030 + *
1031 + * We can't release the ones we delete ourselves because the ill_lock is held
1032 + * by the caller. They are, instead, passed back in a list_t for deletion
1033 + * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
1034 + *
1035 + * While this covers nce_t, ncec_t gets done even further down the road. See
1036 + * nce_graveyard_free() for why.
1037 + */
1038 +static boolean_t
1039 +nce_too_many_mcast(ill_t *ill, list_t *graveyard)
1040 +{
1041 + uint_t reclaim_count, max_count, reclaimed = 0;
1042 + boolean_t too_many;
1043 + nce_t *nce, *deadman;
1044 +
1045 + ASSERT(graveyard != NULL);
1046 + ASSERT(list_is_empty(graveyard));
1047 + ASSERT(MUTEX_HELD(&ill->ill_lock));
1048 +
1049 + /*
1050 + * NOTE: Some grinning weirdo may have lowered the global max beyond
1051 + * what this ill currently has. The behavior in this case will be
1052 + * trim-back just by the reclaim amount for any new ones.
1053 + */
1054 + max_count = ip_max_ill_mcast_nces;
1055 + reclaim_count = min(ip_ill_mcast_reclaim, max_count);
1056 +
1057 + /* All good? */
1058 + if (ill->ill_mcast_nces < max_count)
1059 + return (B_FALSE); /* Yes, all good. */
1060 +
1061 + if (reclaim_count == 0)
1062 + return (B_TRUE); /* Don't bother - we're stuck. */
1063 +
1064 + /* We need to reclaim now. Exploit our held ill_lock. */
1065 +
1066 + /*
1067 + * Start at the tail and work backwards, new nces are head-inserted,
1068 + * so we'll be reaping the oldest entries.
1069 + */
1070 + nce = list_tail(&ill->ill_nce);
1071 + while (reclaimed < reclaim_count) {
1072 + /* Skip ahead to a multicast NCE. */
1073 + while (nce != NULL &&
1074 + (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
1075 + nce = list_prev(&ill->ill_nce, nce);
1076 + }
1077 + if (nce == NULL)
1078 + break;
1079 +
1080 + /*
1081 + * NOTE: For now, we just delete the first one(s) we find.
1082 + * This is not optimal, and may require some inspection of nce
1083 + * & its ncec to be better.
1084 + */
1085 + deadman = nce;
1086 + nce = list_prev(&ill->ill_nce, nce);
1087 +
1088 + /* nce_delete() requires caller holds... */
1089 + nce_refhold(deadman);
1090 + nce_delete(deadman); /* Bumps down ill_mcast_nces. */
1091 +
1092 + /* Link the dead ones singly, still refheld... */
1093 + list_insert_tail(graveyard, deadman);
1094 + reclaimed++;
1095 + }
1096 +
1097 + if (reclaimed != reclaim_count) {
1098 + /* We didn't have enough to reach reclaim_count. Why?!? */
1099 + DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
1100 + uint_t, reclaimed, uint_t, reclaim_count);
1101 +
1102 + /* In case for some REALLY weird reason we found none! */
1103 + too_many = (reclaimed == 0);
1104 + } else {
1105 + too_many = B_FALSE;
1106 + }
1107 +
1108 + return (too_many);
1109 +}
1110 +
1111 +static void
1112 +ncec_mcast_reap_one(ncec_t *ncec, void *arg)
1113 +{
1114 + boolean_t reapit;
1115 + ill_t *ill = (ill_t *)arg;
1116 +
1117 + /* Obvious no-lock-needed checks... */
1118 + if (ncec == NULL || ncec->ncec_ill != ill ||
1119 + (ncec->ncec_flags & NCE_F_MCAST) == 0)
1120 + return;
1121 +
1122 + mutex_enter(&ncec->ncec_lock);
1123 + /*
1124 + * It's refheld by the walk infrastructure. It has one reference for
1125 + * being in the ndp_g_hash, and if an nce_t exists, that's one more.
1126 + * We want ones without an nce_t, so 2 is the magic number. If it's
1127 + * LESS than 2, we have much bigger problems anyway.
1128 + */
1129 + ASSERT(ncec->ncec_refcnt >= 2);
1130 + reapit = (ncec->ncec_refcnt == 2);
1131 + mutex_exit(&ncec->ncec_lock);
1132 +
1133 + if (reapit) {
1134 + IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
1135 + ncec_delete(ncec);
1136 + }
1137 +}
1138 +
1139 +/*
1140 + * Attempt to reap stray multicast ncec_t structures left in the wake of
1141 + * nce_graveyard_free(). This is a taskq servicing routine, as it's well
1142 + * outside any netstack-global locks being held - ndp_g_lock in this case. We
1143 + * have a reference hold on the ill, which will prevent any unplumbing races.
1144 + */
1145 +static void
1146 +ncec_mcast_reap(void *arg)
1147 +{
1148 + ill_t *ill = (ill_t *)arg;
1149 +
1150 + IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
1151 + ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
1152 + mutex_enter(&ill->ill_lock);
1153 + ill->ill_mcast_ncec_cleanup = B_FALSE;
1154 + /*
1155 + * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
1156 + * below for why.
1157 + */
1158 + ill->ill_refcnt--;
1159 + if (ill->ill_refcnt == 0)
1160 + ipif_ill_refrele_tail(ill); /* Drops ill_lock. */
1161 + else
1162 + mutex_exit(&ill->ill_lock);
1163 +}
1164 +
1165 +/*
1166 + * Free a list (including handling an empty list or NULL list) of
1167 + * reference-held NCEs that were reaped from a nce_too_many_mcast()
1168 + * call. Separate because the caller must have dropped ndp_g_lock first.
1169 + *
1170 + * This also schedules a taskq task to unlink underlying NCECs from the
1171 + * ndp_g_hash, which are protected by ndp_g_lock.
1172 + */
1173 +static void
1174 +nce_graveyard_free(list_t *graveyard)
1175 +{
1176 + nce_t *deadman, *current;
1177 + ill_t *ill;
1178 + boolean_t doit;
1179 +
1180 + if (graveyard == NULL)
1181 + return;
1182 +
1183 + current = list_head(graveyard);
1184 + if (current == NULL) {
1185 + list_destroy(graveyard);
1186 + return;
1187 + }
1188 +
1189 + ill = current->nce_ill;
1190 + /*
1191 + * Normally one should ill_refhold(ill) here. There's no _notr()
1192 + * variant like there is for ire_t, dce_t, or even ncec_t, but this is
1193 + * the ONLY case that'll break the mh_trace that IP debugging uses for
1194 + * reference counts (i.e. they assume same thread releases as
1195 + * holds). Instead, we inline ill_refhold() here. We must do the same
1196 + * in the release done by the ncec_mcast_reap() above.
1197 + */
1198 + mutex_enter(&ill->ill_lock);
1199 + ill->ill_refcnt++;
1200 + mutex_exit(&ill->ill_lock);
1201 +
1202 + while (current != NULL) {
1203 + ASSERT3P(ill, ==, current->nce_ill);
1204 + deadman = current;
1205 + current = list_next(graveyard, deadman);
1206 + list_remove(graveyard, deadman);
1207 + ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
1208 + 0);
1209 + nce_refrele(deadman);
1210 + }
1211 + list_destroy(graveyard);
1212 +
1213 + mutex_enter(&ill->ill_lock);
1214 + if (ill->ill_mcast_ncec_cleanup)
1215 + doit = B_FALSE;
1216 + else {
1217 + ill->ill_mcast_ncec_cleanup = B_TRUE;
1218 + doit = B_TRUE;
1219 + }
1220 + mutex_exit(&ill->ill_lock);
1221 + if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
1222 + ill, TQ_NOSLEEP) == NULL) {
1223 + mutex_enter(&ill->ill_lock);
1224 + if (doit) {
1225 + IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
1226 + ill->ill_mcast_ncec_cleanup = B_FALSE;
1227 + }
1228 + /* There's no _notr() for ill_refrele(), so inline it here. */
1229 + ill->ill_refcnt--;
1230 + if (ill->ill_refcnt == 0)
1231 + ipif_ill_refrele_tail(ill); /* Drops ill_lock */
1232 + else
1233 + mutex_exit(&ill->ill_lock);
1234 + }
1235 +}
1236 +
1237 +/*
1008 1238 * For each interface an entry is added for the unspecified multicast group.
1009 1239 * Here that mapping is used to form the multicast cache entry for a particular
1010 1240 * multicast destination.
1011 1241 */
1012 1242 static int
1013 1243 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1014 1244 uint16_t flags, nce_t **newnce)
1015 1245 {
1016 1246 uchar_t *hw_addr;
1017 1247 int err = 0;
1018 1248 ip_stack_t *ipst = ill->ill_ipst;
1019 1249 nce_t *nce;
1020 1250
1021 1251 ASSERT(ill != NULL);
1022 1252 ASSERT(ill->ill_isv6);
1023 1253 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1024 1254
1025 1255 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1026 1256 nce = nce_lookup_addr(ill, dst);
1027 1257 if (nce != NULL) {
1028 1258 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1029 1259 goto done;
1030 1260 }
1031 1261 if (ill->ill_net_type == IRE_IF_RESOLVER) {
1032 1262 /*
1033 1263 * For IRE_IF_RESOLVER a hardware mapping can be
1034 1264 * generated.
1035 1265 */
1036 1266 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1037 1267 if (hw_addr == NULL) {
1038 1268 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1039 1269 return (ENOMEM);
1040 1270 }
1041 1271 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1042 1272 } else {
|
↓ open down ↓ |
25 lines elided |
↑ open up ↑ |
1043 1273 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1044 1274 hw_addr = NULL;
1045 1275 }
1046 1276 ASSERT((flags & NCE_F_MCAST) != 0);
1047 1277 ASSERT((flags & NCE_F_NONUD) != 0);
1048 1278 /* nce_state will be computed by nce_add_common() */
1049 1279 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1050 1280 ND_UNCHANGED, &nce);
1051 1281 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1052 1282 if (err == 0)
1053 - err = nce_add_v6_postprocess(nce);
1283 + err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
1054 1284 if (hw_addr != NULL)
1055 1285 kmem_free(hw_addr, ill->ill_nd_lla_len);
1056 1286 if (err != 0) {
1057 1287 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1058 1288 return (err);
1059 1289 }
1060 1290 done:
1061 1291 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1062 1292 if (newnce != NULL)
1063 1293 *newnce = nce;
1064 1294 else
1065 1295 nce_refrele(nce);
1066 1296 return (0);
1067 1297 }
1068 1298
1069 1299 /*
1070 1300 * Return the link layer address, and any flags of a ncec.
1071 1301 */
1072 1302 int
1073 1303 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1074 1304 {
1075 1305 ncec_t *ncec;
1076 1306 in6_addr_t *addr;
1077 1307 sin6_t *sin6;
1078 1308
1079 1309 ASSERT(ill != NULL && ill->ill_isv6);
1080 1310 sin6 = (sin6_t *)&lnr->lnr_addr;
1081 1311 addr = &sin6->sin6_addr;
1082 1312
1083 1313 /*
1084 1314 * NOTE: if the ill is an IPMP interface, then match against the whole
1085 1315 * illgrp. This e.g. allows in.ndpd to retrieve the link layer
1086 1316 * addresses for the data addresses on an IPMP interface even though
1087 1317 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1088 1318 */
1089 1319 ncec = ncec_lookup_illgrp_v6(ill, addr);
1090 1320 if (ncec == NULL)
1091 1321 return (ESRCH);
1092 1322 /* If no link layer address is available yet, return ESRCH */
1093 1323 if (!NCE_ISREACHABLE(ncec)) {
1094 1324 ncec_refrele(ncec);
1095 1325 return (ESRCH);
1096 1326 }
1097 1327 lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1098 1328 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1099 1329 lnr->lnr_hdw_len);
1100 1330 if (ncec->ncec_flags & NCE_F_ISROUTER)
1101 1331 lnr->lnr_flags = NDF_ISROUTER_ON;
1102 1332 if (ncec->ncec_flags & NCE_F_ANYCAST)
1103 1333 lnr->lnr_flags |= NDF_ANYCAST_ON;
1104 1334 if (ncec->ncec_flags & NCE_F_STATIC)
1105 1335 lnr->lnr_flags |= NDF_STATIC;
1106 1336 ncec_refrele(ncec);
1107 1337 return (0);
1108 1338 }
1109 1339
1110 1340 /*
1111 1341 * Finish setting up the Enable/Disable multicast for the driver.
1112 1342 */
1113 1343 mblk_t *
1114 1344 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1115 1345 uint32_t hw_addr_offset, mblk_t *mp)
1116 1346 {
1117 1347 uchar_t *hw_addr;
1118 1348 ipaddr_t v4group;
1119 1349 uchar_t *addr;
1120 1350
1121 1351 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1122 1352 if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1123 1353 IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1124 1354
1125 1355 ASSERT(CLASSD(v4group));
1126 1356 ASSERT(!(ill->ill_isv6));
1127 1357
1128 1358 addr = (uchar_t *)&v4group;
1129 1359 } else {
1130 1360 ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1131 1361 ASSERT(ill->ill_isv6);
1132 1362
1133 1363 addr = (uchar_t *)v6group;
1134 1364 }
1135 1365 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1136 1366 if (hw_addr == NULL) {
1137 1367 ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1138 1368 freemsg(mp);
1139 1369 return (NULL);
1140 1370 }
1141 1371
1142 1372 ip_mcast_mapping(ill, addr, hw_addr);
1143 1373 return (mp);
1144 1374 }
1145 1375
1146 1376 void
1147 1377 ip_ndp_resolve(ncec_t *ncec)
1148 1378 {
1149 1379 in_addr_t sender4 = INADDR_ANY;
1150 1380 in6_addr_t sender6 = ipv6_all_zeros;
1151 1381 ill_t *src_ill;
1152 1382 uint32_t ms;
1153 1383
1154 1384 src_ill = nce_resolve_src(ncec, &sender6);
1155 1385 if (src_ill == NULL) {
1156 1386 /* Make sure we try again later */
1157 1387 ms = ncec->ncec_ill->ill_reachable_retrans_time;
1158 1388 nce_restart_timer(ncec, (clock_t)ms);
1159 1389 return;
1160 1390 }
1161 1391 if (ncec->ncec_ipversion == IPV4_VERSION)
1162 1392 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1163 1393 mutex_enter(&ncec->ncec_lock);
1164 1394 if (ncec->ncec_ipversion == IPV6_VERSION)
1165 1395 ms = ndp_solicit(ncec, sender6, src_ill);
1166 1396 else
1167 1397 ms = arp_request(ncec, sender4, src_ill);
1168 1398 mutex_exit(&ncec->ncec_lock);
1169 1399 if (ms == 0) {
1170 1400 if (ncec->ncec_state != ND_REACHABLE) {
1171 1401 if (ncec->ncec_ipversion == IPV6_VERSION)
1172 1402 ndp_resolv_failed(ncec);
1173 1403 else
1174 1404 arp_resolv_failed(ncec);
1175 1405 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1176 1406 nce_make_unreachable(ncec);
1177 1407 ncec_delete(ncec);
1178 1408 }
1179 1409 } else {
1180 1410 nce_restart_timer(ncec, (clock_t)ms);
1181 1411 }
1182 1412 done:
1183 1413 ill_refrele(src_ill);
1184 1414 }
1185 1415
1186 1416 /*
1187 1417 * Send an IPv6 neighbor solicitation.
1188 1418 * Returns number of milliseconds after which we should either rexmit or abort.
1189 1419 * Return of zero means we should abort.
1190 1420 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1191 1421 * The optional source address is used as a hint to ndp_solicit for
1192 1422 * which source to use in the packet.
1193 1423 *
1194 1424 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1195 1425 * the packet.
1196 1426 */
1197 1427 uint32_t
1198 1428 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1199 1429 {
1200 1430 in6_addr_t dst;
1201 1431 boolean_t dropped = B_FALSE;
1202 1432
1203 1433 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1204 1434 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1205 1435
1206 1436 if (ncec->ncec_rcnt == 0)
1207 1437 return (0);
1208 1438
1209 1439 dst = ncec->ncec_addr;
1210 1440 ncec->ncec_rcnt--;
1211 1441 mutex_exit(&ncec->ncec_lock);
1212 1442 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1213 1443 ill->ill_phys_addr_length, &src, &dst, 0);
1214 1444 mutex_enter(&ncec->ncec_lock);
1215 1445 if (dropped)
1216 1446 ncec->ncec_rcnt++;
1217 1447 return (ncec->ncec_ill->ill_reachable_retrans_time);
1218 1448 }
1219 1449
1220 1450 /*
1221 1451 * Attempt to recover an address on an interface that's been marked as a
1222 1452 * duplicate. Because NCEs are destroyed when the interface goes down, there's
1223 1453 * no easy way to just probe the address and have the right thing happen if
1224 1454 * it's no longer in use. Instead, we just bring it up normally and allow the
1225 1455 * regular interface start-up logic to probe for a remaining duplicate and take
1226 1456 * us back down if necessary.
1227 1457 * Neither DHCP nor temporary addresses arrive here; they're excluded by
1228 1458 * ip_ndp_excl.
1229 1459 */
1230 1460 /* ARGSUSED */
1231 1461 void
1232 1462 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1233 1463 {
1234 1464 ill_t *ill = rq->q_ptr;
1235 1465 ipif_t *ipif;
1236 1466 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1237 1467 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1238 1468 boolean_t addr_equal;
1239 1469
1240 1470 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1241 1471 /*
1242 1472 * We do not support recovery of proxy ARP'd interfaces,
1243 1473 * because the system lacks a complete proxy ARP mechanism.
1244 1474 */
1245 1475 if (ill->ill_isv6) {
1246 1476 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1247 1477 addr6);
1248 1478 } else {
1249 1479 addr_equal = (ipif->ipif_lcl_addr == *addr4);
1250 1480 }
1251 1481
1252 1482 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1253 1483 continue;
1254 1484
1255 1485 /*
1256 1486 * If we have already recovered or if the interface is going
1257 1487 * away, then ignore.
1258 1488 */
1259 1489 mutex_enter(&ill->ill_lock);
1260 1490 if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1261 1491 (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1262 1492 mutex_exit(&ill->ill_lock);
1263 1493 continue;
1264 1494 }
1265 1495
1266 1496 ipif->ipif_flags &= ~IPIF_DUPLICATE;
1267 1497 ill->ill_ipif_dup_count--;
1268 1498 mutex_exit(&ill->ill_lock);
1269 1499 ipif->ipif_was_dup = B_TRUE;
1270 1500
1271 1501 if (ill->ill_isv6) {
1272 1502 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1273 1503 (void) ipif_up_done_v6(ipif);
1274 1504 } else {
1275 1505 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1276 1506 EINPROGRESS);
1277 1507 (void) ipif_up_done(ipif);
1278 1508 }
1279 1509 }
1280 1510 freeb(mp);
1281 1511 }
1282 1512
1283 1513 /*
1284 1514 * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1285 1515 * As long as someone else holds the address, the interface will stay down.
1286 1516 * When that conflict goes away, the interface is brought back up. This is
1287 1517 * done so that accidental shutdowns of addresses aren't made permanent. Your
1288 1518 * server will recover from a failure.
1289 1519 *
1290 1520 * For DHCP and temporary addresses, recovery is not done in the kernel.
1291 1521 * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1292 1522 *
1293 1523 * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1294 1524 */
1295 1525 void
1296 1526 ipif_dup_recovery(void *arg)
1297 1527 {
1298 1528 ipif_t *ipif = arg;
1299 1529
1300 1530 ipif->ipif_recovery_id = 0;
1301 1531 if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1302 1532 return;
1303 1533
1304 1534 /*
1305 1535 * No lock, because this is just an optimization.
1306 1536 */
1307 1537 if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1308 1538 return;
1309 1539
1310 1540 /* If the link is down, we'll retry this later */
1311 1541 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1312 1542 return;
1313 1543
1314 1544 ipif_do_recovery(ipif);
1315 1545 }
1316 1546
1317 1547 /*
1318 1548 * Perform interface recovery by forcing the duplicate interfaces up and
1319 1549 * allowing the system to determine which ones should stay up.
1320 1550 *
1321 1551 * Called both by recovery timer expiry and link-up notification.
1322 1552 */
1323 1553 void
1324 1554 ipif_do_recovery(ipif_t *ipif)
1325 1555 {
1326 1556 ill_t *ill = ipif->ipif_ill;
1327 1557 mblk_t *mp;
1328 1558 ip_stack_t *ipst = ill->ill_ipst;
1329 1559 size_t mp_size;
1330 1560
1331 1561 if (ipif->ipif_isv6)
1332 1562 mp_size = sizeof (ipif->ipif_v6lcl_addr);
1333 1563 else
1334 1564 mp_size = sizeof (ipif->ipif_lcl_addr);
1335 1565 mp = allocb(mp_size, BPRI_MED);
1336 1566 if (mp == NULL) {
1337 1567 mutex_enter(&ill->ill_lock);
1338 1568 if (ipst->ips_ip_dup_recovery > 0 &&
1339 1569 ipif->ipif_recovery_id == 0 &&
1340 1570 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1341 1571 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1342 1572 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1343 1573 }
1344 1574 mutex_exit(&ill->ill_lock);
1345 1575 } else {
1346 1576 /*
1347 1577 * A recovery timer may still be running if we got here from
1348 1578 * ill_restart_dad(); cancel that timer.
1349 1579 */
1350 1580 if (ipif->ipif_recovery_id != 0)
1351 1581 (void) untimeout(ipif->ipif_recovery_id);
1352 1582 ipif->ipif_recovery_id = 0;
1353 1583
1354 1584 if (ipif->ipif_isv6) {
1355 1585 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1356 1586 sizeof (ipif->ipif_v6lcl_addr));
1357 1587 } else {
1358 1588 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1359 1589 sizeof (ipif->ipif_lcl_addr));
1360 1590 }
1361 1591 ill_refhold(ill);
1362 1592 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1363 1593 B_FALSE);
1364 1594 }
1365 1595 }
1366 1596
1367 1597 /*
1368 1598 * Find the MAC and IP addresses in an NA/NS message.
1369 1599 */
1370 1600 static void
1371 1601 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1372 1602 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1373 1603 {
1374 1604 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1375 1605 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1376 1606 uchar_t *addr;
1377 1607 int alen;
1378 1608
1379 1609 /* icmp_inbound_v6 ensures this */
1380 1610 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1381 1611
1382 1612 addr = ira->ira_l2src;
1383 1613 alen = ill->ill_phys_addr_length;
1384 1614 if (alen > 0) {
1385 1615 *haddr = addr;
1386 1616 *haddrlenp = alen;
1387 1617 } else {
1388 1618 *haddr = NULL;
1389 1619 *haddrlenp = 0;
1390 1620 }
1391 1621
1392 1622 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1393 1623 *targp = ns->nd_ns_target;
1394 1624 }
1395 1625
1396 1626 /*
1397 1627 * This is for exclusive changes due to NDP duplicate address detection
1398 1628 * failure.
1399 1629 */
1400 1630 /* ARGSUSED */
1401 1631 static void
1402 1632 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1403 1633 {
1404 1634 ill_t *ill = rq->q_ptr;
1405 1635 ipif_t *ipif;
1406 1636 uchar_t *haddr;
1407 1637 uint_t haddrlen;
1408 1638 ip_stack_t *ipst = ill->ill_ipst;
1409 1639 in6_addr_t targ;
1410 1640 ip_recv_attr_t iras;
1411 1641 mblk_t *attrmp;
1412 1642
1413 1643 attrmp = mp;
1414 1644 mp = mp->b_cont;
1415 1645 attrmp->b_cont = NULL;
1416 1646 if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1417 1647 /* The ill or ip_stack_t disappeared on us */
1418 1648 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1419 1649 ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1420 1650 freemsg(mp);
1421 1651 ira_cleanup(&iras, B_TRUE);
1422 1652 return;
1423 1653 }
1424 1654
1425 1655 ASSERT(ill == iras.ira_rill);
1426 1656
1427 1657 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1428 1658 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1429 1659 /*
1430 1660 * Ignore conflicts generated by misbehaving switches that
1431 1661 * just reflect our own messages back to us. For IPMP, we may
1432 1662 * see reflections across any ill in the illgrp.
1433 1663 *
1434 1664 * RFC2462 and revisions tried to detect both the case
1435 1665 * when a statically configured IPv6 address is a duplicate,
1436 1666 * and the case when the L2 address itself is a duplicate. The
1437 1667 * later is important because, with stateles address autoconf,
1438 1668 * if the L2 address is a duplicate, the resulting IPv6
1439 1669 * address(es) would also be duplicates. We rely on DAD of the
1440 1670 * IPv6 address itself to detect the latter case.
1441 1671 */
1442 1672 /* For an under ill_grp can change under lock */
1443 1673 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1444 1674 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1445 1675 IS_UNDER_IPMP(ill) &&
1446 1676 ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1447 1677 haddrlen) != NULL) {
1448 1678 rw_exit(&ipst->ips_ill_g_lock);
1449 1679 goto ignore_conflict;
1450 1680 }
1451 1681 rw_exit(&ipst->ips_ill_g_lock);
1452 1682 }
1453 1683
1454 1684 /*
1455 1685 * Look up the appropriate ipif.
1456 1686 */
1457 1687 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1458 1688 if (ipif == NULL)
1459 1689 goto ignore_conflict;
1460 1690
1461 1691 /* Reload the ill to match the ipif */
1462 1692 ill = ipif->ipif_ill;
1463 1693
1464 1694 /* If it's already duplicate or ineligible, then don't do anything. */
1465 1695 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1466 1696 ipif_refrele(ipif);
1467 1697 goto ignore_conflict;
1468 1698 }
1469 1699
1470 1700 /*
1471 1701 * If this is a failure during duplicate recovery, then don't
1472 1702 * complain. It may take a long time to recover.
1473 1703 */
1474 1704 if (!ipif->ipif_was_dup) {
1475 1705 char ibuf[LIFNAMSIZ];
1476 1706 char hbuf[MAC_STR_LEN];
1477 1707 char sbuf[INET6_ADDRSTRLEN];
1478 1708
1479 1709 ipif_get_name(ipif, ibuf, sizeof (ibuf));
1480 1710 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1481 1711 " disabled", ibuf,
1482 1712 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1483 1713 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1484 1714 }
1485 1715 mutex_enter(&ill->ill_lock);
1486 1716 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1487 1717 ipif->ipif_flags |= IPIF_DUPLICATE;
1488 1718 ill->ill_ipif_dup_count++;
1489 1719 mutex_exit(&ill->ill_lock);
1490 1720 (void) ipif_down(ipif, NULL, NULL);
1491 1721 (void) ipif_down_tail(ipif);
1492 1722 mutex_enter(&ill->ill_lock);
1493 1723 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1494 1724 ill->ill_net_type == IRE_IF_RESOLVER &&
1495 1725 !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1496 1726 ipst->ips_ip_dup_recovery > 0) {
1497 1727 ASSERT(ipif->ipif_recovery_id == 0);
1498 1728 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1499 1729 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1500 1730 }
1501 1731 mutex_exit(&ill->ill_lock);
1502 1732 ipif_refrele(ipif);
1503 1733
1504 1734 ignore_conflict:
1505 1735 freemsg(mp);
1506 1736 ira_cleanup(&iras, B_TRUE);
1507 1737 }
1508 1738
1509 1739 /*
1510 1740 * Handle failure by tearing down the ipifs with the specified address. Note
1511 1741 * that tearing down the ipif also means deleting the ncec through ipif_down, so
1512 1742 * it's not possible to do recovery by just restarting the ncec timer. Instead,
1513 1743 * we start a timer on the ipif.
1514 1744 * Caller has to free mp;
1515 1745 */
1516 1746 static void
1517 1747 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1518 1748 {
1519 1749 const uchar_t *haddr;
1520 1750 ill_t *ill = ira->ira_rill;
1521 1751
1522 1752 /*
1523 1753 * Ignore conflicts generated by misbehaving switches that just
1524 1754 * reflect our own messages back to us.
1525 1755 */
1526 1756
1527 1757 /* icmp_inbound_v6 ensures this */
1528 1758 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1529 1759 haddr = ira->ira_l2src;
1530 1760 if (haddr != NULL &&
1531 1761 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1532 1762 return;
1533 1763 }
1534 1764
1535 1765 if ((mp = copymsg(mp)) != NULL) {
1536 1766 mblk_t *attrmp;
1537 1767
1538 1768 attrmp = ip_recv_attr_to_mblk(ira);
1539 1769 if (attrmp == NULL) {
1540 1770 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1541 1771 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1542 1772 freemsg(mp);
1543 1773 } else {
1544 1774 ASSERT(attrmp->b_cont == NULL);
1545 1775 attrmp->b_cont = mp;
1546 1776 mp = attrmp;
1547 1777 ill_refhold(ill);
1548 1778 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1549 1779 B_FALSE);
1550 1780 }
1551 1781 }
1552 1782 }
1553 1783
1554 1784 /*
1555 1785 * Handle a discovered conflict: some other system is advertising that it owns
1556 1786 * one of our IP addresses. We need to defend ourselves, or just shut down the
1557 1787 * interface.
1558 1788 *
1559 1789 * Handles both IPv4 and IPv6
1560 1790 */
1561 1791 boolean_t
1562 1792 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1563 1793 {
1564 1794 ipif_t *ipif;
1565 1795 clock_t now;
1566 1796 uint_t maxdefense;
1567 1797 uint_t defs;
1568 1798 ill_t *ill = ira->ira_ill;
1569 1799 ip_stack_t *ipst = ill->ill_ipst;
1570 1800 uint32_t elapsed;
1571 1801 boolean_t isv6 = ill->ill_isv6;
1572 1802 ipaddr_t ncec_addr;
1573 1803
1574 1804 if (isv6) {
1575 1805 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1576 1806 ipst);
1577 1807 } else {
1578 1808 if (arp_no_defense) {
1579 1809 /*
1580 1810 * Yes, there is a conflict, but no, we do not
1581 1811 * defend ourself.
1582 1812 */
1583 1813 return (B_TRUE);
1584 1814 }
1585 1815 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1586 1816 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1587 1817 ipst);
1588 1818 }
1589 1819 if (ipif == NULL)
1590 1820 return (B_FALSE);
1591 1821
1592 1822 /*
1593 1823 * First, figure out if this address is disposable.
1594 1824 */
1595 1825 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1596 1826 maxdefense = ipst->ips_ip_max_temp_defend;
1597 1827 else
1598 1828 maxdefense = ipst->ips_ip_max_defend;
1599 1829
1600 1830 /*
1601 1831 * Now figure out how many times we've defended ourselves. Ignore
1602 1832 * defenses that happened long in the past.
1603 1833 */
1604 1834 now = ddi_get_lbolt();
1605 1835 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1606 1836 mutex_enter(&ncec->ncec_lock);
1607 1837 if ((defs = ncec->ncec_defense_count) > 0 &&
1608 1838 elapsed > ipst->ips_ip_defend_interval) {
1609 1839 /*
1610 1840 * ip_defend_interval has elapsed.
1611 1841 * reset the defense count.
1612 1842 */
1613 1843 ncec->ncec_defense_count = defs = 0;
1614 1844 }
1615 1845 ncec->ncec_defense_count++;
1616 1846 ncec->ncec_last_time_defended = now;
1617 1847 mutex_exit(&ncec->ncec_lock);
1618 1848 ipif_refrele(ipif);
1619 1849
1620 1850 /*
1621 1851 * If we've defended ourselves too many times already, then give up and
1622 1852 * tear down the interface(s) using this address.
1623 1853 * Otherwise, caller has to defend by sending out an announce.
1624 1854 */
1625 1855 if (defs >= maxdefense) {
1626 1856 if (isv6)
1627 1857 ndp_failure(mp, ira);
1628 1858 else
1629 1859 arp_failure(mp, ira);
1630 1860 } else {
1631 1861 return (B_TRUE); /* caller must defend this address */
1632 1862 }
1633 1863 return (B_FALSE);
1634 1864 }
1635 1865
1636 1866 /*
1637 1867 * Handle reception of Neighbor Solicitation messages.
1638 1868 */
1639 1869 static void
1640 1870 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1641 1871 {
1642 1872 ill_t *ill = ira->ira_ill, *under_ill;
1643 1873 nd_neighbor_solicit_t *ns;
1644 1874 uint32_t hlen = ill->ill_phys_addr_length;
1645 1875 uchar_t *haddr = NULL;
1646 1876 icmp6_t *icmp_nd;
1647 1877 ip6_t *ip6h;
1648 1878 ncec_t *our_ncec = NULL;
1649 1879 in6_addr_t target;
1650 1880 in6_addr_t src;
1651 1881 int len;
1652 1882 int flag = 0;
1653 1883 nd_opt_hdr_t *opt = NULL;
1654 1884 boolean_t bad_solicit = B_FALSE;
1655 1885 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1656 1886 boolean_t need_ill_refrele = B_FALSE;
1657 1887
1658 1888 ip6h = (ip6_t *)mp->b_rptr;
1659 1889 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1660 1890 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1661 1891 src = ip6h->ip6_src;
1662 1892 ns = (nd_neighbor_solicit_t *)icmp_nd;
1663 1893 target = ns->nd_ns_target;
1664 1894 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1665 1895 IN6_IS_ADDR_LOOPBACK(&target)) {
1666 1896 if (ip_debug > 2) {
1667 1897 /* ip1dbg */
1668 1898 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1669 1899 AF_INET6, &target);
1670 1900 }
1671 1901 bad_solicit = B_TRUE;
1672 1902 goto done;
1673 1903 }
1674 1904 if (len > sizeof (nd_neighbor_solicit_t)) {
1675 1905 /* Options present */
1676 1906 opt = (nd_opt_hdr_t *)&ns[1];
1677 1907 len -= sizeof (nd_neighbor_solicit_t);
1678 1908 if (!ndp_verify_optlen(opt, len)) {
1679 1909 ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1680 1910 bad_solicit = B_TRUE;
1681 1911 goto done;
1682 1912 }
1683 1913 }
1684 1914 if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1685 1915 /* Check to see if this is a valid DAD solicitation */
1686 1916 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1687 1917 if (ip_debug > 2) {
1688 1918 /* ip1dbg */
1689 1919 pr_addr_dbg("ndp_input_solicit: IPv6 "
1690 1920 "Destination is not solicited node "
1691 1921 "multicast %s\n", AF_INET6,
1692 1922 &ip6h->ip6_dst);
1693 1923 }
1694 1924 bad_solicit = B_TRUE;
1695 1925 goto done;
1696 1926 }
1697 1927 }
1698 1928
1699 1929 /*
1700 1930 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1701 1931 * received this packet if it's multicast) is not the ill tied to
1702 1932 * e.g. the IPMP ill's data link-local. So we match across the illgrp
1703 1933 * to ensure we find the associated NCE.
1704 1934 */
1705 1935 our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1706 1936 /*
1707 1937 * If this is a valid Solicitation for an address we are publishing,
1708 1938 * then a PUBLISH entry should exist in the cache
1709 1939 */
1710 1940 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1711 1941 ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1712 1942 "ifname=%s ", ill->ill_name));
1713 1943 if (ip_debug > 2) {
1714 1944 /* ip1dbg */
1715 1945 pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1716 1946 }
1717 1947 if (our_ncec == NULL)
1718 1948 bad_solicit = B_TRUE;
1719 1949 goto done;
1720 1950 }
1721 1951
1722 1952 /* At this point we should have a verified NS per spec */
1723 1953 if (opt != NULL) {
1724 1954 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1725 1955 if (opt != NULL) {
1726 1956 haddr = (uchar_t *)&opt[1];
1727 1957 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1728 1958 hlen == 0) {
1729 1959 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1730 1960 bad_solicit = B_TRUE;
1731 1961 goto done;
1732 1962 }
1733 1963 }
1734 1964 }
1735 1965
1736 1966 /* If sending directly to peer, set the unicast flag */
1737 1967 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1738 1968 flag |= NDP_UNICAST;
1739 1969
1740 1970 /*
1741 1971 * Create/update the entry for the soliciting node on the ipmp_ill.
1742 1972 * or respond to outstanding queries, don't if
1743 1973 * the source is unspecified address.
1744 1974 */
1745 1975 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1746 1976 int err;
1747 1977 nce_t *nnce;
1748 1978
1749 1979 ASSERT(ill->ill_isv6);
1750 1980 /*
1751 1981 * Regular solicitations *must* include the Source Link-Layer
1752 1982 * Address option. Ignore messages that do not.
1753 1983 */
1754 1984 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1755 1985 ip1dbg(("ndp_input_solicit: source link-layer address "
1756 1986 "option missing with a specified source.\n"));
1757 1987 bad_solicit = B_TRUE;
1758 1988 goto done;
1759 1989 }
1760 1990
1761 1991 /*
1762 1992 * This is a regular solicitation. If we're still in the
1763 1993 * process of verifying the address, then don't respond at all
1764 1994 * and don't keep track of the sender.
1765 1995 */
1766 1996 if (our_ncec->ncec_state == ND_PROBE)
1767 1997 goto done;
1768 1998
1769 1999 /*
1770 2000 * If the solicitation doesn't have sender hardware address
1771 2001 * (legal for unicast solicitation), then process without
1772 2002 * installing the return NCE. Either we already know it, or
1773 2003 * we'll be forced to look it up when (and if) we reply to the
1774 2004 * packet.
1775 2005 */
1776 2006 if (haddr == NULL)
1777 2007 goto no_source;
1778 2008
1779 2009 under_ill = ill;
1780 2010 if (IS_UNDER_IPMP(under_ill)) {
1781 2011 ill = ipmp_ill_hold_ipmp_ill(under_ill);
1782 2012 if (ill == NULL)
1783 2013 ill = under_ill;
1784 2014 else
1785 2015 need_ill_refrele = B_TRUE;
1786 2016 }
1787 2017 err = nce_lookup_then_add_v6(ill,
1788 2018 haddr, hlen,
1789 2019 &src, /* Soliciting nodes address */
1790 2020 0,
1791 2021 ND_STALE,
1792 2022 &nnce);
1793 2023
1794 2024 if (need_ill_refrele) {
1795 2025 ill_refrele(ill);
1796 2026 ill = under_ill;
1797 2027 need_ill_refrele = B_FALSE;
1798 2028 }
1799 2029 switch (err) {
1800 2030 case 0:
1801 2031 /* done with this entry */
1802 2032 nce_refrele(nnce);
1803 2033 break;
1804 2034 case EEXIST:
1805 2035 /*
1806 2036 * B_FALSE indicates this is not an an advertisement.
1807 2037 */
1808 2038 nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1809 2039 nce_refrele(nnce);
1810 2040 break;
1811 2041 default:
1812 2042 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1813 2043 err));
1814 2044 goto done;
1815 2045 }
1816 2046 no_source:
1817 2047 flag |= NDP_SOLICITED;
1818 2048 } else {
1819 2049 /*
1820 2050 * No source link layer address option should be present in a
1821 2051 * valid DAD request.
1822 2052 */
1823 2053 if (haddr != NULL) {
1824 2054 ip1dbg(("ndp_input_solicit: source link-layer address "
1825 2055 "option present with an unspecified source.\n"));
1826 2056 bad_solicit = B_TRUE;
1827 2057 goto done;
1828 2058 }
1829 2059 if (our_ncec->ncec_state == ND_PROBE) {
1830 2060 /*
1831 2061 * Internally looped-back probes will have
1832 2062 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1833 2063 * transmissions.
1834 2064 */
1835 2065 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1836 2066 /*
1837 2067 * If someone else is probing our address, then
1838 2068 * we've crossed wires. Declare failure.
1839 2069 */
1840 2070 ndp_failure(mp, ira);
1841 2071 }
1842 2072 goto done;
1843 2073 }
1844 2074 /*
1845 2075 * This is a DAD probe. Multicast the advertisement to the
1846 2076 * all-nodes address.
1847 2077 */
1848 2078 src = ipv6_all_hosts_mcast;
1849 2079 }
1850 2080 flag |= nce_advert_flags(our_ncec);
1851 2081 (void) ndp_xmit(ill,
1852 2082 ND_NEIGHBOR_ADVERT,
1853 2083 our_ncec->ncec_lladdr,
1854 2084 our_ncec->ncec_lladdr_length,
1855 2085 &target, /* Source and target of the advertisement pkt */
1856 2086 &src, /* IP Destination (source of original pkt) */
1857 2087 flag);
1858 2088 done:
1859 2089 if (bad_solicit)
1860 2090 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1861 2091 if (our_ncec != NULL)
1862 2092 ncec_refrele(our_ncec);
1863 2093 }
1864 2094
1865 2095 /*
1866 2096 * Handle reception of Neighbor Solicitation messages
1867 2097 */
1868 2098 void
1869 2099 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1870 2100 {
1871 2101 ill_t *ill = ira->ira_ill;
1872 2102 nd_neighbor_advert_t *na;
1873 2103 uint32_t hlen = ill->ill_phys_addr_length;
1874 2104 uchar_t *haddr = NULL;
1875 2105 icmp6_t *icmp_nd;
1876 2106 ip6_t *ip6h;
1877 2107 ncec_t *dst_ncec = NULL;
1878 2108 in6_addr_t target;
1879 2109 nd_opt_hdr_t *opt = NULL;
1880 2110 int len;
1881 2111 ip_stack_t *ipst = ill->ill_ipst;
1882 2112 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1883 2113
1884 2114 ip6h = (ip6_t *)mp->b_rptr;
1885 2115 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1886 2116 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1887 2117 na = (nd_neighbor_advert_t *)icmp_nd;
1888 2118
1889 2119 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1890 2120 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1891 2121 ip1dbg(("ndp_input_advert: Target is multicast but the "
1892 2122 "solicited flag is not zero\n"));
1893 2123 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1894 2124 return;
1895 2125 }
1896 2126 target = na->nd_na_target;
1897 2127 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1898 2128 IN6_IS_ADDR_LOOPBACK(&target)) {
1899 2129 if (ip_debug > 2) {
1900 2130 /* ip1dbg */
1901 2131 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1902 2132 AF_INET6, &target);
1903 2133 }
1904 2134 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1905 2135 return;
1906 2136 }
1907 2137 if (len > sizeof (nd_neighbor_advert_t)) {
1908 2138 opt = (nd_opt_hdr_t *)&na[1];
1909 2139 if (!ndp_verify_optlen(opt,
1910 2140 len - sizeof (nd_neighbor_advert_t))) {
1911 2141 ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1912 2142 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1913 2143 return;
1914 2144 }
1915 2145 /* At this point we have a verified NA per spec */
1916 2146 len -= sizeof (nd_neighbor_advert_t);
1917 2147 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1918 2148 if (opt != NULL) {
1919 2149 haddr = (uchar_t *)&opt[1];
1920 2150 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1921 2151 hlen == 0) {
1922 2152 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1923 2153 BUMP_MIB(mib,
1924 2154 ipv6IfIcmpInBadNeighborAdvertisements);
1925 2155 return;
1926 2156 }
1927 2157 }
1928 2158 }
1929 2159
1930 2160 /*
1931 2161 * NOTE: we match across the illgrp since we need to do DAD for all of
1932 2162 * our local addresses, and those are spread across all the active
1933 2163 * ills in the group.
1934 2164 */
1935 2165 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1936 2166 return;
1937 2167
1938 2168 if (NCE_PUBLISH(dst_ncec)) {
1939 2169 /*
1940 2170 * Someone just advertised an addresses that we publish. First,
1941 2171 * check it it was us -- if so, we can safely ignore it.
1942 2172 * We don't get the haddr from the ira_l2src because, in the
1943 2173 * case that the packet originated from us, on an IPMP group,
1944 2174 * the ira_l2src may would be the link-layer address of the
1945 2175 * cast_ill used to send the packet, which may not be the same
1946 2176 * as the dst_ncec->ncec_lladdr of the address.
1947 2177 */
1948 2178 if (haddr != NULL) {
1949 2179 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1950 2180 goto out;
1951 2181
1952 2182 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1953 2183 goto out; /* from us -- no conflict */
1954 2184
1955 2185 /*
1956 2186 * If we're in an IPMP group, check if this is an echo
1957 2187 * from another ill in the group. Use the double-
1958 2188 * checked locking pattern to avoid grabbing
1959 2189 * ill_g_lock in the non-IPMP case.
1960 2190 */
1961 2191 if (IS_UNDER_IPMP(ill)) {
1962 2192 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1963 2193 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1964 2194 ill->ill_grp, haddr, hlen) != NULL) {
1965 2195 rw_exit(&ipst->ips_ill_g_lock);
1966 2196 goto out;
1967 2197 }
1968 2198 rw_exit(&ipst->ips_ill_g_lock);
1969 2199 }
1970 2200 }
1971 2201
1972 2202 /*
1973 2203 * This appears to be a real conflict. If we're trying to
1974 2204 * configure this NCE (ND_PROBE), then shut it down.
1975 2205 * Otherwise, handle the discovered conflict.
1976 2206 */
1977 2207 if (dst_ncec->ncec_state == ND_PROBE) {
1978 2208 ndp_failure(mp, ira);
1979 2209 } else {
1980 2210 if (ip_nce_conflict(mp, ira, dst_ncec)) {
1981 2211 char hbuf[MAC_STR_LEN];
1982 2212 char sbuf[INET6_ADDRSTRLEN];
1983 2213
1984 2214 cmn_err(CE_WARN,
1985 2215 "node '%s' is using %s on %s",
1986 2216 inet_ntop(AF_INET6, &target, sbuf,
1987 2217 sizeof (sbuf)),
1988 2218 haddr == NULL ? "<none>" :
1989 2219 mac_colon_addr(haddr, hlen, hbuf,
1990 2220 sizeof (hbuf)), ill->ill_name);
1991 2221 /*
1992 2222 * RFC 4862, Section 5.4.4 does not mandate
1993 2223 * any specific behavior when an NA matches
1994 2224 * a non-tentative address assigned to the
1995 2225 * receiver. We make the choice of defending
1996 2226 * our address, based on the assumption that
1997 2227 * the sender has not detected the Duplicate.
1998 2228 *
1999 2229 * ncec_last_time_defended has been adjusted
2000 2230 * in ip_nce_conflict()
2001 2231 */
2002 2232 (void) ndp_announce(dst_ncec);
2003 2233 }
2004 2234 }
2005 2235 } else {
2006 2236 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2007 2237 dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2008 2238
2009 2239 /* B_TRUE indicates this an advertisement */
2010 2240 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2011 2241 }
2012 2242 out:
2013 2243 ncec_refrele(dst_ncec);
2014 2244 }
2015 2245
2016 2246 /*
2017 2247 * Process NDP neighbor solicitation/advertisement messages.
2018 2248 * The checksum has already checked o.k before reaching here.
2019 2249 * Information about the datalink header is contained in ira_l2src, but
2020 2250 * that should be ignored for loopback packets.
2021 2251 */
2022 2252 void
2023 2253 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2024 2254 {
2025 2255 ill_t *ill = ira->ira_rill;
2026 2256 icmp6_t *icmp_nd;
2027 2257 ip6_t *ip6h;
2028 2258 int len;
2029 2259 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
2030 2260 ill_t *orig_ill = NULL;
2031 2261
2032 2262 /*
2033 2263 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2034 2264 * and make it be the IPMP upper so avoid being confused by a packet
2035 2265 * addressed to a unicast address on a different ill.
2036 2266 */
2037 2267 if (IS_UNDER_IPMP(ill)) {
2038 2268 orig_ill = ill;
2039 2269 ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2040 2270 if (ill == NULL) {
2041 2271 ill = orig_ill;
2042 2272 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2043 2273 ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2044 2274 mp, ill);
2045 2275 freemsg(mp);
2046 2276 return;
2047 2277 }
2048 2278 ASSERT(ill != orig_ill);
2049 2279 orig_ill = ira->ira_ill;
2050 2280 ira->ira_ill = ill;
2051 2281 mib = ill->ill_icmp6_mib;
2052 2282 }
2053 2283 if (!pullupmsg(mp, -1)) {
2054 2284 ip1dbg(("ndp_input: pullupmsg failed\n"));
2055 2285 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2056 2286 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2057 2287 goto done;
2058 2288 }
2059 2289 ip6h = (ip6_t *)mp->b_rptr;
2060 2290 if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2061 2291 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2062 2292 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2063 2293 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2064 2294 goto done;
2065 2295 }
2066 2296 /*
2067 2297 * NDP does not accept any extension headers between the
2068 2298 * IP header and the ICMP header since e.g. a routing
2069 2299 * header could be dangerous.
2070 2300 * This assumes that any AH or ESP headers are removed
2071 2301 * by ip prior to passing the packet to ndp_input.
2072 2302 */
2073 2303 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2074 2304 ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2075 2305 ip6h->ip6_nxt));
2076 2306 ip_drop_input("Wrong next header", mp, ill);
2077 2307 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2078 2308 goto done;
2079 2309 }
2080 2310 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2081 2311 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2082 2312 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2083 2313 if (icmp_nd->icmp6_code != 0) {
2084 2314 ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2085 2315 ip_drop_input("code non-zero", mp, ill);
2086 2316 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2087 2317 goto done;
2088 2318 }
2089 2319 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2090 2320 /*
2091 2321 * Make sure packet length is large enough for either
2092 2322 * a NS or a NA icmp packet.
2093 2323 */
2094 2324 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2095 2325 ip1dbg(("ndp_input: packet too short\n"));
2096 2326 ip_drop_input("packet too short", mp, ill);
2097 2327 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2098 2328 goto done;
2099 2329 }
2100 2330 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2101 2331 ndp_input_solicit(mp, ira);
2102 2332 } else {
2103 2333 ndp_input_advert(mp, ira);
2104 2334 }
2105 2335 done:
2106 2336 freemsg(mp);
2107 2337 if (orig_ill != NULL) {
2108 2338 ill_refrele(ill);
2109 2339 ira->ira_ill = orig_ill;
2110 2340 }
2111 2341 }
2112 2342
2113 2343 /*
2114 2344 * ndp_xmit is called to form and transmit a ND solicitation or
2115 2345 * advertisement ICMP packet.
2116 2346 *
2117 2347 * If the source address is unspecified and this isn't a probe (used for
2118 2348 * duplicate address detection), an appropriate source address and link layer
2119 2349 * address will be chosen here. The link layer address option is included if
2120 2350 * the source is specified (i.e., all non-probe packets), and omitted (per the
2121 2351 * specification) otherwise.
2122 2352 *
2123 2353 * It returns B_FALSE only if it does a successful put() to the
2124 2354 * corresponding ill's ill_wq otherwise returns B_TRUE.
2125 2355 */
2126 2356 static boolean_t
2127 2357 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2128 2358 const in6_addr_t *sender, const in6_addr_t *target, int flag)
2129 2359 {
2130 2360 uint32_t len;
2131 2361 icmp6_t *icmp6;
2132 2362 mblk_t *mp;
2133 2363 ip6_t *ip6h;
2134 2364 nd_opt_hdr_t *opt;
2135 2365 uint_t plen;
2136 2366 zoneid_t zoneid = GLOBAL_ZONEID;
2137 2367 ill_t *hwaddr_ill = ill;
2138 2368 ip_xmit_attr_t ixas;
2139 2369 ip_stack_t *ipst = ill->ill_ipst;
2140 2370 boolean_t need_refrele = B_FALSE;
2141 2371 boolean_t probe = B_FALSE;
2142 2372
2143 2373 if (IS_UNDER_IPMP(ill)) {
2144 2374 probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2145 2375 /*
2146 2376 * We send non-probe packets on the upper IPMP interface.
2147 2377 * ip_output_simple() will use cast_ill for sending any
2148 2378 * multicast packets. Note that we can't follow the same
2149 2379 * logic for probe packets because all interfaces in the ipmp
2150 2380 * group may have failed, so that we really want to only try
2151 2381 * to send the ND packet on the ill corresponding to the src
2152 2382 * address.
2153 2383 */
2154 2384 if (!probe) {
2155 2385 ill = ipmp_ill_hold_ipmp_ill(ill);
2156 2386 if (ill != NULL)
2157 2387 need_refrele = B_TRUE;
2158 2388 else
2159 2389 ill = hwaddr_ill;
2160 2390 }
2161 2391 }
2162 2392
2163 2393 /*
2164 2394 * If we have a unspecified source(sender) address, select a
2165 2395 * proper source address for the solicitation here itself so
2166 2396 * that we can initialize the h/w address correctly.
2167 2397 *
2168 2398 * If the sender is specified then we use this address in order
2169 2399 * to lookup the zoneid before calling ip_output_v6(). This is to
2170 2400 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2171 2401 * by IP (we cannot guarantee that the global zone has an interface
2172 2402 * route to the destination).
2173 2403 *
2174 2404 * Note that the NA never comes here with the unspecified source
2175 2405 * address.
2176 2406 */
2177 2407
2178 2408 /*
2179 2409 * Probes will have unspec src at this point.
2180 2410 */
2181 2411 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2182 2412 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2183 2413 /*
2184 2414 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2185 2415 * ALL_ZONES if it cannot find a matching ipif for the address
2186 2416 * we are trying to use. In this case we err on the side of
2187 2417 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2188 2418 */
2189 2419 if (zoneid == ALL_ZONES)
2190 2420 zoneid = GLOBAL_ZONEID;
2191 2421 }
2192 2422
2193 2423 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2194 2424 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2195 2425 mp = allocb(len, BPRI_LO);
2196 2426 if (mp == NULL) {
2197 2427 if (need_refrele)
2198 2428 ill_refrele(ill);
2199 2429 return (B_TRUE);
2200 2430 }
2201 2431
2202 2432 bzero((char *)mp->b_rptr, len);
2203 2433 mp->b_wptr = mp->b_rptr + len;
2204 2434
2205 2435 bzero(&ixas, sizeof (ixas));
2206 2436 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2207 2437
2208 2438 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2209 2439 ixas.ixa_ipst = ipst;
2210 2440 ixas.ixa_cred = kcred;
2211 2441 ixas.ixa_cpid = NOPID;
2212 2442 ixas.ixa_tsl = NULL;
2213 2443 ixas.ixa_zoneid = zoneid;
2214 2444
2215 2445 ip6h = (ip6_t *)mp->b_rptr;
2216 2446 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2217 2447 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2218 2448 ip6h->ip6_nxt = IPPROTO_ICMPV6;
2219 2449 ip6h->ip6_hops = IPV6_MAX_HOPS;
2220 2450 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2221 2451 ip6h->ip6_dst = *target;
2222 2452 icmp6 = (icmp6_t *)&ip6h[1];
2223 2453
2224 2454 if (hw_addr_len != 0) {
2225 2455 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2226 2456 sizeof (nd_neighbor_advert_t));
2227 2457 } else {
2228 2458 opt = NULL;
2229 2459 }
2230 2460 if (operation == ND_NEIGHBOR_SOLICIT) {
2231 2461 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2232 2462
2233 2463 if (opt != NULL && !(flag & NDP_PROBE)) {
2234 2464 /*
2235 2465 * Note that we don't send out SLLA for ND probes
2236 2466 * per RFC 4862, even though we do send out the src
2237 2467 * haddr for IPv4 DAD probes, even though both IPv4
2238 2468 * and IPv6 go out with the unspecified/INADDR_ANY
2239 2469 * src IP addr.
2240 2470 */
2241 2471 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2242 2472 }
2243 2473 ip6h->ip6_src = *sender;
2244 2474 ns->nd_ns_target = *target;
2245 2475 if (!(flag & NDP_UNICAST)) {
2246 2476 /* Form multicast address of the target */
2247 2477 ip6h->ip6_dst = ipv6_solicited_node_mcast;
2248 2478 ip6h->ip6_dst.s6_addr32[3] |=
2249 2479 ns->nd_ns_target.s6_addr32[3];
2250 2480 }
2251 2481 } else {
2252 2482 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2253 2483
2254 2484 ASSERT(!(flag & NDP_PROBE));
2255 2485 if (opt != NULL)
2256 2486 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2257 2487 ip6h->ip6_src = *sender;
2258 2488 na->nd_na_target = *sender;
2259 2489 if (flag & NDP_ISROUTER)
2260 2490 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2261 2491 if (flag & NDP_SOLICITED)
2262 2492 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2263 2493 if (flag & NDP_ORIDE)
2264 2494 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2265 2495 }
2266 2496
2267 2497 if (!(flag & NDP_PROBE)) {
2268 2498 if (hw_addr != NULL && opt != NULL) {
2269 2499 /* Fill in link layer address and option len */
2270 2500 opt->nd_opt_len = (uint8_t)plen;
2271 2501 bcopy(hw_addr, &opt[1], hw_addr_len);
2272 2502 }
2273 2503 }
2274 2504 if (opt != NULL && opt->nd_opt_type == 0) {
2275 2505 /* If there's no link layer address option, then strip it. */
2276 2506 len -= plen * 8;
2277 2507 mp->b_wptr = mp->b_rptr + len;
2278 2508 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2279 2509 }
2280 2510
2281 2511 icmp6->icmp6_type = (uint8_t)operation;
2282 2512 icmp6->icmp6_code = 0;
2283 2513 /*
2284 2514 * Prepare for checksum by putting icmp length in the icmp
2285 2515 * checksum field. The checksum is calculated in ip_output.c.
2286 2516 */
2287 2517 icmp6->icmp6_cksum = ip6h->ip6_plen;
2288 2518
2289 2519 (void) ip_output_simple(mp, &ixas);
2290 2520 ixa_cleanup(&ixas);
2291 2521 if (need_refrele)
2292 2522 ill_refrele(ill);
2293 2523 return (B_FALSE);
2294 2524 }
2295 2525
2296 2526 /*
2297 2527 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2298 2528 * The datapath uses this as an indication that there
2299 2529 * is a problem (as opposed to a NCE that was just
2300 2530 * reclaimed due to lack of memory.
2301 2531 * Note that static ARP entries never become unreachable.
2302 2532 */
2303 2533 void
2304 2534 nce_make_unreachable(ncec_t *ncec)
2305 2535 {
2306 2536 mutex_enter(&ncec->ncec_lock);
2307 2537 ncec->ncec_state = ND_UNREACHABLE;
2308 2538 mutex_exit(&ncec->ncec_lock);
2309 2539 }
2310 2540
2311 2541 /*
2312 2542 * NCE retransmit timer. Common to IPv4 and IPv6.
2313 2543 * This timer goes off when:
2314 2544 * a. It is time to retransmit a resolution for resolver.
2315 2545 * b. It is time to send reachability probes.
2316 2546 */
2317 2547 void
2318 2548 nce_timer(void *arg)
2319 2549 {
2320 2550 ncec_t *ncec = arg;
2321 2551 ill_t *ill = ncec->ncec_ill, *src_ill;
2322 2552 char addrbuf[INET6_ADDRSTRLEN];
2323 2553 boolean_t dropped = B_FALSE;
2324 2554 ip_stack_t *ipst = ncec->ncec_ipst;
2325 2555 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2326 2556 in_addr_t sender4 = INADDR_ANY;
2327 2557 in6_addr_t sender6 = ipv6_all_zeros;
2328 2558
2329 2559 /*
2330 2560 * The timer has to be cancelled by ncec_delete before doing the final
2331 2561 * refrele. So the NCE is guaranteed to exist when the timer runs
2332 2562 * until it clears the timeout_id. Before clearing the timeout_id
2333 2563 * bump up the refcnt so that we can continue to use the ncec
2334 2564 */
2335 2565 ASSERT(ncec != NULL);
2336 2566 mutex_enter(&ncec->ncec_lock);
2337 2567 ncec_refhold_locked(ncec);
2338 2568 ncec->ncec_timeout_id = 0;
2339 2569 mutex_exit(&ncec->ncec_lock);
2340 2570
2341 2571 src_ill = nce_resolve_src(ncec, &sender6);
2342 2572 /* if we could not find a sender address, return */
2343 2573 if (src_ill == NULL) {
2344 2574 if (!isv6) {
2345 2575 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2346 2576 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2347 2577 &sender4, addrbuf, sizeof (addrbuf))));
2348 2578 } else {
2349 2579 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2350 2580 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2351 2581 }
2352 2582 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2353 2583 ncec_refrele(ncec);
2354 2584 return;
2355 2585 }
2356 2586 if (!isv6)
2357 2587 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2358 2588
2359 2589 mutex_enter(&ncec->ncec_lock);
2360 2590 /*
2361 2591 * Check the reachability state.
2362 2592 */
2363 2593 switch (ncec->ncec_state) {
2364 2594 case ND_DELAY:
2365 2595 ASSERT(ncec->ncec_lladdr != NULL);
2366 2596 ncec->ncec_state = ND_PROBE;
2367 2597 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2368 2598 if (isv6) {
2369 2599 mutex_exit(&ncec->ncec_lock);
2370 2600 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2371 2601 src_ill->ill_phys_addr,
2372 2602 src_ill->ill_phys_addr_length,
2373 2603 &sender6, &ncec->ncec_addr,
2374 2604 NDP_UNICAST);
2375 2605 } else {
2376 2606 dropped = (arp_request(ncec, sender4, src_ill) == 0);
2377 2607 mutex_exit(&ncec->ncec_lock);
2378 2608 }
2379 2609 if (!dropped) {
2380 2610 mutex_enter(&ncec->ncec_lock);
2381 2611 ncec->ncec_pcnt--;
2382 2612 mutex_exit(&ncec->ncec_lock);
2383 2613 }
2384 2614 if (ip_debug > 3) {
2385 2615 /* ip2dbg */
2386 2616 pr_addr_dbg("nce_timer: state for %s changed "
2387 2617 "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2388 2618 }
2389 2619 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2390 2620 break;
2391 2621 case ND_PROBE:
2392 2622 /* must be retransmit timer */
2393 2623 ASSERT(ncec->ncec_pcnt >= -1);
2394 2624 if (ncec->ncec_pcnt > 0) {
2395 2625 /*
2396 2626 * As per RFC2461, the ncec gets deleted after
2397 2627 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2398 2628 * Note that the first unicast solicitation is sent
2399 2629 * during the DELAY state.
2400 2630 */
2401 2631 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2402 2632 ncec->ncec_pcnt,
2403 2633 inet_ntop((isv6? AF_INET6 : AF_INET),
2404 2634 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2405 2635 if (NCE_PUBLISH(ncec)) {
2406 2636 mutex_exit(&ncec->ncec_lock);
2407 2637 /*
2408 2638 * send out a probe; note that src_ill
2409 2639 * is ignored by nce_dad() for all
2410 2640 * DAD message types other than IPv6
2411 2641 * unicast probes
2412 2642 */
2413 2643 nce_dad(ncec, src_ill, B_TRUE);
2414 2644 } else {
2415 2645 ASSERT(src_ill != NULL);
2416 2646 if (isv6) {
2417 2647 mutex_exit(&ncec->ncec_lock);
2418 2648 dropped = ndp_xmit(src_ill,
2419 2649 ND_NEIGHBOR_SOLICIT,
2420 2650 src_ill->ill_phys_addr,
2421 2651 src_ill->ill_phys_addr_length,
2422 2652 &sender6, &ncec->ncec_addr,
2423 2653 NDP_UNICAST);
2424 2654 } else {
2425 2655 /*
2426 2656 * since the nce is REACHABLE,
2427 2657 * the ARP request will be sent out
2428 2658 * as a link-layer unicast.
2429 2659 */
2430 2660 dropped = (arp_request(ncec, sender4,
2431 2661 src_ill) == 0);
2432 2662 mutex_exit(&ncec->ncec_lock);
2433 2663 }
2434 2664 if (!dropped) {
2435 2665 mutex_enter(&ncec->ncec_lock);
2436 2666 ncec->ncec_pcnt--;
2437 2667 mutex_exit(&ncec->ncec_lock);
2438 2668 }
2439 2669 nce_restart_timer(ncec,
2440 2670 ill->ill_reachable_retrans_time);
2441 2671 }
2442 2672 } else if (ncec->ncec_pcnt < 0) {
2443 2673 /* No hope, delete the ncec */
2444 2674 /* Tell datapath it went bad */
2445 2675 ncec->ncec_state = ND_UNREACHABLE;
2446 2676 mutex_exit(&ncec->ncec_lock);
2447 2677 if (ip_debug > 2) {
2448 2678 /* ip1dbg */
2449 2679 pr_addr_dbg("nce_timer: Delete NCE for"
2450 2680 " dst %s\n", (isv6? AF_INET6: AF_INET),
2451 2681 &ncec->ncec_addr);
2452 2682 }
2453 2683 /* if static ARP can't delete. */
2454 2684 if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2455 2685 ncec_delete(ncec);
2456 2686
2457 2687 } else if (!NCE_PUBLISH(ncec)) {
2458 2688 /*
2459 2689 * Probe count is 0 for a dynamic entry (one that we
2460 2690 * ourselves are not publishing). We should never get
2461 2691 * here if NONUD was requested, hence the ASSERT below.
2462 2692 */
2463 2693 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2464 2694 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2465 2695 ncec->ncec_pcnt, inet_ntop(AF_INET6,
2466 2696 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2467 2697 ncec->ncec_pcnt--;
2468 2698 mutex_exit(&ncec->ncec_lock);
2469 2699 /* Wait one interval before killing */
2470 2700 nce_restart_timer(ncec,
2471 2701 ill->ill_reachable_retrans_time);
2472 2702 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2473 2703 ipif_t *ipif;
2474 2704 ipaddr_t ncec_addr;
2475 2705
2476 2706 /*
2477 2707 * We're done probing, and we can now declare this
2478 2708 * address to be usable. Let IP know that it's ok to
2479 2709 * use.
2480 2710 */
2481 2711 ncec->ncec_state = ND_REACHABLE;
2482 2712 ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2483 2713 mutex_exit(&ncec->ncec_lock);
2484 2714 if (isv6) {
2485 2715 ipif = ipif_lookup_addr_exact_v6(
2486 2716 &ncec->ncec_addr, ill, ipst);
2487 2717 } else {
2488 2718 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2489 2719 ncec_addr);
2490 2720 ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2491 2721 ipst);
2492 2722 }
2493 2723 if (ipif != NULL) {
2494 2724 if (ipif->ipif_was_dup) {
2495 2725 char ibuf[LIFNAMSIZ];
2496 2726 char sbuf[INET6_ADDRSTRLEN];
2497 2727
2498 2728 ipif->ipif_was_dup = B_FALSE;
2499 2729 (void) inet_ntop(AF_INET6,
2500 2730 &ipif->ipif_v6lcl_addr,
2501 2731 sbuf, sizeof (sbuf));
2502 2732 ipif_get_name(ipif, ibuf,
2503 2733 sizeof (ibuf));
2504 2734 cmn_err(CE_NOTE, "recovered address "
2505 2735 "%s on %s", sbuf, ibuf);
2506 2736 }
2507 2737 if ((ipif->ipif_flags & IPIF_UP) &&
2508 2738 !ipif->ipif_addr_ready)
2509 2739 ipif_up_notify(ipif);
2510 2740 ipif->ipif_addr_ready = 1;
2511 2741 ipif_refrele(ipif);
2512 2742 }
2513 2743 if (!isv6 && arp_no_defense)
2514 2744 break;
2515 2745 /* Begin defending our new address */
2516 2746 if (ncec->ncec_unsolicit_count > 0) {
2517 2747 ncec->ncec_unsolicit_count--;
2518 2748 if (isv6) {
2519 2749 dropped = ndp_announce(ncec);
2520 2750 } else {
2521 2751 dropped = arp_announce(ncec);
2522 2752 }
2523 2753
2524 2754 if (dropped)
2525 2755 ncec->ncec_unsolicit_count++;
2526 2756 else
2527 2757 ncec->ncec_last_time_defended =
2528 2758 ddi_get_lbolt();
2529 2759 }
2530 2760 if (ncec->ncec_unsolicit_count > 0) {
2531 2761 nce_restart_timer(ncec,
2532 2762 ANNOUNCE_INTERVAL(isv6));
2533 2763 } else if (DEFENSE_INTERVAL(isv6) != 0) {
2534 2764 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2535 2765 }
2536 2766 } else {
2537 2767 /*
2538 2768 * This is an address we're probing to be our own, but
2539 2769 * the ill is down. Wait until it comes back before
2540 2770 * doing anything, but switch to reachable state so
2541 2771 * that the restart will work.
2542 2772 */
2543 2773 ncec->ncec_state = ND_REACHABLE;
2544 2774 mutex_exit(&ncec->ncec_lock);
2545 2775 }
2546 2776 break;
2547 2777 case ND_INCOMPLETE: {
2548 2778 mblk_t *mp, *nextmp;
2549 2779 mblk_t **prevmpp;
2550 2780
2551 2781 /*
2552 2782 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2553 2783 * for any IPMP probe packets, and toss them. IPMP probe
2554 2784 * packets will always be at the head of ncec_qd_mp, so that
2555 2785 * we can stop at the first queued ND packet that is
2556 2786 * not a probe packet.
2557 2787 */
2558 2788 prevmpp = &ncec->ncec_qd_mp;
2559 2789 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2560 2790 nextmp = mp->b_next;
2561 2791
2562 2792 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2563 2793 inet_freemsg(mp);
2564 2794 ncec->ncec_nprobes--;
2565 2795 *prevmpp = nextmp;
2566 2796 } else {
2567 2797 prevmpp = &mp->b_next;
2568 2798 }
2569 2799 }
2570 2800
2571 2801 /*
2572 2802 * Must be resolver's retransmit timer.
2573 2803 */
2574 2804 mutex_exit(&ncec->ncec_lock);
2575 2805 ip_ndp_resolve(ncec);
2576 2806 break;
2577 2807 }
2578 2808 case ND_REACHABLE:
2579 2809 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2580 2810 ncec->ncec_unsolicit_count != 0) ||
2581 2811 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2582 2812 if (ncec->ncec_unsolicit_count > 0) {
2583 2813 ncec->ncec_unsolicit_count--;
2584 2814 mutex_exit(&ncec->ncec_lock);
2585 2815 /*
2586 2816 * When we get to zero announcements left,
2587 2817 * switch to address defense
2588 2818 */
2589 2819 } else {
2590 2820 boolean_t rate_limit;
2591 2821
2592 2822 mutex_exit(&ncec->ncec_lock);
2593 2823 rate_limit = ill_defend_rate_limit(ill, ncec);
2594 2824 if (rate_limit) {
2595 2825 nce_restart_timer(ncec,
2596 2826 DEFENSE_INTERVAL(isv6));
2597 2827 break;
2598 2828 }
2599 2829 }
2600 2830 if (isv6) {
2601 2831 dropped = ndp_announce(ncec);
2602 2832 } else {
2603 2833 dropped = arp_announce(ncec);
2604 2834 }
2605 2835 mutex_enter(&ncec->ncec_lock);
2606 2836 if (dropped) {
2607 2837 ncec->ncec_unsolicit_count++;
2608 2838 } else {
2609 2839 ncec->ncec_last_time_defended =
2610 2840 ddi_get_lbolt();
2611 2841 }
2612 2842 mutex_exit(&ncec->ncec_lock);
2613 2843 if (ncec->ncec_unsolicit_count != 0) {
2614 2844 nce_restart_timer(ncec,
2615 2845 ANNOUNCE_INTERVAL(isv6));
2616 2846 } else {
2617 2847 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2618 2848 }
2619 2849 } else {
2620 2850 mutex_exit(&ncec->ncec_lock);
2621 2851 }
2622 2852 break;
2623 2853 default:
2624 2854 mutex_exit(&ncec->ncec_lock);
2625 2855 break;
2626 2856 }
2627 2857 done:
2628 2858 ncec_refrele(ncec);
2629 2859 ill_refrele(src_ill);
2630 2860 }
2631 2861
2632 2862 /*
2633 2863 * Set a link layer address from the ll_addr passed in.
2634 2864 * Copy SAP from ill.
2635 2865 */
2636 2866 static void
2637 2867 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2638 2868 {
2639 2869 ill_t *ill = ncec->ncec_ill;
2640 2870
2641 2871 ASSERT(ll_addr != NULL);
2642 2872 if (ill->ill_phys_addr_length > 0) {
2643 2873 /*
2644 2874 * The bcopy() below used to be called for the physical address
2645 2875 * length rather than the link layer address length. For
2646 2876 * ethernet and many other media, the phys_addr and lla are
2647 2877 * identical.
2648 2878 *
2649 2879 * The phys_addr and lla may not be the same for devices that
2650 2880 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2651 2881 * no known instances of these.
2652 2882 *
2653 2883 * For PPP or other interfaces with a zero length
2654 2884 * physical address, don't do anything here.
2655 2885 * The bcopy() with a zero phys_addr length was previously
2656 2886 * a no-op for interfaces with a zero-length physical address.
2657 2887 * Using the lla for them would change the way they operate.
2658 2888 * Doing nothing in such cases preserves expected behavior.
2659 2889 */
2660 2890 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2661 2891 }
2662 2892 }
2663 2893
2664 2894 boolean_t
2665 2895 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2666 2896 uint32_t ll_addr_len)
2667 2897 {
2668 2898 ASSERT(ncec->ncec_lladdr != NULL);
2669 2899 if (ll_addr == NULL)
2670 2900 return (B_FALSE);
2671 2901 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2672 2902 return (B_TRUE);
2673 2903 return (B_FALSE);
2674 2904 }
2675 2905
2676 2906 /*
2677 2907 * Updates the link layer address or the reachability state of
2678 2908 * a cache entry. Reset probe counter if needed.
2679 2909 */
2680 2910 void
2681 2911 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2682 2912 {
2683 2913 ill_t *ill = ncec->ncec_ill;
2684 2914 boolean_t need_stop_timer = B_FALSE;
2685 2915 boolean_t need_fastpath_update = B_FALSE;
2686 2916 nce_t *nce = NULL;
2687 2917 timeout_id_t tid;
2688 2918
2689 2919 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2690 2920 /*
2691 2921 * If this interface does not do NUD, there is no point
2692 2922 * in allowing an update to the cache entry. Although
2693 2923 * we will respond to NS.
2694 2924 * The only time we accept an update for a resolver when
2695 2925 * NUD is turned off is when it has just been created.
2696 2926 * Non-Resolvers will always be created as REACHABLE.
2697 2927 */
2698 2928 if (new_state != ND_UNCHANGED) {
2699 2929 if ((ncec->ncec_flags & NCE_F_NONUD) &&
2700 2930 (ncec->ncec_state != ND_INCOMPLETE))
2701 2931 return;
2702 2932 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2703 2933 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2704 2934 need_stop_timer = B_TRUE;
2705 2935 if (new_state == ND_REACHABLE)
2706 2936 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2707 2937 else {
2708 2938 /* We force NUD in this case */
2709 2939 ncec->ncec_last = 0;
2710 2940 }
2711 2941 ncec->ncec_state = new_state;
2712 2942 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2713 2943 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2714 2944 new_state == ND_INCOMPLETE);
2715 2945 }
2716 2946 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2717 2947 tid = ncec->ncec_timeout_id;
2718 2948 ncec->ncec_timeout_id = 0;
2719 2949 }
2720 2950 /*
2721 2951 * Re-trigger fastpath probe and
2722 2952 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2723 2953 * whatever packets that happens to be transmitting at the time.
2724 2954 */
2725 2955 if (new_ll_addr != NULL) {
2726 2956 bcopy(new_ll_addr, ncec->ncec_lladdr,
2727 2957 ill->ill_phys_addr_length);
2728 2958 need_fastpath_update = B_TRUE;
2729 2959 }
2730 2960 mutex_exit(&ncec->ncec_lock);
2731 2961 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2732 2962 if (tid != 0)
2733 2963 (void) untimeout(tid);
2734 2964 }
2735 2965 if (need_fastpath_update) {
2736 2966 /*
2737 2967 * Delete any existing existing dlur_mp and fp_mp information.
2738 2968 * For IPMP interfaces, all underlying ill's must be checked
2739 2969 * and purged.
2740 2970 */
2741 2971 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2742 2972 /*
2743 2973 * add the new dlur_mp and fp_mp
2744 2974 */
2745 2975 nce = nce_fastpath(ncec, B_TRUE, NULL);
2746 2976 if (nce != NULL)
2747 2977 nce_refrele(nce);
2748 2978 }
2749 2979 mutex_enter(&ncec->ncec_lock);
2750 2980 }
2751 2981
2752 2982 static void
2753 2983 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2754 2984 {
2755 2985 uint_t count = 0;
2756 2986 mblk_t **mpp, *tmp;
2757 2987
2758 2988 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2759 2989
2760 2990 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2761 2991 if (++count > ncec->ncec_ill->ill_max_buf) {
2762 2992 tmp = ncec->ncec_qd_mp->b_next;
2763 2993 ncec->ncec_qd_mp->b_next = NULL;
2764 2994 /*
2765 2995 * if we never create data addrs on the under_ill
2766 2996 * does this matter?
2767 2997 */
2768 2998 BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2769 2999 ipIfStatsOutDiscards);
2770 3000 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2771 3001 ncec->ncec_ill);
2772 3002 freemsg(ncec->ncec_qd_mp);
2773 3003 ncec->ncec_qd_mp = tmp;
2774 3004 }
2775 3005 }
2776 3006
2777 3007 if (head_insert) {
2778 3008 ncec->ncec_nprobes++;
2779 3009 mp->b_next = ncec->ncec_qd_mp;
2780 3010 ncec->ncec_qd_mp = mp;
2781 3011 } else {
2782 3012 *mpp = mp;
2783 3013 }
2784 3014 }
2785 3015
2786 3016 /*
2787 3017 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2788 3018 * queued at the head or tail of the queue based on the input argument
2789 3019 * 'head_insert'. The caller should specify this argument as B_TRUE if this
2790 3020 * packet is an IPMP probe packet, in which case the following happens:
2791 3021 *
2792 3022 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal
2793 3023 * (non-ipmp_probe) load-speading case where the source address of the ND
2794 3024 * packet is not tied to ncec_ill. If the ill bound to the source address
2795 3025 * cannot receive, the response to the ND packet will not be received.
2796 3026 * However, if ND packets for ncec_ill's probes are queued behind that ND
2797 3027 * packet, those probes will also fail to be sent, and thus in.mpathd will
2798 3028 * erroneously conclude that ncec_ill has also failed.
2799 3029 *
2800 3030 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
2801 3031 * the first attempt. This ensures that ND problems do not manifest as
2802 3032 * probe RTT spikes.
2803 3033 *
2804 3034 * We achieve this by inserting ipmp_probe() packets at the head of the
2805 3035 * nce_queue.
2806 3036 *
2807 3037 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2808 3038 * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2809 3039 */
2810 3040 void
2811 3041 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2812 3042 {
2813 3043 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2814 3044 nce_queue_mp_common(ncec, mp, head_insert);
2815 3045 }
2816 3046
2817 3047 /*
2818 3048 * Called when address resolution failed due to a timeout.
2819 3049 * Send an ICMP unreachable in response to all queued packets.
2820 3050 */
2821 3051 void
2822 3052 ndp_resolv_failed(ncec_t *ncec)
2823 3053 {
2824 3054 mblk_t *mp, *nxt_mp;
2825 3055 char buf[INET6_ADDRSTRLEN];
2826 3056 ill_t *ill = ncec->ncec_ill;
2827 3057 ip_recv_attr_t iras;
2828 3058
2829 3059 bzero(&iras, sizeof (iras));
2830 3060 iras.ira_flags = 0;
2831 3061 /*
2832 3062 * we are setting the ira_rill to the ipmp_ill (instead of
2833 3063 * the actual ill on which the packet was received), but this
2834 3064 * is ok because we don't actually need the real ira_rill.
2835 3065 * to send the icmp unreachable to the sender.
2836 3066 */
2837 3067 iras.ira_ill = iras.ira_rill = ill;
2838 3068 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2839 3069 iras.ira_rifindex = iras.ira_ruifindex;
2840 3070
2841 3071 ip1dbg(("ndp_resolv_failed: dst %s\n",
2842 3072 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2843 3073 mutex_enter(&ncec->ncec_lock);
2844 3074 mp = ncec->ncec_qd_mp;
2845 3075 ncec->ncec_qd_mp = NULL;
2846 3076 ncec->ncec_nprobes = 0;
2847 3077 mutex_exit(&ncec->ncec_lock);
2848 3078 while (mp != NULL) {
2849 3079 nxt_mp = mp->b_next;
2850 3080 mp->b_next = NULL;
2851 3081
2852 3082 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2853 3083 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2854 3084 mp, ill);
2855 3085 icmp_unreachable_v6(mp,
2856 3086 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2857 3087 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2858 3088 mp = nxt_mp;
2859 3089 }
2860 3090 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2861 3091 }
2862 3092
2863 3093 /*
2864 3094 * Handle the completion of NDP and ARP resolution.
2865 3095 */
2866 3096 void
2867 3097 nce_resolv_ok(ncec_t *ncec)
2868 3098 {
2869 3099 mblk_t *mp;
2870 3100 uint_t pkt_len;
2871 3101 iaflags_t ixaflags = IXAF_NO_TRACE;
2872 3102 nce_t *nce;
2873 3103 ill_t *ill = ncec->ncec_ill;
2874 3104 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2875 3105 ip_stack_t *ipst = ill->ill_ipst;
2876 3106
2877 3107 if (IS_IPMP(ncec->ncec_ill)) {
2878 3108 nce_resolv_ipmp_ok(ncec);
2879 3109 return;
2880 3110 }
2881 3111 /* non IPMP case */
2882 3112
2883 3113 mutex_enter(&ncec->ncec_lock);
2884 3114 ASSERT(ncec->ncec_nprobes == 0);
2885 3115 mp = ncec->ncec_qd_mp;
2886 3116 ncec->ncec_qd_mp = NULL;
2887 3117 mutex_exit(&ncec->ncec_lock);
2888 3118
2889 3119 while (mp != NULL) {
2890 3120 mblk_t *nxt_mp;
2891 3121
2892 3122 if (ill->ill_isv6) {
2893 3123 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2894 3124
2895 3125 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2896 3126 } else {
2897 3127 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2898 3128
2899 3129 ixaflags |= IXAF_IS_IPV4;
2900 3130 pkt_len = ntohs(ipha->ipha_length);
2901 3131 }
2902 3132 nxt_mp = mp->b_next;
2903 3133 mp->b_next = NULL;
2904 3134 /*
2905 3135 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2906 3136 * longer available, but it's ok to drop this flag because TCP
2907 3137 * has its own flow-control in effect, so TCP packets
2908 3138 * are not likely to get here when flow-control is in effect.
2909 3139 */
2910 3140 mutex_enter(&ill->ill_lock);
2911 3141 nce = nce_lookup(ill, &ncec->ncec_addr);
2912 3142 mutex_exit(&ill->ill_lock);
2913 3143
2914 3144 if (nce == NULL) {
2915 3145 if (isv6) {
2916 3146 BUMP_MIB(&ipst->ips_ip6_mib,
2917 3147 ipIfStatsOutDiscards);
2918 3148 } else {
2919 3149 BUMP_MIB(&ipst->ips_ip_mib,
2920 3150 ipIfStatsOutDiscards);
2921 3151 }
2922 3152 ip_drop_output("ipIfStatsOutDiscards - no nce",
2923 3153 mp, NULL);
2924 3154 freemsg(mp);
2925 3155 } else {
2926 3156 /*
2927 3157 * We don't know the zoneid, but
2928 3158 * ip_xmit does not care since IXAF_NO_TRACE
2929 3159 * is set. (We traced the packet the first
2930 3160 * time through ip_xmit.)
2931 3161 */
2932 3162 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2933 3163 ALL_ZONES, 0, NULL);
2934 3164 nce_refrele(nce);
2935 3165 }
2936 3166 mp = nxt_mp;
2937 3167 }
2938 3168
2939 3169 ncec_cb_dispatch(ncec); /* complete callbacks */
2940 3170 }
2941 3171
2942 3172 /*
2943 3173 * Called by SIOCSNDP* ioctl to add/change an ncec entry
2944 3174 * and the corresponding attributes.
2945 3175 * Disallow states other than ND_REACHABLE or ND_STALE.
2946 3176 */
2947 3177 int
2948 3178 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2949 3179 {
2950 3180 sin6_t *sin6;
2951 3181 in6_addr_t *addr;
2952 3182 ncec_t *ncec;
2953 3183 nce_t *nce;
2954 3184 int err = 0;
2955 3185 uint16_t new_flags = 0;
2956 3186 uint16_t old_flags = 0;
2957 3187 int inflags = lnr->lnr_flags;
2958 3188 ip_stack_t *ipst = ill->ill_ipst;
2959 3189 boolean_t do_postprocess = B_FALSE;
2960 3190
2961 3191 ASSERT(ill->ill_isv6);
2962 3192 if ((lnr->lnr_state_create != ND_REACHABLE) &&
2963 3193 (lnr->lnr_state_create != ND_STALE))
2964 3194 return (EINVAL);
2965 3195
2966 3196 sin6 = (sin6_t *)&lnr->lnr_addr;
2967 3197 addr = &sin6->sin6_addr;
2968 3198
2969 3199 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2970 3200 ASSERT(!IS_UNDER_IPMP(ill));
2971 3201 nce = nce_lookup_addr(ill, addr);
2972 3202 if (nce != NULL)
2973 3203 new_flags = nce->nce_common->ncec_flags;
2974 3204
2975 3205 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2976 3206 case NDF_ISROUTER_ON:
2977 3207 new_flags |= NCE_F_ISROUTER;
2978 3208 break;
2979 3209 case NDF_ISROUTER_OFF:
2980 3210 new_flags &= ~NCE_F_ISROUTER;
2981 3211 break;
2982 3212 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2983 3213 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2984 3214 if (nce != NULL)
2985 3215 nce_refrele(nce);
2986 3216 return (EINVAL);
2987 3217 }
2988 3218 if (inflags & NDF_STATIC)
2989 3219 new_flags |= NCE_F_STATIC;
2990 3220
2991 3221 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2992 3222 case NDF_ANYCAST_ON:
2993 3223 new_flags |= NCE_F_ANYCAST;
2994 3224 break;
2995 3225 case NDF_ANYCAST_OFF:
2996 3226 new_flags &= ~NCE_F_ANYCAST;
2997 3227 break;
2998 3228 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2999 3229 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3000 3230 if (nce != NULL)
3001 3231 nce_refrele(nce);
3002 3232 return (EINVAL);
3003 3233 }
3004 3234
3005 3235 if (nce == NULL) {
3006 3236 err = nce_add_v6(ill,
3007 3237 (uchar_t *)lnr->lnr_hdw_addr,
3008 3238 ill->ill_phys_addr_length,
3009 3239 addr,
3010 3240 new_flags,
3011 3241 lnr->lnr_state_create,
3012 3242 &nce);
3013 3243 if (err != 0) {
3014 3244 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3015 3245 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3016 3246 return (err);
3017 3247 } else {
3018 3248 do_postprocess = B_TRUE;
3019 3249 }
3020 3250 }
3021 3251 ncec = nce->nce_common;
3022 3252 old_flags = ncec->ncec_flags;
3023 3253 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3024 3254 ncec_router_to_host(ncec);
3025 3255 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3026 3256 if (do_postprocess)
3027 3257 err = nce_add_v6_postprocess(nce);
3028 3258 nce_refrele(nce);
3029 3259 return (0);
3030 3260 }
3031 3261 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3032 3262
3033 3263 if (do_postprocess)
3034 3264 err = nce_add_v6_postprocess(nce);
3035 3265 /*
3036 3266 * err cannot be anything other than 0 because we don't support
3037 3267 * proxy arp of static addresses.
3038 3268 */
3039 3269 ASSERT(err == 0);
3040 3270
3041 3271 mutex_enter(&ncec->ncec_lock);
3042 3272 ncec->ncec_flags = new_flags;
3043 3273 mutex_exit(&ncec->ncec_lock);
3044 3274 /*
3045 3275 * Note that we ignore the state at this point, which
3046 3276 * should be either STALE or REACHABLE. Instead we let
3047 3277 * the link layer address passed in to determine the state
3048 3278 * much like incoming packets.
3049 3279 */
3050 3280 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3051 3281 nce_refrele(nce);
3052 3282 return (0);
3053 3283 }
3054 3284
3055 3285 /*
3056 3286 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3057 3287 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3058 3288 * be held to ensure that they are in the same group.
3059 3289 */
3060 3290 static nce_t *
3061 3291 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3062 3292 {
3063 3293
3064 3294 nce_t *nce;
3065 3295
3066 3296 nce = nce_ill_lookup_then_add(ill, ncec);
3067 3297
3068 3298 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3069 3299 return (nce);
3070 3300
3071 3301 /*
3072 3302 * hold the ncec_lock to synchronize with nce_update() so that,
3073 3303 * at the end of this function, the contents of nce_dlur_mp are
3074 3304 * consistent with ncec->ncec_lladdr, even though some intermediate
3075 3305 * packet may have been sent out with a mangled address, which would
3076 3306 * only be a transient condition.
3077 3307 */
3078 3308 mutex_enter(&ncec->ncec_lock);
3079 3309 if (ncec->ncec_lladdr != NULL) {
3080 3310 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3081 3311 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3082 3312 } else {
3083 3313 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3084 3314 ill->ill_sap_length);
3085 3315 }
3086 3316 mutex_exit(&ncec->ncec_lock);
3087 3317 return (nce);
3088 3318 }
3089 3319
3090 3320 /*
3091 3321 * we make nce_fp_mp to have an M_DATA prepend.
3092 3322 * The caller ensures there is hold on ncec for this function.
|
↓ open down ↓ |
2029 lines elided |
↑ open up ↑ |
3093 3323 * Note that since ill_fastpath_probe() copies the mblk there is
3094 3324 * no need to hold the nce or ncec beyond this function.
3095 3325 *
3096 3326 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3097 3327 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3098 3328 * and will be returned back by this function, so that no extra nce_refrele
3099 3329 * is required for the caller. The calls from nce_add_common() use this
3100 3330 * method. All other callers (that pass in NULL ncec_nce) will have to do a
3101 3331 * nce_refrele of the returned nce (when it is non-null).
3102 3332 */
3103 -nce_t *
3333 +static nce_t *
3104 3334 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3105 3335 {
3106 3336 nce_t *nce;
3107 3337 ill_t *ill = ncec->ncec_ill;
3108 3338
3109 3339 ASSERT(ill != NULL);
3110 3340
3111 3341 if (IS_IPMP(ill) && trigger_fp_req) {
3112 3342 trigger_fp_req = B_FALSE;
3113 3343 ipmp_ncec_refresh_nce(ncec);
3114 3344 }
3115 3345
3116 3346 /*
3117 3347 * If the caller already has the nce corresponding to the ill, use
3118 3348 * that one. Otherwise we have to lookup/add the nce. Calls from
3119 3349 * nce_add_common() fall in the former category, and have just done
3120 3350 * the nce lookup/add that can be reused.
3121 3351 */
3122 3352 if (ncec_nce == NULL)
3123 3353 nce = nce_fastpath_create(ill, ncec);
3124 3354 else
3125 3355 nce = ncec_nce;
3126 3356
3127 3357 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3128 3358 return (nce);
3129 3359
3130 3360 if (trigger_fp_req)
3131 3361 nce_fastpath_trigger(nce);
3132 3362 return (nce);
3133 3363 }
3134 3364
3135 3365 /*
3136 3366 * Trigger fastpath on nce. No locks may be held.
3137 3367 */
3138 3368 static void
3139 3369 nce_fastpath_trigger(nce_t *nce)
3140 3370 {
3141 3371 int res;
3142 3372 ill_t *ill = nce->nce_ill;
3143 3373 ncec_t *ncec = nce->nce_common;
3144 3374
3145 3375 res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3146 3376 /*
3147 3377 * EAGAIN is an indication of a transient error
3148 3378 * i.e. allocation failure etc. leave the ncec in the list it
3149 3379 * will be updated when another probe happens for another ire
3150 3380 * if not it will be taken out of the list when the ire is
|
↓ open down ↓ |
37 lines elided |
↑ open up ↑ |
3151 3381 * deleted.
3152 3382 */
3153 3383 if (res != 0 && res != EAGAIN && res != ENOTSUP)
3154 3384 nce_fastpath_list_delete(ill, ncec, NULL);
3155 3385 }
3156 3386
3157 3387 /*
3158 3388 * Add ncec to the nce fastpath list on ill.
3159 3389 */
3160 3390 static nce_t *
3161 -nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3391 +nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
3162 3392 {
3163 3393 nce_t *nce = NULL;
3164 3394
3165 3395 ASSERT(MUTEX_HELD(&ill->ill_lock));
3166 3396 /*
3167 3397 * Atomically ensure that the ill is not CONDEMNED and is not going
3168 3398 * down, before adding the NCE.
3169 3399 */
3170 3400 if (ill->ill_state_flags & ILL_CONDEMNED)
3171 3401 return (NULL);
3172 3402 mutex_enter(&ncec->ncec_lock);
3173 3403 /*
3174 3404 * if ncec has not been deleted and
3175 3405 * is not already in the list add it.
3176 3406 */
3177 3407 if (!NCE_ISCONDEMNED(ncec)) {
3178 3408 nce = nce_lookup(ill, &ncec->ncec_addr);
3179 3409 if (nce != NULL)
3180 3410 goto done;
3181 - nce = nce_add(ill, ncec);
3411 + nce = nce_add(ill, ncec, graveyard);
3182 3412 }
3183 3413 done:
3184 3414 mutex_exit(&ncec->ncec_lock);
3185 3415 return (nce);
3186 3416 }
3187 3417
3188 -nce_t *
3418 +static nce_t *
3189 3419 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3190 3420 {
3191 3421 nce_t *nce;
3422 + list_t graveyard;
3192 3423
3424 + list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3193 3425 mutex_enter(&ill->ill_lock);
3194 - nce = nce_ill_lookup_then_add_locked(ill, ncec);
3426 + nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
3195 3427 mutex_exit(&ill->ill_lock);
3428 + nce_graveyard_free(&graveyard);
3196 3429 return (nce);
3197 3430 }
3198 3431
3199 3432
3200 3433 /*
3201 3434 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3202 3435 * nce is added to the 'dead' list, and the caller must nce_refrele() the
3203 3436 * entry after all locks have been dropped.
3204 3437 */
3205 3438 void
3206 3439 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3207 3440 {
3208 3441 nce_t *nce;
3209 3442
3210 3443 ASSERT(ill != NULL);
3211 3444
3212 3445 /* delete any nces referencing the ncec from underlying ills */
3213 3446 if (IS_IPMP(ill))
3214 3447 ipmp_ncec_delete_nce(ncec);
3215 3448
3216 3449 /* now the ill itself */
3217 3450 mutex_enter(&ill->ill_lock);
3218 3451 for (nce = list_head(&ill->ill_nce); nce != NULL;
3219 3452 nce = list_next(&ill->ill_nce, nce)) {
3220 3453 if (nce->nce_common == ncec) {
3221 3454 nce_refhold(nce);
3222 3455 nce_delete(nce);
3223 3456 break;
3224 3457 }
3225 3458 }
3226 3459 mutex_exit(&ill->ill_lock);
3227 3460 if (nce != NULL) {
3228 3461 if (dead == NULL)
3229 3462 nce_refrele(nce);
3230 3463 else
3231 3464 list_insert_tail(dead, nce);
3232 3465 }
3233 3466 }
3234 3467
3235 3468 /*
|
↓ open down ↓ |
30 lines elided |
↑ open up ↑ |
3236 3469 * when the fastpath response does not fit in the datab
3237 3470 * associated with the existing nce_fp_mp, we delete and
3238 3471 * add the nce to retrigger fastpath based on the information
3239 3472 * in the ncec_t.
3240 3473 */
3241 3474 static nce_t *
3242 3475 nce_delete_then_add(nce_t *nce)
3243 3476 {
3244 3477 ill_t *ill = nce->nce_ill;
3245 3478 nce_t *newnce = NULL;
3479 + list_t graveyard;
3246 3480
3481 + list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3247 3482 ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3248 3483 (void *)nce, ill->ill_name));
3249 3484 mutex_enter(&ill->ill_lock);
3250 3485 mutex_enter(&nce->nce_common->ncec_lock);
3251 3486 nce_delete(nce);
3252 3487 /*
3253 3488 * Make sure that ncec is not condemned before adding. We hold the
3254 3489 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3255 3490 * ipmp_ncec_delete_nce()
3256 3491 */
3257 3492 if (!NCE_ISCONDEMNED(nce->nce_common))
3258 - newnce = nce_add(ill, nce->nce_common);
3493 + newnce = nce_add(ill, nce->nce_common, &graveyard);
3259 3494 mutex_exit(&nce->nce_common->ncec_lock);
3260 3495 mutex_exit(&ill->ill_lock);
3496 + nce_graveyard_free(&graveyard);
3261 3497 nce_refrele(nce);
3262 3498 return (newnce); /* could be null if nomem */
3263 3499 }
3264 3500
3265 3501 typedef struct nce_fp_match_s {
3266 3502 nce_t *nce_fp_match_res;
3267 3503 mblk_t *nce_fp_match_ack_mp;
3268 3504 } nce_fp_match_t;
3269 3505
3270 3506 /* ARGSUSED */
3271 3507 static int
3272 3508 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3273 3509 {
3274 3510 nce_fp_match_t *nce_fp_marg = arg;
3275 3511 ncec_t *ncec = nce->nce_common;
3276 3512 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp;
3277 3513 uchar_t *mp_rptr, *ud_mp_rptr;
3278 3514 mblk_t *ud_mp = nce->nce_dlur_mp;
3279 3515 ptrdiff_t cmplen;
3280 3516
3281 3517 /*
3282 3518 * mp is the mp associated with the fastpath ack.
3283 3519 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3284 3520 * under consideration. If the contents match, then the
3285 3521 * fastpath ack is used to update the nce.
3286 3522 */
3287 3523 if (ud_mp == NULL)
3288 3524 return (0);
3289 3525 mp_rptr = mp->b_rptr;
3290 3526 cmplen = mp->b_wptr - mp_rptr;
3291 3527 ASSERT(cmplen >= 0);
3292 3528
3293 3529 ud_mp_rptr = ud_mp->b_rptr;
3294 3530 /*
3295 3531 * The ncec is locked here to prevent any other threads from accessing
3296 3532 * and changing nce_dlur_mp when the address becomes resolved to an
3297 3533 * lla while we're in the middle of looking at and comparing the
3298 3534 * hardware address (lla). It is also locked to prevent multiple
3299 3535 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3300 3536 * time.
3301 3537 */
3302 3538 mutex_enter(&ncec->ncec_lock);
3303 3539 if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3304 3540 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3305 3541 nce_fp_marg->nce_fp_match_res = nce;
3306 3542 mutex_exit(&ncec->ncec_lock);
3307 3543 nce_refhold(nce);
3308 3544 return (1);
3309 3545 }
3310 3546 mutex_exit(&ncec->ncec_lock);
3311 3547 return (0);
3312 3548 }
3313 3549
3314 3550 /*
3315 3551 * Update all NCE's that are not in fastpath mode and
3316 3552 * have an nce_fp_mp that matches mp. mp->b_cont contains
3317 3553 * the fastpath header.
3318 3554 *
3319 3555 * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3320 3556 */
3321 3557 void
3322 3558 nce_fastpath_update(ill_t *ill, mblk_t *mp)
3323 3559 {
3324 3560 nce_fp_match_t nce_fp_marg;
3325 3561 nce_t *nce;
3326 3562 mblk_t *nce_fp_mp, *fp_mp;
3327 3563
3328 3564 nce_fp_marg.nce_fp_match_res = NULL;
3329 3565 nce_fp_marg.nce_fp_match_ack_mp = mp;
3330 3566
3331 3567 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3332 3568
3333 3569 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3334 3570 return;
3335 3571
3336 3572 mutex_enter(&nce->nce_lock);
3337 3573 nce_fp_mp = nce->nce_fp_mp;
3338 3574
3339 3575 if (nce_fp_mp != NULL) {
3340 3576 fp_mp = mp->b_cont;
3341 3577 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3342 3578 nce_fp_mp->b_datap->db_lim) {
3343 3579 mutex_exit(&nce->nce_lock);
3344 3580 nce = nce_delete_then_add(nce);
3345 3581 if (nce == NULL) {
3346 3582 return;
3347 3583 }
3348 3584 mutex_enter(&nce->nce_lock);
3349 3585 nce_fp_mp = nce->nce_fp_mp;
3350 3586 }
3351 3587 }
3352 3588
3353 3589 /* Matched - install mp as the fastpath mp */
3354 3590 if (nce_fp_mp == NULL) {
3355 3591 fp_mp = dupb(mp->b_cont);
3356 3592 nce->nce_fp_mp = fp_mp;
3357 3593 } else {
3358 3594 fp_mp = mp->b_cont;
3359 3595 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3360 3596 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3361 3597 + MBLKL(fp_mp);
3362 3598 }
3363 3599 mutex_exit(&nce->nce_lock);
3364 3600 nce_refrele(nce);
3365 3601 }
3366 3602
3367 3603 /*
3368 3604 * Return a pointer to a given option in the packet.
3369 3605 * Assumes that option part of the packet have already been validated.
3370 3606 */
3371 3607 nd_opt_hdr_t *
3372 3608 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3373 3609 {
3374 3610 while (optlen > 0) {
3375 3611 if (opt->nd_opt_type == opt_type)
3376 3612 return (opt);
3377 3613 optlen -= 8 * opt->nd_opt_len;
3378 3614 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3379 3615 }
3380 3616 return (NULL);
3381 3617 }
3382 3618
3383 3619 /*
3384 3620 * Verify all option lengths present are > 0, also check to see
3385 3621 * if the option lengths and packet length are consistent.
3386 3622 */
3387 3623 boolean_t
3388 3624 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3389 3625 {
3390 3626 ASSERT(opt != NULL);
3391 3627 while (optlen > 0) {
3392 3628 if (opt->nd_opt_len == 0)
3393 3629 return (B_FALSE);
3394 3630 optlen -= 8 * opt->nd_opt_len;
3395 3631 if (optlen < 0)
3396 3632 return (B_FALSE);
3397 3633 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3398 3634 }
3399 3635 return (B_TRUE);
3400 3636 }
3401 3637
3402 3638 /*
3403 3639 * ncec_walk function.
3404 3640 * Free a fraction of the NCE cache entries.
3405 3641 *
3406 3642 * A possible optimization here would be to use ncec_last where possible, and
3407 3643 * delete the least-frequently used entry, which would require more complex
3408 3644 * computation as we walk through the ncec's (e.g., track ncec entries by
3409 3645 * order of ncec_last and/or maintain state)
3410 3646 */
3411 3647 static void
3412 3648 ncec_cache_reclaim(ncec_t *ncec, void *arg)
3413 3649 {
3414 3650 ip_stack_t *ipst = ncec->ncec_ipst;
3415 3651 uint_t fraction = *(uint_t *)arg;
3416 3652 uint_t rand;
3417 3653
3418 3654 if ((ncec->ncec_flags &
3419 3655 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3420 3656 return;
3421 3657 }
3422 3658
3423 3659 rand = (uint_t)ddi_get_lbolt() +
3424 3660 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3425 3661 if ((rand/fraction)*fraction == rand) {
3426 3662 IP_STAT(ipst, ip_nce_reclaim_deleted);
3427 3663 ncec_delete(ncec);
3428 3664 }
3429 3665 }
3430 3666
3431 3667 /*
3432 3668 * kmem_cache callback to free up memory.
3433 3669 *
3434 3670 * For now we just delete a fixed fraction.
3435 3671 */
3436 3672 static void
3437 3673 ip_nce_reclaim_stack(ip_stack_t *ipst)
3438 3674 {
3439 3675 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction;
3440 3676
3441 3677 IP_STAT(ipst, ip_nce_reclaim_calls);
3442 3678
3443 3679 ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);
3444 3680
3445 3681 /*
3446 3682 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3447 3683 * Get them to update any stale references to drop any refholds they
3448 3684 * have.
3449 3685 */
3450 3686 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3451 3687 }
3452 3688
3453 3689 /*
3454 3690 * Called by the memory allocator subsystem directly, when the system
3455 3691 * is running low on memory.
3456 3692 */
3457 3693 /* ARGSUSED */
3458 3694 void
3459 3695 ip_nce_reclaim(void *args)
3460 3696 {
3461 3697 netstack_handle_t nh;
3462 3698 netstack_t *ns;
3463 3699 ip_stack_t *ipst;
3464 3700
3465 3701 netstack_next_init(&nh);
3466 3702 while ((ns = netstack_next(&nh)) != NULL) {
3467 3703 /*
3468 3704 * netstack_next() can return a netstack_t with a NULL
3469 3705 * netstack_ip at boot time.
3470 3706 */
3471 3707 if ((ipst = ns->netstack_ip) == NULL) {
3472 3708 netstack_rele(ns);
3473 3709 continue;
3474 3710 }
3475 3711 ip_nce_reclaim_stack(ipst);
3476 3712 netstack_rele(ns);
3477 3713 }
3478 3714 netstack_next_fini(&nh);
3479 3715 }
3480 3716
3481 3717 #ifdef DEBUG
3482 3718 void
3483 3719 ncec_trace_ref(ncec_t *ncec)
3484 3720 {
3485 3721 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3486 3722
3487 3723 if (ncec->ncec_trace_disable)
3488 3724 return;
3489 3725
3490 3726 if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3491 3727 ncec->ncec_trace_disable = B_TRUE;
3492 3728 ncec_trace_cleanup(ncec);
3493 3729 }
3494 3730 }
3495 3731
3496 3732 void
3497 3733 ncec_untrace_ref(ncec_t *ncec)
3498 3734 {
3499 3735 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3500 3736
3501 3737 if (!ncec->ncec_trace_disable)
3502 3738 th_trace_unref(ncec);
3503 3739 }
3504 3740
3505 3741 static void
3506 3742 ncec_trace_cleanup(const ncec_t *ncec)
3507 3743 {
3508 3744 th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3509 3745 }
3510 3746 #endif
3511 3747
3512 3748 /*
3513 3749 * Called when address resolution fails due to a timeout.
3514 3750 * Send an ICMP unreachable in response to all queued packets.
3515 3751 */
3516 3752 void
3517 3753 arp_resolv_failed(ncec_t *ncec)
3518 3754 {
3519 3755 mblk_t *mp, *nxt_mp;
3520 3756 char buf[INET6_ADDRSTRLEN];
3521 3757 struct in_addr ipv4addr;
3522 3758 ill_t *ill = ncec->ncec_ill;
3523 3759 ip_stack_t *ipst = ncec->ncec_ipst;
3524 3760 ip_recv_attr_t iras;
3525 3761
3526 3762 bzero(&iras, sizeof (iras));
3527 3763 iras.ira_flags = IRAF_IS_IPV4;
3528 3764 /*
3529 3765 * we are setting the ira_rill to the ipmp_ill (instead of
3530 3766 * the actual ill on which the packet was received), but this
3531 3767 * is ok because we don't actually need the real ira_rill.
3532 3768 * to send the icmp unreachable to the sender.
3533 3769 */
3534 3770 iras.ira_ill = iras.ira_rill = ill;
3535 3771 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3536 3772 iras.ira_rifindex = iras.ira_ruifindex;
3537 3773
3538 3774 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3539 3775 ip3dbg(("arp_resolv_failed: dst %s\n",
3540 3776 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3541 3777 mutex_enter(&ncec->ncec_lock);
3542 3778 mp = ncec->ncec_qd_mp;
3543 3779 ncec->ncec_qd_mp = NULL;
3544 3780 ncec->ncec_nprobes = 0;
3545 3781 mutex_exit(&ncec->ncec_lock);
3546 3782 while (mp != NULL) {
3547 3783 nxt_mp = mp->b_next;
3548 3784 mp->b_next = NULL;
3549 3785
3550 3786 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3551 3787 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3552 3788 mp, ill);
3553 3789 if (ipst->ips_ip_arp_icmp_error) {
3554 3790 ip3dbg(("arp_resolv_failed: "
3555 3791 "Calling icmp_unreachable\n"));
3556 3792 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3557 3793 } else {
3558 3794 freemsg(mp);
3559 3795 }
3560 3796 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3561 3797 mp = nxt_mp;
3562 3798 }
3563 3799 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3564 3800 }
3565 3801
3566 3802 /*
3567 3803 * if ill is an under_ill, translate it to the ipmp_ill and add the
3568 3804 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3569 3805 * one on the underlying in_ill) will be created for the
3570 3806 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3571 3807 */
3572 3808 int
3573 3809 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3574 3810 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3575 3811 {
3576 3812 int err;
3577 3813 in6_addr_t addr6;
3578 3814 ip_stack_t *ipst = ill->ill_ipst;
3579 3815 nce_t *nce, *upper_nce = NULL;
3580 3816 ill_t *in_ill = ill, *under = NULL;
3581 3817 boolean_t need_ill_refrele = B_FALSE;
3582 3818
3583 3819 if (flags & NCE_F_MCAST) {
3584 3820 /*
3585 3821 * hw_addr will be figured out in nce_set_multicast_v4;
3586 3822 * caller needs to pass in the cast_ill for ipmp
3587 3823 */
3588 3824 ASSERT(hw_addr == NULL);
3589 3825 ASSERT(!IS_IPMP(ill));
3590 3826 err = nce_set_multicast_v4(ill, addr, flags, newnce);
3591 3827 return (err);
3592 3828 }
3593 3829
3594 3830 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3595 3831 ill = ipmp_ill_hold_ipmp_ill(ill);
3596 3832 if (ill == NULL)
3597 3833 return (ENXIO);
3598 3834 need_ill_refrele = B_TRUE;
3599 3835 }
3600 3836 if ((flags & NCE_F_BCAST) != 0) {
3601 3837 /*
3602 3838 * IPv4 broadcast ncec: compute the hwaddr.
3603 3839 */
3604 3840 if (IS_IPMP(ill)) {
3605 3841 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3606 3842 if (under == NULL) {
3607 3843 if (need_ill_refrele)
3608 3844 ill_refrele(ill);
3609 3845 return (ENETDOWN);
3610 3846 }
3611 3847 hw_addr = under->ill_bcast_mp->b_rptr +
3612 3848 NCE_LL_ADDR_OFFSET(under);
3613 3849 hw_addr_len = under->ill_phys_addr_length;
3614 3850 } else {
3615 3851 hw_addr = ill->ill_bcast_mp->b_rptr +
3616 3852 NCE_LL_ADDR_OFFSET(ill),
3617 3853 hw_addr_len = ill->ill_phys_addr_length;
3618 3854 }
3619 3855 }
3620 3856
3621 3857 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3622 3858 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3623 3859 nce = nce_lookup_addr(ill, &addr6);
3624 3860 if (nce == NULL) {
3625 3861 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3626 3862 state, &nce);
3627 3863 } else {
3628 3864 err = EEXIST;
3629 3865 }
3630 3866 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3631 3867 if (err == 0)
3632 3868 err = nce_add_v4_postprocess(nce);
3633 3869
3634 3870 if (in_ill != ill && nce != NULL) {
3635 3871 nce_t *under_nce = NULL;
3636 3872
3637 3873 /*
3638 3874 * in_ill was the under_ill. Try to create the under_nce.
3639 3875 * Hold the ill_g_lock to prevent changes to group membership
3640 3876 * until we are done.
3641 3877 */
3642 3878 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3643 3879 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3644 3880 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3645 3881 ill_t *, ill);
3646 3882 rw_exit(&ipst->ips_ill_g_lock);
3647 3883 err = ENXIO;
3648 3884 nce_refrele(nce);
3649 3885 nce = NULL;
3650 3886 goto bail;
3651 3887 }
3652 3888 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3653 3889 if (under_nce == NULL) {
3654 3890 rw_exit(&ipst->ips_ill_g_lock);
3655 3891 err = EINVAL;
3656 3892 nce_refrele(nce);
3657 3893 nce = NULL;
3658 3894 goto bail;
3659 3895 }
3660 3896 rw_exit(&ipst->ips_ill_g_lock);
3661 3897 upper_nce = nce;
3662 3898 nce = under_nce; /* will be returned to caller */
3663 3899 if (NCE_ISREACHABLE(nce->nce_common))
3664 3900 nce_fastpath_trigger(under_nce);
3665 3901 }
3666 3902 if (nce != NULL) {
3667 3903 if (newnce != NULL)
3668 3904 *newnce = nce;
3669 3905 else
3670 3906 nce_refrele(nce);
3671 3907 }
3672 3908 bail:
3673 3909 if (under != NULL)
3674 3910 ill_refrele(under);
3675 3911 if (upper_nce != NULL)
3676 3912 nce_refrele(upper_nce);
3677 3913 if (need_ill_refrele)
3678 3914 ill_refrele(ill);
3679 3915
3680 3916 return (err);
3681 3917 }
3682 3918
3683 3919 /*
3684 3920 * NDP Cache Entry creation routine for IPv4.
3685 3921 * This routine must always be called with ndp4->ndp_g_lock held.
3686 3922 * Prior to return, ncec_refcnt is incremented.
3687 3923 *
3688 3924 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3689 3925 * are always added pointing at the ipmp_ill. Thus, when the ill passed
3690 3926 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3691 3927 * entries will be created, both pointing at the same ncec_t. The nce_t
3692 3928 * entries will have their nce_ill set to the ipmp_ill and the under_ill
3693 3929 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3694 3930 * Local addresses are always created on the ill passed to nce_add_v4.
3695 3931 */
3696 3932 int
3697 3933 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3698 3934 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3699 3935 {
3700 3936 int err;
3701 3937 boolean_t is_multicast = (flags & NCE_F_MCAST);
3702 3938 struct in6_addr addr6;
3703 3939 nce_t *nce;
3704 3940
3705 3941 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3706 3942 ASSERT(!ill->ill_isv6);
3707 3943 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3708 3944
3709 3945 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3710 3946 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3711 3947 &nce);
3712 3948 ASSERT(newnce != NULL);
3713 3949 *newnce = nce;
3714 3950 return (err);
3715 3951 }
3716 3952
3717 3953 /*
3718 3954 * Post-processing routine to be executed after nce_add_v4(). This function
3719 3955 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3720 3956 * and must be called without any locks held.
3721 3957 *
3722 3958 * Always returns 0, but we return an int to keep this symmetric with the
3723 3959 * IPv6 counter-part.
3724 3960 */
3725 3961 int
3726 3962 nce_add_v4_postprocess(nce_t *nce)
3727 3963 {
3728 3964 ncec_t *ncec = nce->nce_common;
3729 3965 uint16_t flags = ncec->ncec_flags;
3730 3966 boolean_t ndp_need_dad = B_FALSE;
3731 3967 boolean_t dropped;
3732 3968 clock_t delay;
3733 3969 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst;
3734 3970 uchar_t *hw_addr = ncec->ncec_lladdr;
3735 3971 boolean_t trigger_fastpath = B_TRUE;
3736 3972
3737 3973 /*
3738 3974 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3739 3975 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3740 3976 * We call nce_fastpath from nce_update if the link layer address of
3741 3977 * the peer changes from nce_update
3742 3978 */
3743 3979 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3744 3980 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3745 3981 trigger_fastpath = B_FALSE;
3746 3982
3747 3983 if (trigger_fastpath)
3748 3984 nce_fastpath_trigger(nce);
3749 3985
3750 3986 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3751 3987 /*
3752 3988 * Either the caller (by passing in ND_PROBE)
3753 3989 * or nce_add_common() (by the internally computed state
3754 3990 * based on ncec_addr and ill_net_type) has determined
3755 3991 * that this unicast entry needs DAD. Trigger DAD.
3756 3992 */
3757 3993 ndp_need_dad = B_TRUE;
3758 3994 } else if (flags & NCE_F_UNSOL_ADV) {
3759 3995 /*
3760 3996 * We account for the transmit below by assigning one
3761 3997 * less than the ndd variable. Subsequent decrements
3762 3998 * are done in nce_timer.
3763 3999 */
3764 4000 mutex_enter(&ncec->ncec_lock);
3765 4001 ncec->ncec_unsolicit_count =
3766 4002 ipst->ips_ip_arp_publish_count - 1;
3767 4003 mutex_exit(&ncec->ncec_lock);
3768 4004 dropped = arp_announce(ncec);
3769 4005 mutex_enter(&ncec->ncec_lock);
3770 4006 if (dropped)
3771 4007 ncec->ncec_unsolicit_count++;
3772 4008 else
3773 4009 ncec->ncec_last_time_defended = ddi_get_lbolt();
3774 4010 if (ncec->ncec_unsolicit_count != 0) {
3775 4011 nce_start_timer(ncec,
3776 4012 ipst->ips_ip_arp_publish_interval);
3777 4013 }
3778 4014 mutex_exit(&ncec->ncec_lock);
3779 4015 }
3780 4016
3781 4017 /*
3782 4018 * If ncec_xmit_interval is 0, user has configured us to send the first
3783 4019 * probe right away. Do so, and set up for the subsequent probes.
3784 4020 */
3785 4021 if (ndp_need_dad) {
3786 4022 mutex_enter(&ncec->ncec_lock);
3787 4023 if (ncec->ncec_pcnt == 0) {
3788 4024 /*
3789 4025 * DAD probes and announce can be
3790 4026 * administratively disabled by setting the
3791 4027 * probe_count to zero. Restart the timer in
3792 4028 * this case to mark the ipif as ready.
3793 4029 */
3794 4030 ncec->ncec_unsolicit_count = 0;
3795 4031 mutex_exit(&ncec->ncec_lock);
3796 4032 nce_restart_timer(ncec, 0);
3797 4033 } else {
3798 4034 mutex_exit(&ncec->ncec_lock);
3799 4035 delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3800 4036 ipst->ips_arp_probe_delay :
3801 4037 ipst->ips_arp_fastprobe_delay);
3802 4038 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3803 4039 }
3804 4040 }
3805 4041 return (0);
3806 4042 }
3807 4043
3808 4044 /*
3809 4045 * ncec_walk routine to update all entries that have a given destination or
3810 4046 * gateway address and cached link layer (MAC) address. This is used when ARP
3811 4047 * informs us that a network-to-link-layer mapping may have changed.
3812 4048 */
3813 4049 void
3814 4050 nce_update_hw_changed(ncec_t *ncec, void *arg)
3815 4051 {
3816 4052 nce_hw_map_t *hwm = arg;
3817 4053 ipaddr_t ncec_addr;
3818 4054
3819 4055 if (ncec->ncec_state != ND_REACHABLE)
3820 4056 return;
3821 4057
3822 4058 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3823 4059 if (ncec_addr != hwm->hwm_addr)
3824 4060 return;
3825 4061
3826 4062 mutex_enter(&ncec->ncec_lock);
3827 4063 if (hwm->hwm_flags != 0)
3828 4064 ncec->ncec_flags = hwm->hwm_flags;
3829 4065 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3830 4066 mutex_exit(&ncec->ncec_lock);
3831 4067 }
3832 4068
3833 4069 void
3834 4070 ncec_refhold(ncec_t *ncec)
3835 4071 {
3836 4072 mutex_enter(&(ncec)->ncec_lock);
3837 4073 (ncec)->ncec_refcnt++;
3838 4074 ASSERT((ncec)->ncec_refcnt != 0);
3839 4075 #ifdef DEBUG
3840 4076 ncec_trace_ref(ncec);
3841 4077 #endif
3842 4078 mutex_exit(&(ncec)->ncec_lock);
3843 4079 }
3844 4080
3845 4081 void
3846 4082 ncec_refhold_notr(ncec_t *ncec)
3847 4083 {
3848 4084 mutex_enter(&(ncec)->ncec_lock);
3849 4085 (ncec)->ncec_refcnt++;
3850 4086 ASSERT((ncec)->ncec_refcnt != 0);
3851 4087 mutex_exit(&(ncec)->ncec_lock);
3852 4088 }
3853 4089
3854 4090 static void
3855 4091 ncec_refhold_locked(ncec_t *ncec)
3856 4092 {
3857 4093 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3858 4094 (ncec)->ncec_refcnt++;
3859 4095 #ifdef DEBUG
3860 4096 ncec_trace_ref(ncec);
3861 4097 #endif
3862 4098 }
3863 4099
3864 4100 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3865 4101 void
3866 4102 ncec_refrele(ncec_t *ncec)
3867 4103 {
3868 4104 mutex_enter(&(ncec)->ncec_lock);
3869 4105 #ifdef DEBUG
3870 4106 ncec_untrace_ref(ncec);
3871 4107 #endif
3872 4108 ASSERT((ncec)->ncec_refcnt != 0);
3873 4109 if (--(ncec)->ncec_refcnt == 0) {
3874 4110 ncec_inactive(ncec);
3875 4111 } else {
3876 4112 mutex_exit(&(ncec)->ncec_lock);
3877 4113 }
3878 4114 }
3879 4115
3880 4116 void
3881 4117 ncec_refrele_notr(ncec_t *ncec)
3882 4118 {
3883 4119 mutex_enter(&(ncec)->ncec_lock);
3884 4120 ASSERT((ncec)->ncec_refcnt != 0);
3885 4121 if (--(ncec)->ncec_refcnt == 0) {
3886 4122 ncec_inactive(ncec);
3887 4123 } else {
3888 4124 mutex_exit(&(ncec)->ncec_lock);
3889 4125 }
3890 4126 }
3891 4127
3892 4128 /*
3893 4129 * Common to IPv4 and IPv6.
3894 4130 */
3895 4131 void
3896 4132 nce_restart_timer(ncec_t *ncec, uint_t ms)
3897 4133 {
3898 4134 timeout_id_t tid;
3899 4135
3900 4136 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3901 4137
3902 4138 /* First cancel any running timer */
3903 4139 mutex_enter(&ncec->ncec_lock);
3904 4140 tid = ncec->ncec_timeout_id;
3905 4141 ncec->ncec_timeout_id = 0;
3906 4142 if (tid != 0) {
3907 4143 mutex_exit(&ncec->ncec_lock);
3908 4144 (void) untimeout(tid);
3909 4145 mutex_enter(&ncec->ncec_lock);
3910 4146 }
3911 4147
3912 4148 /* Restart timer */
3913 4149 nce_start_timer(ncec, ms);
3914 4150 mutex_exit(&ncec->ncec_lock);
3915 4151 }
3916 4152
3917 4153 static void
3918 4154 nce_start_timer(ncec_t *ncec, uint_t ms)
3919 4155 {
3920 4156 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3921 4157 /*
3922 4158 * Don't start the timer if the ncec has been deleted, or if the timer
3923 4159 * is already running
3924 4160 */
3925 4161 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3926 4162 ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3927 4163 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3928 4164 }
3929 4165 }
3930 4166
3931 4167 int
3932 4168 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3933 4169 uint16_t flags, nce_t **newnce)
3934 4170 {
3935 4171 uchar_t *hw_addr;
3936 4172 int err = 0;
3937 4173 ip_stack_t *ipst = ill->ill_ipst;
3938 4174 in6_addr_t dst6;
3939 4175 nce_t *nce;
3940 4176
3941 4177 ASSERT(!ill->ill_isv6);
3942 4178
3943 4179 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3944 4180 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3945 4181 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3946 4182 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3947 4183 goto done;
3948 4184 }
3949 4185 if (ill->ill_net_type == IRE_IF_RESOLVER) {
3950 4186 /*
3951 4187 * For IRE_IF_RESOLVER a hardware mapping can be
3952 4188 * generated, for IRE_IF_NORESOLVER, resolution cookie
3953 4189 * in the ill is copied in nce_add_v4().
3954 4190 */
3955 4191 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3956 4192 if (hw_addr == NULL) {
3957 4193 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3958 4194 return (ENOMEM);
3959 4195 }
3960 4196 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3961 4197 } else {
3962 4198 /*
3963 4199 * IRE_IF_NORESOLVER type simply copies the resolution
3964 4200 * cookie passed in. So no hw_addr is needed.
|
↓ open down ↓ |
694 lines elided |
↑ open up ↑ |
3965 4201 */
3966 4202 hw_addr = NULL;
3967 4203 }
3968 4204 ASSERT(flags & NCE_F_MCAST);
3969 4205 ASSERT(flags & NCE_F_NONUD);
3970 4206 /* nce_state will be computed by nce_add_common() */
3971 4207 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3972 4208 ND_UNCHANGED, &nce);
3973 4209 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3974 4210 if (err == 0)
3975 - err = nce_add_v4_postprocess(nce);
4211 + err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
3976 4212 if (hw_addr != NULL)
3977 4213 kmem_free(hw_addr, ill->ill_phys_addr_length);
3978 4214 if (err != 0) {
3979 4215 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3980 4216 return (err);
3981 4217 }
3982 4218 done:
3983 4219 if (newnce != NULL)
3984 4220 *newnce = nce;
3985 4221 else
3986 4222 nce_refrele(nce);
3987 4223 return (0);
3988 4224 }
3989 4225
3990 4226 /*
3991 4227 * This is used when scanning for "old" (least recently broadcast) NCEs. We
3992 4228 * don't want to have to walk the list for every single one, so we gather up
3993 4229 * batches at a time.
3994 4230 */
3995 4231 #define NCE_RESCHED_LIST_LEN 8
3996 4232
3997 4233 typedef struct {
3998 4234 ill_t *ncert_ill;
3999 4235 uint_t ncert_num;
4000 4236 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN];
4001 4237 } nce_resched_t;
4002 4238
4003 4239 /*
4004 4240 * Pick the longest waiting NCEs for defense.
4005 4241 */
4006 4242 /* ARGSUSED */
4007 4243 static int
4008 4244 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4009 4245 {
4010 4246 nce_resched_t *ncert = arg;
4011 4247 ncec_t **ncecs;
4012 4248 ncec_t **ncec_max;
4013 4249 ncec_t *ncec_temp;
4014 4250 ncec_t *ncec = nce->nce_common;
4015 4251
4016 4252 ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4017 4253 /*
4018 4254 * Only reachable entries that are ready for announcement are eligible.
4019 4255 */
4020 4256 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4021 4257 return (0);
4022 4258 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4023 4259 ncec_refhold(ncec);
4024 4260 ncert->ncert_nces[ncert->ncert_num++] = ncec;
4025 4261 } else {
4026 4262 ncecs = ncert->ncert_nces;
4027 4263 ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4028 4264 ncec_refhold(ncec);
4029 4265 for (; ncecs < ncec_max; ncecs++) {
4030 4266 ASSERT(ncec != NULL);
4031 4267 if ((*ncecs)->ncec_last_time_defended >
4032 4268 ncec->ncec_last_time_defended) {
4033 4269 ncec_temp = *ncecs;
4034 4270 *ncecs = ncec;
4035 4271 ncec = ncec_temp;
4036 4272 }
4037 4273 }
4038 4274 ncec_refrele(ncec);
4039 4275 }
4040 4276 return (0);
4041 4277 }
4042 4278
4043 4279 /*
4044 4280 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this
4045 4281 * doesn't happen very often (if at all), and thus it needn't be highly
4046 4282 * optimized. (Note, though, that it's actually O(N) complexity, because the
4047 4283 * outer loop is bounded by a constant rather than by the length of the list.)
4048 4284 */
4049 4285 static void
4050 4286 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4051 4287 {
4052 4288 ncec_t *ncec;
4053 4289 ip_stack_t *ipst = ill->ill_ipst;
4054 4290 uint_t i, defend_rate;
4055 4291
4056 4292 i = ill->ill_defend_count;
4057 4293 ill->ill_defend_count = 0;
4058 4294 if (ill->ill_isv6)
4059 4295 defend_rate = ipst->ips_ndp_defend_rate;
4060 4296 else
4061 4297 defend_rate = ipst->ips_arp_defend_rate;
4062 4298 /* If none could be sitting around, then don't reschedule */
4063 4299 if (i < defend_rate) {
4064 4300 DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4065 4301 return;
4066 4302 }
4067 4303 ncert->ncert_ill = ill;
4068 4304 while (ill->ill_defend_count < defend_rate) {
4069 4305 nce_walk_common(ill, ncec_reschedule, ncert);
4070 4306 for (i = 0; i < ncert->ncert_num; i++) {
4071 4307
4072 4308 ncec = ncert->ncert_nces[i];
4073 4309 mutex_enter(&ncec->ncec_lock);
4074 4310 ncec->ncec_flags |= NCE_F_DELAYED;
4075 4311 mutex_exit(&ncec->ncec_lock);
4076 4312 /*
4077 4313 * we plan to schedule this ncec, so incr the
4078 4314 * defend_count in anticipation.
4079 4315 */
4080 4316 if (++ill->ill_defend_count >= defend_rate)
4081 4317 break;
4082 4318 }
4083 4319 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4084 4320 break;
4085 4321 }
4086 4322 }
4087 4323
4088 4324 /*
4089 4325 * Check if the current rate-limiting parameters permit the sending
4090 4326 * of another address defense announcement for both IPv4 and IPv6.
4091 4327 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4092 4328 * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4093 4329 * determines how many address defense announcements are permitted
4094 4330 * in any `defense_perio' interval.
4095 4331 */
4096 4332 static boolean_t
4097 4333 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4098 4334 {
4099 4335 clock_t now = ddi_get_lbolt();
4100 4336 ip_stack_t *ipst = ill->ill_ipst;
4101 4337 clock_t start = ill->ill_defend_start;
4102 4338 uint32_t elapsed, defend_period, defend_rate;
4103 4339 nce_resched_t ncert;
4104 4340 boolean_t ret;
4105 4341 int i;
4106 4342
4107 4343 if (ill->ill_isv6) {
4108 4344 defend_period = ipst->ips_ndp_defend_period;
4109 4345 defend_rate = ipst->ips_ndp_defend_rate;
4110 4346 } else {
4111 4347 defend_period = ipst->ips_arp_defend_period;
4112 4348 defend_rate = ipst->ips_arp_defend_rate;
4113 4349 }
4114 4350 if (defend_rate == 0)
4115 4351 return (B_TRUE);
4116 4352 bzero(&ncert, sizeof (ncert));
4117 4353 mutex_enter(&ill->ill_lock);
4118 4354 if (start > 0) {
4119 4355 elapsed = now - start;
4120 4356 if (elapsed > SEC_TO_TICK(defend_period)) {
4121 4357 ill->ill_defend_start = now;
4122 4358 /*
4123 4359 * nce_ill_reschedule will attempt to
4124 4360 * prevent starvation by reschduling the
4125 4361 * oldest entries, which are marked with
4126 4362 * the NCE_F_DELAYED flag.
4127 4363 */
4128 4364 nce_ill_reschedule(ill, &ncert);
4129 4365 }
4130 4366 } else {
4131 4367 ill->ill_defend_start = now;
4132 4368 }
4133 4369 ASSERT(ill->ill_defend_count <= defend_rate);
4134 4370 mutex_enter(&ncec->ncec_lock);
4135 4371 if (ncec->ncec_flags & NCE_F_DELAYED) {
4136 4372 /*
4137 4373 * This ncec was rescheduled as one of the really old
4138 4374 * entries needing on-going defense. The
4139 4375 * ill_defend_count was already incremented in
4140 4376 * nce_ill_reschedule. Go ahead and send the announce.
4141 4377 */
4142 4378 ncec->ncec_flags &= ~NCE_F_DELAYED;
4143 4379 mutex_exit(&ncec->ncec_lock);
4144 4380 ret = B_FALSE;
4145 4381 goto done;
4146 4382 }
4147 4383 mutex_exit(&ncec->ncec_lock);
4148 4384 if (ill->ill_defend_count < defend_rate)
4149 4385 ill->ill_defend_count++;
4150 4386 if (ill->ill_defend_count == defend_rate) {
4151 4387 /*
4152 4388 * we are no longer allowed to send unbidden defense
4153 4389 * messages. Wait for rescheduling.
4154 4390 */
4155 4391 ret = B_TRUE;
4156 4392 } else {
4157 4393 ret = B_FALSE;
4158 4394 }
4159 4395 done:
4160 4396 mutex_exit(&ill->ill_lock);
4161 4397 /*
4162 4398 * After all the locks have been dropped we can restart nce timer,
4163 4399 * and refrele the delayed ncecs
4164 4400 */
4165 4401 for (i = 0; i < ncert.ncert_num; i++) {
4166 4402 clock_t xmit_interval;
4167 4403 ncec_t *tmp;
4168 4404
4169 4405 tmp = ncert.ncert_nces[i];
4170 4406 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4171 4407 B_FALSE);
4172 4408 nce_restart_timer(tmp, xmit_interval);
4173 4409 ncec_refrele(tmp);
4174 4410 }
4175 4411 return (ret);
4176 4412 }
4177 4413
4178 4414 boolean_t
4179 4415 ndp_announce(ncec_t *ncec)
4180 4416 {
4181 4417 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4182 4418 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4183 4419 nce_advert_flags(ncec)));
4184 4420 }
4185 4421
4186 4422 ill_t *
4187 4423 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4188 4424 {
4189 4425 mblk_t *mp;
4190 4426 in6_addr_t src6;
4191 4427 ipaddr_t src4;
4192 4428 ill_t *ill = ncec->ncec_ill;
4193 4429 ill_t *src_ill = NULL;
4194 4430 ipif_t *ipif = NULL;
4195 4431 boolean_t is_myaddr = NCE_MYADDR(ncec);
4196 4432 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4197 4433
4198 4434 ASSERT(src != NULL);
4199 4435 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4200 4436 src6 = *src;
4201 4437 if (is_myaddr) {
4202 4438 src6 = ncec->ncec_addr;
4203 4439 if (!isv6)
4204 4440 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4205 4441 } else {
4206 4442 /*
4207 4443 * try to find one from the outgoing packet.
4208 4444 */
4209 4445 mutex_enter(&ncec->ncec_lock);
4210 4446 mp = ncec->ncec_qd_mp;
4211 4447 if (mp != NULL) {
4212 4448 if (isv6) {
4213 4449 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4214 4450
4215 4451 src6 = ip6h->ip6_src;
4216 4452 } else {
4217 4453 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4218 4454
4219 4455 src4 = ipha->ipha_src;
4220 4456 IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4221 4457 }
4222 4458 }
4223 4459 mutex_exit(&ncec->ncec_lock);
4224 4460 }
4225 4461
4226 4462 /*
4227 4463 * For outgoing packets, if the src of outgoing packet is one
4228 4464 * of the assigned interface addresses use it, otherwise we
4229 4465 * will pick the source address below.
4230 4466 * For local addresses (is_myaddr) doing DAD, NDP announce
4231 4467 * messages are mcast. So we use the (IPMP) cast_ill or the
4232 4468 * (non-IPMP) ncec_ill for these message types. The only case
4233 4469 * of unicast DAD messages are for IPv6 ND probes, for which
4234 4470 * we find the ipif_bound_ill corresponding to the ncec_addr.
4235 4471 */
4236 4472 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4237 4473 if (isv6) {
4238 4474 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4239 4475 ill->ill_ipst);
4240 4476 } else {
4241 4477 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4242 4478 ill->ill_ipst);
4243 4479 }
4244 4480
4245 4481 /*
4246 4482 * If no relevant ipif can be found, then it's not one of our
4247 4483 * addresses. Reset to :: and try to find a src for the NS or
4248 4484 * ARP request using ipif_select_source_v[4,6] below.
4249 4485 * If an ipif can be found, but it's not yet done with
4250 4486 * DAD verification, and we are not being invoked for
4251 4487 * DAD (i.e., !is_myaddr), then just postpone this
4252 4488 * transmission until later.
4253 4489 */
4254 4490 if (ipif == NULL) {
4255 4491 src6 = ipv6_all_zeros;
4256 4492 src4 = INADDR_ANY;
4257 4493 } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4258 4494 DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4259 4495 ncec_t *, ncec, ipif_t *, ipif);
4260 4496 ipif_refrele(ipif);
4261 4497 return (NULL);
4262 4498 }
4263 4499 }
4264 4500
4265 4501 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4266 4502 /*
4267 4503 * Pick a source address for this solicitation, but
4268 4504 * restrict the selection to addresses assigned to the
4269 4505 * output interface. We do this because the destination will
4270 4506 * create a neighbor cache entry for the source address of
4271 4507 * this packet, so the source address had better be a valid
4272 4508 * neighbor.
4273 4509 */
4274 4510 if (isv6) {
4275 4511 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4276 4512 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4277 4513 B_FALSE, NULL);
4278 4514 } else {
4279 4515 ipaddr_t nce_addr;
4280 4516
4281 4517 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4282 4518 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4283 4519 B_FALSE, NULL);
4284 4520 }
4285 4521 if (ipif == NULL && IS_IPMP(ill)) {
4286 4522 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4287 4523
4288 4524 if (send_ill != NULL) {
4289 4525 if (isv6) {
4290 4526 ipif = ipif_select_source_v6(send_ill,
4291 4527 &ncec->ncec_addr, B_TRUE,
4292 4528 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4293 4529 B_FALSE, NULL);
4294 4530 } else {
4295 4531 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4296 4532 src4);
4297 4533 ipif = ipif_select_source_v4(send_ill,
4298 4534 src4, ALL_ZONES, B_TRUE, NULL);
4299 4535 }
4300 4536 ill_refrele(send_ill);
4301 4537 }
4302 4538 }
4303 4539
4304 4540 if (ipif == NULL) {
4305 4541 char buf[INET6_ADDRSTRLEN];
4306 4542
4307 4543 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4308 4544 inet_ntop((isv6 ? AF_INET6 : AF_INET),
4309 4545 (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4310 4546 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4311 4547 return (NULL);
4312 4548 }
4313 4549 src6 = ipif->ipif_v6lcl_addr;
4314 4550 }
4315 4551 *src = src6;
4316 4552 if (ipif != NULL) {
4317 4553 src_ill = ipif->ipif_ill;
4318 4554 if (IS_IPMP(src_ill))
4319 4555 src_ill = ipmp_ipif_hold_bound_ill(ipif);
4320 4556 else
4321 4557 ill_refhold(src_ill);
4322 4558 ipif_refrele(ipif);
4323 4559 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4324 4560 ill_t *, src_ill);
4325 4561 }
4326 4562 return (src_ill);
4327 4563 }
4328 4564
4329 4565 void
4330 4566 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4331 4567 uchar_t *hwaddr, int hwaddr_len, int flags)
4332 4568 {
4333 4569 ill_t *ill;
4334 4570 ncec_t *ncec;
4335 4571 nce_t *nce;
4336 4572 uint16_t new_state;
4337 4573
4338 4574 ill = (ipif ? ipif->ipif_ill : NULL);
4339 4575 if (ill != NULL) {
4340 4576 /*
4341 4577 * only one ncec is possible
4342 4578 */
4343 4579 nce = nce_lookup_v4(ill, addr);
4344 4580 if (nce != NULL) {
4345 4581 ncec = nce->nce_common;
4346 4582 mutex_enter(&ncec->ncec_lock);
4347 4583 if (NCE_ISREACHABLE(ncec))
4348 4584 new_state = ND_UNCHANGED;
4349 4585 else
4350 4586 new_state = ND_STALE;
4351 4587 ncec->ncec_flags = flags;
4352 4588 nce_update(ncec, new_state, hwaddr);
4353 4589 mutex_exit(&ncec->ncec_lock);
4354 4590 nce_refrele(nce);
4355 4591 return;
4356 4592 }
4357 4593 } else {
4358 4594 /*
4359 4595 * ill is wildcard; clean up all ncec's and ire's
4360 4596 * that match on addr.
4361 4597 */
4362 4598 nce_hw_map_t hwm;
4363 4599
4364 4600 hwm.hwm_addr = *addr;
4365 4601 hwm.hwm_hwlen = hwaddr_len;
4366 4602 hwm.hwm_hwaddr = hwaddr;
4367 4603 hwm.hwm_flags = flags;
4368 4604
4369 4605 ncec_walk_common(ipst->ips_ndp4, NULL,
4370 4606 nce_update_hw_changed, &hwm, B_TRUE);
4371 4607 }
4372 4608 }
4373 4609
4374 4610 /*
4375 4611 * Common function to add ncec entries.
4376 4612 * we always add the ncec with ncec_ill == ill, and always create
4377 4613 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4378 4614 * ncec is !reachable.
4379 4615 *
4380 4616 * When the caller passes in an nce_state of ND_UNCHANGED,
4381 4617 * nce_add_common() will determine the state of the created nce based
4382 4618 * on the ill_net_type and nce_flags used. Otherwise, the nce will
4383 4619 * be created with state set to the passed in nce_state.
4384 4620 */
4385 4621 static int
4386 4622 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4387 4623 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4388 4624 {
|
↓ open down ↓ |
403 lines elided |
↑ open up ↑ |
4389 4625 static ncec_t nce_nil;
4390 4626 uchar_t *template = NULL;
4391 4627 int err;
4392 4628 ncec_t *ncec;
4393 4629 ncec_t **ncep;
4394 4630 ip_stack_t *ipst = ill->ill_ipst;
4395 4631 uint16_t state;
4396 4632 boolean_t fastprobe = B_FALSE;
4397 4633 struct ndp_g_s *ndp;
4398 4634 nce_t *nce = NULL;
4635 + list_t graveyard;
4399 4636 mblk_t *dlur_mp = NULL;
4400 4637
4401 4638 if (ill->ill_isv6)
4402 4639 ndp = ill->ill_ipst->ips_ndp6;
4403 4640 else
4404 4641 ndp = ill->ill_ipst->ips_ndp4;
4405 4642
4406 4643 *retnce = NULL;
4407 4644
4408 4645 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4409 4646
4410 4647 if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4411 4648 ip0dbg(("nce_add_common: no addr\n"));
4412 4649 return (EINVAL);
4413 4650 }
4414 4651 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4415 4652 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4416 4653 return (EINVAL);
4417 4654 }
4418 4655
4419 4656 if (ill->ill_isv6) {
4420 4657 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4421 4658 } else {
4422 4659 ipaddr_t v4addr;
4423 4660
4424 4661 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4425 4662 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4426 4663 }
4427 4664
4428 4665 /*
4429 4666 * The caller has ensured that there is no nce on ill, but there could
4430 4667 * still be an nce_common_t for the address, so that we find exisiting
4431 4668 * ncec_t strucutures first, and atomically add a new nce_t if
4432 4669 * one is found. The ndp_g_lock ensures that we don't cross threads
4433 4670 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4434 4671 * compare for matches across the illgrp because this function is
4435 4672 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4436 4673 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4437 4674 * appropriate.
4438 4675 */
4439 4676 ncec = *ncep;
4440 4677 for (; ncec != NULL; ncec = ncec->ncec_next) {
4441 4678 if (ncec->ncec_ill == ill) {
4442 4679 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4443 4680 /*
4444 4681 * We should never find *retnce to be
4445 4682 * MYADDR, since the caller may then
4446 4683 * incorrectly restart a DAD timer that's
4447 4684 * already running. However, if we are in
4448 4685 * forwarding mode, and the interface is
4449 4686 * moving in/out of groups, the data
4450 4687 * path ire lookup (e.g., ire_revalidate_nce)
4451 4688 * may have determined that some destination
4452 4689 * is offlink while the control path is adding
4453 4690 * that address as a local address.
4454 4691 * Recover from this case by failing the
4455 4692 * lookup
4456 4693 */
4457 4694 if (NCE_MYADDR(ncec))
4458 4695 return (ENXIO);
4459 4696 *retnce = nce_ill_lookup_then_add(ill, ncec);
4460 4697 if (*retnce != NULL)
4461 4698 break;
4462 4699 }
4463 4700 }
4464 4701 }
4465 4702 if (*retnce != NULL) /* caller must trigger fastpath on nce */
4466 4703 return (0);
4467 4704
4468 4705 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4469 4706 if (ncec == NULL)
4470 4707 return (ENOMEM);
4471 4708 *ncec = nce_nil;
4472 4709 ncec->ncec_ill = ill;
4473 4710 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4474 4711 ncec->ncec_flags = flags;
4475 4712 ncec->ncec_ipst = ipst; /* No netstack_hold */
4476 4713
4477 4714 if (!ill->ill_isv6) {
4478 4715 ipaddr_t addr4;
4479 4716
4480 4717 /*
4481 4718 * DAD probe interval and probe count are set based on
4482 4719 * fast/slow probe settings. If the underlying link doesn't
4483 4720 * have reliably up/down notifications or if we're working
4484 4721 * with IPv4 169.254.0.0/16 Link Local Address space, then
4485 4722 * don't use the fast timers. Otherwise, use them.
4486 4723 */
4487 4724 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4488 4725 IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4489 4726 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4490 4727 fastprobe = B_TRUE;
4491 4728 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4492 4729 !IS_IPV4_LL_SPACE(&addr4)) {
4493 4730 ill_t *hwaddr_ill;
4494 4731
4495 4732 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4496 4733 hw_addr_len);
4497 4734 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4498 4735 fastprobe = B_TRUE;
4499 4736 }
4500 4737 if (fastprobe) {
4501 4738 ncec->ncec_xmit_interval =
4502 4739 ipst->ips_arp_fastprobe_interval;
4503 4740 ncec->ncec_pcnt =
4504 4741 ipst->ips_arp_fastprobe_count;
4505 4742 ncec->ncec_flags |= NCE_F_FAST;
4506 4743 } else {
4507 4744 ncec->ncec_xmit_interval =
4508 4745 ipst->ips_arp_probe_interval;
4509 4746 ncec->ncec_pcnt =
4510 4747 ipst->ips_arp_probe_count;
4511 4748 }
4512 4749 if (NCE_PUBLISH(ncec)) {
4513 4750 ncec->ncec_unsolicit_count =
4514 4751 ipst->ips_ip_arp_publish_count;
4515 4752 }
4516 4753 } else {
4517 4754 /*
4518 4755 * probe interval is constant: ILL_PROBE_INTERVAL
4519 4756 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4520 4757 */
4521 4758 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4522 4759 if (NCE_PUBLISH(ncec)) {
4523 4760 ncec->ncec_unsolicit_count =
4524 4761 ipst->ips_ip_ndp_unsolicit_count;
4525 4762 }
4526 4763 }
4527 4764 ncec->ncec_rcnt = ill->ill_xmit_count;
4528 4765 ncec->ncec_addr = *addr;
4529 4766 ncec->ncec_qd_mp = NULL;
4530 4767 ncec->ncec_refcnt = 1; /* for ncec getting created */
4531 4768 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4532 4769 ncec->ncec_trace_disable = B_FALSE;
4533 4770
4534 4771 /*
4535 4772 * ncec_lladdr holds link layer address
4536 4773 */
4537 4774 if (hw_addr_len > 0) {
4538 4775 template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4539 4776 if (template == NULL) {
4540 4777 err = ENOMEM;
4541 4778 goto err_ret;
4542 4779 }
4543 4780 ncec->ncec_lladdr = template;
4544 4781 ncec->ncec_lladdr_length = hw_addr_len;
4545 4782 bzero(ncec->ncec_lladdr, hw_addr_len);
4546 4783 }
4547 4784 if ((flags & NCE_F_BCAST) != 0) {
4548 4785 state = ND_REACHABLE;
4549 4786 ASSERT(hw_addr_len > 0);
4550 4787 } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4551 4788 state = ND_INITIAL;
4552 4789 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4553 4790 /*
4554 4791 * NORESOLVER entries are always created in the REACHABLE
4555 4792 * state.
4556 4793 */
4557 4794 state = ND_REACHABLE;
4558 4795 if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4559 4796 ill->ill_mactype != DL_IPV4 &&
4560 4797 ill->ill_mactype != DL_6TO4) {
4561 4798 /*
4562 4799 * We create a nce_res_mp with the IP nexthop address
4563 4800 * as the destination address if the physical length
4564 4801 * is exactly 4 bytes for point-to-multipoint links
4565 4802 * that do their own resolution from IP to link-layer
4566 4803 * address (e.g. IP over X.25).
4567 4804 */
4568 4805 bcopy((uchar_t *)addr,
4569 4806 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4570 4807 }
4571 4808 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4572 4809 ill->ill_mactype != DL_IPV6) {
4573 4810 /*
4574 4811 * We create a nce_res_mp with the IP nexthop address
4575 4812 * as the destination address if the physical legnth
4576 4813 * is exactly 16 bytes for point-to-multipoint links
4577 4814 * that do their own resolution from IP to link-layer
4578 4815 * address.
4579 4816 */
4580 4817 bcopy((uchar_t *)addr,
4581 4818 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4582 4819 }
4583 4820 /*
4584 4821 * Since NUD is not part of the base IPv4 protocol definition,
4585 4822 * IPv4 neighbor entries on NORESOLVER interfaces will never
4586 4823 * age, and are marked NCE_F_NONUD.
4587 4824 */
4588 4825 if (!ill->ill_isv6)
4589 4826 ncec->ncec_flags |= NCE_F_NONUD;
4590 4827 } else if (ill->ill_net_type == IRE_LOOPBACK) {
4591 4828 state = ND_REACHABLE;
4592 4829 }
4593 4830
4594 4831 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4595 4832 /*
4596 4833 * We are adding an ncec with a deterministic hw_addr,
4597 4834 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4598 4835 *
4599 4836 * if we are adding a unicast ncec for the local address
4600 4837 * it would be REACHABLE; we would be adding a ND_STALE entry
4601 4838 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4602 4839 * addresses are added in PROBE to trigger DAD.
4603 4840 */
4604 4841 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4605 4842 ill->ill_net_type == IRE_IF_NORESOLVER)
4606 4843 state = ND_REACHABLE;
4607 4844 else if (!NCE_PUBLISH(ncec))
4608 4845 state = ND_STALE;
4609 4846 else
4610 4847 state = ND_PROBE;
4611 4848 if (hw_addr != NULL)
4612 4849 nce_set_ll(ncec, hw_addr);
4613 4850 }
4614 4851 /* caller overrides internally computed state */
4615 4852 if (nce_state != ND_UNCHANGED)
4616 4853 state = nce_state;
4617 4854
4618 4855 if (state == ND_PROBE)
4619 4856 ncec->ncec_flags |= NCE_F_UNVERIFIED;
4620 4857
4621 4858 ncec->ncec_state = state;
4622 4859
4623 4860 if (state == ND_REACHABLE) {
4624 4861 ncec->ncec_last = ncec->ncec_init_time =
4625 4862 TICK_TO_MSEC(ddi_get_lbolt64());
4626 4863 } else {
4627 4864 ncec->ncec_last = 0;
4628 4865 if (state == ND_INITIAL)
4629 4866 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4630 4867 }
4631 4868 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4632 4869 offsetof(ncec_cb_t, ncec_cb_node));
4633 4870 /*
4634 4871 * have all the memory allocations out of the way before taking locks
4635 4872 * and adding the nce.
4636 4873 */
4637 4874 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4638 4875 if (nce == NULL) {
4639 4876 err = ENOMEM;
4640 4877 goto err_ret;
4641 4878 }
4642 4879 if (ncec->ncec_lladdr != NULL ||
4643 4880 ill->ill_net_type == IRE_IF_NORESOLVER) {
4644 4881 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4645 4882 ill->ill_phys_addr_length, ill->ill_sap,
4646 4883 ill->ill_sap_length);
4647 4884 if (dlur_mp == NULL) {
4648 4885 err = ENOMEM;
4649 4886 goto err_ret;
4650 4887 }
4651 4888 }
4652 4889
4653 4890 /*
4654 4891 * Atomically ensure that the ill is not CONDEMNED, before
4655 4892 * adding the NCE.
4656 4893 */
4657 4894 mutex_enter(&ill->ill_lock);
4658 4895 if (ill->ill_state_flags & ILL_CONDEMNED) {
4659 4896 mutex_exit(&ill->ill_lock);
4660 4897 err = EINVAL;
4661 4898 goto err_ret;
4662 4899 }
4663 4900 if (!NCE_MYADDR(ncec) &&
4664 4901 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4665 4902 mutex_exit(&ill->ill_lock);
4666 4903 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4667 4904 err = EINVAL;
4668 4905 goto err_ret;
4669 4906 }
4670 4907 /*
4671 4908 * Acquire the ncec_lock even before adding the ncec to the list
4672 4909 * so that it cannot get deleted after the ncec is added, but
4673 4910 * before we add the nce.
4674 4911 */
4675 4912 mutex_enter(&ncec->ncec_lock);
4676 4913 if ((ncec->ncec_next = *ncep) != NULL)
4677 4914 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4678 4915 *ncep = ncec;
|
↓ open down ↓ |
270 lines elided |
↑ open up ↑ |
4679 4916 ncec->ncec_ptpn = ncep;
4680 4917
4681 4918 /* Bump up the number of ncec's referencing this ill */
4682 4919 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4683 4920 (char *), "ncec", (void *), ncec);
4684 4921 ill->ill_ncec_cnt++;
4685 4922 /*
4686 4923 * Since we hold the ncec_lock at this time, the ncec cannot be
4687 4924 * condemned, and we can safely add the nce.
4688 4925 */
4689 - *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4926 + list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
4927 + *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
4690 4928 mutex_exit(&ncec->ncec_lock);
4691 4929 mutex_exit(&ill->ill_lock);
4930 + nce_graveyard_free(&graveyard);
4692 4931
4693 4932 /* caller must trigger fastpath on *retnce */
4694 4933 return (0);
4695 4934
4696 4935 err_ret:
4697 4936 if (ncec != NULL)
4698 4937 kmem_cache_free(ncec_cache, ncec);
4699 4938 if (nce != NULL)
4700 4939 kmem_cache_free(nce_cache, nce);
4701 4940 freemsg(dlur_mp);
4702 4941 if (template != NULL)
4703 4942 kmem_free(template, ill->ill_phys_addr_length);
4704 4943 return (err);
4705 4944 }
4706 4945
4707 4946 /*
4708 4947 * take a ref on the nce
4709 4948 */
4710 4949 void
4711 4950 nce_refhold(nce_t *nce)
4712 4951 {
4713 4952 mutex_enter(&nce->nce_lock);
4714 4953 nce->nce_refcnt++;
4715 4954 ASSERT((nce)->nce_refcnt != 0);
4716 4955 mutex_exit(&nce->nce_lock);
4717 4956 }
4718 4957
4719 4958 /*
4720 4959 * release a ref on the nce; In general, this
4721 4960 * cannot be called with locks held because nce_inactive
4722 4961 * may result in nce_inactive which will take the ill_lock,
4723 4962 * do ipif_ill_refrele_tail etc. Thus the one exception
4724 4963 * where this can be called with locks held is when the caller
4725 4964 * is certain that the nce_refcnt is sufficient to prevent
4726 4965 * the invocation of nce_inactive.
4727 4966 */
4728 4967 void
4729 4968 nce_refrele(nce_t *nce)
4730 4969 {
4731 4970 ASSERT((nce)->nce_refcnt != 0);
4732 4971 mutex_enter(&nce->nce_lock);
4733 4972 if (--nce->nce_refcnt == 0)
4734 4973 nce_inactive(nce); /* destroys the mutex */
4735 4974 else
4736 4975 mutex_exit(&nce->nce_lock);
4737 4976 }
4738 4977
4739 4978 /*
4740 4979 * free the nce after all refs have gone away.
4741 4980 */
4742 4981 static void
4743 4982 nce_inactive(nce_t *nce)
4744 4983 {
4745 4984 ill_t *ill = nce->nce_ill;
4746 4985
4747 4986 ASSERT(nce->nce_refcnt == 0);
4748 4987
4749 4988 ncec_refrele_notr(nce->nce_common);
4750 4989 nce->nce_common = NULL;
4751 4990 freemsg(nce->nce_fp_mp);
4752 4991 freemsg(nce->nce_dlur_mp);
4753 4992
4754 4993 mutex_enter(&ill->ill_lock);
4755 4994 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4756 4995 (char *), "nce", (void *), nce);
4757 4996 ill->ill_nce_cnt--;
4758 4997 nce->nce_ill = NULL;
4759 4998 /*
4760 4999 * If the number of ncec's associated with this ill have dropped
4761 5000 * to zero, check whether we need to restart any operation that
4762 5001 * is waiting for this to happen.
4763 5002 */
4764 5003 if (ILL_DOWN_OK(ill)) {
4765 5004 /* ipif_ill_refrele_tail drops the ill_lock */
4766 5005 ipif_ill_refrele_tail(ill);
|
↓ open down ↓ |
65 lines elided |
↑ open up ↑ |
4767 5006 } else {
4768 5007 mutex_exit(&ill->ill_lock);
4769 5008 }
4770 5009
4771 5010 mutex_destroy(&nce->nce_lock);
4772 5011 kmem_cache_free(nce_cache, nce);
4773 5012 }
4774 5013
4775 5014 /*
4776 5015 * Add an nce to the ill_nce list.
5016 + *
5017 + * Adding multicast NCEs is subject to a per-ill limit. This function returns
5018 + * NULL if that's the case, and it may reap a number of multicast nces.
5019 + * Callers (and upstack) must be able to cope with NULL returns.
4777 5020 */
4778 5021 static nce_t *
4779 -nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
5022 +nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
5023 + list_t *graveyard)
4780 5024 {
5025 + ASSERT(MUTEX_HELD(&ill->ill_lock));
5026 +
5027 + if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
5028 + if (nce_too_many_mcast(ill, graveyard)) {
5029 + kmem_cache_free(nce_cache, nce);
5030 + return (NULL);
5031 + }
5032 + ill->ill_mcast_nces++;
5033 + }
5034 +
4781 5035 bzero(nce, sizeof (*nce));
4782 5036 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4783 5037 nce->nce_common = ncec;
4784 5038 nce->nce_addr = ncec->ncec_addr;
4785 5039 nce->nce_ill = ill;
4786 5040 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4787 5041 (char *), "nce", (void *), nce);
4788 5042 ill->ill_nce_cnt++;
4789 5043
4790 5044 nce->nce_refcnt = 1; /* for the thread */
4791 5045 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4792 5046 nce->nce_dlur_mp = dlur_mp;
4793 5047
4794 5048 /* add nce to the ill's fastpath list. */
4795 5049 nce->nce_refcnt++; /* for the list */
4796 5050 list_insert_head(&ill->ill_nce, nce);
4797 5051 return (nce);
4798 5052 }
4799 5053
4800 5054 static nce_t *
4801 -nce_add(ill_t *ill, ncec_t *ncec)
5055 +nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
4802 5056 {
4803 5057 nce_t *nce;
4804 5058 mblk_t *dlur_mp = NULL;
4805 5059
4806 5060 ASSERT(MUTEX_HELD(&ill->ill_lock));
4807 5061 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4808 5062
4809 5063 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4810 5064 if (nce == NULL)
4811 5065 return (NULL);
4812 5066 if (ncec->ncec_lladdr != NULL ||
4813 5067 ill->ill_net_type == IRE_IF_NORESOLVER) {
4814 5068 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4815 5069 ill->ill_phys_addr_length, ill->ill_sap,
4816 5070 ill->ill_sap_length);
4817 5071 if (dlur_mp == NULL) {
4818 5072 kmem_cache_free(nce_cache, nce);
4819 5073 return (NULL);
4820 5074 }
4821 5075 }
4822 - return (nce_add_impl(ill, ncec, nce, dlur_mp));
5076 + /*
5077 + * If nce_add_impl() returns NULL due to on multicast limiting, caller
5078 + * will (correctly) assume ENOMEM.
5079 + */
5080 + return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
4823 5081 }
4824 5082
4825 5083 /*
4826 5084 * remove the nce from the ill_faspath list
4827 5085 */
4828 5086 void
4829 5087 nce_delete(nce_t *nce)
4830 5088 {
4831 5089 ill_t *ill = nce->nce_ill;
4832 5090
4833 5091 ASSERT(MUTEX_HELD(&ill->ill_lock));
4834 5092
4835 5093 mutex_enter(&nce->nce_lock);
|
↓ open down ↓ |
3 lines elided |
↑ open up ↑ |
4836 5094 if (nce->nce_is_condemned) {
4837 5095 /*
4838 5096 * some other thread has removed this nce from the ill_nce list
4839 5097 */
4840 5098 mutex_exit(&nce->nce_lock);
4841 5099 return;
4842 5100 }
4843 5101 nce->nce_is_condemned = B_TRUE;
4844 5102 mutex_exit(&nce->nce_lock);
4845 5103
5104 + /* Update the count of multicast NCEs. */
5105 + if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
5106 + ill->ill_mcast_nces--;
5107 +
4846 5108 list_remove(&ill->ill_nce, nce);
4847 5109 /*
4848 5110 * even though we are holding the ill_lock, it is ok to
4849 5111 * call nce_refrele here because we know that we should have
4850 5112 * at least 2 refs on the nce: one for the thread, and one
4851 5113 * for the list. The refrele below will release the one for
4852 5114 * the list.
4853 5115 */
4854 5116 nce_refrele(nce);
4855 5117 }
4856 5118
4857 5119 nce_t *
4858 5120 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4859 5121 {
4860 5122 nce_t *nce = NULL;
4861 5123
4862 5124 ASSERT(ill != NULL);
4863 5125 ASSERT(MUTEX_HELD(&ill->ill_lock));
4864 5126
4865 5127 for (nce = list_head(&ill->ill_nce); nce != NULL;
4866 5128 nce = list_next(&ill->ill_nce, nce)) {
4867 5129 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4868 5130 break;
4869 5131 }
4870 5132
4871 5133 /*
4872 5134 * if we found the nce on the ill_nce list while holding
4873 5135 * the ill_lock, then it cannot be condemned yet.
4874 5136 */
4875 5137 if (nce != NULL) {
4876 5138 ASSERT(!nce->nce_is_condemned);
4877 5139 nce_refhold(nce);
4878 5140 }
4879 5141 return (nce);
4880 5142 }
4881 5143
4882 5144 /*
4883 5145 * Walk the ill_nce list on ill. The callback function func() cannot perform
4884 5146 * any destructive actions.
4885 5147 */
4886 5148 static void
4887 5149 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4888 5150 {
4889 5151 nce_t *nce = NULL, *nce_next;
4890 5152
4891 5153 ASSERT(MUTEX_HELD(&ill->ill_lock));
4892 5154 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4893 5155 nce_next = list_next(&ill->ill_nce, nce);
4894 5156 if (func(ill, nce, arg) != 0)
4895 5157 break;
4896 5158 nce = nce_next;
4897 5159 }
4898 5160 }
4899 5161
4900 5162 void
4901 5163 nce_walk(ill_t *ill, pfi_t func, void *arg)
4902 5164 {
4903 5165 mutex_enter(&ill->ill_lock);
4904 5166 nce_walk_common(ill, func, arg);
4905 5167 mutex_exit(&ill->ill_lock);
4906 5168 }
4907 5169
4908 5170 void
4909 5171 nce_flush(ill_t *ill, boolean_t flushall)
4910 5172 {
4911 5173 nce_t *nce, *nce_next;
4912 5174 list_t dead;
4913 5175
4914 5176 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4915 5177 mutex_enter(&ill->ill_lock);
4916 5178 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4917 5179 nce_next = list_next(&ill->ill_nce, nce);
4918 5180 if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4919 5181 nce = nce_next;
4920 5182 continue;
4921 5183 }
4922 5184 /*
4923 5185 * nce_delete requires that the caller should either not
4924 5186 * be holding locks, or should hold a ref to ensure that
4925 5187 * we wont hit ncec_inactive. So take a ref and clean up
4926 5188 * after the list is flushed.
4927 5189 */
4928 5190 nce_refhold(nce);
4929 5191 nce_delete(nce);
4930 5192 list_insert_tail(&dead, nce);
4931 5193 nce = nce_next;
4932 5194 }
4933 5195 mutex_exit(&ill->ill_lock);
4934 5196 while ((nce = list_head(&dead)) != NULL) {
4935 5197 list_remove(&dead, nce);
4936 5198 nce_refrele(nce);
4937 5199 }
4938 5200 ASSERT(list_is_empty(&dead));
4939 5201 list_destroy(&dead);
4940 5202 }
4941 5203
4942 5204 /* Return an interval that is anywhere in the [1 .. intv] range */
4943 5205 static clock_t
4944 5206 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4945 5207 {
4946 5208 clock_t rnd, frac;
4947 5209
4948 5210 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4949 5211 /* Note that clock_t is signed; must chop off bits */
4950 5212 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4951 5213 if (initial_time) {
4952 5214 if (intv <= 0)
4953 5215 intv = 1;
4954 5216 else
4955 5217 intv = (rnd % intv) + 1;
4956 5218 } else {
4957 5219 /* Compute 'frac' as 20% of the configured interval */
4958 5220 if ((frac = intv / 5) <= 1)
4959 5221 frac = 2;
4960 5222 /* Set intv randomly in the range [intv-frac .. intv+frac] */
4961 5223 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4962 5224 intv = 1;
4963 5225 }
4964 5226 return (intv);
4965 5227 }
4966 5228
4967 5229 void
4968 5230 nce_resolv_ipmp_ok(ncec_t *ncec)
4969 5231 {
4970 5232 mblk_t *mp;
4971 5233 uint_t pkt_len;
4972 5234 iaflags_t ixaflags = IXAF_NO_TRACE;
4973 5235 nce_t *under_nce;
4974 5236 ill_t *ill = ncec->ncec_ill;
4975 5237 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4976 5238 ipif_t *src_ipif = NULL;
4977 5239 ip_stack_t *ipst = ill->ill_ipst;
4978 5240 ill_t *send_ill;
4979 5241 uint_t nprobes;
4980 5242
4981 5243 ASSERT(IS_IPMP(ill));
4982 5244
4983 5245 mutex_enter(&ncec->ncec_lock);
4984 5246 nprobes = ncec->ncec_nprobes;
4985 5247 mp = ncec->ncec_qd_mp;
4986 5248 ncec->ncec_qd_mp = NULL;
4987 5249 ncec->ncec_nprobes = 0;
4988 5250 mutex_exit(&ncec->ncec_lock);
4989 5251
4990 5252 while (mp != NULL) {
4991 5253 mblk_t *nxt_mp;
4992 5254
4993 5255 nxt_mp = mp->b_next;
4994 5256 mp->b_next = NULL;
4995 5257 if (isv6) {
4996 5258 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4997 5259
4998 5260 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4999 5261 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
5000 5262 ill, ALL_ZONES, ipst);
5001 5263 } else {
5002 5264 ipha_t *ipha = (ipha_t *)mp->b_rptr;
5003 5265
5004 5266 ixaflags |= IXAF_IS_IPV4;
5005 5267 pkt_len = ntohs(ipha->ipha_length);
5006 5268 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5007 5269 ill, ALL_ZONES, ipst);
5008 5270 }
5009 5271
5010 5272 /*
5011 5273 * find a new nce based on an under_ill. The first IPMP probe
5012 5274 * packet gets queued, so we could still find a src_ipif that
5013 5275 * matches an IPMP test address.
5014 5276 */
5015 5277 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5016 5278 /*
5017 5279 * if src_ipif is null, this could be either a
5018 5280 * forwarded packet or a probe whose src got deleted.
5019 5281 * We identify the former case by looking for the
5020 5282 * ncec_nprobes: the first ncec_nprobes packets are
5021 5283 * probes;
5022 5284 */
5023 5285 if (src_ipif == NULL && nprobes > 0)
5024 5286 goto drop_pkt;
5025 5287
5026 5288 /*
5027 5289 * For forwarded packets, we use the ipmp rotor
5028 5290 * to find send_ill.
5029 5291 */
5030 5292 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5031 5293 B_TRUE);
5032 5294 } else {
5033 5295 send_ill = src_ipif->ipif_ill;
5034 5296 ill_refhold(send_ill);
5035 5297 }
5036 5298
5037 5299 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5038 5300 (ncec_t *), ncec, (ipif_t *),
5039 5301 src_ipif, (ill_t *), send_ill);
5040 5302
5041 5303 if (send_ill == NULL) {
5042 5304 if (src_ipif != NULL)
5043 5305 ipif_refrele(src_ipif);
5044 5306 goto drop_pkt;
5045 5307 }
5046 5308 /* create an under_nce on send_ill */
5047 5309 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5048 5310 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5049 5311 under_nce = nce_fastpath_create(send_ill, ncec);
5050 5312 else
5051 5313 under_nce = NULL;
5052 5314 rw_exit(&ipst->ips_ill_g_lock);
5053 5315 if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5054 5316 nce_fastpath_trigger(under_nce);
5055 5317
5056 5318 ill_refrele(send_ill);
5057 5319 if (src_ipif != NULL)
5058 5320 ipif_refrele(src_ipif);
5059 5321
5060 5322 if (under_nce != NULL) {
5061 5323 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5062 5324 ALL_ZONES, 0, NULL);
5063 5325 nce_refrele(under_nce);
5064 5326 if (nprobes > 0)
5065 5327 nprobes--;
5066 5328 mp = nxt_mp;
5067 5329 continue;
5068 5330 }
5069 5331 drop_pkt:
5070 5332 if (isv6) {
5071 5333 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5072 5334 } else {
5073 5335 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5074 5336 }
5075 5337 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5076 5338 freemsg(mp);
5077 5339 if (nprobes > 0)
5078 5340 nprobes--;
5079 5341 mp = nxt_mp;
5080 5342 }
5081 5343 ncec_cb_dispatch(ncec); /* complete callbacks */
5082 5344 }
|
↓ open down ↓ |
227 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX