1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * Copyright (c) 2018, Joyent, Inc.
27 */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/errno.h>
35 #include <sys/dlpi.h>
36 #include <sys/socket.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/vtrace.h>
42 #include <sys/kmem.h>
43 #include <sys/zone.h>
44 #include <sys/ethernet.h>
45 #include <sys/sdt.h>
46 #include <sys/mac.h>
47
48 #include <net/if.h>
49 #include <net/if_types.h>
50 #include <net/if_dl.h>
51 #include <net/route.h>
52 #include <netinet/in.h>
53 #include <netinet/ip6.h>
54 #include <netinet/icmp6.h>
55
56 #include <inet/common.h>
57 #include <inet/mi.h>
58 #include <inet/mib2.h>
59 #include <inet/nd.h>
60 #include <inet/ip.h>
61 #include <inet/ip_impl.h>
62 #include <inet/ipclassifier.h>
63 #include <inet/ip_if.h>
64 #include <inet/ip_ire.h>
65 #include <inet/ip_rts.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_ndp.h>
68 #include <inet/sctp_ip.h>
69 #include <inet/ip_arp.h>
70 #include <inet/ip2mac_impl.h>
71
72 #define ANNOUNCE_INTERVAL(isv6) \
73 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
74 ipst->ips_ip_arp_publish_interval)
75
76 #define DEFENSE_INTERVAL(isv6) \
77 (isv6 ? ipst->ips_ndp_defend_interval : \
78 ipst->ips_arp_defend_interval)
79
80 /* Non-tunable probe interval, based on link capabilities */
81 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
82
83 /*
84 * The IPv4 Link Local address space is special; we do extra duplicate checking
85 * there, as the entire assignment mechanism rests on random numbers.
86 */
87 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
88 ((uchar_t *)ptr)[1] == 254)
89
90 /*
91 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
92 * in to the ncec*add* functions.
93 *
94 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
95 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
96 * that we will respond to requests for the protocol address.
97 */
98 #define NCE_EXTERNAL_FLAGS_MASK \
99 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
100 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
101 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
102
103 /*
104 * Lock ordering:
105 *
106 * ndp_g_lock -> ill_lock -> ncec_lock
107 *
108 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
109 * ncec_next. ncec_lock protects the contents of the NCE (particularly
110 * ncec_refcnt).
111 */
112
113 static void nce_cleanup_list(ncec_t *ncec);
114 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
115 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
116 ncec_t *);
117 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *);
118 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
119 uint16_t ncec_flags, nce_t **newnce);
120 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
121 uint16_t ncec_flags, nce_t **newnce);
122 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation,
123 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
124 const in6_addr_t *target, int flag);
125 static void ncec_refhold_locked(ncec_t *);
126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
127 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
128 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
129 uint16_t, uint16_t, nce_t **);
130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
131 static nce_t *nce_add(ill_t *, ncec_t *);
132 static void nce_inactive(nce_t *);
133 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
135 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
136 uint16_t, uint16_t, nce_t **);
137 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
138 uint16_t, uint16_t, nce_t **);
139 static int nce_add_v6_postprocess(nce_t *);
140 static int nce_add_v4_postprocess(nce_t *);
141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
143 static void nce_resolv_ipmp_ok(ncec_t *);
144 static void nce_walk_common(ill_t *, pfi_t, void *);
145 static void nce_start_timer(ncec_t *, uint_t);
146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
147 static void nce_fastpath_trigger(nce_t *);
148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
149
150 #ifdef DEBUG
151 static void ncec_trace_cleanup(const ncec_t *);
152 #endif
153
154 #define NCE_HASH_PTR_V4(ipst, addr) \
155 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
156
157 #define NCE_HASH_PTR_V6(ipst, addr) \
158 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
159 NCE_TABLE_SIZE)]))
160
161 extern kmem_cache_t *ncec_cache;
162 extern kmem_cache_t *nce_cache;
163
164 /*
165 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
166 * If src_ill is not null, the ncec_addr is bound to src_ill. The
167 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
168 * the probe is sent on the ncec_ill (in the non-IPMP case) or the
169 * IPMP cast_ill (in the IPMP case).
170 *
171 * Note that the probe interval is based on the src_ill for IPv6, and
172 * the ncec_xmit_interval for IPv4.
173 */
174 static void
175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
176 {
177 boolean_t dropped;
178 uint32_t probe_interval;
179
180 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
181 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
182 if (ncec->ncec_ipversion == IPV6_VERSION) {
183 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
184 ncec->ncec_lladdr, ncec->ncec_lladdr_length,
185 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
186 probe_interval = ILL_PROBE_INTERVAL(src_ill);
187 } else {
188 /* IPv4 DAD delay the initial probe. */
189 if (send_probe)
190 dropped = arp_probe(ncec);
191 else
192 dropped = B_TRUE;
193 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
194 !send_probe);
195 }
196 if (!dropped) {
197 mutex_enter(&ncec->ncec_lock);
198 ncec->ncec_pcnt--;
199 mutex_exit(&ncec->ncec_lock);
200 }
201 nce_restart_timer(ncec, probe_interval);
202 }
203
204 /*
205 * Compute default flags to use for an advertisement of this ncec's address.
206 */
207 static int
208 nce_advert_flags(const ncec_t *ncec)
209 {
210 int flag = 0;
211
212 if (ncec->ncec_flags & NCE_F_ISROUTER)
213 flag |= NDP_ISROUTER;
214 if (!(ncec->ncec_flags & NCE_F_ANYCAST))
215 flag |= NDP_ORIDE;
216
217 return (flag);
218 }
219
220 /*
221 * NDP Cache Entry creation routine.
222 * This routine must always be called with ndp6->ndp_g_lock held.
223 */
224 int
225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
226 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
227 {
228 int err;
229 nce_t *nce;
230
231 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
232 ASSERT(ill != NULL && ill->ill_isv6);
233
234 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
235 &nce);
236 if (err != 0)
237 return (err);
238 ASSERT(newnce != NULL);
239 *newnce = nce;
240 return (err);
241 }
242
243 /*
244 * Post-processing routine to be executed after nce_add_v6(). This function
245 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
246 * and must be called without any locks held.
247 */
248 int
249 nce_add_v6_postprocess(nce_t *nce)
250 {
251 ncec_t *ncec = nce->nce_common;
252 boolean_t dropped = B_FALSE;
253 uchar_t *hw_addr = ncec->ncec_lladdr;
254 uint_t hw_addr_len = ncec->ncec_lladdr_length;
255 ill_t *ill = ncec->ncec_ill;
256 int err = 0;
257 uint16_t flags = ncec->ncec_flags;
258 ip_stack_t *ipst = ill->ill_ipst;
259 boolean_t trigger_fastpath = B_TRUE;
260
261 /*
262 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
263 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
264 * We call nce_fastpath from nce_update if the link layer address of
265 * the peer changes from nce_update
266 */
267 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
268 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
269 trigger_fastpath = B_FALSE;
270
271 if (trigger_fastpath)
272 nce_fastpath_trigger(nce);
273 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
274 ill_t *hwaddr_ill;
275 /*
276 * Unicast entry that needs DAD.
277 */
278 if (IS_IPMP(ill)) {
279 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
280 hw_addr, hw_addr_len);
281 } else {
282 hwaddr_ill = ill;
283 }
284 nce_dad(ncec, hwaddr_ill, B_TRUE);
285 err = EINPROGRESS;
286 } else if (flags & NCE_F_UNSOL_ADV) {
287 /*
288 * We account for the transmit below by assigning one
289 * less than the ndd variable. Subsequent decrements
290 * are done in nce_timer.
291 */
292 mutex_enter(&ncec->ncec_lock);
293 ncec->ncec_unsolicit_count =
294 ipst->ips_ip_ndp_unsolicit_count - 1;
295 mutex_exit(&ncec->ncec_lock);
296 dropped = ndp_xmit(ill,
297 ND_NEIGHBOR_ADVERT,
298 hw_addr,
299 hw_addr_len,
300 &ncec->ncec_addr, /* Source and target of the adv */
301 &ipv6_all_hosts_mcast, /* Destination of the packet */
302 nce_advert_flags(ncec));
303 mutex_enter(&ncec->ncec_lock);
304 if (dropped)
305 ncec->ncec_unsolicit_count++;
306 else
307 ncec->ncec_last_time_defended = ddi_get_lbolt();
308 if (ncec->ncec_unsolicit_count != 0) {
309 nce_start_timer(ncec,
310 ipst->ips_ip_ndp_unsolicit_interval);
311 }
312 mutex_exit(&ncec->ncec_lock);
313 }
314 return (err);
315 }
316
317 /*
318 * Atomically lookup and add (if needed) Neighbor Cache information for
319 * an address.
320 *
321 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
322 * are always added pointing at the ipmp_ill. Thus, when the ill passed
323 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
324 * entries will be created, both pointing at the same ncec_t. The nce_t
325 * entries will have their nce_ill set to the ipmp_ill and the under_ill
326 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
327 * Local addresses are always created on the ill passed to nce_add_v6.
328 */
329 int
330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
331 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
332 {
333 int err = 0;
334 ip_stack_t *ipst = ill->ill_ipst;
335 nce_t *nce, *upper_nce = NULL;
336 ill_t *in_ill = ill;
337 boolean_t need_ill_refrele = B_FALSE;
338
339 if (flags & NCE_F_MCAST) {
340 /*
341 * hw_addr will be figured out in nce_set_multicast_v6;
342 * caller has to select the cast_ill
343 */
344 ASSERT(hw_addr == NULL);
345 ASSERT(!IS_IPMP(ill));
346 err = nce_set_multicast_v6(ill, addr, flags, newnce);
347 return (err);
348 }
349 ASSERT(ill->ill_isv6);
350 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
351 ill = ipmp_ill_hold_ipmp_ill(ill);
352 if (ill == NULL)
353 return (ENXIO);
354 need_ill_refrele = B_TRUE;
355 }
356
357 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
358 nce = nce_lookup_addr(ill, addr);
359 if (nce == NULL) {
360 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
361 &nce);
362 } else {
363 err = EEXIST;
364 }
365 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
366 if (err == 0)
367 err = nce_add_v6_postprocess(nce);
368 if (in_ill != ill && nce != NULL) {
369 nce_t *under_nce = NULL;
370
371 /*
372 * in_ill was the under_ill. Try to create the under_nce.
373 * Hold the ill_g_lock to prevent changes to group membership
374 * until we are done.
375 */
376 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
377 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
378 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
379 ill_t *, ill);
380 rw_exit(&ipst->ips_ill_g_lock);
381 err = ENXIO;
382 nce_refrele(nce);
383 nce = NULL;
384 goto bail;
385 }
386 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
387 if (under_nce == NULL) {
388 rw_exit(&ipst->ips_ill_g_lock);
389 err = EINVAL;
390 nce_refrele(nce);
391 nce = NULL;
392 goto bail;
393 }
394 rw_exit(&ipst->ips_ill_g_lock);
395 upper_nce = nce;
396 nce = under_nce; /* will be returned to caller */
397 if (NCE_ISREACHABLE(nce->nce_common))
398 nce_fastpath_trigger(under_nce);
399 }
400 /* nce_refrele is deferred until the lock is dropped */
401 if (nce != NULL) {
402 if (newnce != NULL)
403 *newnce = nce;
404 else
405 nce_refrele(nce);
406 }
407 bail:
408 if (upper_nce != NULL)
409 nce_refrele(upper_nce);
410 if (need_ill_refrele)
411 ill_refrele(ill);
412 return (err);
413 }
414
415 /*
416 * Remove all the CONDEMNED nces from the appropriate hash table.
417 * We create a private list of NCEs, these may have ires pointing
418 * to them, so the list will be passed through to clean up dependent
419 * ires and only then we can do ncec_refrele() which can make NCE inactive.
420 */
421 static void
422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
423 {
424 ncec_t *ncec1;
425 ncec_t **ptpn;
426
427 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
428 ASSERT(ndp->ndp_g_walker == 0);
429 for (; ncec; ncec = ncec1) {
430 ncec1 = ncec->ncec_next;
431 mutex_enter(&ncec->ncec_lock);
432 if (NCE_ISCONDEMNED(ncec)) {
433 ptpn = ncec->ncec_ptpn;
434 ncec1 = ncec->ncec_next;
435 if (ncec1 != NULL)
436 ncec1->ncec_ptpn = ptpn;
437 *ptpn = ncec1;
438 ncec->ncec_ptpn = NULL;
439 ncec->ncec_next = NULL;
440 ncec->ncec_next = *free_nce_list;
441 *free_nce_list = ncec;
442 }
443 mutex_exit(&ncec->ncec_lock);
444 }
445 }
446
447 /*
448 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
449 * will return this NCE. Also no new timeouts will
450 * be started (See nce_restart_timer).
451 * 2. Cancel any currently running timeouts.
452 * 3. If there is an ndp walker, return. The walker will do the cleanup.
453 * This ensures that walkers see a consistent list of NCEs while walking.
454 * 4. Otherwise remove the NCE from the list of NCEs
455 */
456 void
457 ncec_delete(ncec_t *ncec)
458 {
459 ncec_t **ptpn;
460 ncec_t *ncec1;
461 int ipversion = ncec->ncec_ipversion;
462 ndp_g_t *ndp;
463 ip_stack_t *ipst = ncec->ncec_ipst;
464
465 if (ipversion == IPV4_VERSION)
466 ndp = ipst->ips_ndp4;
467 else
468 ndp = ipst->ips_ndp6;
469
470 /* Serialize deletes */
471 mutex_enter(&ncec->ncec_lock);
472 if (NCE_ISCONDEMNED(ncec)) {
473 /* Some other thread is doing the delete */
474 mutex_exit(&ncec->ncec_lock);
475 return;
476 }
477 /*
478 * Caller has a refhold. Also 1 ref for being in the list. Thus
479 * refcnt has to be >= 2
480 */
481 ASSERT(ncec->ncec_refcnt >= 2);
482 ncec->ncec_flags |= NCE_F_CONDEMNED;
483 mutex_exit(&ncec->ncec_lock);
484
485 /* Count how many condemned ires for kmem_cache callback */
486 atomic_inc_32(&ipst->ips_num_nce_condemned);
487 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
488
489 /* Complete any waiting callbacks */
490 ncec_cb_dispatch(ncec);
491
492 /*
493 * Cancel any running timer. Timeout can't be restarted
494 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
495 * Passing invalid timeout id is fine.
496 */
497 if (ncec->ncec_timeout_id != 0) {
498 (void) untimeout(ncec->ncec_timeout_id);
499 ncec->ncec_timeout_id = 0;
500 }
501
502 mutex_enter(&ndp->ndp_g_lock);
503 if (ncec->ncec_ptpn == NULL) {
504 /*
505 * The last ndp walker has already removed this ncec from
506 * the list after we marked the ncec CONDEMNED and before
507 * we grabbed the global lock.
508 */
509 mutex_exit(&ndp->ndp_g_lock);
510 return;
511 }
512 if (ndp->ndp_g_walker > 0) {
513 /*
514 * Can't unlink. The walker will clean up
515 */
516 ndp->ndp_g_walker_cleanup = B_TRUE;
517 mutex_exit(&ndp->ndp_g_lock);
518 return;
519 }
520
521 /*
522 * Now remove the ncec from the list. nce_restart_timer won't restart
523 * the timer since it is marked CONDEMNED.
524 */
525 ptpn = ncec->ncec_ptpn;
526 ncec1 = ncec->ncec_next;
527 if (ncec1 != NULL)
528 ncec1->ncec_ptpn = ptpn;
529 *ptpn = ncec1;
530 ncec->ncec_ptpn = NULL;
531 ncec->ncec_next = NULL;
532 mutex_exit(&ndp->ndp_g_lock);
533
534 /* Removed from ncec_ptpn/ncec_next list */
535 ncec_refrele_notr(ncec);
536 }
537
538 void
539 ncec_inactive(ncec_t *ncec)
540 {
541 mblk_t **mpp;
542 ill_t *ill = ncec->ncec_ill;
543 ip_stack_t *ipst = ncec->ncec_ipst;
544
545 ASSERT(ncec->ncec_refcnt == 0);
546 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
547
548 /* Count how many condemned nces for kmem_cache callback */
549 if (NCE_ISCONDEMNED(ncec))
550 atomic_add_32(&ipst->ips_num_nce_condemned, -1);
551
552 /* Free all allocated messages */
553 mpp = &ncec->ncec_qd_mp;
554 while (*mpp != NULL) {
555 mblk_t *mp;
556
557 mp = *mpp;
558 *mpp = mp->b_next;
559
560 inet_freemsg(mp);
561 }
562 /*
563 * must have been cleaned up in ncec_delete
564 */
565 ASSERT(list_is_empty(&ncec->ncec_cb));
566 list_destroy(&ncec->ncec_cb);
567 /*
568 * free the ncec_lladdr if one was allocated in nce_add_common()
569 */
570 if (ncec->ncec_lladdr_length > 0)
571 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
572
573 #ifdef DEBUG
574 ncec_trace_cleanup(ncec);
575 #endif
576
577 mutex_enter(&ill->ill_lock);
578 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
579 (char *), "ncec", (void *), ncec);
580 ill->ill_ncec_cnt--;
581 ncec->ncec_ill = NULL;
582 /*
583 * If the number of ncec's associated with this ill have dropped
584 * to zero, check whether we need to restart any operation that
585 * is waiting for this to happen.
586 */
587 if (ILL_DOWN_OK(ill)) {
588 /* ipif_ill_refrele_tail drops the ill_lock */
589 ipif_ill_refrele_tail(ill);
590 } else {
591 mutex_exit(&ill->ill_lock);
592 }
593
594 mutex_destroy(&ncec->ncec_lock);
595 kmem_cache_free(ncec_cache, ncec);
596 }
597
598 /*
599 * ncec_walk routine. Delete the ncec if it is associated with the ill
600 * that is going away. Always called as a writer.
601 */
602 void
603 ncec_delete_per_ill(ncec_t *ncec, void *arg)
604 {
605 if ((ncec != NULL) && ncec->ncec_ill == arg) {
606 ncec_delete(ncec);
607 }
608 }
609
610 /*
611 * Neighbor Cache cleanup logic for a list of ncec_t entries.
612 */
613 static void
614 nce_cleanup_list(ncec_t *ncec)
615 {
616 ncec_t *ncec_next;
617
618 ASSERT(ncec != NULL);
619 while (ncec != NULL) {
620 ncec_next = ncec->ncec_next;
621 ncec->ncec_next = NULL;
622
623 /*
624 * It is possible for the last ndp walker (this thread)
625 * to come here after ncec_delete has marked the ncec CONDEMNED
626 * and before it has removed the ncec from the fastpath list
627 * or called untimeout. So we need to do it here. It is safe
628 * for both ncec_delete and this thread to do it twice or
629 * even simultaneously since each of the threads has a
630 * reference on the ncec.
631 */
632 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
633 /*
634 * Cancel any running timer. Timeout can't be restarted
635 * since CONDEMNED is set. The ncec_lock can't be
636 * held across untimeout though passing invalid timeout
637 * id is fine.
638 */
639 if (ncec->ncec_timeout_id != 0) {
640 (void) untimeout(ncec->ncec_timeout_id);
641 ncec->ncec_timeout_id = 0;
642 }
643 /* Removed from ncec_ptpn/ncec_next list */
644 ncec_refrele_notr(ncec);
645 ncec = ncec_next;
646 }
647 }
648
649 /*
650 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted.
651 */
652 boolean_t
653 nce_restart_dad(ncec_t *ncec)
654 {
655 boolean_t started;
656 ill_t *ill, *hwaddr_ill;
657
658 if (ncec == NULL)
659 return (B_FALSE);
660 ill = ncec->ncec_ill;
661 mutex_enter(&ncec->ncec_lock);
662 if (ncec->ncec_state == ND_PROBE) {
663 mutex_exit(&ncec->ncec_lock);
664 started = B_TRUE;
665 } else if (ncec->ncec_state == ND_REACHABLE) {
666 ASSERT(ncec->ncec_lladdr != NULL);
667 ncec->ncec_state = ND_PROBE;
668 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
669 /*
670 * Slight cheat here: we don't use the initial probe delay
671 * for IPv4 in this obscure case.
672 */
673 mutex_exit(&ncec->ncec_lock);
674 if (IS_IPMP(ill)) {
675 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
676 ncec->ncec_lladdr, ncec->ncec_lladdr_length);
677 } else {
678 hwaddr_ill = ill;
679 }
680 nce_dad(ncec, hwaddr_ill, B_TRUE);
681 started = B_TRUE;
682 } else {
683 mutex_exit(&ncec->ncec_lock);
684 started = B_FALSE;
685 }
686 return (started);
687 }
688
689 /*
690 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed.
691 * If one is found, the refcnt on the ncec will be incremented.
692 */
693 ncec_t *
694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
695 {
696 ncec_t *ncec;
697 ip_stack_t *ipst = ill->ill_ipst;
698
699 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
700 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
701
702 /* Get head of v6 hash table */
703 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
704 ncec = ncec_lookup_illgrp(ill, addr, ncec);
705 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
706 rw_exit(&ipst->ips_ill_g_lock);
707 return (ncec);
708 }
709 /*
710 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed.
711 * If one is found, the refcnt on the ncec will be incremented.
712 */
713 ncec_t *
714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
715 {
716 ncec_t *ncec = NULL;
717 in6_addr_t addr6;
718 ip_stack_t *ipst = ill->ill_ipst;
719
720 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
721 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
722
723 /* Get head of v4 hash table */
724 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
725 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
726 ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
727 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
728 rw_exit(&ipst->ips_ill_g_lock);
729 return (ncec);
730 }
731
732 /*
733 * Cache entry lookup. Try to find an ncec matching the parameters passed.
734 * If an ncec is found, increment the hold count on that ncec.
735 * The caller passes in the start of the appropriate hash table, and must
736 * be holding the appropriate global lock (ndp_g_lock). In addition, since
737 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
738 * must be held as reader.
739 *
740 * This function always matches across the ipmp group.
741 */
742 ncec_t *
743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
744 {
745 ndp_g_t *ndp;
746 ip_stack_t *ipst = ill->ill_ipst;
747
748 if (ill->ill_isv6)
749 ndp = ipst->ips_ndp6;
750 else
751 ndp = ipst->ips_ndp4;
752
753 ASSERT(ill != NULL);
754 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
755 if (IN6_IS_ADDR_UNSPECIFIED(addr))
756 return (NULL);
757 for (; ncec != NULL; ncec = ncec->ncec_next) {
758 if (ncec->ncec_ill == ill ||
759 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
760 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
761 mutex_enter(&ncec->ncec_lock);
762 if (!NCE_ISCONDEMNED(ncec)) {
763 ncec_refhold_locked(ncec);
764 mutex_exit(&ncec->ncec_lock);
765 break;
766 }
767 mutex_exit(&ncec->ncec_lock);
768 }
769 }
770 }
771 return (ncec);
772 }
773
774 /*
775 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
776 * entries for ill only, i.e., when ill is part of an ipmp group,
777 * nce_lookup_v4 will never try to match across the group.
778 */
779 nce_t *
780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
781 {
782 nce_t *nce;
783 in6_addr_t addr6;
784 ip_stack_t *ipst = ill->ill_ipst;
785
786 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
787 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
788 nce = nce_lookup_addr(ill, &addr6);
789 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
790 return (nce);
791 }
792
793 /*
794 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
795 * entries for ill only, i.e., when ill is part of an ipmp group,
796 * nce_lookup_v6 will never try to match across the group.
797 */
798 nce_t *
799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
800 {
801 nce_t *nce;
802 ip_stack_t *ipst = ill->ill_ipst;
803
804 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
805 nce = nce_lookup_addr(ill, addr6);
806 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
807 return (nce);
808 }
809
810 static nce_t *
811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
812 {
813 nce_t *nce;
814
815 ASSERT(ill != NULL);
816 #ifdef DEBUG
817 if (ill->ill_isv6)
818 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
819 else
820 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
821 #endif
822 mutex_enter(&ill->ill_lock);
823 nce = nce_lookup(ill, addr);
824 mutex_exit(&ill->ill_lock);
825 return (nce);
826 }
827
828
829 /*
830 * Router turned to host. We need to make sure that cached copies of the ncec
831 * are not used for forwarding packets if they were derived from the default
832 * route, and that the default route itself is removed, as required by
833 * section 7.2.5 of RFC 2461.
834 *
835 * Note that the ncec itself probably has valid link-layer information for the
836 * nexthop, so that there is no reason to delete the ncec, as long as the
837 * ISROUTER flag is turned off.
838 */
839 static void
840 ncec_router_to_host(ncec_t *ncec)
841 {
842 ire_t *ire;
843 ip_stack_t *ipst = ncec->ncec_ipst;
844
845 mutex_enter(&ncec->ncec_lock);
846 ncec->ncec_flags &= ~NCE_F_ISROUTER;
847 mutex_exit(&ncec->ncec_lock);
848
849 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
850 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
851 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
852 if (ire != NULL) {
853 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
854 ire_delete(ire);
855 ire_refrele(ire);
856 }
857 }
858
859 /*
860 * Process passed in parameters either from an incoming packet or via
861 * user ioctl.
862 */
863 void
864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
865 {
866 ill_t *ill = ncec->ncec_ill;
867 uint32_t hw_addr_len = ill->ill_phys_addr_length;
868 boolean_t ll_updated = B_FALSE;
869 boolean_t ll_changed;
870 nce_t *nce;
871
872 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
873 /*
874 * No updates of link layer address or the neighbor state is
875 * allowed, when the cache is in NONUD state. This still
876 * allows for responding to reachability solicitation.
877 */
878 mutex_enter(&ncec->ncec_lock);
879 if (ncec->ncec_state == ND_INCOMPLETE) {
880 if (hw_addr == NULL) {
881 mutex_exit(&ncec->ncec_lock);
882 return;
883 }
884 nce_set_ll(ncec, hw_addr);
885 /*
886 * Update ncec state and send the queued packets
887 * back to ip this time ire will be added.
888 */
889 if (flag & ND_NA_FLAG_SOLICITED) {
890 nce_update(ncec, ND_REACHABLE, NULL);
891 } else {
892 nce_update(ncec, ND_STALE, NULL);
893 }
894 mutex_exit(&ncec->ncec_lock);
895 nce = nce_fastpath(ncec, B_TRUE, NULL);
896 nce_resolv_ok(ncec);
897 if (nce != NULL)
898 nce_refrele(nce);
899 return;
900 }
901 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
902 if (!is_adv) {
903 /* If this is a SOLICITATION request only */
904 if (ll_changed)
905 nce_update(ncec, ND_STALE, hw_addr);
906 mutex_exit(&ncec->ncec_lock);
907 ncec_cb_dispatch(ncec);
908 return;
909 }
910 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
911 /* If in any other state than REACHABLE, ignore */
912 if (ncec->ncec_state == ND_REACHABLE) {
913 nce_update(ncec, ND_STALE, NULL);
914 }
915 mutex_exit(&ncec->ncec_lock);
916 ncec_cb_dispatch(ncec);
917 return;
918 } else {
919 if (ll_changed) {
920 nce_update(ncec, ND_UNCHANGED, hw_addr);
921 ll_updated = B_TRUE;
922 }
923 if (flag & ND_NA_FLAG_SOLICITED) {
924 nce_update(ncec, ND_REACHABLE, NULL);
925 } else {
926 if (ll_updated) {
927 nce_update(ncec, ND_STALE, NULL);
928 }
929 }
930 mutex_exit(&ncec->ncec_lock);
931 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
932 NCE_F_ISROUTER)) {
933 ncec_router_to_host(ncec);
934 } else {
935 ncec_cb_dispatch(ncec);
936 }
937 }
938 }
939
940 /*
941 * Pass arg1 to the cbf supplied, along with each ncec in existence.
942 * ncec_walk() places a REFHOLD on the ncec and drops the lock when
943 * walking the hash list.
944 */
945 void
946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
947 void *arg1, boolean_t trace)
948 {
949 ncec_t *ncec;
950 ncec_t *ncec1;
951 ncec_t **ncep;
952 ncec_t *free_nce_list = NULL;
953
954 mutex_enter(&ndp->ndp_g_lock);
955 /* Prevent ncec_delete from unlink and free of NCE */
956 ndp->ndp_g_walker++;
957 mutex_exit(&ndp->ndp_g_lock);
958 for (ncep = ndp->nce_hash_tbl;
959 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
960 for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
961 ncec1 = ncec->ncec_next;
962 if (ill == NULL || ncec->ncec_ill == ill) {
963 if (trace) {
964 ncec_refhold(ncec);
965 (*cbf)(ncec, arg1);
966 ncec_refrele(ncec);
967 } else {
968 ncec_refhold_notr(ncec);
969 (*cbf)(ncec, arg1);
970 ncec_refrele_notr(ncec);
971 }
972 }
973 }
974 }
975 mutex_enter(&ndp->ndp_g_lock);
976 ndp->ndp_g_walker--;
977 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
978 /* Time to delete condemned entries */
979 for (ncep = ndp->nce_hash_tbl;
980 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
981 ncec = *ncep;
982 if (ncec != NULL) {
983 nce_remove(ndp, ncec, &free_nce_list);
984 }
985 }
986 ndp->ndp_g_walker_cleanup = B_FALSE;
987 }
988
989 mutex_exit(&ndp->ndp_g_lock);
990
991 if (free_nce_list != NULL) {
992 nce_cleanup_list(free_nce_list);
993 }
994 }
995
996 /*
997 * Walk everything.
998 * Note that ill can be NULL hence can't derive the ipst from it.
999 */
1000 void
1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003 ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004 ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006
1007 /*
1008 * For each interface an entry is added for the unspecified multicast group.
1009 * Here that mapping is used to form the multicast cache entry for a particular
1010 * multicast destination.
1011 */
1012 static int
1013 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1014 uint16_t flags, nce_t **newnce)
1015 {
1016 uchar_t *hw_addr;
1017 int err = 0;
1018 ip_stack_t *ipst = ill->ill_ipst;
1019 nce_t *nce;
1020
1021 ASSERT(ill != NULL);
1022 ASSERT(ill->ill_isv6);
1023 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1024
1025 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1026 nce = nce_lookup_addr(ill, dst);
1027 if (nce != NULL) {
1028 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1029 goto done;
1030 }
1031 if (ill->ill_net_type == IRE_IF_RESOLVER) {
1032 /*
1033 * For IRE_IF_RESOLVER a hardware mapping can be
1034 * generated.
1035 */
1036 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1037 if (hw_addr == NULL) {
1038 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1039 return (ENOMEM);
1040 }
1041 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1042 } else {
1043 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1044 hw_addr = NULL;
1045 }
1046 ASSERT((flags & NCE_F_MCAST) != 0);
1047 ASSERT((flags & NCE_F_NONUD) != 0);
1048 /* nce_state will be computed by nce_add_common() */
1049 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1050 ND_UNCHANGED, &nce);
1051 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1052 if (err == 0)
1053 err = nce_add_v6_postprocess(nce);
1054 if (hw_addr != NULL)
1055 kmem_free(hw_addr, ill->ill_nd_lla_len);
1056 if (err != 0) {
1057 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1058 return (err);
1059 }
1060 done:
1061 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1062 if (newnce != NULL)
1063 *newnce = nce;
1064 else
1065 nce_refrele(nce);
1066 return (0);
1067 }
1068
1069 /*
1070 * Return the link layer address, and any flags of a ncec.
1071 */
1072 int
1073 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1074 {
1075 ncec_t *ncec;
1076 in6_addr_t *addr;
1077 sin6_t *sin6;
1078
1079 ASSERT(ill != NULL && ill->ill_isv6);
1080 sin6 = (sin6_t *)&lnr->lnr_addr;
1081 addr = &sin6->sin6_addr;
1082
1083 /*
1084 * NOTE: if the ill is an IPMP interface, then match against the whole
1085 * illgrp. This e.g. allows in.ndpd to retrieve the link layer
1086 * addresses for the data addresses on an IPMP interface even though
1087 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1088 */
1089 ncec = ncec_lookup_illgrp_v6(ill, addr);
1090 if (ncec == NULL)
1091 return (ESRCH);
1092 /* If no link layer address is available yet, return ESRCH */
1093 if (!NCE_ISREACHABLE(ncec)) {
1094 ncec_refrele(ncec);
1095 return (ESRCH);
1096 }
1097 lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1098 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1099 lnr->lnr_hdw_len);
1100 if (ncec->ncec_flags & NCE_F_ISROUTER)
1101 lnr->lnr_flags = NDF_ISROUTER_ON;
1102 if (ncec->ncec_flags & NCE_F_ANYCAST)
1103 lnr->lnr_flags |= NDF_ANYCAST_ON;
1104 if (ncec->ncec_flags & NCE_F_STATIC)
1105 lnr->lnr_flags |= NDF_STATIC;
1106 ncec_refrele(ncec);
1107 return (0);
1108 }
1109
1110 /*
1111 * Finish setting up the Enable/Disable multicast for the driver.
1112 */
1113 mblk_t *
1114 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1115 uint32_t hw_addr_offset, mblk_t *mp)
1116 {
1117 uchar_t *hw_addr;
1118 ipaddr_t v4group;
1119 uchar_t *addr;
1120
1121 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1122 if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1123 IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1124
1125 ASSERT(CLASSD(v4group));
1126 ASSERT(!(ill->ill_isv6));
1127
1128 addr = (uchar_t *)&v4group;
1129 } else {
1130 ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1131 ASSERT(ill->ill_isv6);
1132
1133 addr = (uchar_t *)v6group;
1134 }
1135 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1136 if (hw_addr == NULL) {
1137 ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1138 freemsg(mp);
1139 return (NULL);
1140 }
1141
1142 ip_mcast_mapping(ill, addr, hw_addr);
1143 return (mp);
1144 }
1145
1146 void
1147 ip_ndp_resolve(ncec_t *ncec)
1148 {
1149 in_addr_t sender4 = INADDR_ANY;
1150 in6_addr_t sender6 = ipv6_all_zeros;
1151 ill_t *src_ill;
1152 uint32_t ms;
1153
1154 src_ill = nce_resolve_src(ncec, &sender6);
1155 if (src_ill == NULL) {
1156 /* Make sure we try again later */
1157 ms = ncec->ncec_ill->ill_reachable_retrans_time;
1158 nce_restart_timer(ncec, (clock_t)ms);
1159 return;
1160 }
1161 if (ncec->ncec_ipversion == IPV4_VERSION)
1162 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1163 mutex_enter(&ncec->ncec_lock);
1164 if (ncec->ncec_ipversion == IPV6_VERSION)
1165 ms = ndp_solicit(ncec, sender6, src_ill);
1166 else
1167 ms = arp_request(ncec, sender4, src_ill);
1168 mutex_exit(&ncec->ncec_lock);
1169 if (ms == 0) {
1170 if (ncec->ncec_state != ND_REACHABLE) {
1171 if (ncec->ncec_ipversion == IPV6_VERSION)
1172 ndp_resolv_failed(ncec);
1173 else
1174 arp_resolv_failed(ncec);
1175 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1176 nce_make_unreachable(ncec);
1177 ncec_delete(ncec);
1178 }
1179 } else {
1180 nce_restart_timer(ncec, (clock_t)ms);
1181 }
1182 done:
1183 ill_refrele(src_ill);
1184 }
1185
1186 /*
1187 * Send an IPv6 neighbor solicitation.
1188 * Returns number of milliseconds after which we should either rexmit or abort.
1189 * Return of zero means we should abort.
1190 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1191 * The optional source address is used as a hint to ndp_solicit for
1192 * which source to use in the packet.
1193 *
1194 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1195 * the packet.
1196 */
1197 uint32_t
1198 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1199 {
1200 in6_addr_t dst;
1201 boolean_t dropped = B_FALSE;
1202
1203 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1204 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1205
1206 if (ncec->ncec_rcnt == 0)
1207 return (0);
1208
1209 dst = ncec->ncec_addr;
1210 ncec->ncec_rcnt--;
1211 mutex_exit(&ncec->ncec_lock);
1212 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1213 ill->ill_phys_addr_length, &src, &dst, 0);
1214 mutex_enter(&ncec->ncec_lock);
1215 if (dropped)
1216 ncec->ncec_rcnt++;
1217 return (ncec->ncec_ill->ill_reachable_retrans_time);
1218 }
1219
1220 /*
1221 * Attempt to recover an address on an interface that's been marked as a
1222 * duplicate. Because NCEs are destroyed when the interface goes down, there's
1223 * no easy way to just probe the address and have the right thing happen if
1224 * it's no longer in use. Instead, we just bring it up normally and allow the
1225 * regular interface start-up logic to probe for a remaining duplicate and take
1226 * us back down if necessary.
1227 * Neither DHCP nor temporary addresses arrive here; they're excluded by
1228 * ip_ndp_excl.
1229 */
1230 /* ARGSUSED */
1231 void
1232 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1233 {
1234 ill_t *ill = rq->q_ptr;
1235 ipif_t *ipif;
1236 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1237 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1238 boolean_t addr_equal;
1239
1240 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1241 /*
1242 * We do not support recovery of proxy ARP'd interfaces,
1243 * because the system lacks a complete proxy ARP mechanism.
1244 */
1245 if (ill->ill_isv6) {
1246 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1247 addr6);
1248 } else {
1249 addr_equal = (ipif->ipif_lcl_addr == *addr4);
1250 }
1251
1252 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1253 continue;
1254
1255 /*
1256 * If we have already recovered or if the interface is going
1257 * away, then ignore.
1258 */
1259 mutex_enter(&ill->ill_lock);
1260 if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1261 (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1262 mutex_exit(&ill->ill_lock);
1263 continue;
1264 }
1265
1266 ipif->ipif_flags &= ~IPIF_DUPLICATE;
1267 ill->ill_ipif_dup_count--;
1268 mutex_exit(&ill->ill_lock);
1269 ipif->ipif_was_dup = B_TRUE;
1270
1271 if (ill->ill_isv6) {
1272 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1273 (void) ipif_up_done_v6(ipif);
1274 } else {
1275 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1276 EINPROGRESS);
1277 (void) ipif_up_done(ipif);
1278 }
1279 }
1280 freeb(mp);
1281 }
1282
1283 /*
1284 * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1285 * As long as someone else holds the address, the interface will stay down.
1286 * When that conflict goes away, the interface is brought back up. This is
1287 * done so that accidental shutdowns of addresses aren't made permanent. Your
1288 * server will recover from a failure.
1289 *
1290 * For DHCP and temporary addresses, recovery is not done in the kernel.
1291 * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1292 *
1293 * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1294 */
1295 void
1296 ipif_dup_recovery(void *arg)
1297 {
1298 ipif_t *ipif = arg;
1299
1300 ipif->ipif_recovery_id = 0;
1301 if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1302 return;
1303
1304 /*
1305 * No lock, because this is just an optimization.
1306 */
1307 if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1308 return;
1309
1310 /* If the link is down, we'll retry this later */
1311 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1312 return;
1313
1314 ipif_do_recovery(ipif);
1315 }
1316
1317 /*
1318 * Perform interface recovery by forcing the duplicate interfaces up and
1319 * allowing the system to determine which ones should stay up.
1320 *
1321 * Called both by recovery timer expiry and link-up notification.
1322 */
1323 void
1324 ipif_do_recovery(ipif_t *ipif)
1325 {
1326 ill_t *ill = ipif->ipif_ill;
1327 mblk_t *mp;
1328 ip_stack_t *ipst = ill->ill_ipst;
1329 size_t mp_size;
1330
1331 if (ipif->ipif_isv6)
1332 mp_size = sizeof (ipif->ipif_v6lcl_addr);
1333 else
1334 mp_size = sizeof (ipif->ipif_lcl_addr);
1335 mp = allocb(mp_size, BPRI_MED);
1336 if (mp == NULL) {
1337 mutex_enter(&ill->ill_lock);
1338 if (ipst->ips_ip_dup_recovery > 0 &&
1339 ipif->ipif_recovery_id == 0 &&
1340 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1341 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1342 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1343 }
1344 mutex_exit(&ill->ill_lock);
1345 } else {
1346 /*
1347 * A recovery timer may still be running if we got here from
1348 * ill_restart_dad(); cancel that timer.
1349 */
1350 if (ipif->ipif_recovery_id != 0)
1351 (void) untimeout(ipif->ipif_recovery_id);
1352 ipif->ipif_recovery_id = 0;
1353
1354 if (ipif->ipif_isv6) {
1355 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1356 sizeof (ipif->ipif_v6lcl_addr));
1357 } else {
1358 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1359 sizeof (ipif->ipif_lcl_addr));
1360 }
1361 ill_refhold(ill);
1362 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1363 B_FALSE);
1364 }
1365 }
1366
1367 /*
1368 * Find the MAC and IP addresses in an NA/NS message.
1369 */
1370 static void
1371 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1372 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1373 {
1374 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1375 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1376 uchar_t *addr;
1377 int alen;
1378
1379 /* icmp_inbound_v6 ensures this */
1380 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1381
1382 addr = ira->ira_l2src;
1383 alen = ill->ill_phys_addr_length;
1384 if (alen > 0) {
1385 *haddr = addr;
1386 *haddrlenp = alen;
1387 } else {
1388 *haddr = NULL;
1389 *haddrlenp = 0;
1390 }
1391
1392 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1393 *targp = ns->nd_ns_target;
1394 }
1395
1396 /*
1397 * This is for exclusive changes due to NDP duplicate address detection
1398 * failure.
1399 */
1400 /* ARGSUSED */
1401 static void
1402 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1403 {
1404 ill_t *ill = rq->q_ptr;
1405 ipif_t *ipif;
1406 uchar_t *haddr;
1407 uint_t haddrlen;
1408 ip_stack_t *ipst = ill->ill_ipst;
1409 in6_addr_t targ;
1410 ip_recv_attr_t iras;
1411 mblk_t *attrmp;
1412
1413 attrmp = mp;
1414 mp = mp->b_cont;
1415 attrmp->b_cont = NULL;
1416 if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1417 /* The ill or ip_stack_t disappeared on us */
1418 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1419 ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1420 freemsg(mp);
1421 ira_cleanup(&iras, B_TRUE);
1422 return;
1423 }
1424
1425 ASSERT(ill == iras.ira_rill);
1426
1427 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1428 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1429 /*
1430 * Ignore conflicts generated by misbehaving switches that
1431 * just reflect our own messages back to us. For IPMP, we may
1432 * see reflections across any ill in the illgrp.
1433 *
1434 * RFC2462 and revisions tried to detect both the case
1435 * when a statically configured IPv6 address is a duplicate,
1436 * and the case when the L2 address itself is a duplicate. The
1437 * later is important because, with stateles address autoconf,
1438 * if the L2 address is a duplicate, the resulting IPv6
1439 * address(es) would also be duplicates. We rely on DAD of the
1440 * IPv6 address itself to detect the latter case.
1441 */
1442 /* For an under ill_grp can change under lock */
1443 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1444 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1445 IS_UNDER_IPMP(ill) &&
1446 ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1447 haddrlen) != NULL) {
1448 rw_exit(&ipst->ips_ill_g_lock);
1449 goto ignore_conflict;
1450 }
1451 rw_exit(&ipst->ips_ill_g_lock);
1452 }
1453
1454 /*
1455 * Look up the appropriate ipif.
1456 */
1457 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1458 if (ipif == NULL)
1459 goto ignore_conflict;
1460
1461 /* Reload the ill to match the ipif */
1462 ill = ipif->ipif_ill;
1463
1464 /* If it's already duplicate or ineligible, then don't do anything. */
1465 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1466 ipif_refrele(ipif);
1467 goto ignore_conflict;
1468 }
1469
1470 /*
1471 * If this is a failure during duplicate recovery, then don't
1472 * complain. It may take a long time to recover.
1473 */
1474 if (!ipif->ipif_was_dup) {
1475 char ibuf[LIFNAMSIZ];
1476 char hbuf[MAC_STR_LEN];
1477 char sbuf[INET6_ADDRSTRLEN];
1478
1479 ipif_get_name(ipif, ibuf, sizeof (ibuf));
1480 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1481 " disabled", ibuf,
1482 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1483 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1484 }
1485 mutex_enter(&ill->ill_lock);
1486 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1487 ipif->ipif_flags |= IPIF_DUPLICATE;
1488 ill->ill_ipif_dup_count++;
1489 mutex_exit(&ill->ill_lock);
1490 (void) ipif_down(ipif, NULL, NULL);
1491 (void) ipif_down_tail(ipif);
1492 mutex_enter(&ill->ill_lock);
1493 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1494 ill->ill_net_type == IRE_IF_RESOLVER &&
1495 !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1496 ipst->ips_ip_dup_recovery > 0) {
1497 ASSERT(ipif->ipif_recovery_id == 0);
1498 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1499 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1500 }
1501 mutex_exit(&ill->ill_lock);
1502 ipif_refrele(ipif);
1503
1504 ignore_conflict:
1505 freemsg(mp);
1506 ira_cleanup(&iras, B_TRUE);
1507 }
1508
1509 /*
1510 * Handle failure by tearing down the ipifs with the specified address. Note
1511 * that tearing down the ipif also means deleting the ncec through ipif_down, so
1512 * it's not possible to do recovery by just restarting the ncec timer. Instead,
1513 * we start a timer on the ipif.
1514 * Caller has to free mp;
1515 */
1516 static void
1517 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1518 {
1519 const uchar_t *haddr;
1520 ill_t *ill = ira->ira_rill;
1521
1522 /*
1523 * Ignore conflicts generated by misbehaving switches that just
1524 * reflect our own messages back to us.
1525 */
1526
1527 /* icmp_inbound_v6 ensures this */
1528 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1529 haddr = ira->ira_l2src;
1530 if (haddr != NULL &&
1531 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1532 return;
1533 }
1534
1535 if ((mp = copymsg(mp)) != NULL) {
1536 mblk_t *attrmp;
1537
1538 attrmp = ip_recv_attr_to_mblk(ira);
1539 if (attrmp == NULL) {
1540 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1541 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1542 freemsg(mp);
1543 } else {
1544 ASSERT(attrmp->b_cont == NULL);
1545 attrmp->b_cont = mp;
1546 mp = attrmp;
1547 ill_refhold(ill);
1548 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1549 B_FALSE);
1550 }
1551 }
1552 }
1553
1554 /*
1555 * Handle a discovered conflict: some other system is advertising that it owns
1556 * one of our IP addresses. We need to defend ourselves, or just shut down the
1557 * interface.
1558 *
1559 * Handles both IPv4 and IPv6
1560 */
1561 boolean_t
1562 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1563 {
1564 ipif_t *ipif;
1565 clock_t now;
1566 uint_t maxdefense;
1567 uint_t defs;
1568 ill_t *ill = ira->ira_ill;
1569 ip_stack_t *ipst = ill->ill_ipst;
1570 uint32_t elapsed;
1571 boolean_t isv6 = ill->ill_isv6;
1572 ipaddr_t ncec_addr;
1573
1574 if (isv6) {
1575 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1576 ipst);
1577 } else {
1578 if (arp_no_defense) {
1579 /*
1580 * Yes, there is a conflict, but no, we do not
1581 * defend ourself.
1582 */
1583 return (B_TRUE);
1584 }
1585 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1586 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1587 ipst);
1588 }
1589 if (ipif == NULL)
1590 return (B_FALSE);
1591
1592 /*
1593 * First, figure out if this address is disposable.
1594 */
1595 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1596 maxdefense = ipst->ips_ip_max_temp_defend;
1597 else
1598 maxdefense = ipst->ips_ip_max_defend;
1599
1600 /*
1601 * Now figure out how many times we've defended ourselves. Ignore
1602 * defenses that happened long in the past.
1603 */
1604 now = ddi_get_lbolt();
1605 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1606 mutex_enter(&ncec->ncec_lock);
1607 if ((defs = ncec->ncec_defense_count) > 0 &&
1608 elapsed > ipst->ips_ip_defend_interval) {
1609 /*
1610 * ip_defend_interval has elapsed.
1611 * reset the defense count.
1612 */
1613 ncec->ncec_defense_count = defs = 0;
1614 }
1615 ncec->ncec_defense_count++;
1616 ncec->ncec_last_time_defended = now;
1617 mutex_exit(&ncec->ncec_lock);
1618 ipif_refrele(ipif);
1619
1620 /*
1621 * If we've defended ourselves too many times already, then give up and
1622 * tear down the interface(s) using this address.
1623 * Otherwise, caller has to defend by sending out an announce.
1624 */
1625 if (defs >= maxdefense) {
1626 if (isv6)
1627 ndp_failure(mp, ira);
1628 else
1629 arp_failure(mp, ira);
1630 } else {
1631 return (B_TRUE); /* caller must defend this address */
1632 }
1633 return (B_FALSE);
1634 }
1635
1636 /*
1637 * Handle reception of Neighbor Solicitation messages.
1638 */
1639 static void
1640 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1641 {
1642 ill_t *ill = ira->ira_ill, *under_ill;
1643 nd_neighbor_solicit_t *ns;
1644 uint32_t hlen = ill->ill_phys_addr_length;
1645 uchar_t *haddr = NULL;
1646 icmp6_t *icmp_nd;
1647 ip6_t *ip6h;
1648 ncec_t *our_ncec = NULL;
1649 in6_addr_t target;
1650 in6_addr_t src;
1651 int len;
1652 int flag = 0;
1653 nd_opt_hdr_t *opt = NULL;
1654 boolean_t bad_solicit = B_FALSE;
1655 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1656 boolean_t need_ill_refrele = B_FALSE;
1657
1658 ip6h = (ip6_t *)mp->b_rptr;
1659 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1660 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1661 src = ip6h->ip6_src;
1662 ns = (nd_neighbor_solicit_t *)icmp_nd;
1663 target = ns->nd_ns_target;
1664 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1665 IN6_IS_ADDR_LOOPBACK(&target)) {
1666 if (ip_debug > 2) {
1667 /* ip1dbg */
1668 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1669 AF_INET6, &target);
1670 }
1671 bad_solicit = B_TRUE;
1672 goto done;
1673 }
1674 if (len > sizeof (nd_neighbor_solicit_t)) {
1675 /* Options present */
1676 opt = (nd_opt_hdr_t *)&ns[1];
1677 len -= sizeof (nd_neighbor_solicit_t);
1678 if (!ndp_verify_optlen(opt, len)) {
1679 ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1680 bad_solicit = B_TRUE;
1681 goto done;
1682 }
1683 }
1684 if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1685 /* Check to see if this is a valid DAD solicitation */
1686 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1687 if (ip_debug > 2) {
1688 /* ip1dbg */
1689 pr_addr_dbg("ndp_input_solicit: IPv6 "
1690 "Destination is not solicited node "
1691 "multicast %s\n", AF_INET6,
1692 &ip6h->ip6_dst);
1693 }
1694 bad_solicit = B_TRUE;
1695 goto done;
1696 }
1697 }
1698
1699 /*
1700 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1701 * received this packet if it's multicast) is not the ill tied to
1702 * e.g. the IPMP ill's data link-local. So we match across the illgrp
1703 * to ensure we find the associated NCE.
1704 */
1705 our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1706 /*
1707 * If this is a valid Solicitation for an address we are publishing,
1708 * then a PUBLISH entry should exist in the cache
1709 */
1710 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1711 ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1712 "ifname=%s ", ill->ill_name));
1713 if (ip_debug > 2) {
1714 /* ip1dbg */
1715 pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1716 }
1717 if (our_ncec == NULL)
1718 bad_solicit = B_TRUE;
1719 goto done;
1720 }
1721
1722 /* At this point we should have a verified NS per spec */
1723 if (opt != NULL) {
1724 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1725 if (opt != NULL) {
1726 haddr = (uchar_t *)&opt[1];
1727 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1728 hlen == 0) {
1729 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1730 bad_solicit = B_TRUE;
1731 goto done;
1732 }
1733 }
1734 }
1735
1736 /* If sending directly to peer, set the unicast flag */
1737 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1738 flag |= NDP_UNICAST;
1739
1740 /*
1741 * Create/update the entry for the soliciting node on the ipmp_ill.
1742 * or respond to outstanding queries, don't if
1743 * the source is unspecified address.
1744 */
1745 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1746 int err;
1747 nce_t *nnce;
1748
1749 ASSERT(ill->ill_isv6);
1750 /*
1751 * Regular solicitations *must* include the Source Link-Layer
1752 * Address option. Ignore messages that do not.
1753 */
1754 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1755 ip1dbg(("ndp_input_solicit: source link-layer address "
1756 "option missing with a specified source.\n"));
1757 bad_solicit = B_TRUE;
1758 goto done;
1759 }
1760
1761 /*
1762 * This is a regular solicitation. If we're still in the
1763 * process of verifying the address, then don't respond at all
1764 * and don't keep track of the sender.
1765 */
1766 if (our_ncec->ncec_state == ND_PROBE)
1767 goto done;
1768
1769 /*
1770 * If the solicitation doesn't have sender hardware address
1771 * (legal for unicast solicitation), then process without
1772 * installing the return NCE. Either we already know it, or
1773 * we'll be forced to look it up when (and if) we reply to the
1774 * packet.
1775 */
1776 if (haddr == NULL)
1777 goto no_source;
1778
1779 under_ill = ill;
1780 if (IS_UNDER_IPMP(under_ill)) {
1781 ill = ipmp_ill_hold_ipmp_ill(under_ill);
1782 if (ill == NULL)
1783 ill = under_ill;
1784 else
1785 need_ill_refrele = B_TRUE;
1786 }
1787 err = nce_lookup_then_add_v6(ill,
1788 haddr, hlen,
1789 &src, /* Soliciting nodes address */
1790 0,
1791 ND_STALE,
1792 &nnce);
1793
1794 if (need_ill_refrele) {
1795 ill_refrele(ill);
1796 ill = under_ill;
1797 need_ill_refrele = B_FALSE;
1798 }
1799 switch (err) {
1800 case 0:
1801 /* done with this entry */
1802 nce_refrele(nnce);
1803 break;
1804 case EEXIST:
1805 /*
1806 * B_FALSE indicates this is not an an advertisement.
1807 */
1808 nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1809 nce_refrele(nnce);
1810 break;
1811 default:
1812 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1813 err));
1814 goto done;
1815 }
1816 no_source:
1817 flag |= NDP_SOLICITED;
1818 } else {
1819 /*
1820 * No source link layer address option should be present in a
1821 * valid DAD request.
1822 */
1823 if (haddr != NULL) {
1824 ip1dbg(("ndp_input_solicit: source link-layer address "
1825 "option present with an unspecified source.\n"));
1826 bad_solicit = B_TRUE;
1827 goto done;
1828 }
1829 if (our_ncec->ncec_state == ND_PROBE) {
1830 /*
1831 * Internally looped-back probes will have
1832 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1833 * transmissions.
1834 */
1835 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1836 /*
1837 * If someone else is probing our address, then
1838 * we've crossed wires. Declare failure.
1839 */
1840 ndp_failure(mp, ira);
1841 }
1842 goto done;
1843 }
1844 /*
1845 * This is a DAD probe. Multicast the advertisement to the
1846 * all-nodes address.
1847 */
1848 src = ipv6_all_hosts_mcast;
1849 }
1850 flag |= nce_advert_flags(our_ncec);
1851 (void) ndp_xmit(ill,
1852 ND_NEIGHBOR_ADVERT,
1853 our_ncec->ncec_lladdr,
1854 our_ncec->ncec_lladdr_length,
1855 &target, /* Source and target of the advertisement pkt */
1856 &src, /* IP Destination (source of original pkt) */
1857 flag);
1858 done:
1859 if (bad_solicit)
1860 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1861 if (our_ncec != NULL)
1862 ncec_refrele(our_ncec);
1863 }
1864
1865 /*
1866 * Handle reception of Neighbor Solicitation messages
1867 */
1868 void
1869 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1870 {
1871 ill_t *ill = ira->ira_ill;
1872 nd_neighbor_advert_t *na;
1873 uint32_t hlen = ill->ill_phys_addr_length;
1874 uchar_t *haddr = NULL;
1875 icmp6_t *icmp_nd;
1876 ip6_t *ip6h;
1877 ncec_t *dst_ncec = NULL;
1878 in6_addr_t target;
1879 nd_opt_hdr_t *opt = NULL;
1880 int len;
1881 ip_stack_t *ipst = ill->ill_ipst;
1882 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1883
1884 ip6h = (ip6_t *)mp->b_rptr;
1885 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1886 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1887 na = (nd_neighbor_advert_t *)icmp_nd;
1888
1889 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1890 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1891 ip1dbg(("ndp_input_advert: Target is multicast but the "
1892 "solicited flag is not zero\n"));
1893 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1894 return;
1895 }
1896 target = na->nd_na_target;
1897 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1898 IN6_IS_ADDR_LOOPBACK(&target)) {
1899 if (ip_debug > 2) {
1900 /* ip1dbg */
1901 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1902 AF_INET6, &target);
1903 }
1904 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1905 return;
1906 }
1907 if (len > sizeof (nd_neighbor_advert_t)) {
1908 opt = (nd_opt_hdr_t *)&na[1];
1909 if (!ndp_verify_optlen(opt,
1910 len - sizeof (nd_neighbor_advert_t))) {
1911 ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1912 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1913 return;
1914 }
1915 /* At this point we have a verified NA per spec */
1916 len -= sizeof (nd_neighbor_advert_t);
1917 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1918 if (opt != NULL) {
1919 haddr = (uchar_t *)&opt[1];
1920 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1921 hlen == 0) {
1922 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1923 BUMP_MIB(mib,
1924 ipv6IfIcmpInBadNeighborAdvertisements);
1925 return;
1926 }
1927 }
1928 }
1929
1930 /*
1931 * NOTE: we match across the illgrp since we need to do DAD for all of
1932 * our local addresses, and those are spread across all the active
1933 * ills in the group.
1934 */
1935 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1936 return;
1937
1938 if (NCE_PUBLISH(dst_ncec)) {
1939 /*
1940 * Someone just advertised an addresses that we publish. First,
1941 * check it it was us -- if so, we can safely ignore it.
1942 * We don't get the haddr from the ira_l2src because, in the
1943 * case that the packet originated from us, on an IPMP group,
1944 * the ira_l2src may would be the link-layer address of the
1945 * cast_ill used to send the packet, which may not be the same
1946 * as the dst_ncec->ncec_lladdr of the address.
1947 */
1948 if (haddr != NULL) {
1949 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1950 goto out;
1951
1952 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1953 goto out; /* from us -- no conflict */
1954
1955 /*
1956 * If we're in an IPMP group, check if this is an echo
1957 * from another ill in the group. Use the double-
1958 * checked locking pattern to avoid grabbing
1959 * ill_g_lock in the non-IPMP case.
1960 */
1961 if (IS_UNDER_IPMP(ill)) {
1962 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1963 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1964 ill->ill_grp, haddr, hlen) != NULL) {
1965 rw_exit(&ipst->ips_ill_g_lock);
1966 goto out;
1967 }
1968 rw_exit(&ipst->ips_ill_g_lock);
1969 }
1970 }
1971
1972 /*
1973 * This appears to be a real conflict. If we're trying to
1974 * configure this NCE (ND_PROBE), then shut it down.
1975 * Otherwise, handle the discovered conflict.
1976 */
1977 if (dst_ncec->ncec_state == ND_PROBE) {
1978 ndp_failure(mp, ira);
1979 } else {
1980 if (ip_nce_conflict(mp, ira, dst_ncec)) {
1981 char hbuf[MAC_STR_LEN];
1982 char sbuf[INET6_ADDRSTRLEN];
1983
1984 cmn_err(CE_WARN,
1985 "node '%s' is using %s on %s",
1986 inet_ntop(AF_INET6, &target, sbuf,
1987 sizeof (sbuf)),
1988 haddr == NULL ? "<none>" :
1989 mac_colon_addr(haddr, hlen, hbuf,
1990 sizeof (hbuf)), ill->ill_name);
1991 /*
1992 * RFC 4862, Section 5.4.4 does not mandate
1993 * any specific behavior when an NA matches
1994 * a non-tentative address assigned to the
1995 * receiver. We make the choice of defending
1996 * our address, based on the assumption that
1997 * the sender has not detected the Duplicate.
1998 *
1999 * ncec_last_time_defended has been adjusted
2000 * in ip_nce_conflict()
2001 */
2002 (void) ndp_announce(dst_ncec);
2003 }
2004 }
2005 } else {
2006 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2007 dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2008
2009 /* B_TRUE indicates this an advertisement */
2010 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2011 }
2012 out:
2013 ncec_refrele(dst_ncec);
2014 }
2015
2016 /*
2017 * Process NDP neighbor solicitation/advertisement messages.
2018 * The checksum has already checked o.k before reaching here.
2019 * Information about the datalink header is contained in ira_l2src, but
2020 * that should be ignored for loopback packets.
2021 */
2022 void
2023 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2024 {
2025 ill_t *ill = ira->ira_rill;
2026 icmp6_t *icmp_nd;
2027 ip6_t *ip6h;
2028 int len;
2029 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
2030 ill_t *orig_ill = NULL;
2031
2032 /*
2033 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2034 * and make it be the IPMP upper so avoid being confused by a packet
2035 * addressed to a unicast address on a different ill.
2036 */
2037 if (IS_UNDER_IPMP(ill)) {
2038 orig_ill = ill;
2039 ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2040 if (ill == NULL) {
2041 ill = orig_ill;
2042 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2043 ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2044 mp, ill);
2045 freemsg(mp);
2046 return;
2047 }
2048 ASSERT(ill != orig_ill);
2049 orig_ill = ira->ira_ill;
2050 ira->ira_ill = ill;
2051 mib = ill->ill_icmp6_mib;
2052 }
2053 if (!pullupmsg(mp, -1)) {
2054 ip1dbg(("ndp_input: pullupmsg failed\n"));
2055 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2056 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2057 goto done;
2058 }
2059 ip6h = (ip6_t *)mp->b_rptr;
2060 if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2061 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2062 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2063 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2064 goto done;
2065 }
2066 /*
2067 * NDP does not accept any extension headers between the
2068 * IP header and the ICMP header since e.g. a routing
2069 * header could be dangerous.
2070 * This assumes that any AH or ESP headers are removed
2071 * by ip prior to passing the packet to ndp_input.
2072 */
2073 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2074 ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2075 ip6h->ip6_nxt));
2076 ip_drop_input("Wrong next header", mp, ill);
2077 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2078 goto done;
2079 }
2080 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2081 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2082 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2083 if (icmp_nd->icmp6_code != 0) {
2084 ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2085 ip_drop_input("code non-zero", mp, ill);
2086 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2087 goto done;
2088 }
2089 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2090 /*
2091 * Make sure packet length is large enough for either
2092 * a NS or a NA icmp packet.
2093 */
2094 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2095 ip1dbg(("ndp_input: packet too short\n"));
2096 ip_drop_input("packet too short", mp, ill);
2097 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2098 goto done;
2099 }
2100 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2101 ndp_input_solicit(mp, ira);
2102 } else {
2103 ndp_input_advert(mp, ira);
2104 }
2105 done:
2106 freemsg(mp);
2107 if (orig_ill != NULL) {
2108 ill_refrele(ill);
2109 ira->ira_ill = orig_ill;
2110 }
2111 }
2112
2113 /*
2114 * ndp_xmit is called to form and transmit a ND solicitation or
2115 * advertisement ICMP packet.
2116 *
2117 * If the source address is unspecified and this isn't a probe (used for
2118 * duplicate address detection), an appropriate source address and link layer
2119 * address will be chosen here. The link layer address option is included if
2120 * the source is specified (i.e., all non-probe packets), and omitted (per the
2121 * specification) otherwise.
2122 *
2123 * It returns B_FALSE only if it does a successful put() to the
2124 * corresponding ill's ill_wq otherwise returns B_TRUE.
2125 */
2126 static boolean_t
2127 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2128 const in6_addr_t *sender, const in6_addr_t *target, int flag)
2129 {
2130 uint32_t len;
2131 icmp6_t *icmp6;
2132 mblk_t *mp;
2133 ip6_t *ip6h;
2134 nd_opt_hdr_t *opt;
2135 uint_t plen;
2136 zoneid_t zoneid = GLOBAL_ZONEID;
2137 ill_t *hwaddr_ill = ill;
2138 ip_xmit_attr_t ixas;
2139 ip_stack_t *ipst = ill->ill_ipst;
2140 boolean_t need_refrele = B_FALSE;
2141 boolean_t probe = B_FALSE;
2142
2143 if (IS_UNDER_IPMP(ill)) {
2144 probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2145 /*
2146 * We send non-probe packets on the upper IPMP interface.
2147 * ip_output_simple() will use cast_ill for sending any
2148 * multicast packets. Note that we can't follow the same
2149 * logic for probe packets because all interfaces in the ipmp
2150 * group may have failed, so that we really want to only try
2151 * to send the ND packet on the ill corresponding to the src
2152 * address.
2153 */
2154 if (!probe) {
2155 ill = ipmp_ill_hold_ipmp_ill(ill);
2156 if (ill != NULL)
2157 need_refrele = B_TRUE;
2158 else
2159 ill = hwaddr_ill;
2160 }
2161 }
2162
2163 /*
2164 * If we have a unspecified source(sender) address, select a
2165 * proper source address for the solicitation here itself so
2166 * that we can initialize the h/w address correctly.
2167 *
2168 * If the sender is specified then we use this address in order
2169 * to lookup the zoneid before calling ip_output_v6(). This is to
2170 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2171 * by IP (we cannot guarantee that the global zone has an interface
2172 * route to the destination).
2173 *
2174 * Note that the NA never comes here with the unspecified source
2175 * address.
2176 */
2177
2178 /*
2179 * Probes will have unspec src at this point.
2180 */
2181 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2182 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2183 /*
2184 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2185 * ALL_ZONES if it cannot find a matching ipif for the address
2186 * we are trying to use. In this case we err on the side of
2187 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2188 */
2189 if (zoneid == ALL_ZONES)
2190 zoneid = GLOBAL_ZONEID;
2191 }
2192
2193 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2194 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2195 mp = allocb(len, BPRI_LO);
2196 if (mp == NULL) {
2197 if (need_refrele)
2198 ill_refrele(ill);
2199 return (B_TRUE);
2200 }
2201
2202 bzero((char *)mp->b_rptr, len);
2203 mp->b_wptr = mp->b_rptr + len;
2204
2205 bzero(&ixas, sizeof (ixas));
2206 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2207
2208 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2209 ixas.ixa_ipst = ipst;
2210 ixas.ixa_cred = kcred;
2211 ixas.ixa_cpid = NOPID;
2212 ixas.ixa_tsl = NULL;
2213 ixas.ixa_zoneid = zoneid;
2214
2215 ip6h = (ip6_t *)mp->b_rptr;
2216 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2217 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2218 ip6h->ip6_nxt = IPPROTO_ICMPV6;
2219 ip6h->ip6_hops = IPV6_MAX_HOPS;
2220 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2221 ip6h->ip6_dst = *target;
2222 icmp6 = (icmp6_t *)&ip6h[1];
2223
2224 if (hw_addr_len != 0) {
2225 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2226 sizeof (nd_neighbor_advert_t));
2227 } else {
2228 opt = NULL;
2229 }
2230 if (operation == ND_NEIGHBOR_SOLICIT) {
2231 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2232
2233 if (opt != NULL && !(flag & NDP_PROBE)) {
2234 /*
2235 * Note that we don't send out SLLA for ND probes
2236 * per RFC 4862, even though we do send out the src
2237 * haddr for IPv4 DAD probes, even though both IPv4
2238 * and IPv6 go out with the unspecified/INADDR_ANY
2239 * src IP addr.
2240 */
2241 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2242 }
2243 ip6h->ip6_src = *sender;
2244 ns->nd_ns_target = *target;
2245 if (!(flag & NDP_UNICAST)) {
2246 /* Form multicast address of the target */
2247 ip6h->ip6_dst = ipv6_solicited_node_mcast;
2248 ip6h->ip6_dst.s6_addr32[3] |=
2249 ns->nd_ns_target.s6_addr32[3];
2250 }
2251 } else {
2252 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2253
2254 ASSERT(!(flag & NDP_PROBE));
2255 if (opt != NULL)
2256 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2257 ip6h->ip6_src = *sender;
2258 na->nd_na_target = *sender;
2259 if (flag & NDP_ISROUTER)
2260 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2261 if (flag & NDP_SOLICITED)
2262 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2263 if (flag & NDP_ORIDE)
2264 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2265 }
2266
2267 if (!(flag & NDP_PROBE)) {
2268 if (hw_addr != NULL && opt != NULL) {
2269 /* Fill in link layer address and option len */
2270 opt->nd_opt_len = (uint8_t)plen;
2271 bcopy(hw_addr, &opt[1], hw_addr_len);
2272 }
2273 }
2274 if (opt != NULL && opt->nd_opt_type == 0) {
2275 /* If there's no link layer address option, then strip it. */
2276 len -= plen * 8;
2277 mp->b_wptr = mp->b_rptr + len;
2278 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2279 }
2280
2281 icmp6->icmp6_type = (uint8_t)operation;
2282 icmp6->icmp6_code = 0;
2283 /*
2284 * Prepare for checksum by putting icmp length in the icmp
2285 * checksum field. The checksum is calculated in ip_output.c.
2286 */
2287 icmp6->icmp6_cksum = ip6h->ip6_plen;
2288
2289 (void) ip_output_simple(mp, &ixas);
2290 ixa_cleanup(&ixas);
2291 if (need_refrele)
2292 ill_refrele(ill);
2293 return (B_FALSE);
2294 }
2295
2296 /*
2297 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2298 * The datapath uses this as an indication that there
2299 * is a problem (as opposed to a NCE that was just
2300 * reclaimed due to lack of memory.
2301 * Note that static ARP entries never become unreachable.
2302 */
2303 void
2304 nce_make_unreachable(ncec_t *ncec)
2305 {
2306 mutex_enter(&ncec->ncec_lock);
2307 ncec->ncec_state = ND_UNREACHABLE;
2308 mutex_exit(&ncec->ncec_lock);
2309 }
2310
2311 /*
2312 * NCE retransmit timer. Common to IPv4 and IPv6.
2313 * This timer goes off when:
2314 * a. It is time to retransmit a resolution for resolver.
2315 * b. It is time to send reachability probes.
2316 */
2317 void
2318 nce_timer(void *arg)
2319 {
2320 ncec_t *ncec = arg;
2321 ill_t *ill = ncec->ncec_ill, *src_ill;
2322 char addrbuf[INET6_ADDRSTRLEN];
2323 boolean_t dropped = B_FALSE;
2324 ip_stack_t *ipst = ncec->ncec_ipst;
2325 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2326 in_addr_t sender4 = INADDR_ANY;
2327 in6_addr_t sender6 = ipv6_all_zeros;
2328
2329 /*
2330 * The timer has to be cancelled by ncec_delete before doing the final
2331 * refrele. So the NCE is guaranteed to exist when the timer runs
2332 * until it clears the timeout_id. Before clearing the timeout_id
2333 * bump up the refcnt so that we can continue to use the ncec
2334 */
2335 ASSERT(ncec != NULL);
2336 mutex_enter(&ncec->ncec_lock);
2337 ncec_refhold_locked(ncec);
2338 ncec->ncec_timeout_id = 0;
2339 mutex_exit(&ncec->ncec_lock);
2340
2341 src_ill = nce_resolve_src(ncec, &sender6);
2342 /* if we could not find a sender address, return */
2343 if (src_ill == NULL) {
2344 if (!isv6) {
2345 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2346 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2347 &sender4, addrbuf, sizeof (addrbuf))));
2348 } else {
2349 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2350 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2351 }
2352 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2353 ncec_refrele(ncec);
2354 return;
2355 }
2356 if (!isv6)
2357 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2358
2359 mutex_enter(&ncec->ncec_lock);
2360 /*
2361 * Check the reachability state.
2362 */
2363 switch (ncec->ncec_state) {
2364 case ND_DELAY:
2365 ASSERT(ncec->ncec_lladdr != NULL);
2366 ncec->ncec_state = ND_PROBE;
2367 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2368 if (isv6) {
2369 mutex_exit(&ncec->ncec_lock);
2370 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2371 src_ill->ill_phys_addr,
2372 src_ill->ill_phys_addr_length,
2373 &sender6, &ncec->ncec_addr,
2374 NDP_UNICAST);
2375 } else {
2376 dropped = (arp_request(ncec, sender4, src_ill) == 0);
2377 mutex_exit(&ncec->ncec_lock);
2378 }
2379 if (!dropped) {
2380 mutex_enter(&ncec->ncec_lock);
2381 ncec->ncec_pcnt--;
2382 mutex_exit(&ncec->ncec_lock);
2383 }
2384 if (ip_debug > 3) {
2385 /* ip2dbg */
2386 pr_addr_dbg("nce_timer: state for %s changed "
2387 "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2388 }
2389 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2390 break;
2391 case ND_PROBE:
2392 /* must be retransmit timer */
2393 ASSERT(ncec->ncec_pcnt >= -1);
2394 if (ncec->ncec_pcnt > 0) {
2395 /*
2396 * As per RFC2461, the ncec gets deleted after
2397 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2398 * Note that the first unicast solicitation is sent
2399 * during the DELAY state.
2400 */
2401 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2402 ncec->ncec_pcnt,
2403 inet_ntop((isv6? AF_INET6 : AF_INET),
2404 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2405 if (NCE_PUBLISH(ncec)) {
2406 mutex_exit(&ncec->ncec_lock);
2407 /*
2408 * send out a probe; note that src_ill
2409 * is ignored by nce_dad() for all
2410 * DAD message types other than IPv6
2411 * unicast probes
2412 */
2413 nce_dad(ncec, src_ill, B_TRUE);
2414 } else {
2415 ASSERT(src_ill != NULL);
2416 if (isv6) {
2417 mutex_exit(&ncec->ncec_lock);
2418 dropped = ndp_xmit(src_ill,
2419 ND_NEIGHBOR_SOLICIT,
2420 src_ill->ill_phys_addr,
2421 src_ill->ill_phys_addr_length,
2422 &sender6, &ncec->ncec_addr,
2423 NDP_UNICAST);
2424 } else {
2425 /*
2426 * since the nce is REACHABLE,
2427 * the ARP request will be sent out
2428 * as a link-layer unicast.
2429 */
2430 dropped = (arp_request(ncec, sender4,
2431 src_ill) == 0);
2432 mutex_exit(&ncec->ncec_lock);
2433 }
2434 if (!dropped) {
2435 mutex_enter(&ncec->ncec_lock);
2436 ncec->ncec_pcnt--;
2437 mutex_exit(&ncec->ncec_lock);
2438 }
2439 nce_restart_timer(ncec,
2440 ill->ill_reachable_retrans_time);
2441 }
2442 } else if (ncec->ncec_pcnt < 0) {
2443 /* No hope, delete the ncec */
2444 /* Tell datapath it went bad */
2445 ncec->ncec_state = ND_UNREACHABLE;
2446 mutex_exit(&ncec->ncec_lock);
2447 if (ip_debug > 2) {
2448 /* ip1dbg */
2449 pr_addr_dbg("nce_timer: Delete NCE for"
2450 " dst %s\n", (isv6? AF_INET6: AF_INET),
2451 &ncec->ncec_addr);
2452 }
2453 /* if static ARP can't delete. */
2454 if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2455 ncec_delete(ncec);
2456
2457 } else if (!NCE_PUBLISH(ncec)) {
2458 /*
2459 * Probe count is 0 for a dynamic entry (one that we
2460 * ourselves are not publishing). We should never get
2461 * here if NONUD was requested, hence the ASSERT below.
2462 */
2463 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2464 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2465 ncec->ncec_pcnt, inet_ntop(AF_INET6,
2466 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2467 ncec->ncec_pcnt--;
2468 mutex_exit(&ncec->ncec_lock);
2469 /* Wait one interval before killing */
2470 nce_restart_timer(ncec,
2471 ill->ill_reachable_retrans_time);
2472 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2473 ipif_t *ipif;
2474 ipaddr_t ncec_addr;
2475
2476 /*
2477 * We're done probing, and we can now declare this
2478 * address to be usable. Let IP know that it's ok to
2479 * use.
2480 */
2481 ncec->ncec_state = ND_REACHABLE;
2482 ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2483 mutex_exit(&ncec->ncec_lock);
2484 if (isv6) {
2485 ipif = ipif_lookup_addr_exact_v6(
2486 &ncec->ncec_addr, ill, ipst);
2487 } else {
2488 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2489 ncec_addr);
2490 ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2491 ipst);
2492 }
2493 if (ipif != NULL) {
2494 if (ipif->ipif_was_dup) {
2495 char ibuf[LIFNAMSIZ];
2496 char sbuf[INET6_ADDRSTRLEN];
2497
2498 ipif->ipif_was_dup = B_FALSE;
2499 (void) inet_ntop(AF_INET6,
2500 &ipif->ipif_v6lcl_addr,
2501 sbuf, sizeof (sbuf));
2502 ipif_get_name(ipif, ibuf,
2503 sizeof (ibuf));
2504 cmn_err(CE_NOTE, "recovered address "
2505 "%s on %s", sbuf, ibuf);
2506 }
2507 if ((ipif->ipif_flags & IPIF_UP) &&
2508 !ipif->ipif_addr_ready)
2509 ipif_up_notify(ipif);
2510 ipif->ipif_addr_ready = 1;
2511 ipif_refrele(ipif);
2512 }
2513 if (!isv6 && arp_no_defense)
2514 break;
2515 /* Begin defending our new address */
2516 if (ncec->ncec_unsolicit_count > 0) {
2517 ncec->ncec_unsolicit_count--;
2518 if (isv6) {
2519 dropped = ndp_announce(ncec);
2520 } else {
2521 dropped = arp_announce(ncec);
2522 }
2523
2524 if (dropped)
2525 ncec->ncec_unsolicit_count++;
2526 else
2527 ncec->ncec_last_time_defended =
2528 ddi_get_lbolt();
2529 }
2530 if (ncec->ncec_unsolicit_count > 0) {
2531 nce_restart_timer(ncec,
2532 ANNOUNCE_INTERVAL(isv6));
2533 } else if (DEFENSE_INTERVAL(isv6) != 0) {
2534 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2535 }
2536 } else {
2537 /*
2538 * This is an address we're probing to be our own, but
2539 * the ill is down. Wait until it comes back before
2540 * doing anything, but switch to reachable state so
2541 * that the restart will work.
2542 */
2543 ncec->ncec_state = ND_REACHABLE;
2544 mutex_exit(&ncec->ncec_lock);
2545 }
2546 break;
2547 case ND_INCOMPLETE: {
2548 mblk_t *mp, *nextmp;
2549 mblk_t **prevmpp;
2550
2551 /*
2552 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2553 * for any IPMP probe packets, and toss them. IPMP probe
2554 * packets will always be at the head of ncec_qd_mp, so that
2555 * we can stop at the first queued ND packet that is
2556 * not a probe packet.
2557 */
2558 prevmpp = &ncec->ncec_qd_mp;
2559 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2560 nextmp = mp->b_next;
2561
2562 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2563 inet_freemsg(mp);
2564 ncec->ncec_nprobes--;
2565 *prevmpp = nextmp;
2566 } else {
2567 prevmpp = &mp->b_next;
2568 }
2569 }
2570
2571 /*
2572 * Must be resolver's retransmit timer.
2573 */
2574 mutex_exit(&ncec->ncec_lock);
2575 ip_ndp_resolve(ncec);
2576 break;
2577 }
2578 case ND_REACHABLE:
2579 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2580 ncec->ncec_unsolicit_count != 0) ||
2581 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2582 if (ncec->ncec_unsolicit_count > 0) {
2583 ncec->ncec_unsolicit_count--;
2584 mutex_exit(&ncec->ncec_lock);
2585 /*
2586 * When we get to zero announcements left,
2587 * switch to address defense
2588 */
2589 } else {
2590 boolean_t rate_limit;
2591
2592 mutex_exit(&ncec->ncec_lock);
2593 rate_limit = ill_defend_rate_limit(ill, ncec);
2594 if (rate_limit) {
2595 nce_restart_timer(ncec,
2596 DEFENSE_INTERVAL(isv6));
2597 break;
2598 }
2599 }
2600 if (isv6) {
2601 dropped = ndp_announce(ncec);
2602 } else {
2603 dropped = arp_announce(ncec);
2604 }
2605 mutex_enter(&ncec->ncec_lock);
2606 if (dropped) {
2607 ncec->ncec_unsolicit_count++;
2608 } else {
2609 ncec->ncec_last_time_defended =
2610 ddi_get_lbolt();
2611 }
2612 mutex_exit(&ncec->ncec_lock);
2613 if (ncec->ncec_unsolicit_count != 0) {
2614 nce_restart_timer(ncec,
2615 ANNOUNCE_INTERVAL(isv6));
2616 } else {
2617 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2618 }
2619 } else {
2620 mutex_exit(&ncec->ncec_lock);
2621 }
2622 break;
2623 default:
2624 mutex_exit(&ncec->ncec_lock);
2625 break;
2626 }
2627 done:
2628 ncec_refrele(ncec);
2629 ill_refrele(src_ill);
2630 }
2631
2632 /*
2633 * Set a link layer address from the ll_addr passed in.
2634 * Copy SAP from ill.
2635 */
2636 static void
2637 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2638 {
2639 ill_t *ill = ncec->ncec_ill;
2640
2641 ASSERT(ll_addr != NULL);
2642 if (ill->ill_phys_addr_length > 0) {
2643 /*
2644 * The bcopy() below used to be called for the physical address
2645 * length rather than the link layer address length. For
2646 * ethernet and many other media, the phys_addr and lla are
2647 * identical.
2648 *
2649 * The phys_addr and lla may not be the same for devices that
2650 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2651 * no known instances of these.
2652 *
2653 * For PPP or other interfaces with a zero length
2654 * physical address, don't do anything here.
2655 * The bcopy() with a zero phys_addr length was previously
2656 * a no-op for interfaces with a zero-length physical address.
2657 * Using the lla for them would change the way they operate.
2658 * Doing nothing in such cases preserves expected behavior.
2659 */
2660 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2661 }
2662 }
2663
2664 boolean_t
2665 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2666 uint32_t ll_addr_len)
2667 {
2668 ASSERT(ncec->ncec_lladdr != NULL);
2669 if (ll_addr == NULL)
2670 return (B_FALSE);
2671 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2672 return (B_TRUE);
2673 return (B_FALSE);
2674 }
2675
2676 /*
2677 * Updates the link layer address or the reachability state of
2678 * a cache entry. Reset probe counter if needed.
2679 */
2680 void
2681 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2682 {
2683 ill_t *ill = ncec->ncec_ill;
2684 boolean_t need_stop_timer = B_FALSE;
2685 boolean_t need_fastpath_update = B_FALSE;
2686 nce_t *nce = NULL;
2687 timeout_id_t tid;
2688
2689 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2690 /*
2691 * If this interface does not do NUD, there is no point
2692 * in allowing an update to the cache entry. Although
2693 * we will respond to NS.
2694 * The only time we accept an update for a resolver when
2695 * NUD is turned off is when it has just been created.
2696 * Non-Resolvers will always be created as REACHABLE.
2697 */
2698 if (new_state != ND_UNCHANGED) {
2699 if ((ncec->ncec_flags & NCE_F_NONUD) &&
2700 (ncec->ncec_state != ND_INCOMPLETE))
2701 return;
2702 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2703 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2704 need_stop_timer = B_TRUE;
2705 if (new_state == ND_REACHABLE)
2706 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2707 else {
2708 /* We force NUD in this case */
2709 ncec->ncec_last = 0;
2710 }
2711 ncec->ncec_state = new_state;
2712 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2713 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2714 new_state == ND_INCOMPLETE);
2715 }
2716 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2717 tid = ncec->ncec_timeout_id;
2718 ncec->ncec_timeout_id = 0;
2719 }
2720 /*
2721 * Re-trigger fastpath probe and
2722 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2723 * whatever packets that happens to be transmitting at the time.
2724 */
2725 if (new_ll_addr != NULL) {
2726 bcopy(new_ll_addr, ncec->ncec_lladdr,
2727 ill->ill_phys_addr_length);
2728 need_fastpath_update = B_TRUE;
2729 }
2730 mutex_exit(&ncec->ncec_lock);
2731 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2732 if (tid != 0)
2733 (void) untimeout(tid);
2734 }
2735 if (need_fastpath_update) {
2736 /*
2737 * Delete any existing existing dlur_mp and fp_mp information.
2738 * For IPMP interfaces, all underlying ill's must be checked
2739 * and purged.
2740 */
2741 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2742 /*
2743 * add the new dlur_mp and fp_mp
2744 */
2745 nce = nce_fastpath(ncec, B_TRUE, NULL);
2746 if (nce != NULL)
2747 nce_refrele(nce);
2748 }
2749 mutex_enter(&ncec->ncec_lock);
2750 }
2751
2752 static void
2753 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2754 {
2755 uint_t count = 0;
2756 mblk_t **mpp, *tmp;
2757
2758 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2759
2760 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2761 if (++count > ncec->ncec_ill->ill_max_buf) {
2762 tmp = ncec->ncec_qd_mp->b_next;
2763 ncec->ncec_qd_mp->b_next = NULL;
2764 /*
2765 * if we never create data addrs on the under_ill
2766 * does this matter?
2767 */
2768 BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2769 ipIfStatsOutDiscards);
2770 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2771 ncec->ncec_ill);
2772 freemsg(ncec->ncec_qd_mp);
2773 ncec->ncec_qd_mp = tmp;
2774 }
2775 }
2776
2777 if (head_insert) {
2778 ncec->ncec_nprobes++;
2779 mp->b_next = ncec->ncec_qd_mp;
2780 ncec->ncec_qd_mp = mp;
2781 } else {
2782 *mpp = mp;
2783 }
2784 }
2785
2786 /*
2787 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2788 * queued at the head or tail of the queue based on the input argument
2789 * 'head_insert'. The caller should specify this argument as B_TRUE if this
2790 * packet is an IPMP probe packet, in which case the following happens:
2791 *
2792 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal
2793 * (non-ipmp_probe) load-speading case where the source address of the ND
2794 * packet is not tied to ncec_ill. If the ill bound to the source address
2795 * cannot receive, the response to the ND packet will not be received.
2796 * However, if ND packets for ncec_ill's probes are queued behind that ND
2797 * packet, those probes will also fail to be sent, and thus in.mpathd will
2798 * erroneously conclude that ncec_ill has also failed.
2799 *
2800 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
2801 * the first attempt. This ensures that ND problems do not manifest as
2802 * probe RTT spikes.
2803 *
2804 * We achieve this by inserting ipmp_probe() packets at the head of the
2805 * nce_queue.
2806 *
2807 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2808 * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2809 */
2810 void
2811 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2812 {
2813 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2814 nce_queue_mp_common(ncec, mp, head_insert);
2815 }
2816
2817 /*
2818 * Called when address resolution failed due to a timeout.
2819 * Send an ICMP unreachable in response to all queued packets.
2820 */
2821 void
2822 ndp_resolv_failed(ncec_t *ncec)
2823 {
2824 mblk_t *mp, *nxt_mp;
2825 char buf[INET6_ADDRSTRLEN];
2826 ill_t *ill = ncec->ncec_ill;
2827 ip_recv_attr_t iras;
2828
2829 bzero(&iras, sizeof (iras));
2830 iras.ira_flags = 0;
2831 /*
2832 * we are setting the ira_rill to the ipmp_ill (instead of
2833 * the actual ill on which the packet was received), but this
2834 * is ok because we don't actually need the real ira_rill.
2835 * to send the icmp unreachable to the sender.
2836 */
2837 iras.ira_ill = iras.ira_rill = ill;
2838 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2839 iras.ira_rifindex = iras.ira_ruifindex;
2840
2841 ip1dbg(("ndp_resolv_failed: dst %s\n",
2842 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2843 mutex_enter(&ncec->ncec_lock);
2844 mp = ncec->ncec_qd_mp;
2845 ncec->ncec_qd_mp = NULL;
2846 ncec->ncec_nprobes = 0;
2847 mutex_exit(&ncec->ncec_lock);
2848 while (mp != NULL) {
2849 nxt_mp = mp->b_next;
2850 mp->b_next = NULL;
2851
2852 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2853 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2854 mp, ill);
2855 icmp_unreachable_v6(mp,
2856 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2857 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2858 mp = nxt_mp;
2859 }
2860 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2861 }
2862
2863 /*
2864 * Handle the completion of NDP and ARP resolution.
2865 */
2866 void
2867 nce_resolv_ok(ncec_t *ncec)
2868 {
2869 mblk_t *mp;
2870 uint_t pkt_len;
2871 iaflags_t ixaflags = IXAF_NO_TRACE;
2872 nce_t *nce;
2873 ill_t *ill = ncec->ncec_ill;
2874 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2875 ip_stack_t *ipst = ill->ill_ipst;
2876
2877 if (IS_IPMP(ncec->ncec_ill)) {
2878 nce_resolv_ipmp_ok(ncec);
2879 return;
2880 }
2881 /* non IPMP case */
2882
2883 mutex_enter(&ncec->ncec_lock);
2884 ASSERT(ncec->ncec_nprobes == 0);
2885 mp = ncec->ncec_qd_mp;
2886 ncec->ncec_qd_mp = NULL;
2887 mutex_exit(&ncec->ncec_lock);
2888
2889 while (mp != NULL) {
2890 mblk_t *nxt_mp;
2891
2892 if (ill->ill_isv6) {
2893 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2894
2895 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2896 } else {
2897 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2898
2899 ixaflags |= IXAF_IS_IPV4;
2900 pkt_len = ntohs(ipha->ipha_length);
2901 }
2902 nxt_mp = mp->b_next;
2903 mp->b_next = NULL;
2904 /*
2905 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2906 * longer available, but it's ok to drop this flag because TCP
2907 * has its own flow-control in effect, so TCP packets
2908 * are not likely to get here when flow-control is in effect.
2909 */
2910 mutex_enter(&ill->ill_lock);
2911 nce = nce_lookup(ill, &ncec->ncec_addr);
2912 mutex_exit(&ill->ill_lock);
2913
2914 if (nce == NULL) {
2915 if (isv6) {
2916 BUMP_MIB(&ipst->ips_ip6_mib,
2917 ipIfStatsOutDiscards);
2918 } else {
2919 BUMP_MIB(&ipst->ips_ip_mib,
2920 ipIfStatsOutDiscards);
2921 }
2922 ip_drop_output("ipIfStatsOutDiscards - no nce",
2923 mp, NULL);
2924 freemsg(mp);
2925 } else {
2926 /*
2927 * We don't know the zoneid, but
2928 * ip_xmit does not care since IXAF_NO_TRACE
2929 * is set. (We traced the packet the first
2930 * time through ip_xmit.)
2931 */
2932 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2933 ALL_ZONES, 0, NULL);
2934 nce_refrele(nce);
2935 }
2936 mp = nxt_mp;
2937 }
2938
2939 ncec_cb_dispatch(ncec); /* complete callbacks */
2940 }
2941
2942 /*
2943 * Called by SIOCSNDP* ioctl to add/change an ncec entry
2944 * and the corresponding attributes.
2945 * Disallow states other than ND_REACHABLE or ND_STALE.
2946 */
2947 int
2948 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2949 {
2950 sin6_t *sin6;
2951 in6_addr_t *addr;
2952 ncec_t *ncec;
2953 nce_t *nce;
2954 int err = 0;
2955 uint16_t new_flags = 0;
2956 uint16_t old_flags = 0;
2957 int inflags = lnr->lnr_flags;
2958 ip_stack_t *ipst = ill->ill_ipst;
2959 boolean_t do_postprocess = B_FALSE;
2960
2961 ASSERT(ill->ill_isv6);
2962 if ((lnr->lnr_state_create != ND_REACHABLE) &&
2963 (lnr->lnr_state_create != ND_STALE))
2964 return (EINVAL);
2965
2966 sin6 = (sin6_t *)&lnr->lnr_addr;
2967 addr = &sin6->sin6_addr;
2968
2969 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2970 ASSERT(!IS_UNDER_IPMP(ill));
2971 nce = nce_lookup_addr(ill, addr);
2972 if (nce != NULL)
2973 new_flags = nce->nce_common->ncec_flags;
2974
2975 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2976 case NDF_ISROUTER_ON:
2977 new_flags |= NCE_F_ISROUTER;
2978 break;
2979 case NDF_ISROUTER_OFF:
2980 new_flags &= ~NCE_F_ISROUTER;
2981 break;
2982 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2983 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2984 if (nce != NULL)
2985 nce_refrele(nce);
2986 return (EINVAL);
2987 }
2988 if (inflags & NDF_STATIC)
2989 new_flags |= NCE_F_STATIC;
2990
2991 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2992 case NDF_ANYCAST_ON:
2993 new_flags |= NCE_F_ANYCAST;
2994 break;
2995 case NDF_ANYCAST_OFF:
2996 new_flags &= ~NCE_F_ANYCAST;
2997 break;
2998 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2999 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3000 if (nce != NULL)
3001 nce_refrele(nce);
3002 return (EINVAL);
3003 }
3004
3005 if (nce == NULL) {
3006 err = nce_add_v6(ill,
3007 (uchar_t *)lnr->lnr_hdw_addr,
3008 ill->ill_phys_addr_length,
3009 addr,
3010 new_flags,
3011 lnr->lnr_state_create,
3012 &nce);
3013 if (err != 0) {
3014 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3015 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3016 return (err);
3017 } else {
3018 do_postprocess = B_TRUE;
3019 }
3020 }
3021 ncec = nce->nce_common;
3022 old_flags = ncec->ncec_flags;
3023 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3024 ncec_router_to_host(ncec);
3025 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3026 if (do_postprocess)
3027 err = nce_add_v6_postprocess(nce);
3028 nce_refrele(nce);
3029 return (0);
3030 }
3031 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3032
3033 if (do_postprocess)
3034 err = nce_add_v6_postprocess(nce);
3035 /*
3036 * err cannot be anything other than 0 because we don't support
3037 * proxy arp of static addresses.
3038 */
3039 ASSERT(err == 0);
3040
3041 mutex_enter(&ncec->ncec_lock);
3042 ncec->ncec_flags = new_flags;
3043 mutex_exit(&ncec->ncec_lock);
3044 /*
3045 * Note that we ignore the state at this point, which
3046 * should be either STALE or REACHABLE. Instead we let
3047 * the link layer address passed in to determine the state
3048 * much like incoming packets.
3049 */
3050 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3051 nce_refrele(nce);
3052 return (0);
3053 }
3054
3055 /*
3056 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3057 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3058 * be held to ensure that they are in the same group.
3059 */
3060 static nce_t *
3061 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3062 {
3063
3064 nce_t *nce;
3065
3066 nce = nce_ill_lookup_then_add(ill, ncec);
3067
3068 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3069 return (nce);
3070
3071 /*
3072 * hold the ncec_lock to synchronize with nce_update() so that,
3073 * at the end of this function, the contents of nce_dlur_mp are
3074 * consistent with ncec->ncec_lladdr, even though some intermediate
3075 * packet may have been sent out with a mangled address, which would
3076 * only be a transient condition.
3077 */
3078 mutex_enter(&ncec->ncec_lock);
3079 if (ncec->ncec_lladdr != NULL) {
3080 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3081 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3082 } else {
3083 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3084 ill->ill_sap_length);
3085 }
3086 mutex_exit(&ncec->ncec_lock);
3087 return (nce);
3088 }
3089
3090 /*
3091 * we make nce_fp_mp to have an M_DATA prepend.
3092 * The caller ensures there is hold on ncec for this function.
3093 * Note that since ill_fastpath_probe() copies the mblk there is
3094 * no need to hold the nce or ncec beyond this function.
3095 *
3096 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3097 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3098 * and will be returned back by this function, so that no extra nce_refrele
3099 * is required for the caller. The calls from nce_add_common() use this
3100 * method. All other callers (that pass in NULL ncec_nce) will have to do a
3101 * nce_refrele of the returned nce (when it is non-null).
3102 */
3103 nce_t *
3104 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3105 {
3106 nce_t *nce;
3107 ill_t *ill = ncec->ncec_ill;
3108
3109 ASSERT(ill != NULL);
3110
3111 if (IS_IPMP(ill) && trigger_fp_req) {
3112 trigger_fp_req = B_FALSE;
3113 ipmp_ncec_refresh_nce(ncec);
3114 }
3115
3116 /*
3117 * If the caller already has the nce corresponding to the ill, use
3118 * that one. Otherwise we have to lookup/add the nce. Calls from
3119 * nce_add_common() fall in the former category, and have just done
3120 * the nce lookup/add that can be reused.
3121 */
3122 if (ncec_nce == NULL)
3123 nce = nce_fastpath_create(ill, ncec);
3124 else
3125 nce = ncec_nce;
3126
3127 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3128 return (nce);
3129
3130 if (trigger_fp_req)
3131 nce_fastpath_trigger(nce);
3132 return (nce);
3133 }
3134
3135 /*
3136 * Trigger fastpath on nce. No locks may be held.
3137 */
3138 static void
3139 nce_fastpath_trigger(nce_t *nce)
3140 {
3141 int res;
3142 ill_t *ill = nce->nce_ill;
3143 ncec_t *ncec = nce->nce_common;
3144
3145 res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3146 /*
3147 * EAGAIN is an indication of a transient error
3148 * i.e. allocation failure etc. leave the ncec in the list it
3149 * will be updated when another probe happens for another ire
3150 * if not it will be taken out of the list when the ire is
3151 * deleted.
3152 */
3153 if (res != 0 && res != EAGAIN && res != ENOTSUP)
3154 nce_fastpath_list_delete(ill, ncec, NULL);
3155 }
3156
3157 /*
3158 * Add ncec to the nce fastpath list on ill.
3159 */
3160 static nce_t *
3161 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3162 {
3163 nce_t *nce = NULL;
3164
3165 ASSERT(MUTEX_HELD(&ill->ill_lock));
3166 /*
3167 * Atomically ensure that the ill is not CONDEMNED and is not going
3168 * down, before adding the NCE.
3169 */
3170 if (ill->ill_state_flags & ILL_CONDEMNED)
3171 return (NULL);
3172 mutex_enter(&ncec->ncec_lock);
3173 /*
3174 * if ncec has not been deleted and
3175 * is not already in the list add it.
3176 */
3177 if (!NCE_ISCONDEMNED(ncec)) {
3178 nce = nce_lookup(ill, &ncec->ncec_addr);
3179 if (nce != NULL)
3180 goto done;
3181 nce = nce_add(ill, ncec);
3182 }
3183 done:
3184 mutex_exit(&ncec->ncec_lock);
3185 return (nce);
3186 }
3187
3188 nce_t *
3189 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3190 {
3191 nce_t *nce;
3192
3193 mutex_enter(&ill->ill_lock);
3194 nce = nce_ill_lookup_then_add_locked(ill, ncec);
3195 mutex_exit(&ill->ill_lock);
3196 return (nce);
3197 }
3198
3199
3200 /*
3201 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3202 * nce is added to the 'dead' list, and the caller must nce_refrele() the
3203 * entry after all locks have been dropped.
3204 */
3205 void
3206 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3207 {
3208 nce_t *nce;
3209
3210 ASSERT(ill != NULL);
3211
3212 /* delete any nces referencing the ncec from underlying ills */
3213 if (IS_IPMP(ill))
3214 ipmp_ncec_delete_nce(ncec);
3215
3216 /* now the ill itself */
3217 mutex_enter(&ill->ill_lock);
3218 for (nce = list_head(&ill->ill_nce); nce != NULL;
3219 nce = list_next(&ill->ill_nce, nce)) {
3220 if (nce->nce_common == ncec) {
3221 nce_refhold(nce);
3222 nce_delete(nce);
3223 break;
3224 }
3225 }
3226 mutex_exit(&ill->ill_lock);
3227 if (nce != NULL) {
3228 if (dead == NULL)
3229 nce_refrele(nce);
3230 else
3231 list_insert_tail(dead, nce);
3232 }
3233 }
3234
3235 /*
3236 * when the fastpath response does not fit in the datab
3237 * associated with the existing nce_fp_mp, we delete and
3238 * add the nce to retrigger fastpath based on the information
3239 * in the ncec_t.
3240 */
3241 static nce_t *
3242 nce_delete_then_add(nce_t *nce)
3243 {
3244 ill_t *ill = nce->nce_ill;
3245 nce_t *newnce = NULL;
3246
3247 ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3248 (void *)nce, ill->ill_name));
3249 mutex_enter(&ill->ill_lock);
3250 mutex_enter(&nce->nce_common->ncec_lock);
3251 nce_delete(nce);
3252 /*
3253 * Make sure that ncec is not condemned before adding. We hold the
3254 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3255 * ipmp_ncec_delete_nce()
3256 */
3257 if (!NCE_ISCONDEMNED(nce->nce_common))
3258 newnce = nce_add(ill, nce->nce_common);
3259 mutex_exit(&nce->nce_common->ncec_lock);
3260 mutex_exit(&ill->ill_lock);
3261 nce_refrele(nce);
3262 return (newnce); /* could be null if nomem */
3263 }
3264
3265 typedef struct nce_fp_match_s {
3266 nce_t *nce_fp_match_res;
3267 mblk_t *nce_fp_match_ack_mp;
3268 } nce_fp_match_t;
3269
3270 /* ARGSUSED */
3271 static int
3272 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3273 {
3274 nce_fp_match_t *nce_fp_marg = arg;
3275 ncec_t *ncec = nce->nce_common;
3276 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp;
3277 uchar_t *mp_rptr, *ud_mp_rptr;
3278 mblk_t *ud_mp = nce->nce_dlur_mp;
3279 ptrdiff_t cmplen;
3280
3281 /*
3282 * mp is the mp associated with the fastpath ack.
3283 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3284 * under consideration. If the contents match, then the
3285 * fastpath ack is used to update the nce.
3286 */
3287 if (ud_mp == NULL)
3288 return (0);
3289 mp_rptr = mp->b_rptr;
3290 cmplen = mp->b_wptr - mp_rptr;
3291 ASSERT(cmplen >= 0);
3292
3293 ud_mp_rptr = ud_mp->b_rptr;
3294 /*
3295 * The ncec is locked here to prevent any other threads from accessing
3296 * and changing nce_dlur_mp when the address becomes resolved to an
3297 * lla while we're in the middle of looking at and comparing the
3298 * hardware address (lla). It is also locked to prevent multiple
3299 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3300 * time.
3301 */
3302 mutex_enter(&ncec->ncec_lock);
3303 if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3304 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3305 nce_fp_marg->nce_fp_match_res = nce;
3306 mutex_exit(&ncec->ncec_lock);
3307 nce_refhold(nce);
3308 return (1);
3309 }
3310 mutex_exit(&ncec->ncec_lock);
3311 return (0);
3312 }
3313
3314 /*
3315 * Update all NCE's that are not in fastpath mode and
3316 * have an nce_fp_mp that matches mp. mp->b_cont contains
3317 * the fastpath header.
3318 *
3319 * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3320 */
3321 void
3322 nce_fastpath_update(ill_t *ill, mblk_t *mp)
3323 {
3324 nce_fp_match_t nce_fp_marg;
3325 nce_t *nce;
3326 mblk_t *nce_fp_mp, *fp_mp;
3327
3328 nce_fp_marg.nce_fp_match_res = NULL;
3329 nce_fp_marg.nce_fp_match_ack_mp = mp;
3330
3331 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3332
3333 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3334 return;
3335
3336 mutex_enter(&nce->nce_lock);
3337 nce_fp_mp = nce->nce_fp_mp;
3338
3339 if (nce_fp_mp != NULL) {
3340 fp_mp = mp->b_cont;
3341 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3342 nce_fp_mp->b_datap->db_lim) {
3343 mutex_exit(&nce->nce_lock);
3344 nce = nce_delete_then_add(nce);
3345 if (nce == NULL) {
3346 return;
3347 }
3348 mutex_enter(&nce->nce_lock);
3349 nce_fp_mp = nce->nce_fp_mp;
3350 }
3351 }
3352
3353 /* Matched - install mp as the fastpath mp */
3354 if (nce_fp_mp == NULL) {
3355 fp_mp = dupb(mp->b_cont);
3356 nce->nce_fp_mp = fp_mp;
3357 } else {
3358 fp_mp = mp->b_cont;
3359 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3360 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3361 + MBLKL(fp_mp);
3362 }
3363 mutex_exit(&nce->nce_lock);
3364 nce_refrele(nce);
3365 }
3366
3367 /*
3368 * Return a pointer to a given option in the packet.
3369 * Assumes that option part of the packet have already been validated.
3370 */
3371 nd_opt_hdr_t *
3372 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3373 {
3374 while (optlen > 0) {
3375 if (opt->nd_opt_type == opt_type)
3376 return (opt);
3377 optlen -= 8 * opt->nd_opt_len;
3378 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3379 }
3380 return (NULL);
3381 }
3382
3383 /*
3384 * Verify all option lengths present are > 0, also check to see
3385 * if the option lengths and packet length are consistent.
3386 */
3387 boolean_t
3388 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3389 {
3390 ASSERT(opt != NULL);
3391 while (optlen > 0) {
3392 if (opt->nd_opt_len == 0)
3393 return (B_FALSE);
3394 optlen -= 8 * opt->nd_opt_len;
3395 if (optlen < 0)
3396 return (B_FALSE);
3397 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3398 }
3399 return (B_TRUE);
3400 }
3401
3402 /*
3403 * ncec_walk function.
3404 * Free a fraction of the NCE cache entries.
3405 *
3406 * A possible optimization here would be to use ncec_last where possible, and
3407 * delete the least-frequently used entry, which would require more complex
3408 * computation as we walk through the ncec's (e.g., track ncec entries by
3409 * order of ncec_last and/or maintain state)
3410 */
3411 static void
3412 ncec_cache_reclaim(ncec_t *ncec, void *arg)
3413 {
3414 ip_stack_t *ipst = ncec->ncec_ipst;
3415 uint_t fraction = *(uint_t *)arg;
3416 uint_t rand;
3417
3418 if ((ncec->ncec_flags &
3419 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3420 return;
3421 }
3422
3423 rand = (uint_t)ddi_get_lbolt() +
3424 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3425 if ((rand/fraction)*fraction == rand) {
3426 IP_STAT(ipst, ip_nce_reclaim_deleted);
3427 ncec_delete(ncec);
3428 }
3429 }
3430
3431 /*
3432 * kmem_cache callback to free up memory.
3433 *
3434 * For now we just delete a fixed fraction.
3435 */
3436 static void
3437 ip_nce_reclaim_stack(ip_stack_t *ipst)
3438 {
3439 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction;
3440
3441 IP_STAT(ipst, ip_nce_reclaim_calls);
3442
3443 ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);
3444
3445 /*
3446 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3447 * Get them to update any stale references to drop any refholds they
3448 * have.
3449 */
3450 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3451 }
3452
3453 /*
3454 * Called by the memory allocator subsystem directly, when the system
3455 * is running low on memory.
3456 */
3457 /* ARGSUSED */
3458 void
3459 ip_nce_reclaim(void *args)
3460 {
3461 netstack_handle_t nh;
3462 netstack_t *ns;
3463 ip_stack_t *ipst;
3464
3465 netstack_next_init(&nh);
3466 while ((ns = netstack_next(&nh)) != NULL) {
3467 /*
3468 * netstack_next() can return a netstack_t with a NULL
3469 * netstack_ip at boot time.
3470 */
3471 if ((ipst = ns->netstack_ip) == NULL) {
3472 netstack_rele(ns);
3473 continue;
3474 }
3475 ip_nce_reclaim_stack(ipst);
3476 netstack_rele(ns);
3477 }
3478 netstack_next_fini(&nh);
3479 }
3480
3481 #ifdef DEBUG
3482 void
3483 ncec_trace_ref(ncec_t *ncec)
3484 {
3485 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3486
3487 if (ncec->ncec_trace_disable)
3488 return;
3489
3490 if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3491 ncec->ncec_trace_disable = B_TRUE;
3492 ncec_trace_cleanup(ncec);
3493 }
3494 }
3495
3496 void
3497 ncec_untrace_ref(ncec_t *ncec)
3498 {
3499 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3500
3501 if (!ncec->ncec_trace_disable)
3502 th_trace_unref(ncec);
3503 }
3504
3505 static void
3506 ncec_trace_cleanup(const ncec_t *ncec)
3507 {
3508 th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3509 }
3510 #endif
3511
3512 /*
3513 * Called when address resolution fails due to a timeout.
3514 * Send an ICMP unreachable in response to all queued packets.
3515 */
3516 void
3517 arp_resolv_failed(ncec_t *ncec)
3518 {
3519 mblk_t *mp, *nxt_mp;
3520 char buf[INET6_ADDRSTRLEN];
3521 struct in_addr ipv4addr;
3522 ill_t *ill = ncec->ncec_ill;
3523 ip_stack_t *ipst = ncec->ncec_ipst;
3524 ip_recv_attr_t iras;
3525
3526 bzero(&iras, sizeof (iras));
3527 iras.ira_flags = IRAF_IS_IPV4;
3528 /*
3529 * we are setting the ira_rill to the ipmp_ill (instead of
3530 * the actual ill on which the packet was received), but this
3531 * is ok because we don't actually need the real ira_rill.
3532 * to send the icmp unreachable to the sender.
3533 */
3534 iras.ira_ill = iras.ira_rill = ill;
3535 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3536 iras.ira_rifindex = iras.ira_ruifindex;
3537
3538 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3539 ip3dbg(("arp_resolv_failed: dst %s\n",
3540 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3541 mutex_enter(&ncec->ncec_lock);
3542 mp = ncec->ncec_qd_mp;
3543 ncec->ncec_qd_mp = NULL;
3544 ncec->ncec_nprobes = 0;
3545 mutex_exit(&ncec->ncec_lock);
3546 while (mp != NULL) {
3547 nxt_mp = mp->b_next;
3548 mp->b_next = NULL;
3549
3550 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3551 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3552 mp, ill);
3553 if (ipst->ips_ip_arp_icmp_error) {
3554 ip3dbg(("arp_resolv_failed: "
3555 "Calling icmp_unreachable\n"));
3556 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3557 } else {
3558 freemsg(mp);
3559 }
3560 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3561 mp = nxt_mp;
3562 }
3563 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3564 }
3565
3566 /*
3567 * if ill is an under_ill, translate it to the ipmp_ill and add the
3568 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3569 * one on the underlying in_ill) will be created for the
3570 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3571 */
3572 int
3573 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3574 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3575 {
3576 int err;
3577 in6_addr_t addr6;
3578 ip_stack_t *ipst = ill->ill_ipst;
3579 nce_t *nce, *upper_nce = NULL;
3580 ill_t *in_ill = ill, *under = NULL;
3581 boolean_t need_ill_refrele = B_FALSE;
3582
3583 if (flags & NCE_F_MCAST) {
3584 /*
3585 * hw_addr will be figured out in nce_set_multicast_v4;
3586 * caller needs to pass in the cast_ill for ipmp
3587 */
3588 ASSERT(hw_addr == NULL);
3589 ASSERT(!IS_IPMP(ill));
3590 err = nce_set_multicast_v4(ill, addr, flags, newnce);
3591 return (err);
3592 }
3593
3594 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3595 ill = ipmp_ill_hold_ipmp_ill(ill);
3596 if (ill == NULL)
3597 return (ENXIO);
3598 need_ill_refrele = B_TRUE;
3599 }
3600 if ((flags & NCE_F_BCAST) != 0) {
3601 /*
3602 * IPv4 broadcast ncec: compute the hwaddr.
3603 */
3604 if (IS_IPMP(ill)) {
3605 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3606 if (under == NULL) {
3607 if (need_ill_refrele)
3608 ill_refrele(ill);
3609 return (ENETDOWN);
3610 }
3611 hw_addr = under->ill_bcast_mp->b_rptr +
3612 NCE_LL_ADDR_OFFSET(under);
3613 hw_addr_len = under->ill_phys_addr_length;
3614 } else {
3615 hw_addr = ill->ill_bcast_mp->b_rptr +
3616 NCE_LL_ADDR_OFFSET(ill),
3617 hw_addr_len = ill->ill_phys_addr_length;
3618 }
3619 }
3620
3621 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3622 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3623 nce = nce_lookup_addr(ill, &addr6);
3624 if (nce == NULL) {
3625 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3626 state, &nce);
3627 } else {
3628 err = EEXIST;
3629 }
3630 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3631 if (err == 0)
3632 err = nce_add_v4_postprocess(nce);
3633
3634 if (in_ill != ill && nce != NULL) {
3635 nce_t *under_nce = NULL;
3636
3637 /*
3638 * in_ill was the under_ill. Try to create the under_nce.
3639 * Hold the ill_g_lock to prevent changes to group membership
3640 * until we are done.
3641 */
3642 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3643 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3644 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3645 ill_t *, ill);
3646 rw_exit(&ipst->ips_ill_g_lock);
3647 err = ENXIO;
3648 nce_refrele(nce);
3649 nce = NULL;
3650 goto bail;
3651 }
3652 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3653 if (under_nce == NULL) {
3654 rw_exit(&ipst->ips_ill_g_lock);
3655 err = EINVAL;
3656 nce_refrele(nce);
3657 nce = NULL;
3658 goto bail;
3659 }
3660 rw_exit(&ipst->ips_ill_g_lock);
3661 upper_nce = nce;
3662 nce = under_nce; /* will be returned to caller */
3663 if (NCE_ISREACHABLE(nce->nce_common))
3664 nce_fastpath_trigger(under_nce);
3665 }
3666 if (nce != NULL) {
3667 if (newnce != NULL)
3668 *newnce = nce;
3669 else
3670 nce_refrele(nce);
3671 }
3672 bail:
3673 if (under != NULL)
3674 ill_refrele(under);
3675 if (upper_nce != NULL)
3676 nce_refrele(upper_nce);
3677 if (need_ill_refrele)
3678 ill_refrele(ill);
3679
3680 return (err);
3681 }
3682
3683 /*
3684 * NDP Cache Entry creation routine for IPv4.
3685 * This routine must always be called with ndp4->ndp_g_lock held.
3686 * Prior to return, ncec_refcnt is incremented.
3687 *
3688 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3689 * are always added pointing at the ipmp_ill. Thus, when the ill passed
3690 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3691 * entries will be created, both pointing at the same ncec_t. The nce_t
3692 * entries will have their nce_ill set to the ipmp_ill and the under_ill
3693 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3694 * Local addresses are always created on the ill passed to nce_add_v4.
3695 */
3696 int
3697 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3698 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3699 {
3700 int err;
3701 boolean_t is_multicast = (flags & NCE_F_MCAST);
3702 struct in6_addr addr6;
3703 nce_t *nce;
3704
3705 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3706 ASSERT(!ill->ill_isv6);
3707 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3708
3709 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3710 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3711 &nce);
3712 ASSERT(newnce != NULL);
3713 *newnce = nce;
3714 return (err);
3715 }
3716
3717 /*
3718 * Post-processing routine to be executed after nce_add_v4(). This function
3719 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3720 * and must be called without any locks held.
3721 *
3722 * Always returns 0, but we return an int to keep this symmetric with the
3723 * IPv6 counter-part.
3724 */
3725 int
3726 nce_add_v4_postprocess(nce_t *nce)
3727 {
3728 ncec_t *ncec = nce->nce_common;
3729 uint16_t flags = ncec->ncec_flags;
3730 boolean_t ndp_need_dad = B_FALSE;
3731 boolean_t dropped;
3732 clock_t delay;
3733 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst;
3734 uchar_t *hw_addr = ncec->ncec_lladdr;
3735 boolean_t trigger_fastpath = B_TRUE;
3736
3737 /*
3738 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3739 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3740 * We call nce_fastpath from nce_update if the link layer address of
3741 * the peer changes from nce_update
3742 */
3743 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3744 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3745 trigger_fastpath = B_FALSE;
3746
3747 if (trigger_fastpath)
3748 nce_fastpath_trigger(nce);
3749
3750 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3751 /*
3752 * Either the caller (by passing in ND_PROBE)
3753 * or nce_add_common() (by the internally computed state
3754 * based on ncec_addr and ill_net_type) has determined
3755 * that this unicast entry needs DAD. Trigger DAD.
3756 */
3757 ndp_need_dad = B_TRUE;
3758 } else if (flags & NCE_F_UNSOL_ADV) {
3759 /*
3760 * We account for the transmit below by assigning one
3761 * less than the ndd variable. Subsequent decrements
3762 * are done in nce_timer.
3763 */
3764 mutex_enter(&ncec->ncec_lock);
3765 ncec->ncec_unsolicit_count =
3766 ipst->ips_ip_arp_publish_count - 1;
3767 mutex_exit(&ncec->ncec_lock);
3768 dropped = arp_announce(ncec);
3769 mutex_enter(&ncec->ncec_lock);
3770 if (dropped)
3771 ncec->ncec_unsolicit_count++;
3772 else
3773 ncec->ncec_last_time_defended = ddi_get_lbolt();
3774 if (ncec->ncec_unsolicit_count != 0) {
3775 nce_start_timer(ncec,
3776 ipst->ips_ip_arp_publish_interval);
3777 }
3778 mutex_exit(&ncec->ncec_lock);
3779 }
3780
3781 /*
3782 * If ncec_xmit_interval is 0, user has configured us to send the first
3783 * probe right away. Do so, and set up for the subsequent probes.
3784 */
3785 if (ndp_need_dad) {
3786 mutex_enter(&ncec->ncec_lock);
3787 if (ncec->ncec_pcnt == 0) {
3788 /*
3789 * DAD probes and announce can be
3790 * administratively disabled by setting the
3791 * probe_count to zero. Restart the timer in
3792 * this case to mark the ipif as ready.
3793 */
3794 ncec->ncec_unsolicit_count = 0;
3795 mutex_exit(&ncec->ncec_lock);
3796 nce_restart_timer(ncec, 0);
3797 } else {
3798 mutex_exit(&ncec->ncec_lock);
3799 delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3800 ipst->ips_arp_probe_delay :
3801 ipst->ips_arp_fastprobe_delay);
3802 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3803 }
3804 }
3805 return (0);
3806 }
3807
3808 /*
3809 * ncec_walk routine to update all entries that have a given destination or
3810 * gateway address and cached link layer (MAC) address. This is used when ARP
3811 * informs us that a network-to-link-layer mapping may have changed.
3812 */
3813 void
3814 nce_update_hw_changed(ncec_t *ncec, void *arg)
3815 {
3816 nce_hw_map_t *hwm = arg;
3817 ipaddr_t ncec_addr;
3818
3819 if (ncec->ncec_state != ND_REACHABLE)
3820 return;
3821
3822 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3823 if (ncec_addr != hwm->hwm_addr)
3824 return;
3825
3826 mutex_enter(&ncec->ncec_lock);
3827 if (hwm->hwm_flags != 0)
3828 ncec->ncec_flags = hwm->hwm_flags;
3829 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3830 mutex_exit(&ncec->ncec_lock);
3831 }
3832
3833 void
3834 ncec_refhold(ncec_t *ncec)
3835 {
3836 mutex_enter(&(ncec)->ncec_lock);
3837 (ncec)->ncec_refcnt++;
3838 ASSERT((ncec)->ncec_refcnt != 0);
3839 #ifdef DEBUG
3840 ncec_trace_ref(ncec);
3841 #endif
3842 mutex_exit(&(ncec)->ncec_lock);
3843 }
3844
3845 void
3846 ncec_refhold_notr(ncec_t *ncec)
3847 {
3848 mutex_enter(&(ncec)->ncec_lock);
3849 (ncec)->ncec_refcnt++;
3850 ASSERT((ncec)->ncec_refcnt != 0);
3851 mutex_exit(&(ncec)->ncec_lock);
3852 }
3853
3854 static void
3855 ncec_refhold_locked(ncec_t *ncec)
3856 {
3857 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3858 (ncec)->ncec_refcnt++;
3859 #ifdef DEBUG
3860 ncec_trace_ref(ncec);
3861 #endif
3862 }
3863
3864 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3865 void
3866 ncec_refrele(ncec_t *ncec)
3867 {
3868 mutex_enter(&(ncec)->ncec_lock);
3869 #ifdef DEBUG
3870 ncec_untrace_ref(ncec);
3871 #endif
3872 ASSERT((ncec)->ncec_refcnt != 0);
3873 if (--(ncec)->ncec_refcnt == 0) {
3874 ncec_inactive(ncec);
3875 } else {
3876 mutex_exit(&(ncec)->ncec_lock);
3877 }
3878 }
3879
3880 void
3881 ncec_refrele_notr(ncec_t *ncec)
3882 {
3883 mutex_enter(&(ncec)->ncec_lock);
3884 ASSERT((ncec)->ncec_refcnt != 0);
3885 if (--(ncec)->ncec_refcnt == 0) {
3886 ncec_inactive(ncec);
3887 } else {
3888 mutex_exit(&(ncec)->ncec_lock);
3889 }
3890 }
3891
3892 /*
3893 * Common to IPv4 and IPv6.
3894 */
3895 void
3896 nce_restart_timer(ncec_t *ncec, uint_t ms)
3897 {
3898 timeout_id_t tid;
3899
3900 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3901
3902 /* First cancel any running timer */
3903 mutex_enter(&ncec->ncec_lock);
3904 tid = ncec->ncec_timeout_id;
3905 ncec->ncec_timeout_id = 0;
3906 if (tid != 0) {
3907 mutex_exit(&ncec->ncec_lock);
3908 (void) untimeout(tid);
3909 mutex_enter(&ncec->ncec_lock);
3910 }
3911
3912 /* Restart timer */
3913 nce_start_timer(ncec, ms);
3914 mutex_exit(&ncec->ncec_lock);
3915 }
3916
3917 static void
3918 nce_start_timer(ncec_t *ncec, uint_t ms)
3919 {
3920 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3921 /*
3922 * Don't start the timer if the ncec has been deleted, or if the timer
3923 * is already running
3924 */
3925 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3926 ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3927 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3928 }
3929 }
3930
3931 int
3932 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3933 uint16_t flags, nce_t **newnce)
3934 {
3935 uchar_t *hw_addr;
3936 int err = 0;
3937 ip_stack_t *ipst = ill->ill_ipst;
3938 in6_addr_t dst6;
3939 nce_t *nce;
3940
3941 ASSERT(!ill->ill_isv6);
3942
3943 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3944 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3945 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3946 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3947 goto done;
3948 }
3949 if (ill->ill_net_type == IRE_IF_RESOLVER) {
3950 /*
3951 * For IRE_IF_RESOLVER a hardware mapping can be
3952 * generated, for IRE_IF_NORESOLVER, resolution cookie
3953 * in the ill is copied in nce_add_v4().
3954 */
3955 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3956 if (hw_addr == NULL) {
3957 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3958 return (ENOMEM);
3959 }
3960 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3961 } else {
3962 /*
3963 * IRE_IF_NORESOLVER type simply copies the resolution
3964 * cookie passed in. So no hw_addr is needed.
3965 */
3966 hw_addr = NULL;
3967 }
3968 ASSERT(flags & NCE_F_MCAST);
3969 ASSERT(flags & NCE_F_NONUD);
3970 /* nce_state will be computed by nce_add_common() */
3971 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3972 ND_UNCHANGED, &nce);
3973 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3974 if (err == 0)
3975 err = nce_add_v4_postprocess(nce);
3976 if (hw_addr != NULL)
3977 kmem_free(hw_addr, ill->ill_phys_addr_length);
3978 if (err != 0) {
3979 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3980 return (err);
3981 }
3982 done:
3983 if (newnce != NULL)
3984 *newnce = nce;
3985 else
3986 nce_refrele(nce);
3987 return (0);
3988 }
3989
3990 /*
3991 * This is used when scanning for "old" (least recently broadcast) NCEs. We
3992 * don't want to have to walk the list for every single one, so we gather up
3993 * batches at a time.
3994 */
3995 #define NCE_RESCHED_LIST_LEN 8
3996
3997 typedef struct {
3998 ill_t *ncert_ill;
3999 uint_t ncert_num;
4000 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN];
4001 } nce_resched_t;
4002
4003 /*
4004 * Pick the longest waiting NCEs for defense.
4005 */
4006 /* ARGSUSED */
4007 static int
4008 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4009 {
4010 nce_resched_t *ncert = arg;
4011 ncec_t **ncecs;
4012 ncec_t **ncec_max;
4013 ncec_t *ncec_temp;
4014 ncec_t *ncec = nce->nce_common;
4015
4016 ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4017 /*
4018 * Only reachable entries that are ready for announcement are eligible.
4019 */
4020 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4021 return (0);
4022 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4023 ncec_refhold(ncec);
4024 ncert->ncert_nces[ncert->ncert_num++] = ncec;
4025 } else {
4026 ncecs = ncert->ncert_nces;
4027 ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4028 ncec_refhold(ncec);
4029 for (; ncecs < ncec_max; ncecs++) {
4030 ASSERT(ncec != NULL);
4031 if ((*ncecs)->ncec_last_time_defended >
4032 ncec->ncec_last_time_defended) {
4033 ncec_temp = *ncecs;
4034 *ncecs = ncec;
4035 ncec = ncec_temp;
4036 }
4037 }
4038 ncec_refrele(ncec);
4039 }
4040 return (0);
4041 }
4042
4043 /*
4044 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this
4045 * doesn't happen very often (if at all), and thus it needn't be highly
4046 * optimized. (Note, though, that it's actually O(N) complexity, because the
4047 * outer loop is bounded by a constant rather than by the length of the list.)
4048 */
4049 static void
4050 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4051 {
4052 ncec_t *ncec;
4053 ip_stack_t *ipst = ill->ill_ipst;
4054 uint_t i, defend_rate;
4055
4056 i = ill->ill_defend_count;
4057 ill->ill_defend_count = 0;
4058 if (ill->ill_isv6)
4059 defend_rate = ipst->ips_ndp_defend_rate;
4060 else
4061 defend_rate = ipst->ips_arp_defend_rate;
4062 /* If none could be sitting around, then don't reschedule */
4063 if (i < defend_rate) {
4064 DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4065 return;
4066 }
4067 ncert->ncert_ill = ill;
4068 while (ill->ill_defend_count < defend_rate) {
4069 nce_walk_common(ill, ncec_reschedule, ncert);
4070 for (i = 0; i < ncert->ncert_num; i++) {
4071
4072 ncec = ncert->ncert_nces[i];
4073 mutex_enter(&ncec->ncec_lock);
4074 ncec->ncec_flags |= NCE_F_DELAYED;
4075 mutex_exit(&ncec->ncec_lock);
4076 /*
4077 * we plan to schedule this ncec, so incr the
4078 * defend_count in anticipation.
4079 */
4080 if (++ill->ill_defend_count >= defend_rate)
4081 break;
4082 }
4083 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4084 break;
4085 }
4086 }
4087
4088 /*
4089 * Check if the current rate-limiting parameters permit the sending
4090 * of another address defense announcement for both IPv4 and IPv6.
4091 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4092 * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4093 * determines how many address defense announcements are permitted
4094 * in any `defense_perio' interval.
4095 */
4096 static boolean_t
4097 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4098 {
4099 clock_t now = ddi_get_lbolt();
4100 ip_stack_t *ipst = ill->ill_ipst;
4101 clock_t start = ill->ill_defend_start;
4102 uint32_t elapsed, defend_period, defend_rate;
4103 nce_resched_t ncert;
4104 boolean_t ret;
4105 int i;
4106
4107 if (ill->ill_isv6) {
4108 defend_period = ipst->ips_ndp_defend_period;
4109 defend_rate = ipst->ips_ndp_defend_rate;
4110 } else {
4111 defend_period = ipst->ips_arp_defend_period;
4112 defend_rate = ipst->ips_arp_defend_rate;
4113 }
4114 if (defend_rate == 0)
4115 return (B_TRUE);
4116 bzero(&ncert, sizeof (ncert));
4117 mutex_enter(&ill->ill_lock);
4118 if (start > 0) {
4119 elapsed = now - start;
4120 if (elapsed > SEC_TO_TICK(defend_period)) {
4121 ill->ill_defend_start = now;
4122 /*
4123 * nce_ill_reschedule will attempt to
4124 * prevent starvation by reschduling the
4125 * oldest entries, which are marked with
4126 * the NCE_F_DELAYED flag.
4127 */
4128 nce_ill_reschedule(ill, &ncert);
4129 }
4130 } else {
4131 ill->ill_defend_start = now;
4132 }
4133 ASSERT(ill->ill_defend_count <= defend_rate);
4134 mutex_enter(&ncec->ncec_lock);
4135 if (ncec->ncec_flags & NCE_F_DELAYED) {
4136 /*
4137 * This ncec was rescheduled as one of the really old
4138 * entries needing on-going defense. The
4139 * ill_defend_count was already incremented in
4140 * nce_ill_reschedule. Go ahead and send the announce.
4141 */
4142 ncec->ncec_flags &= ~NCE_F_DELAYED;
4143 mutex_exit(&ncec->ncec_lock);
4144 ret = B_FALSE;
4145 goto done;
4146 }
4147 mutex_exit(&ncec->ncec_lock);
4148 if (ill->ill_defend_count < defend_rate)
4149 ill->ill_defend_count++;
4150 if (ill->ill_defend_count == defend_rate) {
4151 /*
4152 * we are no longer allowed to send unbidden defense
4153 * messages. Wait for rescheduling.
4154 */
4155 ret = B_TRUE;
4156 } else {
4157 ret = B_FALSE;
4158 }
4159 done:
4160 mutex_exit(&ill->ill_lock);
4161 /*
4162 * After all the locks have been dropped we can restart nce timer,
4163 * and refrele the delayed ncecs
4164 */
4165 for (i = 0; i < ncert.ncert_num; i++) {
4166 clock_t xmit_interval;
4167 ncec_t *tmp;
4168
4169 tmp = ncert.ncert_nces[i];
4170 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4171 B_FALSE);
4172 nce_restart_timer(tmp, xmit_interval);
4173 ncec_refrele(tmp);
4174 }
4175 return (ret);
4176 }
4177
4178 boolean_t
4179 ndp_announce(ncec_t *ncec)
4180 {
4181 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4182 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4183 nce_advert_flags(ncec)));
4184 }
4185
4186 ill_t *
4187 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4188 {
4189 mblk_t *mp;
4190 in6_addr_t src6;
4191 ipaddr_t src4;
4192 ill_t *ill = ncec->ncec_ill;
4193 ill_t *src_ill = NULL;
4194 ipif_t *ipif = NULL;
4195 boolean_t is_myaddr = NCE_MYADDR(ncec);
4196 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4197
4198 ASSERT(src != NULL);
4199 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4200 src6 = *src;
4201 if (is_myaddr) {
4202 src6 = ncec->ncec_addr;
4203 if (!isv6)
4204 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4205 } else {
4206 /*
4207 * try to find one from the outgoing packet.
4208 */
4209 mutex_enter(&ncec->ncec_lock);
4210 mp = ncec->ncec_qd_mp;
4211 if (mp != NULL) {
4212 if (isv6) {
4213 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4214
4215 src6 = ip6h->ip6_src;
4216 } else {
4217 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4218
4219 src4 = ipha->ipha_src;
4220 IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4221 }
4222 }
4223 mutex_exit(&ncec->ncec_lock);
4224 }
4225
4226 /*
4227 * For outgoing packets, if the src of outgoing packet is one
4228 * of the assigned interface addresses use it, otherwise we
4229 * will pick the source address below.
4230 * For local addresses (is_myaddr) doing DAD, NDP announce
4231 * messages are mcast. So we use the (IPMP) cast_ill or the
4232 * (non-IPMP) ncec_ill for these message types. The only case
4233 * of unicast DAD messages are for IPv6 ND probes, for which
4234 * we find the ipif_bound_ill corresponding to the ncec_addr.
4235 */
4236 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4237 if (isv6) {
4238 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4239 ill->ill_ipst);
4240 } else {
4241 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4242 ill->ill_ipst);
4243 }
4244
4245 /*
4246 * If no relevant ipif can be found, then it's not one of our
4247 * addresses. Reset to :: and try to find a src for the NS or
4248 * ARP request using ipif_select_source_v[4,6] below.
4249 * If an ipif can be found, but it's not yet done with
4250 * DAD verification, and we are not being invoked for
4251 * DAD (i.e., !is_myaddr), then just postpone this
4252 * transmission until later.
4253 */
4254 if (ipif == NULL) {
4255 src6 = ipv6_all_zeros;
4256 src4 = INADDR_ANY;
4257 } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4258 DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4259 ncec_t *, ncec, ipif_t *, ipif);
4260 ipif_refrele(ipif);
4261 return (NULL);
4262 }
4263 }
4264
4265 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4266 /*
4267 * Pick a source address for this solicitation, but
4268 * restrict the selection to addresses assigned to the
4269 * output interface. We do this because the destination will
4270 * create a neighbor cache entry for the source address of
4271 * this packet, so the source address had better be a valid
4272 * neighbor.
4273 */
4274 if (isv6) {
4275 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4276 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4277 B_FALSE, NULL);
4278 } else {
4279 ipaddr_t nce_addr;
4280
4281 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4282 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4283 B_FALSE, NULL);
4284 }
4285 if (ipif == NULL && IS_IPMP(ill)) {
4286 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4287
4288 if (send_ill != NULL) {
4289 if (isv6) {
4290 ipif = ipif_select_source_v6(send_ill,
4291 &ncec->ncec_addr, B_TRUE,
4292 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4293 B_FALSE, NULL);
4294 } else {
4295 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4296 src4);
4297 ipif = ipif_select_source_v4(send_ill,
4298 src4, ALL_ZONES, B_TRUE, NULL);
4299 }
4300 ill_refrele(send_ill);
4301 }
4302 }
4303
4304 if (ipif == NULL) {
4305 char buf[INET6_ADDRSTRLEN];
4306
4307 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4308 inet_ntop((isv6 ? AF_INET6 : AF_INET),
4309 (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4310 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4311 return (NULL);
4312 }
4313 src6 = ipif->ipif_v6lcl_addr;
4314 }
4315 *src = src6;
4316 if (ipif != NULL) {
4317 src_ill = ipif->ipif_ill;
4318 if (IS_IPMP(src_ill))
4319 src_ill = ipmp_ipif_hold_bound_ill(ipif);
4320 else
4321 ill_refhold(src_ill);
4322 ipif_refrele(ipif);
4323 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4324 ill_t *, src_ill);
4325 }
4326 return (src_ill);
4327 }
4328
4329 void
4330 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4331 uchar_t *hwaddr, int hwaddr_len, int flags)
4332 {
4333 ill_t *ill;
4334 ncec_t *ncec;
4335 nce_t *nce;
4336 uint16_t new_state;
4337
4338 ill = (ipif ? ipif->ipif_ill : NULL);
4339 if (ill != NULL) {
4340 /*
4341 * only one ncec is possible
4342 */
4343 nce = nce_lookup_v4(ill, addr);
4344 if (nce != NULL) {
4345 ncec = nce->nce_common;
4346 mutex_enter(&ncec->ncec_lock);
4347 if (NCE_ISREACHABLE(ncec))
4348 new_state = ND_UNCHANGED;
4349 else
4350 new_state = ND_STALE;
4351 ncec->ncec_flags = flags;
4352 nce_update(ncec, new_state, hwaddr);
4353 mutex_exit(&ncec->ncec_lock);
4354 nce_refrele(nce);
4355 return;
4356 }
4357 } else {
4358 /*
4359 * ill is wildcard; clean up all ncec's and ire's
4360 * that match on addr.
4361 */
4362 nce_hw_map_t hwm;
4363
4364 hwm.hwm_addr = *addr;
4365 hwm.hwm_hwlen = hwaddr_len;
4366 hwm.hwm_hwaddr = hwaddr;
4367 hwm.hwm_flags = flags;
4368
4369 ncec_walk_common(ipst->ips_ndp4, NULL,
4370 nce_update_hw_changed, &hwm, B_TRUE);
4371 }
4372 }
4373
4374 /*
4375 * Common function to add ncec entries.
4376 * we always add the ncec with ncec_ill == ill, and always create
4377 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4378 * ncec is !reachable.
4379 *
4380 * When the caller passes in an nce_state of ND_UNCHANGED,
4381 * nce_add_common() will determine the state of the created nce based
4382 * on the ill_net_type and nce_flags used. Otherwise, the nce will
4383 * be created with state set to the passed in nce_state.
4384 */
4385 static int
4386 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4387 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4388 {
4389 static ncec_t nce_nil;
4390 uchar_t *template = NULL;
4391 int err;
4392 ncec_t *ncec;
4393 ncec_t **ncep;
4394 ip_stack_t *ipst = ill->ill_ipst;
4395 uint16_t state;
4396 boolean_t fastprobe = B_FALSE;
4397 struct ndp_g_s *ndp;
4398 nce_t *nce = NULL;
4399 mblk_t *dlur_mp = NULL;
4400
4401 if (ill->ill_isv6)
4402 ndp = ill->ill_ipst->ips_ndp6;
4403 else
4404 ndp = ill->ill_ipst->ips_ndp4;
4405
4406 *retnce = NULL;
4407
4408 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4409
4410 if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4411 ip0dbg(("nce_add_common: no addr\n"));
4412 return (EINVAL);
4413 }
4414 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4415 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4416 return (EINVAL);
4417 }
4418
4419 if (ill->ill_isv6) {
4420 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4421 } else {
4422 ipaddr_t v4addr;
4423
4424 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4425 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4426 }
4427
4428 /*
4429 * The caller has ensured that there is no nce on ill, but there could
4430 * still be an nce_common_t for the address, so that we find exisiting
4431 * ncec_t strucutures first, and atomically add a new nce_t if
4432 * one is found. The ndp_g_lock ensures that we don't cross threads
4433 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4434 * compare for matches across the illgrp because this function is
4435 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4436 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4437 * appropriate.
4438 */
4439 ncec = *ncep;
4440 for (; ncec != NULL; ncec = ncec->ncec_next) {
4441 if (ncec->ncec_ill == ill) {
4442 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4443 /*
4444 * We should never find *retnce to be
4445 * MYADDR, since the caller may then
4446 * incorrectly restart a DAD timer that's
4447 * already running. However, if we are in
4448 * forwarding mode, and the interface is
4449 * moving in/out of groups, the data
4450 * path ire lookup (e.g., ire_revalidate_nce)
4451 * may have determined that some destination
4452 * is offlink while the control path is adding
4453 * that address as a local address.
4454 * Recover from this case by failing the
4455 * lookup
4456 */
4457 if (NCE_MYADDR(ncec))
4458 return (ENXIO);
4459 *retnce = nce_ill_lookup_then_add(ill, ncec);
4460 if (*retnce != NULL)
4461 break;
4462 }
4463 }
4464 }
4465 if (*retnce != NULL) /* caller must trigger fastpath on nce */
4466 return (0);
4467
4468 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4469 if (ncec == NULL)
4470 return (ENOMEM);
4471 *ncec = nce_nil;
4472 ncec->ncec_ill = ill;
4473 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4474 ncec->ncec_flags = flags;
4475 ncec->ncec_ipst = ipst; /* No netstack_hold */
4476
4477 if (!ill->ill_isv6) {
4478 ipaddr_t addr4;
4479
4480 /*
4481 * DAD probe interval and probe count are set based on
4482 * fast/slow probe settings. If the underlying link doesn't
4483 * have reliably up/down notifications or if we're working
4484 * with IPv4 169.254.0.0/16 Link Local Address space, then
4485 * don't use the fast timers. Otherwise, use them.
4486 */
4487 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4488 IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4489 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4490 fastprobe = B_TRUE;
4491 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4492 !IS_IPV4_LL_SPACE(&addr4)) {
4493 ill_t *hwaddr_ill;
4494
4495 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4496 hw_addr_len);
4497 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4498 fastprobe = B_TRUE;
4499 }
4500 if (fastprobe) {
4501 ncec->ncec_xmit_interval =
4502 ipst->ips_arp_fastprobe_interval;
4503 ncec->ncec_pcnt =
4504 ipst->ips_arp_fastprobe_count;
4505 ncec->ncec_flags |= NCE_F_FAST;
4506 } else {
4507 ncec->ncec_xmit_interval =
4508 ipst->ips_arp_probe_interval;
4509 ncec->ncec_pcnt =
4510 ipst->ips_arp_probe_count;
4511 }
4512 if (NCE_PUBLISH(ncec)) {
4513 ncec->ncec_unsolicit_count =
4514 ipst->ips_ip_arp_publish_count;
4515 }
4516 } else {
4517 /*
4518 * probe interval is constant: ILL_PROBE_INTERVAL
4519 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4520 */
4521 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4522 if (NCE_PUBLISH(ncec)) {
4523 ncec->ncec_unsolicit_count =
4524 ipst->ips_ip_ndp_unsolicit_count;
4525 }
4526 }
4527 ncec->ncec_rcnt = ill->ill_xmit_count;
4528 ncec->ncec_addr = *addr;
4529 ncec->ncec_qd_mp = NULL;
4530 ncec->ncec_refcnt = 1; /* for ncec getting created */
4531 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4532 ncec->ncec_trace_disable = B_FALSE;
4533
4534 /*
4535 * ncec_lladdr holds link layer address
4536 */
4537 if (hw_addr_len > 0) {
4538 template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4539 if (template == NULL) {
4540 err = ENOMEM;
4541 goto err_ret;
4542 }
4543 ncec->ncec_lladdr = template;
4544 ncec->ncec_lladdr_length = hw_addr_len;
4545 bzero(ncec->ncec_lladdr, hw_addr_len);
4546 }
4547 if ((flags & NCE_F_BCAST) != 0) {
4548 state = ND_REACHABLE;
4549 ASSERT(hw_addr_len > 0);
4550 } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4551 state = ND_INITIAL;
4552 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4553 /*
4554 * NORESOLVER entries are always created in the REACHABLE
4555 * state.
4556 */
4557 state = ND_REACHABLE;
4558 if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4559 ill->ill_mactype != DL_IPV4 &&
4560 ill->ill_mactype != DL_6TO4) {
4561 /*
4562 * We create a nce_res_mp with the IP nexthop address
4563 * as the destination address if the physical length
4564 * is exactly 4 bytes for point-to-multipoint links
4565 * that do their own resolution from IP to link-layer
4566 * address (e.g. IP over X.25).
4567 */
4568 bcopy((uchar_t *)addr,
4569 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4570 }
4571 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4572 ill->ill_mactype != DL_IPV6) {
4573 /*
4574 * We create a nce_res_mp with the IP nexthop address
4575 * as the destination address if the physical legnth
4576 * is exactly 16 bytes for point-to-multipoint links
4577 * that do their own resolution from IP to link-layer
4578 * address.
4579 */
4580 bcopy((uchar_t *)addr,
4581 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4582 }
4583 /*
4584 * Since NUD is not part of the base IPv4 protocol definition,
4585 * IPv4 neighbor entries on NORESOLVER interfaces will never
4586 * age, and are marked NCE_F_NONUD.
4587 */
4588 if (!ill->ill_isv6)
4589 ncec->ncec_flags |= NCE_F_NONUD;
4590 } else if (ill->ill_net_type == IRE_LOOPBACK) {
4591 state = ND_REACHABLE;
4592 }
4593
4594 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4595 /*
4596 * We are adding an ncec with a deterministic hw_addr,
4597 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4598 *
4599 * if we are adding a unicast ncec for the local address
4600 * it would be REACHABLE; we would be adding a ND_STALE entry
4601 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4602 * addresses are added in PROBE to trigger DAD.
4603 */
4604 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4605 ill->ill_net_type == IRE_IF_NORESOLVER)
4606 state = ND_REACHABLE;
4607 else if (!NCE_PUBLISH(ncec))
4608 state = ND_STALE;
4609 else
4610 state = ND_PROBE;
4611 if (hw_addr != NULL)
4612 nce_set_ll(ncec, hw_addr);
4613 }
4614 /* caller overrides internally computed state */
4615 if (nce_state != ND_UNCHANGED)
4616 state = nce_state;
4617
4618 if (state == ND_PROBE)
4619 ncec->ncec_flags |= NCE_F_UNVERIFIED;
4620
4621 ncec->ncec_state = state;
4622
4623 if (state == ND_REACHABLE) {
4624 ncec->ncec_last = ncec->ncec_init_time =
4625 TICK_TO_MSEC(ddi_get_lbolt64());
4626 } else {
4627 ncec->ncec_last = 0;
4628 if (state == ND_INITIAL)
4629 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4630 }
4631 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4632 offsetof(ncec_cb_t, ncec_cb_node));
4633 /*
4634 * have all the memory allocations out of the way before taking locks
4635 * and adding the nce.
4636 */
4637 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4638 if (nce == NULL) {
4639 err = ENOMEM;
4640 goto err_ret;
4641 }
4642 if (ncec->ncec_lladdr != NULL ||
4643 ill->ill_net_type == IRE_IF_NORESOLVER) {
4644 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4645 ill->ill_phys_addr_length, ill->ill_sap,
4646 ill->ill_sap_length);
4647 if (dlur_mp == NULL) {
4648 err = ENOMEM;
4649 goto err_ret;
4650 }
4651 }
4652
4653 /*
4654 * Atomically ensure that the ill is not CONDEMNED, before
4655 * adding the NCE.
4656 */
4657 mutex_enter(&ill->ill_lock);
4658 if (ill->ill_state_flags & ILL_CONDEMNED) {
4659 mutex_exit(&ill->ill_lock);
4660 err = EINVAL;
4661 goto err_ret;
4662 }
4663 if (!NCE_MYADDR(ncec) &&
4664 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4665 mutex_exit(&ill->ill_lock);
4666 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4667 err = EINVAL;
4668 goto err_ret;
4669 }
4670 /*
4671 * Acquire the ncec_lock even before adding the ncec to the list
4672 * so that it cannot get deleted after the ncec is added, but
4673 * before we add the nce.
4674 */
4675 mutex_enter(&ncec->ncec_lock);
4676 if ((ncec->ncec_next = *ncep) != NULL)
4677 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4678 *ncep = ncec;
4679 ncec->ncec_ptpn = ncep;
4680
4681 /* Bump up the number of ncec's referencing this ill */
4682 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4683 (char *), "ncec", (void *), ncec);
4684 ill->ill_ncec_cnt++;
4685 /*
4686 * Since we hold the ncec_lock at this time, the ncec cannot be
4687 * condemned, and we can safely add the nce.
4688 */
4689 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4690 mutex_exit(&ncec->ncec_lock);
4691 mutex_exit(&ill->ill_lock);
4692
4693 /* caller must trigger fastpath on *retnce */
4694 return (0);
4695
4696 err_ret:
4697 if (ncec != NULL)
4698 kmem_cache_free(ncec_cache, ncec);
4699 if (nce != NULL)
4700 kmem_cache_free(nce_cache, nce);
4701 freemsg(dlur_mp);
4702 if (template != NULL)
4703 kmem_free(template, ill->ill_phys_addr_length);
4704 return (err);
4705 }
4706
4707 /*
4708 * take a ref on the nce
4709 */
4710 void
4711 nce_refhold(nce_t *nce)
4712 {
4713 mutex_enter(&nce->nce_lock);
4714 nce->nce_refcnt++;
4715 ASSERT((nce)->nce_refcnt != 0);
4716 mutex_exit(&nce->nce_lock);
4717 }
4718
4719 /*
4720 * release a ref on the nce; In general, this
4721 * cannot be called with locks held because nce_inactive
4722 * may result in nce_inactive which will take the ill_lock,
4723 * do ipif_ill_refrele_tail etc. Thus the one exception
4724 * where this can be called with locks held is when the caller
4725 * is certain that the nce_refcnt is sufficient to prevent
4726 * the invocation of nce_inactive.
4727 */
4728 void
4729 nce_refrele(nce_t *nce)
4730 {
4731 ASSERT((nce)->nce_refcnt != 0);
4732 mutex_enter(&nce->nce_lock);
4733 if (--nce->nce_refcnt == 0)
4734 nce_inactive(nce); /* destroys the mutex */
4735 else
4736 mutex_exit(&nce->nce_lock);
4737 }
4738
4739 /*
4740 * free the nce after all refs have gone away.
4741 */
4742 static void
4743 nce_inactive(nce_t *nce)
4744 {
4745 ill_t *ill = nce->nce_ill;
4746
4747 ASSERT(nce->nce_refcnt == 0);
4748
4749 ncec_refrele_notr(nce->nce_common);
4750 nce->nce_common = NULL;
4751 freemsg(nce->nce_fp_mp);
4752 freemsg(nce->nce_dlur_mp);
4753
4754 mutex_enter(&ill->ill_lock);
4755 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4756 (char *), "nce", (void *), nce);
4757 ill->ill_nce_cnt--;
4758 nce->nce_ill = NULL;
4759 /*
4760 * If the number of ncec's associated with this ill have dropped
4761 * to zero, check whether we need to restart any operation that
4762 * is waiting for this to happen.
4763 */
4764 if (ILL_DOWN_OK(ill)) {
4765 /* ipif_ill_refrele_tail drops the ill_lock */
4766 ipif_ill_refrele_tail(ill);
4767 } else {
4768 mutex_exit(&ill->ill_lock);
4769 }
4770
4771 mutex_destroy(&nce->nce_lock);
4772 kmem_cache_free(nce_cache, nce);
4773 }
4774
4775 /*
4776 * Add an nce to the ill_nce list.
4777 */
4778 static nce_t *
4779 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4780 {
4781 bzero(nce, sizeof (*nce));
4782 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4783 nce->nce_common = ncec;
4784 nce->nce_addr = ncec->ncec_addr;
4785 nce->nce_ill = ill;
4786 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4787 (char *), "nce", (void *), nce);
4788 ill->ill_nce_cnt++;
4789
4790 nce->nce_refcnt = 1; /* for the thread */
4791 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4792 nce->nce_dlur_mp = dlur_mp;
4793
4794 /* add nce to the ill's fastpath list. */
4795 nce->nce_refcnt++; /* for the list */
4796 list_insert_head(&ill->ill_nce, nce);
4797 return (nce);
4798 }
4799
4800 static nce_t *
4801 nce_add(ill_t *ill, ncec_t *ncec)
4802 {
4803 nce_t *nce;
4804 mblk_t *dlur_mp = NULL;
4805
4806 ASSERT(MUTEX_HELD(&ill->ill_lock));
4807 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4808
4809 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4810 if (nce == NULL)
4811 return (NULL);
4812 if (ncec->ncec_lladdr != NULL ||
4813 ill->ill_net_type == IRE_IF_NORESOLVER) {
4814 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4815 ill->ill_phys_addr_length, ill->ill_sap,
4816 ill->ill_sap_length);
4817 if (dlur_mp == NULL) {
4818 kmem_cache_free(nce_cache, nce);
4819 return (NULL);
4820 }
4821 }
4822 return (nce_add_impl(ill, ncec, nce, dlur_mp));
4823 }
4824
4825 /*
4826 * remove the nce from the ill_faspath list
4827 */
4828 void
4829 nce_delete(nce_t *nce)
4830 {
4831 ill_t *ill = nce->nce_ill;
4832
4833 ASSERT(MUTEX_HELD(&ill->ill_lock));
4834
4835 mutex_enter(&nce->nce_lock);
4836 if (nce->nce_is_condemned) {
4837 /*
4838 * some other thread has removed this nce from the ill_nce list
4839 */
4840 mutex_exit(&nce->nce_lock);
4841 return;
4842 }
4843 nce->nce_is_condemned = B_TRUE;
4844 mutex_exit(&nce->nce_lock);
4845
4846 list_remove(&ill->ill_nce, nce);
4847 /*
4848 * even though we are holding the ill_lock, it is ok to
4849 * call nce_refrele here because we know that we should have
4850 * at least 2 refs on the nce: one for the thread, and one
4851 * for the list. The refrele below will release the one for
4852 * the list.
4853 */
4854 nce_refrele(nce);
4855 }
4856
4857 nce_t *
4858 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4859 {
4860 nce_t *nce = NULL;
4861
4862 ASSERT(ill != NULL);
4863 ASSERT(MUTEX_HELD(&ill->ill_lock));
4864
4865 for (nce = list_head(&ill->ill_nce); nce != NULL;
4866 nce = list_next(&ill->ill_nce, nce)) {
4867 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4868 break;
4869 }
4870
4871 /*
4872 * if we found the nce on the ill_nce list while holding
4873 * the ill_lock, then it cannot be condemned yet.
4874 */
4875 if (nce != NULL) {
4876 ASSERT(!nce->nce_is_condemned);
4877 nce_refhold(nce);
4878 }
4879 return (nce);
4880 }
4881
4882 /*
4883 * Walk the ill_nce list on ill. The callback function func() cannot perform
4884 * any destructive actions.
4885 */
4886 static void
4887 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4888 {
4889 nce_t *nce = NULL, *nce_next;
4890
4891 ASSERT(MUTEX_HELD(&ill->ill_lock));
4892 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4893 nce_next = list_next(&ill->ill_nce, nce);
4894 if (func(ill, nce, arg) != 0)
4895 break;
4896 nce = nce_next;
4897 }
4898 }
4899
4900 void
4901 nce_walk(ill_t *ill, pfi_t func, void *arg)
4902 {
4903 mutex_enter(&ill->ill_lock);
4904 nce_walk_common(ill, func, arg);
4905 mutex_exit(&ill->ill_lock);
4906 }
4907
4908 void
4909 nce_flush(ill_t *ill, boolean_t flushall)
4910 {
4911 nce_t *nce, *nce_next;
4912 list_t dead;
4913
4914 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4915 mutex_enter(&ill->ill_lock);
4916 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4917 nce_next = list_next(&ill->ill_nce, nce);
4918 if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4919 nce = nce_next;
4920 continue;
4921 }
4922 /*
4923 * nce_delete requires that the caller should either not
4924 * be holding locks, or should hold a ref to ensure that
4925 * we wont hit ncec_inactive. So take a ref and clean up
4926 * after the list is flushed.
4927 */
4928 nce_refhold(nce);
4929 nce_delete(nce);
4930 list_insert_tail(&dead, nce);
4931 nce = nce_next;
4932 }
4933 mutex_exit(&ill->ill_lock);
4934 while ((nce = list_head(&dead)) != NULL) {
4935 list_remove(&dead, nce);
4936 nce_refrele(nce);
4937 }
4938 ASSERT(list_is_empty(&dead));
4939 list_destroy(&dead);
4940 }
4941
4942 /* Return an interval that is anywhere in the [1 .. intv] range */
4943 static clock_t
4944 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4945 {
4946 clock_t rnd, frac;
4947
4948 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4949 /* Note that clock_t is signed; must chop off bits */
4950 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4951 if (initial_time) {
4952 if (intv <= 0)
4953 intv = 1;
4954 else
4955 intv = (rnd % intv) + 1;
4956 } else {
4957 /* Compute 'frac' as 20% of the configured interval */
4958 if ((frac = intv / 5) <= 1)
4959 frac = 2;
4960 /* Set intv randomly in the range [intv-frac .. intv+frac] */
4961 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4962 intv = 1;
4963 }
4964 return (intv);
4965 }
4966
4967 void
4968 nce_resolv_ipmp_ok(ncec_t *ncec)
4969 {
4970 mblk_t *mp;
4971 uint_t pkt_len;
4972 iaflags_t ixaflags = IXAF_NO_TRACE;
4973 nce_t *under_nce;
4974 ill_t *ill = ncec->ncec_ill;
4975 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4976 ipif_t *src_ipif = NULL;
4977 ip_stack_t *ipst = ill->ill_ipst;
4978 ill_t *send_ill;
4979 uint_t nprobes;
4980
4981 ASSERT(IS_IPMP(ill));
4982
4983 mutex_enter(&ncec->ncec_lock);
4984 nprobes = ncec->ncec_nprobes;
4985 mp = ncec->ncec_qd_mp;
4986 ncec->ncec_qd_mp = NULL;
4987 ncec->ncec_nprobes = 0;
4988 mutex_exit(&ncec->ncec_lock);
4989
4990 while (mp != NULL) {
4991 mblk_t *nxt_mp;
4992
4993 nxt_mp = mp->b_next;
4994 mp->b_next = NULL;
4995 if (isv6) {
4996 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4997
4998 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4999 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
5000 ill, ALL_ZONES, ipst);
5001 } else {
5002 ipha_t *ipha = (ipha_t *)mp->b_rptr;
5003
5004 ixaflags |= IXAF_IS_IPV4;
5005 pkt_len = ntohs(ipha->ipha_length);
5006 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5007 ill, ALL_ZONES, ipst);
5008 }
5009
5010 /*
5011 * find a new nce based on an under_ill. The first IPMP probe
5012 * packet gets queued, so we could still find a src_ipif that
5013 * matches an IPMP test address.
5014 */
5015 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5016 /*
5017 * if src_ipif is null, this could be either a
5018 * forwarded packet or a probe whose src got deleted.
5019 * We identify the former case by looking for the
5020 * ncec_nprobes: the first ncec_nprobes packets are
5021 * probes;
5022 */
5023 if (src_ipif == NULL && nprobes > 0)
5024 goto drop_pkt;
5025
5026 /*
5027 * For forwarded packets, we use the ipmp rotor
5028 * to find send_ill.
5029 */
5030 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5031 B_TRUE);
5032 } else {
5033 send_ill = src_ipif->ipif_ill;
5034 ill_refhold(send_ill);
5035 }
5036
5037 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5038 (ncec_t *), ncec, (ipif_t *),
5039 src_ipif, (ill_t *), send_ill);
5040
5041 if (send_ill == NULL) {
5042 if (src_ipif != NULL)
5043 ipif_refrele(src_ipif);
5044 goto drop_pkt;
5045 }
5046 /* create an under_nce on send_ill */
5047 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5048 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5049 under_nce = nce_fastpath_create(send_ill, ncec);
5050 else
5051 under_nce = NULL;
5052 rw_exit(&ipst->ips_ill_g_lock);
5053 if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5054 nce_fastpath_trigger(under_nce);
5055
5056 ill_refrele(send_ill);
5057 if (src_ipif != NULL)
5058 ipif_refrele(src_ipif);
5059
5060 if (under_nce != NULL) {
5061 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5062 ALL_ZONES, 0, NULL);
5063 nce_refrele(under_nce);
5064 if (nprobes > 0)
5065 nprobes--;
5066 mp = nxt_mp;
5067 continue;
5068 }
5069 drop_pkt:
5070 if (isv6) {
5071 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5072 } else {
5073 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5074 }
5075 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5076 freemsg(mp);
5077 if (nprobes > 0)
5078 nprobes--;
5079 mp = nxt_mp;
5080 }
5081 ncec_cb_dispatch(ncec); /* complete callbacks */
5082 }