1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * Copyright (c) 2019, Joyent, Inc.
27 */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/errno.h>
35 #include <sys/dlpi.h>
36 #include <sys/socket.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/vtrace.h>
42 #include <sys/kmem.h>
43 #include <sys/zone.h>
44 #include <sys/ethernet.h>
45 #include <sys/sdt.h>
46 #include <sys/mac.h>
47
48 #include <net/if.h>
49 #include <net/if_types.h>
50 #include <net/if_dl.h>
51 #include <net/route.h>
52 #include <netinet/in.h>
53 #include <netinet/ip6.h>
54 #include <netinet/icmp6.h>
55
56 #include <inet/common.h>
57 #include <inet/mi.h>
58 #include <inet/mib2.h>
59 #include <inet/nd.h>
60 #include <inet/ip.h>
61 #include <inet/ip_impl.h>
62 #include <inet/ipclassifier.h>
63 #include <inet/ip_if.h>
64 #include <inet/ip_ire.h>
65 #include <inet/ip_rts.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_ndp.h>
68 #include <inet/sctp_ip.h>
69 #include <inet/ip_arp.h>
70 #include <inet/ip2mac_impl.h>
71
72 #define ANNOUNCE_INTERVAL(isv6) \
73 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
74 ipst->ips_ip_arp_publish_interval)
75
76 #define DEFENSE_INTERVAL(isv6) \
77 (isv6 ? ipst->ips_ndp_defend_interval : \
78 ipst->ips_arp_defend_interval)
79
80 /* Non-tunable probe interval, based on link capabilities */
81 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
82
83 /*
84 * The IPv4 Link Local address space is special; we do extra duplicate checking
85 * there, as the entire assignment mechanism rests on random numbers.
86 */
87 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
88 ((uchar_t *)ptr)[1] == 254)
89
90 /*
91 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
92 * in to the ncec*add* functions.
93 *
94 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
95 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
96 * that we will respond to requests for the protocol address.
97 */
98 #define NCE_EXTERNAL_FLAGS_MASK \
99 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
100 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
101 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
102
103 /*
104 * Lock ordering:
105 *
106 * ndp_g_lock -> ill_lock -> ncec_lock
107 *
108 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
109 * ncec_next. ncec_lock protects the contents of the NCE (particularly
110 * ncec_refcnt).
111 */
112
113 static void nce_cleanup_list(ncec_t *ncec);
114 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
115 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
116 ncec_t *);
117 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *);
118 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
119 uint16_t ncec_flags, nce_t **newnce);
120 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
121 uint16_t ncec_flags, nce_t **newnce);
122 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation,
123 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
124 const in6_addr_t *target, int flag);
125 static void ncec_refhold_locked(ncec_t *);
126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
127 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
128 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
129 uint16_t, uint16_t, nce_t **);
130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
131 static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
132 static void nce_inactive(nce_t *);
133 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
135 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
136 uint16_t, uint16_t, nce_t **);
137 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
138 uint16_t, uint16_t, nce_t **);
139 static int nce_add_v6_postprocess(nce_t *);
140 static int nce_add_v4_postprocess(nce_t *);
141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
143 static void nce_resolv_ipmp_ok(ncec_t *);
144 static void nce_walk_common(ill_t *, pfi_t, void *);
145 static void nce_start_timer(ncec_t *, uint_t);
146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
147 static void nce_fastpath_trigger(nce_t *);
148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
149
150 #ifdef DEBUG
151 static void ncec_trace_cleanup(const ncec_t *);
152 #endif
153
154 #define NCE_HASH_PTR_V4(ipst, addr) \
155 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
156
157 #define NCE_HASH_PTR_V6(ipst, addr) \
158 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
159 NCE_TABLE_SIZE)]))
160
161 extern kmem_cache_t *ncec_cache;
162 extern kmem_cache_t *nce_cache;
163
164 /*
165 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
166 * If src_ill is not null, the ncec_addr is bound to src_ill. The
167 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
168 * the probe is sent on the ncec_ill (in the non-IPMP case) or the
169 * IPMP cast_ill (in the IPMP case).
170 *
171 * Note that the probe interval is based on the src_ill for IPv6, and
172 * the ncec_xmit_interval for IPv4.
173 */
174 static void
175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
176 {
177 boolean_t dropped;
178 uint32_t probe_interval;
179
180 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
181 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
182 if (ncec->ncec_ipversion == IPV6_VERSION) {
183 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
184 ncec->ncec_lladdr, ncec->ncec_lladdr_length,
185 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
186 probe_interval = ILL_PROBE_INTERVAL(src_ill);
187 } else {
188 /* IPv4 DAD delay the initial probe. */
189 if (send_probe)
190 dropped = arp_probe(ncec);
191 else
192 dropped = B_TRUE;
193 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
194 !send_probe);
195 }
196 if (!dropped) {
197 mutex_enter(&ncec->ncec_lock);
198 ncec->ncec_pcnt--;
199 mutex_exit(&ncec->ncec_lock);
200 }
201 nce_restart_timer(ncec, probe_interval);
202 }
203
204 /*
205 * Compute default flags to use for an advertisement of this ncec's address.
206 */
207 static int
208 nce_advert_flags(const ncec_t *ncec)
209 {
210 int flag = 0;
211
212 if (ncec->ncec_flags & NCE_F_ISROUTER)
213 flag |= NDP_ISROUTER;
214 if (!(ncec->ncec_flags & NCE_F_ANYCAST))
215 flag |= NDP_ORIDE;
216
217 return (flag);
218 }
219
220 /*
221 * NDP Cache Entry creation routine.
222 * This routine must always be called with ndp6->ndp_g_lock held.
223 */
224 int
225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
226 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
227 {
228 int err;
229 nce_t *nce;
230
231 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
232 ASSERT(ill != NULL && ill->ill_isv6);
233
234 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
235 &nce);
236 if (err != 0)
237 return (err);
238 ASSERT(newnce != NULL);
239 *newnce = nce;
240 return (err);
241 }
242
243 /*
244 * Post-processing routine to be executed after nce_add_v6(). This function
245 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
246 * and must be called without any locks held.
247 */
248 int
249 nce_add_v6_postprocess(nce_t *nce)
250 {
251 ncec_t *ncec = nce->nce_common;
252 boolean_t dropped = B_FALSE;
253 uchar_t *hw_addr = ncec->ncec_lladdr;
254 uint_t hw_addr_len = ncec->ncec_lladdr_length;
255 ill_t *ill = ncec->ncec_ill;
256 int err = 0;
257 uint16_t flags = ncec->ncec_flags;
258 ip_stack_t *ipst = ill->ill_ipst;
259 boolean_t trigger_fastpath = B_TRUE;
260
261 /*
262 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
263 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
264 * We call nce_fastpath from nce_update if the link layer address of
265 * the peer changes from nce_update
266 */
267 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
268 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
269 trigger_fastpath = B_FALSE;
270
271 if (trigger_fastpath)
272 nce_fastpath_trigger(nce);
273 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
274 ill_t *hwaddr_ill;
275 /*
276 * Unicast entry that needs DAD.
277 */
278 if (IS_IPMP(ill)) {
279 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
280 hw_addr, hw_addr_len);
281 } else {
282 hwaddr_ill = ill;
283 }
284 nce_dad(ncec, hwaddr_ill, B_TRUE);
285 err = EINPROGRESS;
286 } else if (flags & NCE_F_UNSOL_ADV) {
287 /*
288 * We account for the transmit below by assigning one
289 * less than the ndd variable. Subsequent decrements
290 * are done in nce_timer.
291 */
292 mutex_enter(&ncec->ncec_lock);
293 ncec->ncec_unsolicit_count =
294 ipst->ips_ip_ndp_unsolicit_count - 1;
295 mutex_exit(&ncec->ncec_lock);
296 dropped = ndp_xmit(ill,
297 ND_NEIGHBOR_ADVERT,
298 hw_addr,
299 hw_addr_len,
300 &ncec->ncec_addr, /* Source and target of the adv */
301 &ipv6_all_hosts_mcast, /* Destination of the packet */
302 nce_advert_flags(ncec));
303 mutex_enter(&ncec->ncec_lock);
304 if (dropped)
305 ncec->ncec_unsolicit_count++;
306 else
307 ncec->ncec_last_time_defended = ddi_get_lbolt();
308 if (ncec->ncec_unsolicit_count != 0) {
309 nce_start_timer(ncec,
310 ipst->ips_ip_ndp_unsolicit_interval);
311 }
312 mutex_exit(&ncec->ncec_lock);
313 }
314 return (err);
315 }
316
317 /*
318 * Atomically lookup and add (if needed) Neighbor Cache information for
319 * an address.
320 *
321 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
322 * are always added pointing at the ipmp_ill. Thus, when the ill passed
323 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
324 * entries will be created, both pointing at the same ncec_t. The nce_t
325 * entries will have their nce_ill set to the ipmp_ill and the under_ill
326 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
327 * Local addresses are always created on the ill passed to nce_add_v6.
328 */
329 int
330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
331 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
332 {
333 int err = 0;
334 ip_stack_t *ipst = ill->ill_ipst;
335 nce_t *nce, *upper_nce = NULL;
336 ill_t *in_ill = ill;
337 boolean_t need_ill_refrele = B_FALSE;
338
339 if (flags & NCE_F_MCAST) {
340 /*
341 * hw_addr will be figured out in nce_set_multicast_v6;
342 * caller has to select the cast_ill
343 */
344 ASSERT(hw_addr == NULL);
345 ASSERT(!IS_IPMP(ill));
346 err = nce_set_multicast_v6(ill, addr, flags, newnce);
347 return (err);
348 }
349 ASSERT(ill->ill_isv6);
350 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
351 ill = ipmp_ill_hold_ipmp_ill(ill);
352 if (ill == NULL)
353 return (ENXIO);
354 need_ill_refrele = B_TRUE;
355 }
356
357 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
358 nce = nce_lookup_addr(ill, addr);
359 if (nce == NULL) {
360 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
361 &nce);
362 } else {
363 err = EEXIST;
364 }
365 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
366 if (err == 0)
367 err = nce_add_v6_postprocess(nce);
368 if (in_ill != ill && nce != NULL) {
369 nce_t *under_nce = NULL;
370
371 /*
372 * in_ill was the under_ill. Try to create the under_nce.
373 * Hold the ill_g_lock to prevent changes to group membership
374 * until we are done.
375 */
376 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
377 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
378 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
379 ill_t *, ill);
380 rw_exit(&ipst->ips_ill_g_lock);
381 err = ENXIO;
382 nce_refrele(nce);
383 nce = NULL;
384 goto bail;
385 }
386 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
387 if (under_nce == NULL) {
388 rw_exit(&ipst->ips_ill_g_lock);
389 err = EINVAL;
390 nce_refrele(nce);
391 nce = NULL;
392 goto bail;
393 }
394 rw_exit(&ipst->ips_ill_g_lock);
395 upper_nce = nce;
396 nce = under_nce; /* will be returned to caller */
397 if (NCE_ISREACHABLE(nce->nce_common))
398 nce_fastpath_trigger(under_nce);
399 }
400 /* nce_refrele is deferred until the lock is dropped */
401 if (nce != NULL) {
402 if (newnce != NULL)
403 *newnce = nce;
404 else
405 nce_refrele(nce);
406 }
407 bail:
408 if (upper_nce != NULL)
409 nce_refrele(upper_nce);
410 if (need_ill_refrele)
411 ill_refrele(ill);
412 return (err);
413 }
414
415 /*
416 * Remove all the CONDEMNED nces from the appropriate hash table.
417 * We create a private list of NCEs, these may have ires pointing
418 * to them, so the list will be passed through to clean up dependent
419 * ires and only then we can do ncec_refrele() which can make NCE inactive.
420 */
421 static void
422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
423 {
424 ncec_t *ncec1;
425 ncec_t **ptpn;
426
427 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
428 ASSERT(ndp->ndp_g_walker == 0);
429 for (; ncec; ncec = ncec1) {
430 ncec1 = ncec->ncec_next;
431 mutex_enter(&ncec->ncec_lock);
432 if (NCE_ISCONDEMNED(ncec)) {
433 ptpn = ncec->ncec_ptpn;
434 ncec1 = ncec->ncec_next;
435 if (ncec1 != NULL)
436 ncec1->ncec_ptpn = ptpn;
437 *ptpn = ncec1;
438 ncec->ncec_ptpn = NULL;
439 ncec->ncec_next = NULL;
440 ncec->ncec_next = *free_nce_list;
441 *free_nce_list = ncec;
442 }
443 mutex_exit(&ncec->ncec_lock);
444 }
445 }
446
447 /*
448 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
449 * will return this NCE. Also no new timeouts will
450 * be started (See nce_restart_timer).
451 * 2. Cancel any currently running timeouts.
452 * 3. If there is an ndp walker, return. The walker will do the cleanup.
453 * This ensures that walkers see a consistent list of NCEs while walking.
454 * 4. Otherwise remove the NCE from the list of NCEs
455 */
456 void
457 ncec_delete(ncec_t *ncec)
458 {
459 ncec_t **ptpn;
460 ncec_t *ncec1;
461 int ipversion = ncec->ncec_ipversion;
462 ndp_g_t *ndp;
463 ip_stack_t *ipst = ncec->ncec_ipst;
464
465 if (ipversion == IPV4_VERSION)
466 ndp = ipst->ips_ndp4;
467 else
468 ndp = ipst->ips_ndp6;
469
470 /* Serialize deletes */
471 mutex_enter(&ncec->ncec_lock);
472 if (NCE_ISCONDEMNED(ncec)) {
473 /* Some other thread is doing the delete */
474 mutex_exit(&ncec->ncec_lock);
475 return;
476 }
477 /*
478 * Caller has a refhold. Also 1 ref for being in the list. Thus
479 * refcnt has to be >= 2
480 */
481 ASSERT(ncec->ncec_refcnt >= 2);
482 ncec->ncec_flags |= NCE_F_CONDEMNED;
483 mutex_exit(&ncec->ncec_lock);
484
485 /* Count how many condemned ires for kmem_cache callback */
486 atomic_inc_32(&ipst->ips_num_nce_condemned);
487 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
488
489 /* Complete any waiting callbacks */
490 ncec_cb_dispatch(ncec);
491
492 /*
493 * Cancel any running timer. Timeout can't be restarted
494 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
495 * Passing invalid timeout id is fine.
496 */
497 if (ncec->ncec_timeout_id != 0) {
498 (void) untimeout(ncec->ncec_timeout_id);
499 ncec->ncec_timeout_id = 0;
500 }
501
502 mutex_enter(&ndp->ndp_g_lock);
503 if (ncec->ncec_ptpn == NULL) {
504 /*
505 * The last ndp walker has already removed this ncec from
506 * the list after we marked the ncec CONDEMNED and before
507 * we grabbed the global lock.
508 */
509 mutex_exit(&ndp->ndp_g_lock);
510 return;
511 }
512 if (ndp->ndp_g_walker > 0) {
513 /*
514 * Can't unlink. The walker will clean up
515 */
516 ndp->ndp_g_walker_cleanup = B_TRUE;
517 mutex_exit(&ndp->ndp_g_lock);
518 return;
519 }
520
521 /*
522 * Now remove the ncec from the list. nce_restart_timer won't restart
523 * the timer since it is marked CONDEMNED.
524 */
525 ptpn = ncec->ncec_ptpn;
526 ncec1 = ncec->ncec_next;
527 if (ncec1 != NULL)
528 ncec1->ncec_ptpn = ptpn;
529 *ptpn = ncec1;
530 ncec->ncec_ptpn = NULL;
531 ncec->ncec_next = NULL;
532 mutex_exit(&ndp->ndp_g_lock);
533
534 /* Removed from ncec_ptpn/ncec_next list */
535 ncec_refrele_notr(ncec);
536 }
537
538 void
539 ncec_inactive(ncec_t *ncec)
540 {
541 mblk_t **mpp;
542 ill_t *ill = ncec->ncec_ill;
543 ip_stack_t *ipst = ncec->ncec_ipst;
544
545 ASSERT(ncec->ncec_refcnt == 0);
546 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
547
548 /* Count how many condemned nces for kmem_cache callback */
549 if (NCE_ISCONDEMNED(ncec))
550 atomic_add_32(&ipst->ips_num_nce_condemned, -1);
551
552 /* Free all allocated messages */
553 mpp = &ncec->ncec_qd_mp;
554 while (*mpp != NULL) {
555 mblk_t *mp;
556
557 mp = *mpp;
558 *mpp = mp->b_next;
559
560 inet_freemsg(mp);
561 }
562 /*
563 * must have been cleaned up in ncec_delete
564 */
565 ASSERT(list_is_empty(&ncec->ncec_cb));
566 list_destroy(&ncec->ncec_cb);
567 /*
568 * free the ncec_lladdr if one was allocated in nce_add_common()
569 */
570 if (ncec->ncec_lladdr_length > 0)
571 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
572
573 #ifdef DEBUG
574 ncec_trace_cleanup(ncec);
575 #endif
576
577 mutex_enter(&ill->ill_lock);
578 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
579 (char *), "ncec", (void *), ncec);
580 ill->ill_ncec_cnt--;
581 ncec->ncec_ill = NULL;
582 /*
583 * If the number of ncec's associated with this ill have dropped
584 * to zero, check whether we need to restart any operation that
585 * is waiting for this to happen.
586 */
587 if (ILL_DOWN_OK(ill)) {
588 /* ipif_ill_refrele_tail drops the ill_lock */
589 ipif_ill_refrele_tail(ill);
590 } else {
591 mutex_exit(&ill->ill_lock);
592 }
593
594 mutex_destroy(&ncec->ncec_lock);
595 kmem_cache_free(ncec_cache, ncec);
596 }
597
598 /*
599 * ncec_walk routine. Delete the ncec if it is associated with the ill
600 * that is going away. Always called as a writer.
601 */
602 void
603 ncec_delete_per_ill(ncec_t *ncec, void *arg)
604 {
605 if ((ncec != NULL) && ncec->ncec_ill == arg) {
606 ncec_delete(ncec);
607 }
608 }
609
610 /*
611 * Neighbor Cache cleanup logic for a list of ncec_t entries.
612 */
613 static void
614 nce_cleanup_list(ncec_t *ncec)
615 {
616 ncec_t *ncec_next;
617
618 ASSERT(ncec != NULL);
619 while (ncec != NULL) {
620 ncec_next = ncec->ncec_next;
621 ncec->ncec_next = NULL;
622
623 /*
624 * It is possible for the last ndp walker (this thread)
625 * to come here after ncec_delete has marked the ncec CONDEMNED
626 * and before it has removed the ncec from the fastpath list
627 * or called untimeout. So we need to do it here. It is safe
628 * for both ncec_delete and this thread to do it twice or
629 * even simultaneously since each of the threads has a
630 * reference on the ncec.
631 */
632 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
633 /*
634 * Cancel any running timer. Timeout can't be restarted
635 * since CONDEMNED is set. The ncec_lock can't be
636 * held across untimeout though passing invalid timeout
637 * id is fine.
638 */
639 if (ncec->ncec_timeout_id != 0) {
640 (void) untimeout(ncec->ncec_timeout_id);
641 ncec->ncec_timeout_id = 0;
642 }
643 /* Removed from ncec_ptpn/ncec_next list */
644 ncec_refrele_notr(ncec);
645 ncec = ncec_next;
646 }
647 }
648
649 /*
650 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted.
651 */
652 boolean_t
653 nce_restart_dad(ncec_t *ncec)
654 {
655 boolean_t started;
656 ill_t *ill, *hwaddr_ill;
657
658 if (ncec == NULL)
659 return (B_FALSE);
660 ill = ncec->ncec_ill;
661 mutex_enter(&ncec->ncec_lock);
662 if (ncec->ncec_state == ND_PROBE) {
663 mutex_exit(&ncec->ncec_lock);
664 started = B_TRUE;
665 } else if (ncec->ncec_state == ND_REACHABLE) {
666 ASSERT(ncec->ncec_lladdr != NULL);
667 ncec->ncec_state = ND_PROBE;
668 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
669 /*
670 * Slight cheat here: we don't use the initial probe delay
671 * for IPv4 in this obscure case.
672 */
673 mutex_exit(&ncec->ncec_lock);
674 if (IS_IPMP(ill)) {
675 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
676 ncec->ncec_lladdr, ncec->ncec_lladdr_length);
677 } else {
678 hwaddr_ill = ill;
679 }
680 nce_dad(ncec, hwaddr_ill, B_TRUE);
681 started = B_TRUE;
682 } else {
683 mutex_exit(&ncec->ncec_lock);
684 started = B_FALSE;
685 }
686 return (started);
687 }
688
689 /*
690 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed.
691 * If one is found, the refcnt on the ncec will be incremented.
692 */
693 ncec_t *
694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
695 {
696 ncec_t *ncec;
697 ip_stack_t *ipst = ill->ill_ipst;
698
699 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
700 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
701
702 /* Get head of v6 hash table */
703 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
704 ncec = ncec_lookup_illgrp(ill, addr, ncec);
705 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
706 rw_exit(&ipst->ips_ill_g_lock);
707 return (ncec);
708 }
709 /*
710 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed.
711 * If one is found, the refcnt on the ncec will be incremented.
712 */
713 ncec_t *
714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
715 {
716 ncec_t *ncec = NULL;
717 in6_addr_t addr6;
718 ip_stack_t *ipst = ill->ill_ipst;
719
720 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
721 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
722
723 /* Get head of v4 hash table */
724 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
725 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
726 ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
727 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
728 rw_exit(&ipst->ips_ill_g_lock);
729 return (ncec);
730 }
731
732 /*
733 * Cache entry lookup. Try to find an ncec matching the parameters passed.
734 * If an ncec is found, increment the hold count on that ncec.
735 * The caller passes in the start of the appropriate hash table, and must
736 * be holding the appropriate global lock (ndp_g_lock). In addition, since
737 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
738 * must be held as reader.
739 *
740 * This function always matches across the ipmp group.
741 */
742 ncec_t *
743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
744 {
745 ndp_g_t *ndp;
746 ip_stack_t *ipst = ill->ill_ipst;
747
748 if (ill->ill_isv6)
749 ndp = ipst->ips_ndp6;
750 else
751 ndp = ipst->ips_ndp4;
752
753 ASSERT(ill != NULL);
754 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
755 if (IN6_IS_ADDR_UNSPECIFIED(addr))
756 return (NULL);
757 for (; ncec != NULL; ncec = ncec->ncec_next) {
758 if (ncec->ncec_ill == ill ||
759 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
760 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
761 mutex_enter(&ncec->ncec_lock);
762 if (!NCE_ISCONDEMNED(ncec)) {
763 ncec_refhold_locked(ncec);
764 mutex_exit(&ncec->ncec_lock);
765 break;
766 }
767 mutex_exit(&ncec->ncec_lock);
768 }
769 }
770 }
771 return (ncec);
772 }
773
774 /*
775 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
776 * entries for ill only, i.e., when ill is part of an ipmp group,
777 * nce_lookup_v4 will never try to match across the group.
778 */
779 nce_t *
780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
781 {
782 nce_t *nce;
783 in6_addr_t addr6;
784 ip_stack_t *ipst = ill->ill_ipst;
785
786 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
787 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
788 nce = nce_lookup_addr(ill, &addr6);
789 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
790 return (nce);
791 }
792
793 /*
794 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
795 * entries for ill only, i.e., when ill is part of an ipmp group,
796 * nce_lookup_v6 will never try to match across the group.
797 */
798 nce_t *
799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
800 {
801 nce_t *nce;
802 ip_stack_t *ipst = ill->ill_ipst;
803
804 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
805 nce = nce_lookup_addr(ill, addr6);
806 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
807 return (nce);
808 }
809
810 static nce_t *
811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
812 {
813 nce_t *nce;
814
815 ASSERT(ill != NULL);
816 #ifdef DEBUG
817 if (ill->ill_isv6)
818 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
819 else
820 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
821 #endif
822 mutex_enter(&ill->ill_lock);
823 nce = nce_lookup(ill, addr);
824 mutex_exit(&ill->ill_lock);
825 return (nce);
826 }
827
828
829 /*
830 * Router turned to host. We need to make sure that cached copies of the ncec
831 * are not used for forwarding packets if they were derived from the default
832 * route, and that the default route itself is removed, as required by
833 * section 7.2.5 of RFC 2461.
834 *
835 * Note that the ncec itself probably has valid link-layer information for the
836 * nexthop, so that there is no reason to delete the ncec, as long as the
837 * ISROUTER flag is turned off.
838 */
839 static void
840 ncec_router_to_host(ncec_t *ncec)
841 {
842 ire_t *ire;
843 ip_stack_t *ipst = ncec->ncec_ipst;
844
845 mutex_enter(&ncec->ncec_lock);
846 ncec->ncec_flags &= ~NCE_F_ISROUTER;
847 mutex_exit(&ncec->ncec_lock);
848
849 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
850 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
851 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
852 if (ire != NULL) {
853 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
854 ire_delete(ire);
855 ire_refrele(ire);
856 }
857 }
858
859 /*
860 * Process passed in parameters either from an incoming packet or via
861 * user ioctl.
862 */
863 void
864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
865 {
866 ill_t *ill = ncec->ncec_ill;
867 uint32_t hw_addr_len = ill->ill_phys_addr_length;
868 boolean_t ll_updated = B_FALSE;
869 boolean_t ll_changed;
870 nce_t *nce;
871
872 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
873 /*
874 * No updates of link layer address or the neighbor state is
875 * allowed, when the cache is in NONUD state. This still
876 * allows for responding to reachability solicitation.
877 */
878 mutex_enter(&ncec->ncec_lock);
879 if (ncec->ncec_state == ND_INCOMPLETE) {
880 if (hw_addr == NULL) {
881 mutex_exit(&ncec->ncec_lock);
882 return;
883 }
884 nce_set_ll(ncec, hw_addr);
885 /*
886 * Update ncec state and send the queued packets
887 * back to ip this time ire will be added.
888 */
889 if (flag & ND_NA_FLAG_SOLICITED) {
890 nce_update(ncec, ND_REACHABLE, NULL);
891 } else {
892 nce_update(ncec, ND_STALE, NULL);
893 }
894 mutex_exit(&ncec->ncec_lock);
895 nce = nce_fastpath(ncec, B_TRUE, NULL);
896 nce_resolv_ok(ncec);
897 if (nce != NULL)
898 nce_refrele(nce);
899 return;
900 }
901 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
902 if (!is_adv) {
903 /* If this is a SOLICITATION request only */
904 if (ll_changed)
905 nce_update(ncec, ND_STALE, hw_addr);
906 mutex_exit(&ncec->ncec_lock);
907 ncec_cb_dispatch(ncec);
908 return;
909 }
910 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
911 /* If in any other state than REACHABLE, ignore */
912 if (ncec->ncec_state == ND_REACHABLE) {
913 nce_update(ncec, ND_STALE, NULL);
914 }
915 mutex_exit(&ncec->ncec_lock);
916 ncec_cb_dispatch(ncec);
917 return;
918 } else {
919 if (ll_changed) {
920 nce_update(ncec, ND_UNCHANGED, hw_addr);
921 ll_updated = B_TRUE;
922 }
923 if (flag & ND_NA_FLAG_SOLICITED) {
924 nce_update(ncec, ND_REACHABLE, NULL);
925 } else {
926 if (ll_updated) {
927 nce_update(ncec, ND_STALE, NULL);
928 }
929 }
930 mutex_exit(&ncec->ncec_lock);
931 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
932 NCE_F_ISROUTER)) {
933 ncec_router_to_host(ncec);
934 } else {
935 ncec_cb_dispatch(ncec);
936 }
937 }
938 }
939
940 /*
941 * Pass arg1 to the cbf supplied, along with each ncec in existence.
942 * ncec_walk() places a REFHOLD on the ncec and drops the lock when
943 * walking the hash list.
944 */
945 void
946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
947 void *arg1, boolean_t trace)
948 {
949 ncec_t *ncec;
950 ncec_t *ncec1;
951 ncec_t **ncep;
952 ncec_t *free_nce_list = NULL;
953
954 mutex_enter(&ndp->ndp_g_lock);
955 /* Prevent ncec_delete from unlink and free of NCE */
956 ndp->ndp_g_walker++;
957 mutex_exit(&ndp->ndp_g_lock);
958 for (ncep = ndp->nce_hash_tbl;
959 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
960 for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
961 ncec1 = ncec->ncec_next;
962 if (ill == NULL || ncec->ncec_ill == ill) {
963 if (trace) {
964 ncec_refhold(ncec);
965 (*cbf)(ncec, arg1);
966 ncec_refrele(ncec);
967 } else {
968 ncec_refhold_notr(ncec);
969 (*cbf)(ncec, arg1);
970 ncec_refrele_notr(ncec);
971 }
972 }
973 }
974 }
975 mutex_enter(&ndp->ndp_g_lock);
976 ndp->ndp_g_walker--;
977 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
978 /* Time to delete condemned entries */
979 for (ncep = ndp->nce_hash_tbl;
980 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
981 ncec = *ncep;
982 if (ncec != NULL) {
983 nce_remove(ndp, ncec, &free_nce_list);
984 }
985 }
986 ndp->ndp_g_walker_cleanup = B_FALSE;
987 }
988
989 mutex_exit(&ndp->ndp_g_lock);
990
991 if (free_nce_list != NULL) {
992 nce_cleanup_list(free_nce_list);
993 }
994 }
995
996 /*
997 * Walk everything.
998 * Note that ill can be NULL hence can't derive the ipst from it.
999 */
1000 void
1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003 ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004 ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006
1007 /*
1008 * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
1009 * NCEs, and the number to reclaim if we hit the limit. Used by
1010 * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
1011 * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
1012 */
1013
1014 /* Maximum number of multicast NCEs on an ill. */
1015 uint_t ip_max_ill_mcast_nces = 16384;
1016 /*
1017 * Number of NCEs to delete if we hit the maximum above. 0 means *don't* and
1018 * return an error. Non-zero means delete so many, and if the number is >=
1019 * the max above, that means delete them all.
1020 */
1021 uint_t ip_ill_mcast_reclaim = 256;
1022
1023 /*
1024 * Encapsulate multicast ill capping in a function, for easier DTrace
1025 * detections. Return a list of refheld NCEs to destroy-via-refrele. That
1026 * list can be NULL, but can only be non-NULL if we successfully reclaimed.
1027 *
1028 * NOTE: This function must be called while holding the ill_lock AND
1029 * JUST PRIOR to making the insertion into the ill_nce list.
1030 *
1031 * We can't release the ones we delete ourselves because the ill_lock is held
1032 * by the caller. They are, instead, passed back in a list_t for deletion
1033 * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
1034 *
1035 * While this covers nce_t, ncec_t gets done even further down the road. See
1036 * nce_graveyard_free() for why.
1037 */
1038 static boolean_t
1039 nce_too_many_mcast(ill_t *ill, list_t *graveyard)
1040 {
1041 uint_t reclaim_count, max_count, reclaimed = 0;
1042 boolean_t too_many;
1043 nce_t *nce, *deadman;
1044
1045 ASSERT(graveyard != NULL);
1046 ASSERT(list_is_empty(graveyard));
1047 ASSERT(MUTEX_HELD(&ill->ill_lock));
1048
1049 /*
1050 * NOTE: Some grinning weirdo may have lowered the global max beyond
1051 * what this ill currently has. The behavior in this case will be
1052 * trim-back just by the reclaim amount for any new ones.
1053 */
1054 max_count = ip_max_ill_mcast_nces;
1055 reclaim_count = min(ip_ill_mcast_reclaim, max_count);
1056
1057 /* All good? */
1058 if (ill->ill_mcast_nces < max_count)
1059 return (B_FALSE); /* Yes, all good. */
1060
1061 if (reclaim_count == 0)
1062 return (B_TRUE); /* Don't bother - we're stuck. */
1063
1064 /* We need to reclaim now. Exploit our held ill_lock. */
1065
1066 /*
1067 * Start at the tail and work backwards, new nces are head-inserted,
1068 * so we'll be reaping the oldest entries.
1069 */
1070 nce = list_tail(&ill->ill_nce);
1071 while (reclaimed < reclaim_count) {
1072 /* Skip ahead to a multicast NCE. */
1073 while (nce != NULL &&
1074 (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
1075 nce = list_prev(&ill->ill_nce, nce);
1076 }
1077 if (nce == NULL)
1078 break;
1079
1080 /*
1081 * NOTE: For now, we just delete the first one(s) we find.
1082 * This is not optimal, and may require some inspection of nce
1083 * & its ncec to be better.
1084 */
1085 deadman = nce;
1086 nce = list_prev(&ill->ill_nce, nce);
1087
1088 /* nce_delete() requires caller holds... */
1089 nce_refhold(deadman);
1090 nce_delete(deadman); /* Bumps down ill_mcast_nces. */
1091
1092 /* Link the dead ones singly, still refheld... */
1093 list_insert_tail(graveyard, deadman);
1094 reclaimed++;
1095 }
1096
1097 if (reclaimed != reclaim_count) {
1098 /* We didn't have enough to reach reclaim_count. Why?!? */
1099 DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
1100 uint_t, reclaimed, uint_t, reclaim_count);
1101
1102 /* In case for some REALLY weird reason we found none! */
1103 too_many = (reclaimed == 0);
1104 } else {
1105 too_many = B_FALSE;
1106 }
1107
1108 return (too_many);
1109 }
1110
1111 static void
1112 ncec_mcast_reap_one(ncec_t *ncec, void *arg)
1113 {
1114 boolean_t reapit;
1115 ill_t *ill = (ill_t *)arg;
1116
1117 /* Obvious no-lock-needed checks... */
1118 if (ncec == NULL || ncec->ncec_ill != ill ||
1119 (ncec->ncec_flags & NCE_F_MCAST) == 0)
1120 return;
1121
1122 mutex_enter(&ncec->ncec_lock);
1123 /*
1124 * It's refheld by the walk infrastructure. It has one reference for
1125 * being in the ndp_g_hash, and if an nce_t exists, that's one more.
1126 * We want ones without an nce_t, so 2 is the magic number. If it's
1127 * LESS than 2, we have much bigger problems anyway.
1128 */
1129 ASSERT(ncec->ncec_refcnt >= 2);
1130 reapit = (ncec->ncec_refcnt == 2);
1131 mutex_exit(&ncec->ncec_lock);
1132
1133 if (reapit) {
1134 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
1135 ncec_delete(ncec);
1136 }
1137 }
1138
1139 /*
1140 * Attempt to reap stray multicast ncec_t structures left in the wake of
1141 * nce_graveyard_free(). This is a taskq servicing routine, as it's well
1142 * outside any netstack-global locks being held - ndp_g_lock in this case. We
1143 * have a reference hold on the ill, which will prevent any unplumbing races.
1144 */
1145 static void
1146 ncec_mcast_reap(void *arg)
1147 {
1148 ill_t *ill = (ill_t *)arg;
1149
1150 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
1151 ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
1152 mutex_enter(&ill->ill_lock);
1153 ill->ill_mcast_ncec_cleanup = B_FALSE;
1154 /*
1155 * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
1156 * below for why.
1157 */
1158 ill->ill_refcnt--;
1159 if (ill->ill_refcnt == 0)
1160 ipif_ill_refrele_tail(ill); /* Drops ill_lock. */
1161 else
1162 mutex_exit(&ill->ill_lock);
1163 }
1164
1165 /*
1166 * Free a list (including handling an empty list or NULL list) of
1167 * reference-held NCEs that were reaped from a nce_too_many_mcast()
1168 * call. Separate because the caller must have dropped ndp_g_lock first.
1169 *
1170 * This also schedules a taskq task to unlink underlying NCECs from the
1171 * ndp_g_hash, which are protected by ndp_g_lock.
1172 */
1173 static void
1174 nce_graveyard_free(list_t *graveyard)
1175 {
1176 nce_t *deadman, *current;
1177 ill_t *ill;
1178 boolean_t doit;
1179
1180 if (graveyard == NULL)
1181 return;
1182
1183 current = list_head(graveyard);
1184 if (current == NULL) {
1185 list_destroy(graveyard);
1186 return;
1187 }
1188
1189 ill = current->nce_ill;
1190 /*
1191 * Normally one should ill_refhold(ill) here. There's no _notr()
1192 * variant like there is for ire_t, dce_t, or even ncec_t, but this is
1193 * the ONLY case that'll break the mh_trace that IP debugging uses for
1194 * reference counts (i.e. they assume same thread releases as
1195 * holds). Instead, we inline ill_refhold() here. We must do the same
1196 * in the release done by the ncec_mcast_reap() above.
1197 */
1198 mutex_enter(&ill->ill_lock);
1199 ill->ill_refcnt++;
1200 mutex_exit(&ill->ill_lock);
1201
1202 while (current != NULL) {
1203 ASSERT3P(ill, ==, current->nce_ill);
1204 deadman = current;
1205 current = list_next(graveyard, deadman);
1206 list_remove(graveyard, deadman);
1207 ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
1208 0);
1209 nce_refrele(deadman);
1210 }
1211 list_destroy(graveyard);
1212
1213 mutex_enter(&ill->ill_lock);
1214 if (ill->ill_mcast_ncec_cleanup)
1215 doit = B_FALSE;
1216 else {
1217 ill->ill_mcast_ncec_cleanup = B_TRUE;
1218 doit = B_TRUE;
1219 }
1220 mutex_exit(&ill->ill_lock);
1221 if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
1222 ill, TQ_NOSLEEP) == NULL) {
1223 mutex_enter(&ill->ill_lock);
1224 if (doit) {
1225 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
1226 ill->ill_mcast_ncec_cleanup = B_FALSE;
1227 }
1228 /* There's no _notr() for ill_refrele(), so inline it here. */
1229 ill->ill_refcnt--;
1230 if (ill->ill_refcnt == 0)
1231 ipif_ill_refrele_tail(ill); /* Drops ill_lock */
1232 else
1233 mutex_exit(&ill->ill_lock);
1234 }
1235 }
1236
1237 /*
1238 * For each interface an entry is added for the unspecified multicast group.
1239 * Here that mapping is used to form the multicast cache entry for a particular
1240 * multicast destination.
1241 */
1242 static int
1243 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1244 uint16_t flags, nce_t **newnce)
1245 {
1246 uchar_t *hw_addr;
1247 int err = 0;
1248 ip_stack_t *ipst = ill->ill_ipst;
1249 nce_t *nce;
1250
1251 ASSERT(ill != NULL);
1252 ASSERT(ill->ill_isv6);
1253 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1254
1255 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1256 nce = nce_lookup_addr(ill, dst);
1257 if (nce != NULL) {
1258 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1259 goto done;
1260 }
1261 if (ill->ill_net_type == IRE_IF_RESOLVER) {
1262 /*
1263 * For IRE_IF_RESOLVER a hardware mapping can be
1264 * generated.
1265 */
1266 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1267 if (hw_addr == NULL) {
1268 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1269 return (ENOMEM);
1270 }
1271 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1272 } else {
1273 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1274 hw_addr = NULL;
1275 }
1276 ASSERT((flags & NCE_F_MCAST) != 0);
1277 ASSERT((flags & NCE_F_NONUD) != 0);
1278 /* nce_state will be computed by nce_add_common() */
1279 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1280 ND_UNCHANGED, &nce);
1281 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1282 if (err == 0)
1283 err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
1284 if (hw_addr != NULL)
1285 kmem_free(hw_addr, ill->ill_nd_lla_len);
1286 if (err != 0) {
1287 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1288 return (err);
1289 }
1290 done:
1291 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1292 if (newnce != NULL)
1293 *newnce = nce;
1294 else
1295 nce_refrele(nce);
1296 return (0);
1297 }
1298
1299 /*
1300 * Return the link layer address, and any flags of a ncec.
1301 */
1302 int
1303 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1304 {
1305 ncec_t *ncec;
1306 in6_addr_t *addr;
1307 sin6_t *sin6;
1308
1309 ASSERT(ill != NULL && ill->ill_isv6);
1310 sin6 = (sin6_t *)&lnr->lnr_addr;
1311 addr = &sin6->sin6_addr;
1312
1313 /*
1314 * NOTE: if the ill is an IPMP interface, then match against the whole
1315 * illgrp. This e.g. allows in.ndpd to retrieve the link layer
1316 * addresses for the data addresses on an IPMP interface even though
1317 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1318 */
1319 ncec = ncec_lookup_illgrp_v6(ill, addr);
1320 if (ncec == NULL)
1321 return (ESRCH);
1322 /* If no link layer address is available yet, return ESRCH */
1323 if (!NCE_ISREACHABLE(ncec)) {
1324 ncec_refrele(ncec);
1325 return (ESRCH);
1326 }
1327 lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1328 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1329 lnr->lnr_hdw_len);
1330 if (ncec->ncec_flags & NCE_F_ISROUTER)
1331 lnr->lnr_flags = NDF_ISROUTER_ON;
1332 if (ncec->ncec_flags & NCE_F_ANYCAST)
1333 lnr->lnr_flags |= NDF_ANYCAST_ON;
1334 if (ncec->ncec_flags & NCE_F_STATIC)
1335 lnr->lnr_flags |= NDF_STATIC;
1336 ncec_refrele(ncec);
1337 return (0);
1338 }
1339
1340 /*
1341 * Finish setting up the Enable/Disable multicast for the driver.
1342 */
1343 mblk_t *
1344 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1345 uint32_t hw_addr_offset, mblk_t *mp)
1346 {
1347 uchar_t *hw_addr;
1348 ipaddr_t v4group;
1349 uchar_t *addr;
1350
1351 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1352 if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1353 IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1354
1355 ASSERT(CLASSD(v4group));
1356 ASSERT(!(ill->ill_isv6));
1357
1358 addr = (uchar_t *)&v4group;
1359 } else {
1360 ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1361 ASSERT(ill->ill_isv6);
1362
1363 addr = (uchar_t *)v6group;
1364 }
1365 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1366 if (hw_addr == NULL) {
1367 ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1368 freemsg(mp);
1369 return (NULL);
1370 }
1371
1372 ip_mcast_mapping(ill, addr, hw_addr);
1373 return (mp);
1374 }
1375
1376 void
1377 ip_ndp_resolve(ncec_t *ncec)
1378 {
1379 in_addr_t sender4 = INADDR_ANY;
1380 in6_addr_t sender6 = ipv6_all_zeros;
1381 ill_t *src_ill;
1382 uint32_t ms;
1383
1384 src_ill = nce_resolve_src(ncec, &sender6);
1385 if (src_ill == NULL) {
1386 /* Make sure we try again later */
1387 ms = ncec->ncec_ill->ill_reachable_retrans_time;
1388 nce_restart_timer(ncec, (clock_t)ms);
1389 return;
1390 }
1391 if (ncec->ncec_ipversion == IPV4_VERSION)
1392 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1393 mutex_enter(&ncec->ncec_lock);
1394 if (ncec->ncec_ipversion == IPV6_VERSION)
1395 ms = ndp_solicit(ncec, sender6, src_ill);
1396 else
1397 ms = arp_request(ncec, sender4, src_ill);
1398 mutex_exit(&ncec->ncec_lock);
1399 if (ms == 0) {
1400 if (ncec->ncec_state != ND_REACHABLE) {
1401 if (ncec->ncec_ipversion == IPV6_VERSION)
1402 ndp_resolv_failed(ncec);
1403 else
1404 arp_resolv_failed(ncec);
1405 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1406 nce_make_unreachable(ncec);
1407 ncec_delete(ncec);
1408 }
1409 } else {
1410 nce_restart_timer(ncec, (clock_t)ms);
1411 }
1412 done:
1413 ill_refrele(src_ill);
1414 }
1415
1416 /*
1417 * Send an IPv6 neighbor solicitation.
1418 * Returns number of milliseconds after which we should either rexmit or abort.
1419 * Return of zero means we should abort.
1420 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1421 * The optional source address is used as a hint to ndp_solicit for
1422 * which source to use in the packet.
1423 *
1424 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1425 * the packet.
1426 */
1427 uint32_t
1428 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1429 {
1430 in6_addr_t dst;
1431 boolean_t dropped = B_FALSE;
1432
1433 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1434 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1435
1436 if (ncec->ncec_rcnt == 0)
1437 return (0);
1438
1439 dst = ncec->ncec_addr;
1440 ncec->ncec_rcnt--;
1441 mutex_exit(&ncec->ncec_lock);
1442 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1443 ill->ill_phys_addr_length, &src, &dst, 0);
1444 mutex_enter(&ncec->ncec_lock);
1445 if (dropped)
1446 ncec->ncec_rcnt++;
1447 return (ncec->ncec_ill->ill_reachable_retrans_time);
1448 }
1449
1450 /*
1451 * Attempt to recover an address on an interface that's been marked as a
1452 * duplicate. Because NCEs are destroyed when the interface goes down, there's
1453 * no easy way to just probe the address and have the right thing happen if
1454 * it's no longer in use. Instead, we just bring it up normally and allow the
1455 * regular interface start-up logic to probe for a remaining duplicate and take
1456 * us back down if necessary.
1457 * Neither DHCP nor temporary addresses arrive here; they're excluded by
1458 * ip_ndp_excl.
1459 */
1460 /* ARGSUSED */
1461 void
1462 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1463 {
1464 ill_t *ill = rq->q_ptr;
1465 ipif_t *ipif;
1466 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1467 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1468 boolean_t addr_equal;
1469
1470 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1471 /*
1472 * We do not support recovery of proxy ARP'd interfaces,
1473 * because the system lacks a complete proxy ARP mechanism.
1474 */
1475 if (ill->ill_isv6) {
1476 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1477 addr6);
1478 } else {
1479 addr_equal = (ipif->ipif_lcl_addr == *addr4);
1480 }
1481
1482 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1483 continue;
1484
1485 /*
1486 * If we have already recovered or if the interface is going
1487 * away, then ignore.
1488 */
1489 mutex_enter(&ill->ill_lock);
1490 if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1491 (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1492 mutex_exit(&ill->ill_lock);
1493 continue;
1494 }
1495
1496 ipif->ipif_flags &= ~IPIF_DUPLICATE;
1497 ill->ill_ipif_dup_count--;
1498 mutex_exit(&ill->ill_lock);
1499 ipif->ipif_was_dup = B_TRUE;
1500
1501 if (ill->ill_isv6) {
1502 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1503 (void) ipif_up_done_v6(ipif);
1504 } else {
1505 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1506 EINPROGRESS);
1507 (void) ipif_up_done(ipif);
1508 }
1509 }
1510 freeb(mp);
1511 }
1512
1513 /*
1514 * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1515 * As long as someone else holds the address, the interface will stay down.
1516 * When that conflict goes away, the interface is brought back up. This is
1517 * done so that accidental shutdowns of addresses aren't made permanent. Your
1518 * server will recover from a failure.
1519 *
1520 * For DHCP and temporary addresses, recovery is not done in the kernel.
1521 * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1522 *
1523 * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1524 */
1525 void
1526 ipif_dup_recovery(void *arg)
1527 {
1528 ipif_t *ipif = arg;
1529
1530 ipif->ipif_recovery_id = 0;
1531 if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1532 return;
1533
1534 /*
1535 * No lock, because this is just an optimization.
1536 */
1537 if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1538 return;
1539
1540 /* If the link is down, we'll retry this later */
1541 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1542 return;
1543
1544 ipif_do_recovery(ipif);
1545 }
1546
1547 /*
1548 * Perform interface recovery by forcing the duplicate interfaces up and
1549 * allowing the system to determine which ones should stay up.
1550 *
1551 * Called both by recovery timer expiry and link-up notification.
1552 */
1553 void
1554 ipif_do_recovery(ipif_t *ipif)
1555 {
1556 ill_t *ill = ipif->ipif_ill;
1557 mblk_t *mp;
1558 ip_stack_t *ipst = ill->ill_ipst;
1559 size_t mp_size;
1560
1561 if (ipif->ipif_isv6)
1562 mp_size = sizeof (ipif->ipif_v6lcl_addr);
1563 else
1564 mp_size = sizeof (ipif->ipif_lcl_addr);
1565 mp = allocb(mp_size, BPRI_MED);
1566 if (mp == NULL) {
1567 mutex_enter(&ill->ill_lock);
1568 if (ipst->ips_ip_dup_recovery > 0 &&
1569 ipif->ipif_recovery_id == 0 &&
1570 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1571 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1572 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1573 }
1574 mutex_exit(&ill->ill_lock);
1575 } else {
1576 /*
1577 * A recovery timer may still be running if we got here from
1578 * ill_restart_dad(); cancel that timer.
1579 */
1580 if (ipif->ipif_recovery_id != 0)
1581 (void) untimeout(ipif->ipif_recovery_id);
1582 ipif->ipif_recovery_id = 0;
1583
1584 if (ipif->ipif_isv6) {
1585 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1586 sizeof (ipif->ipif_v6lcl_addr));
1587 } else {
1588 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1589 sizeof (ipif->ipif_lcl_addr));
1590 }
1591 ill_refhold(ill);
1592 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1593 B_FALSE);
1594 }
1595 }
1596
1597 /*
1598 * Find the MAC and IP addresses in an NA/NS message.
1599 */
1600 static void
1601 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1602 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1603 {
1604 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1605 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1606 uchar_t *addr;
1607 int alen;
1608
1609 /* icmp_inbound_v6 ensures this */
1610 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1611
1612 addr = ira->ira_l2src;
1613 alen = ill->ill_phys_addr_length;
1614 if (alen > 0) {
1615 *haddr = addr;
1616 *haddrlenp = alen;
1617 } else {
1618 *haddr = NULL;
1619 *haddrlenp = 0;
1620 }
1621
1622 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1623 *targp = ns->nd_ns_target;
1624 }
1625
1626 /*
1627 * This is for exclusive changes due to NDP duplicate address detection
1628 * failure.
1629 */
1630 /* ARGSUSED */
1631 static void
1632 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1633 {
1634 ill_t *ill = rq->q_ptr;
1635 ipif_t *ipif;
1636 uchar_t *haddr;
1637 uint_t haddrlen;
1638 ip_stack_t *ipst = ill->ill_ipst;
1639 in6_addr_t targ;
1640 ip_recv_attr_t iras;
1641 mblk_t *attrmp;
1642
1643 attrmp = mp;
1644 mp = mp->b_cont;
1645 attrmp->b_cont = NULL;
1646 if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1647 /* The ill or ip_stack_t disappeared on us */
1648 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1649 ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1650 freemsg(mp);
1651 ira_cleanup(&iras, B_TRUE);
1652 return;
1653 }
1654
1655 ASSERT(ill == iras.ira_rill);
1656
1657 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1658 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1659 /*
1660 * Ignore conflicts generated by misbehaving switches that
1661 * just reflect our own messages back to us. For IPMP, we may
1662 * see reflections across any ill in the illgrp.
1663 *
1664 * RFC2462 and revisions tried to detect both the case
1665 * when a statically configured IPv6 address is a duplicate,
1666 * and the case when the L2 address itself is a duplicate. The
1667 * later is important because, with stateles address autoconf,
1668 * if the L2 address is a duplicate, the resulting IPv6
1669 * address(es) would also be duplicates. We rely on DAD of the
1670 * IPv6 address itself to detect the latter case.
1671 */
1672 /* For an under ill_grp can change under lock */
1673 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1674 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1675 IS_UNDER_IPMP(ill) &&
1676 ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1677 haddrlen) != NULL) {
1678 rw_exit(&ipst->ips_ill_g_lock);
1679 goto ignore_conflict;
1680 }
1681 rw_exit(&ipst->ips_ill_g_lock);
1682 }
1683
1684 /*
1685 * Look up the appropriate ipif.
1686 */
1687 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1688 if (ipif == NULL)
1689 goto ignore_conflict;
1690
1691 /* Reload the ill to match the ipif */
1692 ill = ipif->ipif_ill;
1693
1694 /* If it's already duplicate or ineligible, then don't do anything. */
1695 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1696 ipif_refrele(ipif);
1697 goto ignore_conflict;
1698 }
1699
1700 /*
1701 * If this is a failure during duplicate recovery, then don't
1702 * complain. It may take a long time to recover.
1703 */
1704 if (!ipif->ipif_was_dup) {
1705 char ibuf[LIFNAMSIZ];
1706 char hbuf[MAC_STR_LEN];
1707 char sbuf[INET6_ADDRSTRLEN];
1708
1709 ipif_get_name(ipif, ibuf, sizeof (ibuf));
1710 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1711 " disabled", ibuf,
1712 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1713 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1714 }
1715 mutex_enter(&ill->ill_lock);
1716 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1717 ipif->ipif_flags |= IPIF_DUPLICATE;
1718 ill->ill_ipif_dup_count++;
1719 mutex_exit(&ill->ill_lock);
1720 (void) ipif_down(ipif, NULL, NULL);
1721 (void) ipif_down_tail(ipif);
1722 mutex_enter(&ill->ill_lock);
1723 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1724 ill->ill_net_type == IRE_IF_RESOLVER &&
1725 !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1726 ipst->ips_ip_dup_recovery > 0) {
1727 ASSERT(ipif->ipif_recovery_id == 0);
1728 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1729 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1730 }
1731 mutex_exit(&ill->ill_lock);
1732 ipif_refrele(ipif);
1733
1734 ignore_conflict:
1735 freemsg(mp);
1736 ira_cleanup(&iras, B_TRUE);
1737 }
1738
1739 /*
1740 * Handle failure by tearing down the ipifs with the specified address. Note
1741 * that tearing down the ipif also means deleting the ncec through ipif_down, so
1742 * it's not possible to do recovery by just restarting the ncec timer. Instead,
1743 * we start a timer on the ipif.
1744 * Caller has to free mp;
1745 */
1746 static void
1747 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1748 {
1749 const uchar_t *haddr;
1750 ill_t *ill = ira->ira_rill;
1751
1752 /*
1753 * Ignore conflicts generated by misbehaving switches that just
1754 * reflect our own messages back to us.
1755 */
1756
1757 /* icmp_inbound_v6 ensures this */
1758 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1759 haddr = ira->ira_l2src;
1760 if (haddr != NULL &&
1761 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1762 return;
1763 }
1764
1765 if ((mp = copymsg(mp)) != NULL) {
1766 mblk_t *attrmp;
1767
1768 attrmp = ip_recv_attr_to_mblk(ira);
1769 if (attrmp == NULL) {
1770 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1771 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1772 freemsg(mp);
1773 } else {
1774 ASSERT(attrmp->b_cont == NULL);
1775 attrmp->b_cont = mp;
1776 mp = attrmp;
1777 ill_refhold(ill);
1778 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1779 B_FALSE);
1780 }
1781 }
1782 }
1783
1784 /*
1785 * Handle a discovered conflict: some other system is advertising that it owns
1786 * one of our IP addresses. We need to defend ourselves, or just shut down the
1787 * interface.
1788 *
1789 * Handles both IPv4 and IPv6
1790 */
1791 boolean_t
1792 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1793 {
1794 ipif_t *ipif;
1795 clock_t now;
1796 uint_t maxdefense;
1797 uint_t defs;
1798 ill_t *ill = ira->ira_ill;
1799 ip_stack_t *ipst = ill->ill_ipst;
1800 uint32_t elapsed;
1801 boolean_t isv6 = ill->ill_isv6;
1802 ipaddr_t ncec_addr;
1803
1804 if (isv6) {
1805 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1806 ipst);
1807 } else {
1808 if (arp_no_defense) {
1809 /*
1810 * Yes, there is a conflict, but no, we do not
1811 * defend ourself.
1812 */
1813 return (B_TRUE);
1814 }
1815 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1816 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1817 ipst);
1818 }
1819 if (ipif == NULL)
1820 return (B_FALSE);
1821
1822 /*
1823 * First, figure out if this address is disposable.
1824 */
1825 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1826 maxdefense = ipst->ips_ip_max_temp_defend;
1827 else
1828 maxdefense = ipst->ips_ip_max_defend;
1829
1830 /*
1831 * Now figure out how many times we've defended ourselves. Ignore
1832 * defenses that happened long in the past.
1833 */
1834 now = ddi_get_lbolt();
1835 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1836 mutex_enter(&ncec->ncec_lock);
1837 if ((defs = ncec->ncec_defense_count) > 0 &&
1838 elapsed > ipst->ips_ip_defend_interval) {
1839 /*
1840 * ip_defend_interval has elapsed.
1841 * reset the defense count.
1842 */
1843 ncec->ncec_defense_count = defs = 0;
1844 }
1845 ncec->ncec_defense_count++;
1846 ncec->ncec_last_time_defended = now;
1847 mutex_exit(&ncec->ncec_lock);
1848 ipif_refrele(ipif);
1849
1850 /*
1851 * If we've defended ourselves too many times already, then give up and
1852 * tear down the interface(s) using this address.
1853 * Otherwise, caller has to defend by sending out an announce.
1854 */
1855 if (defs >= maxdefense) {
1856 if (isv6)
1857 ndp_failure(mp, ira);
1858 else
1859 arp_failure(mp, ira);
1860 } else {
1861 return (B_TRUE); /* caller must defend this address */
1862 }
1863 return (B_FALSE);
1864 }
1865
1866 /*
1867 * Handle reception of Neighbor Solicitation messages.
1868 */
1869 static void
1870 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1871 {
1872 ill_t *ill = ira->ira_ill, *under_ill;
1873 nd_neighbor_solicit_t *ns;
1874 uint32_t hlen = ill->ill_phys_addr_length;
1875 uchar_t *haddr = NULL;
1876 icmp6_t *icmp_nd;
1877 ip6_t *ip6h;
1878 ncec_t *our_ncec = NULL;
1879 in6_addr_t target;
1880 in6_addr_t src;
1881 int len;
1882 int flag = 0;
1883 nd_opt_hdr_t *opt = NULL;
1884 boolean_t bad_solicit = B_FALSE;
1885 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1886 boolean_t need_ill_refrele = B_FALSE;
1887
1888 ip6h = (ip6_t *)mp->b_rptr;
1889 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1890 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1891 src = ip6h->ip6_src;
1892 ns = (nd_neighbor_solicit_t *)icmp_nd;
1893 target = ns->nd_ns_target;
1894 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1895 IN6_IS_ADDR_LOOPBACK(&target)) {
1896 if (ip_debug > 2) {
1897 /* ip1dbg */
1898 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1899 AF_INET6, &target);
1900 }
1901 bad_solicit = B_TRUE;
1902 goto done;
1903 }
1904 if (len > sizeof (nd_neighbor_solicit_t)) {
1905 /* Options present */
1906 opt = (nd_opt_hdr_t *)&ns[1];
1907 len -= sizeof (nd_neighbor_solicit_t);
1908 if (!ndp_verify_optlen(opt, len)) {
1909 ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1910 bad_solicit = B_TRUE;
1911 goto done;
1912 }
1913 }
1914 if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1915 /* Check to see if this is a valid DAD solicitation */
1916 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1917 if (ip_debug > 2) {
1918 /* ip1dbg */
1919 pr_addr_dbg("ndp_input_solicit: IPv6 "
1920 "Destination is not solicited node "
1921 "multicast %s\n", AF_INET6,
1922 &ip6h->ip6_dst);
1923 }
1924 bad_solicit = B_TRUE;
1925 goto done;
1926 }
1927 }
1928
1929 /*
1930 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1931 * received this packet if it's multicast) is not the ill tied to
1932 * e.g. the IPMP ill's data link-local. So we match across the illgrp
1933 * to ensure we find the associated NCE.
1934 */
1935 our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1936 /*
1937 * If this is a valid Solicitation for an address we are publishing,
1938 * then a PUBLISH entry should exist in the cache
1939 */
1940 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1941 ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1942 "ifname=%s ", ill->ill_name));
1943 if (ip_debug > 2) {
1944 /* ip1dbg */
1945 pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1946 }
1947 if (our_ncec == NULL)
1948 bad_solicit = B_TRUE;
1949 goto done;
1950 }
1951
1952 /* At this point we should have a verified NS per spec */
1953 if (opt != NULL) {
1954 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1955 if (opt != NULL) {
1956 haddr = (uchar_t *)&opt[1];
1957 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1958 hlen == 0) {
1959 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1960 bad_solicit = B_TRUE;
1961 goto done;
1962 }
1963 }
1964 }
1965
1966 /* If sending directly to peer, set the unicast flag */
1967 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1968 flag |= NDP_UNICAST;
1969
1970 /*
1971 * Create/update the entry for the soliciting node on the ipmp_ill.
1972 * or respond to outstanding queries, don't if
1973 * the source is unspecified address.
1974 */
1975 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1976 int err;
1977 nce_t *nnce;
1978
1979 ASSERT(ill->ill_isv6);
1980 /*
1981 * Regular solicitations *must* include the Source Link-Layer
1982 * Address option. Ignore messages that do not.
1983 */
1984 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1985 ip1dbg(("ndp_input_solicit: source link-layer address "
1986 "option missing with a specified source.\n"));
1987 bad_solicit = B_TRUE;
1988 goto done;
1989 }
1990
1991 /*
1992 * This is a regular solicitation. If we're still in the
1993 * process of verifying the address, then don't respond at all
1994 * and don't keep track of the sender.
1995 */
1996 if (our_ncec->ncec_state == ND_PROBE)
1997 goto done;
1998
1999 /*
2000 * If the solicitation doesn't have sender hardware address
2001 * (legal for unicast solicitation), then process without
2002 * installing the return NCE. Either we already know it, or
2003 * we'll be forced to look it up when (and if) we reply to the
2004 * packet.
2005 */
2006 if (haddr == NULL)
2007 goto no_source;
2008
2009 under_ill = ill;
2010 if (IS_UNDER_IPMP(under_ill)) {
2011 ill = ipmp_ill_hold_ipmp_ill(under_ill);
2012 if (ill == NULL)
2013 ill = under_ill;
2014 else
2015 need_ill_refrele = B_TRUE;
2016 }
2017 err = nce_lookup_then_add_v6(ill,
2018 haddr, hlen,
2019 &src, /* Soliciting nodes address */
2020 0,
2021 ND_STALE,
2022 &nnce);
2023
2024 if (need_ill_refrele) {
2025 ill_refrele(ill);
2026 ill = under_ill;
2027 need_ill_refrele = B_FALSE;
2028 }
2029 switch (err) {
2030 case 0:
2031 /* done with this entry */
2032 nce_refrele(nnce);
2033 break;
2034 case EEXIST:
2035 /*
2036 * B_FALSE indicates this is not an an advertisement.
2037 */
2038 nce_process(nnce->nce_common, haddr, 0, B_FALSE);
2039 nce_refrele(nnce);
2040 break;
2041 default:
2042 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
2043 err));
2044 goto done;
2045 }
2046 no_source:
2047 flag |= NDP_SOLICITED;
2048 } else {
2049 /*
2050 * No source link layer address option should be present in a
2051 * valid DAD request.
2052 */
2053 if (haddr != NULL) {
2054 ip1dbg(("ndp_input_solicit: source link-layer address "
2055 "option present with an unspecified source.\n"));
2056 bad_solicit = B_TRUE;
2057 goto done;
2058 }
2059 if (our_ncec->ncec_state == ND_PROBE) {
2060 /*
2061 * Internally looped-back probes will have
2062 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
2063 * transmissions.
2064 */
2065 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
2066 /*
2067 * If someone else is probing our address, then
2068 * we've crossed wires. Declare failure.
2069 */
2070 ndp_failure(mp, ira);
2071 }
2072 goto done;
2073 }
2074 /*
2075 * This is a DAD probe. Multicast the advertisement to the
2076 * all-nodes address.
2077 */
2078 src = ipv6_all_hosts_mcast;
2079 }
2080 flag |= nce_advert_flags(our_ncec);
2081 (void) ndp_xmit(ill,
2082 ND_NEIGHBOR_ADVERT,
2083 our_ncec->ncec_lladdr,
2084 our_ncec->ncec_lladdr_length,
2085 &target, /* Source and target of the advertisement pkt */
2086 &src, /* IP Destination (source of original pkt) */
2087 flag);
2088 done:
2089 if (bad_solicit)
2090 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2091 if (our_ncec != NULL)
2092 ncec_refrele(our_ncec);
2093 }
2094
2095 /*
2096 * Handle reception of Neighbor Solicitation messages
2097 */
2098 void
2099 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
2100 {
2101 ill_t *ill = ira->ira_ill;
2102 nd_neighbor_advert_t *na;
2103 uint32_t hlen = ill->ill_phys_addr_length;
2104 uchar_t *haddr = NULL;
2105 icmp6_t *icmp_nd;
2106 ip6_t *ip6h;
2107 ncec_t *dst_ncec = NULL;
2108 in6_addr_t target;
2109 nd_opt_hdr_t *opt = NULL;
2110 int len;
2111 ip_stack_t *ipst = ill->ill_ipst;
2112 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
2113
2114 ip6h = (ip6_t *)mp->b_rptr;
2115 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2116 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2117 na = (nd_neighbor_advert_t *)icmp_nd;
2118
2119 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2120 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2121 ip1dbg(("ndp_input_advert: Target is multicast but the "
2122 "solicited flag is not zero\n"));
2123 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2124 return;
2125 }
2126 target = na->nd_na_target;
2127 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
2128 IN6_IS_ADDR_LOOPBACK(&target)) {
2129 if (ip_debug > 2) {
2130 /* ip1dbg */
2131 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
2132 AF_INET6, &target);
2133 }
2134 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2135 return;
2136 }
2137 if (len > sizeof (nd_neighbor_advert_t)) {
2138 opt = (nd_opt_hdr_t *)&na[1];
2139 if (!ndp_verify_optlen(opt,
2140 len - sizeof (nd_neighbor_advert_t))) {
2141 ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2142 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2143 return;
2144 }
2145 /* At this point we have a verified NA per spec */
2146 len -= sizeof (nd_neighbor_advert_t);
2147 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2148 if (opt != NULL) {
2149 haddr = (uchar_t *)&opt[1];
2150 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2151 hlen == 0) {
2152 ip1dbg(("ndp_input_advert: bad SLLA\n"));
2153 BUMP_MIB(mib,
2154 ipv6IfIcmpInBadNeighborAdvertisements);
2155 return;
2156 }
2157 }
2158 }
2159
2160 /*
2161 * NOTE: we match across the illgrp since we need to do DAD for all of
2162 * our local addresses, and those are spread across all the active
2163 * ills in the group.
2164 */
2165 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
2166 return;
2167
2168 if (NCE_PUBLISH(dst_ncec)) {
2169 /*
2170 * Someone just advertised an addresses that we publish. First,
2171 * check it it was us -- if so, we can safely ignore it.
2172 * We don't get the haddr from the ira_l2src because, in the
2173 * case that the packet originated from us, on an IPMP group,
2174 * the ira_l2src may would be the link-layer address of the
2175 * cast_ill used to send the packet, which may not be the same
2176 * as the dst_ncec->ncec_lladdr of the address.
2177 */
2178 if (haddr != NULL) {
2179 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
2180 goto out;
2181
2182 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
2183 goto out; /* from us -- no conflict */
2184
2185 /*
2186 * If we're in an IPMP group, check if this is an echo
2187 * from another ill in the group. Use the double-
2188 * checked locking pattern to avoid grabbing
2189 * ill_g_lock in the non-IPMP case.
2190 */
2191 if (IS_UNDER_IPMP(ill)) {
2192 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2193 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
2194 ill->ill_grp, haddr, hlen) != NULL) {
2195 rw_exit(&ipst->ips_ill_g_lock);
2196 goto out;
2197 }
2198 rw_exit(&ipst->ips_ill_g_lock);
2199 }
2200 }
2201
2202 /*
2203 * This appears to be a real conflict. If we're trying to
2204 * configure this NCE (ND_PROBE), then shut it down.
2205 * Otherwise, handle the discovered conflict.
2206 */
2207 if (dst_ncec->ncec_state == ND_PROBE) {
2208 ndp_failure(mp, ira);
2209 } else {
2210 if (ip_nce_conflict(mp, ira, dst_ncec)) {
2211 char hbuf[MAC_STR_LEN];
2212 char sbuf[INET6_ADDRSTRLEN];
2213
2214 cmn_err(CE_WARN,
2215 "node '%s' is using %s on %s",
2216 inet_ntop(AF_INET6, &target, sbuf,
2217 sizeof (sbuf)),
2218 haddr == NULL ? "<none>" :
2219 mac_colon_addr(haddr, hlen, hbuf,
2220 sizeof (hbuf)), ill->ill_name);
2221 /*
2222 * RFC 4862, Section 5.4.4 does not mandate
2223 * any specific behavior when an NA matches
2224 * a non-tentative address assigned to the
2225 * receiver. We make the choice of defending
2226 * our address, based on the assumption that
2227 * the sender has not detected the Duplicate.
2228 *
2229 * ncec_last_time_defended has been adjusted
2230 * in ip_nce_conflict()
2231 */
2232 (void) ndp_announce(dst_ncec);
2233 }
2234 }
2235 } else {
2236 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2237 dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2238
2239 /* B_TRUE indicates this an advertisement */
2240 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2241 }
2242 out:
2243 ncec_refrele(dst_ncec);
2244 }
2245
2246 /*
2247 * Process NDP neighbor solicitation/advertisement messages.
2248 * The checksum has already checked o.k before reaching here.
2249 * Information about the datalink header is contained in ira_l2src, but
2250 * that should be ignored for loopback packets.
2251 */
2252 void
2253 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2254 {
2255 ill_t *ill = ira->ira_rill;
2256 icmp6_t *icmp_nd;
2257 ip6_t *ip6h;
2258 int len;
2259 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
2260 ill_t *orig_ill = NULL;
2261
2262 /*
2263 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2264 * and make it be the IPMP upper so avoid being confused by a packet
2265 * addressed to a unicast address on a different ill.
2266 */
2267 if (IS_UNDER_IPMP(ill)) {
2268 orig_ill = ill;
2269 ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2270 if (ill == NULL) {
2271 ill = orig_ill;
2272 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2273 ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2274 mp, ill);
2275 freemsg(mp);
2276 return;
2277 }
2278 ASSERT(ill != orig_ill);
2279 orig_ill = ira->ira_ill;
2280 ira->ira_ill = ill;
2281 mib = ill->ill_icmp6_mib;
2282 }
2283 if (!pullupmsg(mp, -1)) {
2284 ip1dbg(("ndp_input: pullupmsg failed\n"));
2285 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2286 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2287 goto done;
2288 }
2289 ip6h = (ip6_t *)mp->b_rptr;
2290 if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2291 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2292 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2293 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2294 goto done;
2295 }
2296 /*
2297 * NDP does not accept any extension headers between the
2298 * IP header and the ICMP header since e.g. a routing
2299 * header could be dangerous.
2300 * This assumes that any AH or ESP headers are removed
2301 * by ip prior to passing the packet to ndp_input.
2302 */
2303 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2304 ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2305 ip6h->ip6_nxt));
2306 ip_drop_input("Wrong next header", mp, ill);
2307 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2308 goto done;
2309 }
2310 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2311 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2312 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2313 if (icmp_nd->icmp6_code != 0) {
2314 ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2315 ip_drop_input("code non-zero", mp, ill);
2316 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2317 goto done;
2318 }
2319 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2320 /*
2321 * Make sure packet length is large enough for either
2322 * a NS or a NA icmp packet.
2323 */
2324 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2325 ip1dbg(("ndp_input: packet too short\n"));
2326 ip_drop_input("packet too short", mp, ill);
2327 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2328 goto done;
2329 }
2330 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2331 ndp_input_solicit(mp, ira);
2332 } else {
2333 ndp_input_advert(mp, ira);
2334 }
2335 done:
2336 freemsg(mp);
2337 if (orig_ill != NULL) {
2338 ill_refrele(ill);
2339 ira->ira_ill = orig_ill;
2340 }
2341 }
2342
2343 /*
2344 * ndp_xmit is called to form and transmit a ND solicitation or
2345 * advertisement ICMP packet.
2346 *
2347 * If the source address is unspecified and this isn't a probe (used for
2348 * duplicate address detection), an appropriate source address and link layer
2349 * address will be chosen here. The link layer address option is included if
2350 * the source is specified (i.e., all non-probe packets), and omitted (per the
2351 * specification) otherwise.
2352 *
2353 * It returns B_FALSE only if it does a successful put() to the
2354 * corresponding ill's ill_wq otherwise returns B_TRUE.
2355 */
2356 static boolean_t
2357 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2358 const in6_addr_t *sender, const in6_addr_t *target, int flag)
2359 {
2360 uint32_t len;
2361 icmp6_t *icmp6;
2362 mblk_t *mp;
2363 ip6_t *ip6h;
2364 nd_opt_hdr_t *opt;
2365 uint_t plen;
2366 zoneid_t zoneid = GLOBAL_ZONEID;
2367 ill_t *hwaddr_ill = ill;
2368 ip_xmit_attr_t ixas;
2369 ip_stack_t *ipst = ill->ill_ipst;
2370 boolean_t need_refrele = B_FALSE;
2371 boolean_t probe = B_FALSE;
2372
2373 if (IS_UNDER_IPMP(ill)) {
2374 probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2375 /*
2376 * We send non-probe packets on the upper IPMP interface.
2377 * ip_output_simple() will use cast_ill for sending any
2378 * multicast packets. Note that we can't follow the same
2379 * logic for probe packets because all interfaces in the ipmp
2380 * group may have failed, so that we really want to only try
2381 * to send the ND packet on the ill corresponding to the src
2382 * address.
2383 */
2384 if (!probe) {
2385 ill = ipmp_ill_hold_ipmp_ill(ill);
2386 if (ill != NULL)
2387 need_refrele = B_TRUE;
2388 else
2389 ill = hwaddr_ill;
2390 }
2391 }
2392
2393 /*
2394 * If we have a unspecified source(sender) address, select a
2395 * proper source address for the solicitation here itself so
2396 * that we can initialize the h/w address correctly.
2397 *
2398 * If the sender is specified then we use this address in order
2399 * to lookup the zoneid before calling ip_output_v6(). This is to
2400 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2401 * by IP (we cannot guarantee that the global zone has an interface
2402 * route to the destination).
2403 *
2404 * Note that the NA never comes here with the unspecified source
2405 * address.
2406 */
2407
2408 /*
2409 * Probes will have unspec src at this point.
2410 */
2411 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2412 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2413 /*
2414 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2415 * ALL_ZONES if it cannot find a matching ipif for the address
2416 * we are trying to use. In this case we err on the side of
2417 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2418 */
2419 if (zoneid == ALL_ZONES)
2420 zoneid = GLOBAL_ZONEID;
2421 }
2422
2423 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2424 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2425 mp = allocb(len, BPRI_LO);
2426 if (mp == NULL) {
2427 if (need_refrele)
2428 ill_refrele(ill);
2429 return (B_TRUE);
2430 }
2431
2432 bzero((char *)mp->b_rptr, len);
2433 mp->b_wptr = mp->b_rptr + len;
2434
2435 bzero(&ixas, sizeof (ixas));
2436 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2437
2438 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2439 ixas.ixa_ipst = ipst;
2440 ixas.ixa_cred = kcred;
2441 ixas.ixa_cpid = NOPID;
2442 ixas.ixa_tsl = NULL;
2443 ixas.ixa_zoneid = zoneid;
2444
2445 ip6h = (ip6_t *)mp->b_rptr;
2446 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2447 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2448 ip6h->ip6_nxt = IPPROTO_ICMPV6;
2449 ip6h->ip6_hops = IPV6_MAX_HOPS;
2450 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2451 ip6h->ip6_dst = *target;
2452 icmp6 = (icmp6_t *)&ip6h[1];
2453
2454 if (hw_addr_len != 0) {
2455 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2456 sizeof (nd_neighbor_advert_t));
2457 } else {
2458 opt = NULL;
2459 }
2460 if (operation == ND_NEIGHBOR_SOLICIT) {
2461 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2462
2463 if (opt != NULL && !(flag & NDP_PROBE)) {
2464 /*
2465 * Note that we don't send out SLLA for ND probes
2466 * per RFC 4862, even though we do send out the src
2467 * haddr for IPv4 DAD probes, even though both IPv4
2468 * and IPv6 go out with the unspecified/INADDR_ANY
2469 * src IP addr.
2470 */
2471 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2472 }
2473 ip6h->ip6_src = *sender;
2474 ns->nd_ns_target = *target;
2475 if (!(flag & NDP_UNICAST)) {
2476 /* Form multicast address of the target */
2477 ip6h->ip6_dst = ipv6_solicited_node_mcast;
2478 ip6h->ip6_dst.s6_addr32[3] |=
2479 ns->nd_ns_target.s6_addr32[3];
2480 }
2481 } else {
2482 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2483
2484 ASSERT(!(flag & NDP_PROBE));
2485 if (opt != NULL)
2486 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2487 ip6h->ip6_src = *sender;
2488 na->nd_na_target = *sender;
2489 if (flag & NDP_ISROUTER)
2490 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2491 if (flag & NDP_SOLICITED)
2492 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2493 if (flag & NDP_ORIDE)
2494 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2495 }
2496
2497 if (!(flag & NDP_PROBE)) {
2498 if (hw_addr != NULL && opt != NULL) {
2499 /* Fill in link layer address and option len */
2500 opt->nd_opt_len = (uint8_t)plen;
2501 bcopy(hw_addr, &opt[1], hw_addr_len);
2502 }
2503 }
2504 if (opt != NULL && opt->nd_opt_type == 0) {
2505 /* If there's no link layer address option, then strip it. */
2506 len -= plen * 8;
2507 mp->b_wptr = mp->b_rptr + len;
2508 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2509 }
2510
2511 icmp6->icmp6_type = (uint8_t)operation;
2512 icmp6->icmp6_code = 0;
2513 /*
2514 * Prepare for checksum by putting icmp length in the icmp
2515 * checksum field. The checksum is calculated in ip_output.c.
2516 */
2517 icmp6->icmp6_cksum = ip6h->ip6_plen;
2518
2519 (void) ip_output_simple(mp, &ixas);
2520 ixa_cleanup(&ixas);
2521 if (need_refrele)
2522 ill_refrele(ill);
2523 return (B_FALSE);
2524 }
2525
2526 /*
2527 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2528 * The datapath uses this as an indication that there
2529 * is a problem (as opposed to a NCE that was just
2530 * reclaimed due to lack of memory.
2531 * Note that static ARP entries never become unreachable.
2532 */
2533 void
2534 nce_make_unreachable(ncec_t *ncec)
2535 {
2536 mutex_enter(&ncec->ncec_lock);
2537 ncec->ncec_state = ND_UNREACHABLE;
2538 mutex_exit(&ncec->ncec_lock);
2539 }
2540
2541 /*
2542 * NCE retransmit timer. Common to IPv4 and IPv6.
2543 * This timer goes off when:
2544 * a. It is time to retransmit a resolution for resolver.
2545 * b. It is time to send reachability probes.
2546 */
2547 void
2548 nce_timer(void *arg)
2549 {
2550 ncec_t *ncec = arg;
2551 ill_t *ill = ncec->ncec_ill, *src_ill;
2552 char addrbuf[INET6_ADDRSTRLEN];
2553 boolean_t dropped = B_FALSE;
2554 ip_stack_t *ipst = ncec->ncec_ipst;
2555 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2556 in_addr_t sender4 = INADDR_ANY;
2557 in6_addr_t sender6 = ipv6_all_zeros;
2558
2559 /*
2560 * The timer has to be cancelled by ncec_delete before doing the final
2561 * refrele. So the NCE is guaranteed to exist when the timer runs
2562 * until it clears the timeout_id. Before clearing the timeout_id
2563 * bump up the refcnt so that we can continue to use the ncec
2564 */
2565 ASSERT(ncec != NULL);
2566 mutex_enter(&ncec->ncec_lock);
2567 ncec_refhold_locked(ncec);
2568 ncec->ncec_timeout_id = 0;
2569 mutex_exit(&ncec->ncec_lock);
2570
2571 src_ill = nce_resolve_src(ncec, &sender6);
2572 /* if we could not find a sender address, return */
2573 if (src_ill == NULL) {
2574 if (!isv6) {
2575 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2576 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2577 &sender4, addrbuf, sizeof (addrbuf))));
2578 } else {
2579 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2580 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2581 }
2582 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2583 ncec_refrele(ncec);
2584 return;
2585 }
2586 if (!isv6)
2587 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2588
2589 mutex_enter(&ncec->ncec_lock);
2590 /*
2591 * Check the reachability state.
2592 */
2593 switch (ncec->ncec_state) {
2594 case ND_DELAY:
2595 ASSERT(ncec->ncec_lladdr != NULL);
2596 ncec->ncec_state = ND_PROBE;
2597 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2598 if (isv6) {
2599 mutex_exit(&ncec->ncec_lock);
2600 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2601 src_ill->ill_phys_addr,
2602 src_ill->ill_phys_addr_length,
2603 &sender6, &ncec->ncec_addr,
2604 NDP_UNICAST);
2605 } else {
2606 dropped = (arp_request(ncec, sender4, src_ill) == 0);
2607 mutex_exit(&ncec->ncec_lock);
2608 }
2609 if (!dropped) {
2610 mutex_enter(&ncec->ncec_lock);
2611 ncec->ncec_pcnt--;
2612 mutex_exit(&ncec->ncec_lock);
2613 }
2614 if (ip_debug > 3) {
2615 /* ip2dbg */
2616 pr_addr_dbg("nce_timer: state for %s changed "
2617 "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2618 }
2619 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2620 break;
2621 case ND_PROBE:
2622 /* must be retransmit timer */
2623 ASSERT(ncec->ncec_pcnt >= -1);
2624 if (ncec->ncec_pcnt > 0) {
2625 /*
2626 * As per RFC2461, the ncec gets deleted after
2627 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2628 * Note that the first unicast solicitation is sent
2629 * during the DELAY state.
2630 */
2631 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2632 ncec->ncec_pcnt,
2633 inet_ntop((isv6? AF_INET6 : AF_INET),
2634 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2635 if (NCE_PUBLISH(ncec)) {
2636 mutex_exit(&ncec->ncec_lock);
2637 /*
2638 * send out a probe; note that src_ill
2639 * is ignored by nce_dad() for all
2640 * DAD message types other than IPv6
2641 * unicast probes
2642 */
2643 nce_dad(ncec, src_ill, B_TRUE);
2644 } else {
2645 ASSERT(src_ill != NULL);
2646 if (isv6) {
2647 mutex_exit(&ncec->ncec_lock);
2648 dropped = ndp_xmit(src_ill,
2649 ND_NEIGHBOR_SOLICIT,
2650 src_ill->ill_phys_addr,
2651 src_ill->ill_phys_addr_length,
2652 &sender6, &ncec->ncec_addr,
2653 NDP_UNICAST);
2654 } else {
2655 /*
2656 * since the nce is REACHABLE,
2657 * the ARP request will be sent out
2658 * as a link-layer unicast.
2659 */
2660 dropped = (arp_request(ncec, sender4,
2661 src_ill) == 0);
2662 mutex_exit(&ncec->ncec_lock);
2663 }
2664 if (!dropped) {
2665 mutex_enter(&ncec->ncec_lock);
2666 ncec->ncec_pcnt--;
2667 mutex_exit(&ncec->ncec_lock);
2668 }
2669 nce_restart_timer(ncec,
2670 ill->ill_reachable_retrans_time);
2671 }
2672 } else if (ncec->ncec_pcnt < 0) {
2673 /* No hope, delete the ncec */
2674 /* Tell datapath it went bad */
2675 ncec->ncec_state = ND_UNREACHABLE;
2676 mutex_exit(&ncec->ncec_lock);
2677 if (ip_debug > 2) {
2678 /* ip1dbg */
2679 pr_addr_dbg("nce_timer: Delete NCE for"
2680 " dst %s\n", (isv6? AF_INET6: AF_INET),
2681 &ncec->ncec_addr);
2682 }
2683 /* if static ARP can't delete. */
2684 if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2685 ncec_delete(ncec);
2686
2687 } else if (!NCE_PUBLISH(ncec)) {
2688 /*
2689 * Probe count is 0 for a dynamic entry (one that we
2690 * ourselves are not publishing). We should never get
2691 * here if NONUD was requested, hence the ASSERT below.
2692 */
2693 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2694 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2695 ncec->ncec_pcnt, inet_ntop(AF_INET6,
2696 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2697 ncec->ncec_pcnt--;
2698 mutex_exit(&ncec->ncec_lock);
2699 /* Wait one interval before killing */
2700 nce_restart_timer(ncec,
2701 ill->ill_reachable_retrans_time);
2702 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2703 ipif_t *ipif;
2704 ipaddr_t ncec_addr;
2705
2706 /*
2707 * We're done probing, and we can now declare this
2708 * address to be usable. Let IP know that it's ok to
2709 * use.
2710 */
2711 ncec->ncec_state = ND_REACHABLE;
2712 ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2713 mutex_exit(&ncec->ncec_lock);
2714 if (isv6) {
2715 ipif = ipif_lookup_addr_exact_v6(
2716 &ncec->ncec_addr, ill, ipst);
2717 } else {
2718 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2719 ncec_addr);
2720 ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2721 ipst);
2722 }
2723 if (ipif != NULL) {
2724 if (ipif->ipif_was_dup) {
2725 char ibuf[LIFNAMSIZ];
2726 char sbuf[INET6_ADDRSTRLEN];
2727
2728 ipif->ipif_was_dup = B_FALSE;
2729 (void) inet_ntop(AF_INET6,
2730 &ipif->ipif_v6lcl_addr,
2731 sbuf, sizeof (sbuf));
2732 ipif_get_name(ipif, ibuf,
2733 sizeof (ibuf));
2734 cmn_err(CE_NOTE, "recovered address "
2735 "%s on %s", sbuf, ibuf);
2736 }
2737 if ((ipif->ipif_flags & IPIF_UP) &&
2738 !ipif->ipif_addr_ready)
2739 ipif_up_notify(ipif);
2740 ipif->ipif_addr_ready = 1;
2741 ipif_refrele(ipif);
2742 }
2743 if (!isv6 && arp_no_defense)
2744 break;
2745 /* Begin defending our new address */
2746 if (ncec->ncec_unsolicit_count > 0) {
2747 ncec->ncec_unsolicit_count--;
2748 if (isv6) {
2749 dropped = ndp_announce(ncec);
2750 } else {
2751 dropped = arp_announce(ncec);
2752 }
2753
2754 if (dropped)
2755 ncec->ncec_unsolicit_count++;
2756 else
2757 ncec->ncec_last_time_defended =
2758 ddi_get_lbolt();
2759 }
2760 if (ncec->ncec_unsolicit_count > 0) {
2761 nce_restart_timer(ncec,
2762 ANNOUNCE_INTERVAL(isv6));
2763 } else if (DEFENSE_INTERVAL(isv6) != 0) {
2764 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2765 }
2766 } else {
2767 /*
2768 * This is an address we're probing to be our own, but
2769 * the ill is down. Wait until it comes back before
2770 * doing anything, but switch to reachable state so
2771 * that the restart will work.
2772 */
2773 ncec->ncec_state = ND_REACHABLE;
2774 mutex_exit(&ncec->ncec_lock);
2775 }
2776 break;
2777 case ND_INCOMPLETE: {
2778 mblk_t *mp, *nextmp;
2779 mblk_t **prevmpp;
2780
2781 /*
2782 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2783 * for any IPMP probe packets, and toss them. IPMP probe
2784 * packets will always be at the head of ncec_qd_mp, so that
2785 * we can stop at the first queued ND packet that is
2786 * not a probe packet.
2787 */
2788 prevmpp = &ncec->ncec_qd_mp;
2789 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2790 nextmp = mp->b_next;
2791
2792 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2793 inet_freemsg(mp);
2794 ncec->ncec_nprobes--;
2795 *prevmpp = nextmp;
2796 } else {
2797 prevmpp = &mp->b_next;
2798 }
2799 }
2800
2801 /*
2802 * Must be resolver's retransmit timer.
2803 */
2804 mutex_exit(&ncec->ncec_lock);
2805 ip_ndp_resolve(ncec);
2806 break;
2807 }
2808 case ND_REACHABLE:
2809 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2810 ncec->ncec_unsolicit_count != 0) ||
2811 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2812 if (ncec->ncec_unsolicit_count > 0) {
2813 ncec->ncec_unsolicit_count--;
2814 mutex_exit(&ncec->ncec_lock);
2815 /*
2816 * When we get to zero announcements left,
2817 * switch to address defense
2818 */
2819 } else {
2820 boolean_t rate_limit;
2821
2822 mutex_exit(&ncec->ncec_lock);
2823 rate_limit = ill_defend_rate_limit(ill, ncec);
2824 if (rate_limit) {
2825 nce_restart_timer(ncec,
2826 DEFENSE_INTERVAL(isv6));
2827 break;
2828 }
2829 }
2830 if (isv6) {
2831 dropped = ndp_announce(ncec);
2832 } else {
2833 dropped = arp_announce(ncec);
2834 }
2835 mutex_enter(&ncec->ncec_lock);
2836 if (dropped) {
2837 ncec->ncec_unsolicit_count++;
2838 } else {
2839 ncec->ncec_last_time_defended =
2840 ddi_get_lbolt();
2841 }
2842 mutex_exit(&ncec->ncec_lock);
2843 if (ncec->ncec_unsolicit_count != 0) {
2844 nce_restart_timer(ncec,
2845 ANNOUNCE_INTERVAL(isv6));
2846 } else {
2847 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2848 }
2849 } else {
2850 mutex_exit(&ncec->ncec_lock);
2851 }
2852 break;
2853 default:
2854 mutex_exit(&ncec->ncec_lock);
2855 break;
2856 }
2857 done:
2858 ncec_refrele(ncec);
2859 ill_refrele(src_ill);
2860 }
2861
2862 /*
2863 * Set a link layer address from the ll_addr passed in.
2864 * Copy SAP from ill.
2865 */
2866 static void
2867 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2868 {
2869 ill_t *ill = ncec->ncec_ill;
2870
2871 ASSERT(ll_addr != NULL);
2872 if (ill->ill_phys_addr_length > 0) {
2873 /*
2874 * The bcopy() below used to be called for the physical address
2875 * length rather than the link layer address length. For
2876 * ethernet and many other media, the phys_addr and lla are
2877 * identical.
2878 *
2879 * The phys_addr and lla may not be the same for devices that
2880 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2881 * no known instances of these.
2882 *
2883 * For PPP or other interfaces with a zero length
2884 * physical address, don't do anything here.
2885 * The bcopy() with a zero phys_addr length was previously
2886 * a no-op for interfaces with a zero-length physical address.
2887 * Using the lla for them would change the way they operate.
2888 * Doing nothing in such cases preserves expected behavior.
2889 */
2890 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2891 }
2892 }
2893
2894 boolean_t
2895 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2896 uint32_t ll_addr_len)
2897 {
2898 ASSERT(ncec->ncec_lladdr != NULL);
2899 if (ll_addr == NULL)
2900 return (B_FALSE);
2901 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2902 return (B_TRUE);
2903 return (B_FALSE);
2904 }
2905
2906 /*
2907 * Updates the link layer address or the reachability state of
2908 * a cache entry. Reset probe counter if needed.
2909 */
2910 void
2911 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2912 {
2913 ill_t *ill = ncec->ncec_ill;
2914 boolean_t need_stop_timer = B_FALSE;
2915 boolean_t need_fastpath_update = B_FALSE;
2916 nce_t *nce = NULL;
2917 timeout_id_t tid;
2918
2919 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2920 /*
2921 * If this interface does not do NUD, there is no point
2922 * in allowing an update to the cache entry. Although
2923 * we will respond to NS.
2924 * The only time we accept an update for a resolver when
2925 * NUD is turned off is when it has just been created.
2926 * Non-Resolvers will always be created as REACHABLE.
2927 */
2928 if (new_state != ND_UNCHANGED) {
2929 if ((ncec->ncec_flags & NCE_F_NONUD) &&
2930 (ncec->ncec_state != ND_INCOMPLETE))
2931 return;
2932 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2933 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2934 need_stop_timer = B_TRUE;
2935 if (new_state == ND_REACHABLE)
2936 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2937 else {
2938 /* We force NUD in this case */
2939 ncec->ncec_last = 0;
2940 }
2941 ncec->ncec_state = new_state;
2942 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2943 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2944 new_state == ND_INCOMPLETE);
2945 }
2946 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2947 tid = ncec->ncec_timeout_id;
2948 ncec->ncec_timeout_id = 0;
2949 }
2950 /*
2951 * Re-trigger fastpath probe and
2952 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2953 * whatever packets that happens to be transmitting at the time.
2954 */
2955 if (new_ll_addr != NULL) {
2956 bcopy(new_ll_addr, ncec->ncec_lladdr,
2957 ill->ill_phys_addr_length);
2958 need_fastpath_update = B_TRUE;
2959 }
2960 mutex_exit(&ncec->ncec_lock);
2961 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2962 if (tid != 0)
2963 (void) untimeout(tid);
2964 }
2965 if (need_fastpath_update) {
2966 /*
2967 * Delete any existing existing dlur_mp and fp_mp information.
2968 * For IPMP interfaces, all underlying ill's must be checked
2969 * and purged.
2970 */
2971 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2972 /*
2973 * add the new dlur_mp and fp_mp
2974 */
2975 nce = nce_fastpath(ncec, B_TRUE, NULL);
2976 if (nce != NULL)
2977 nce_refrele(nce);
2978 }
2979 mutex_enter(&ncec->ncec_lock);
2980 }
2981
2982 static void
2983 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2984 {
2985 uint_t count = 0;
2986 mblk_t **mpp, *tmp;
2987
2988 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2989
2990 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2991 if (++count > ncec->ncec_ill->ill_max_buf) {
2992 tmp = ncec->ncec_qd_mp->b_next;
2993 ncec->ncec_qd_mp->b_next = NULL;
2994 /*
2995 * if we never create data addrs on the under_ill
2996 * does this matter?
2997 */
2998 BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2999 ipIfStatsOutDiscards);
3000 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
3001 ncec->ncec_ill);
3002 freemsg(ncec->ncec_qd_mp);
3003 ncec->ncec_qd_mp = tmp;
3004 }
3005 }
3006
3007 if (head_insert) {
3008 ncec->ncec_nprobes++;
3009 mp->b_next = ncec->ncec_qd_mp;
3010 ncec->ncec_qd_mp = mp;
3011 } else {
3012 *mpp = mp;
3013 }
3014 }
3015
3016 /*
3017 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
3018 * queued at the head or tail of the queue based on the input argument
3019 * 'head_insert'. The caller should specify this argument as B_TRUE if this
3020 * packet is an IPMP probe packet, in which case the following happens:
3021 *
3022 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal
3023 * (non-ipmp_probe) load-speading case where the source address of the ND
3024 * packet is not tied to ncec_ill. If the ill bound to the source address
3025 * cannot receive, the response to the ND packet will not be received.
3026 * However, if ND packets for ncec_ill's probes are queued behind that ND
3027 * packet, those probes will also fail to be sent, and thus in.mpathd will
3028 * erroneously conclude that ncec_ill has also failed.
3029 *
3030 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
3031 * the first attempt. This ensures that ND problems do not manifest as
3032 * probe RTT spikes.
3033 *
3034 * We achieve this by inserting ipmp_probe() packets at the head of the
3035 * nce_queue.
3036 *
3037 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
3038 * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
3039 */
3040 void
3041 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
3042 {
3043 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3044 nce_queue_mp_common(ncec, mp, head_insert);
3045 }
3046
3047 /*
3048 * Called when address resolution failed due to a timeout.
3049 * Send an ICMP unreachable in response to all queued packets.
3050 */
3051 void
3052 ndp_resolv_failed(ncec_t *ncec)
3053 {
3054 mblk_t *mp, *nxt_mp;
3055 char buf[INET6_ADDRSTRLEN];
3056 ill_t *ill = ncec->ncec_ill;
3057 ip_recv_attr_t iras;
3058
3059 bzero(&iras, sizeof (iras));
3060 iras.ira_flags = 0;
3061 /*
3062 * we are setting the ira_rill to the ipmp_ill (instead of
3063 * the actual ill on which the packet was received), but this
3064 * is ok because we don't actually need the real ira_rill.
3065 * to send the icmp unreachable to the sender.
3066 */
3067 iras.ira_ill = iras.ira_rill = ill;
3068 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3069 iras.ira_rifindex = iras.ira_ruifindex;
3070
3071 ip1dbg(("ndp_resolv_failed: dst %s\n",
3072 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
3073 mutex_enter(&ncec->ncec_lock);
3074 mp = ncec->ncec_qd_mp;
3075 ncec->ncec_qd_mp = NULL;
3076 ncec->ncec_nprobes = 0;
3077 mutex_exit(&ncec->ncec_lock);
3078 while (mp != NULL) {
3079 nxt_mp = mp->b_next;
3080 mp->b_next = NULL;
3081
3082 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3083 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3084 mp, ill);
3085 icmp_unreachable_v6(mp,
3086 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
3087 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3088 mp = nxt_mp;
3089 }
3090 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3091 }
3092
3093 /*
3094 * Handle the completion of NDP and ARP resolution.
3095 */
3096 void
3097 nce_resolv_ok(ncec_t *ncec)
3098 {
3099 mblk_t *mp;
3100 uint_t pkt_len;
3101 iaflags_t ixaflags = IXAF_NO_TRACE;
3102 nce_t *nce;
3103 ill_t *ill = ncec->ncec_ill;
3104 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
3105 ip_stack_t *ipst = ill->ill_ipst;
3106
3107 if (IS_IPMP(ncec->ncec_ill)) {
3108 nce_resolv_ipmp_ok(ncec);
3109 return;
3110 }
3111 /* non IPMP case */
3112
3113 mutex_enter(&ncec->ncec_lock);
3114 ASSERT(ncec->ncec_nprobes == 0);
3115 mp = ncec->ncec_qd_mp;
3116 ncec->ncec_qd_mp = NULL;
3117 mutex_exit(&ncec->ncec_lock);
3118
3119 while (mp != NULL) {
3120 mblk_t *nxt_mp;
3121
3122 if (ill->ill_isv6) {
3123 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
3124
3125 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
3126 } else {
3127 ipha_t *ipha = (ipha_t *)mp->b_rptr;
3128
3129 ixaflags |= IXAF_IS_IPV4;
3130 pkt_len = ntohs(ipha->ipha_length);
3131 }
3132 nxt_mp = mp->b_next;
3133 mp->b_next = NULL;
3134 /*
3135 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
3136 * longer available, but it's ok to drop this flag because TCP
3137 * has its own flow-control in effect, so TCP packets
3138 * are not likely to get here when flow-control is in effect.
3139 */
3140 mutex_enter(&ill->ill_lock);
3141 nce = nce_lookup(ill, &ncec->ncec_addr);
3142 mutex_exit(&ill->ill_lock);
3143
3144 if (nce == NULL) {
3145 if (isv6) {
3146 BUMP_MIB(&ipst->ips_ip6_mib,
3147 ipIfStatsOutDiscards);
3148 } else {
3149 BUMP_MIB(&ipst->ips_ip_mib,
3150 ipIfStatsOutDiscards);
3151 }
3152 ip_drop_output("ipIfStatsOutDiscards - no nce",
3153 mp, NULL);
3154 freemsg(mp);
3155 } else {
3156 /*
3157 * We don't know the zoneid, but
3158 * ip_xmit does not care since IXAF_NO_TRACE
3159 * is set. (We traced the packet the first
3160 * time through ip_xmit.)
3161 */
3162 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
3163 ALL_ZONES, 0, NULL);
3164 nce_refrele(nce);
3165 }
3166 mp = nxt_mp;
3167 }
3168
3169 ncec_cb_dispatch(ncec); /* complete callbacks */
3170 }
3171
3172 /*
3173 * Called by SIOCSNDP* ioctl to add/change an ncec entry
3174 * and the corresponding attributes.
3175 * Disallow states other than ND_REACHABLE or ND_STALE.
3176 */
3177 int
3178 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3179 {
3180 sin6_t *sin6;
3181 in6_addr_t *addr;
3182 ncec_t *ncec;
3183 nce_t *nce;
3184 int err = 0;
3185 uint16_t new_flags = 0;
3186 uint16_t old_flags = 0;
3187 int inflags = lnr->lnr_flags;
3188 ip_stack_t *ipst = ill->ill_ipst;
3189 boolean_t do_postprocess = B_FALSE;
3190
3191 ASSERT(ill->ill_isv6);
3192 if ((lnr->lnr_state_create != ND_REACHABLE) &&
3193 (lnr->lnr_state_create != ND_STALE))
3194 return (EINVAL);
3195
3196 sin6 = (sin6_t *)&lnr->lnr_addr;
3197 addr = &sin6->sin6_addr;
3198
3199 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3200 ASSERT(!IS_UNDER_IPMP(ill));
3201 nce = nce_lookup_addr(ill, addr);
3202 if (nce != NULL)
3203 new_flags = nce->nce_common->ncec_flags;
3204
3205 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3206 case NDF_ISROUTER_ON:
3207 new_flags |= NCE_F_ISROUTER;
3208 break;
3209 case NDF_ISROUTER_OFF:
3210 new_flags &= ~NCE_F_ISROUTER;
3211 break;
3212 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3213 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3214 if (nce != NULL)
3215 nce_refrele(nce);
3216 return (EINVAL);
3217 }
3218 if (inflags & NDF_STATIC)
3219 new_flags |= NCE_F_STATIC;
3220
3221 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3222 case NDF_ANYCAST_ON:
3223 new_flags |= NCE_F_ANYCAST;
3224 break;
3225 case NDF_ANYCAST_OFF:
3226 new_flags &= ~NCE_F_ANYCAST;
3227 break;
3228 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3229 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3230 if (nce != NULL)
3231 nce_refrele(nce);
3232 return (EINVAL);
3233 }
3234
3235 if (nce == NULL) {
3236 err = nce_add_v6(ill,
3237 (uchar_t *)lnr->lnr_hdw_addr,
3238 ill->ill_phys_addr_length,
3239 addr,
3240 new_flags,
3241 lnr->lnr_state_create,
3242 &nce);
3243 if (err != 0) {
3244 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3245 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3246 return (err);
3247 } else {
3248 do_postprocess = B_TRUE;
3249 }
3250 }
3251 ncec = nce->nce_common;
3252 old_flags = ncec->ncec_flags;
3253 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3254 ncec_router_to_host(ncec);
3255 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3256 if (do_postprocess)
3257 err = nce_add_v6_postprocess(nce);
3258 nce_refrele(nce);
3259 return (0);
3260 }
3261 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3262
3263 if (do_postprocess)
3264 err = nce_add_v6_postprocess(nce);
3265 /*
3266 * err cannot be anything other than 0 because we don't support
3267 * proxy arp of static addresses.
3268 */
3269 ASSERT(err == 0);
3270
3271 mutex_enter(&ncec->ncec_lock);
3272 ncec->ncec_flags = new_flags;
3273 mutex_exit(&ncec->ncec_lock);
3274 /*
3275 * Note that we ignore the state at this point, which
3276 * should be either STALE or REACHABLE. Instead we let
3277 * the link layer address passed in to determine the state
3278 * much like incoming packets.
3279 */
3280 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3281 nce_refrele(nce);
3282 return (0);
3283 }
3284
3285 /*
3286 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3287 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3288 * be held to ensure that they are in the same group.
3289 */
3290 static nce_t *
3291 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3292 {
3293
3294 nce_t *nce;
3295
3296 nce = nce_ill_lookup_then_add(ill, ncec);
3297
3298 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3299 return (nce);
3300
3301 /*
3302 * hold the ncec_lock to synchronize with nce_update() so that,
3303 * at the end of this function, the contents of nce_dlur_mp are
3304 * consistent with ncec->ncec_lladdr, even though some intermediate
3305 * packet may have been sent out with a mangled address, which would
3306 * only be a transient condition.
3307 */
3308 mutex_enter(&ncec->ncec_lock);
3309 if (ncec->ncec_lladdr != NULL) {
3310 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3311 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3312 } else {
3313 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3314 ill->ill_sap_length);
3315 }
3316 mutex_exit(&ncec->ncec_lock);
3317 return (nce);
3318 }
3319
3320 /*
3321 * we make nce_fp_mp to have an M_DATA prepend.
3322 * The caller ensures there is hold on ncec for this function.
3323 * Note that since ill_fastpath_probe() copies the mblk there is
3324 * no need to hold the nce or ncec beyond this function.
3325 *
3326 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3327 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3328 * and will be returned back by this function, so that no extra nce_refrele
3329 * is required for the caller. The calls from nce_add_common() use this
3330 * method. All other callers (that pass in NULL ncec_nce) will have to do a
3331 * nce_refrele of the returned nce (when it is non-null).
3332 */
3333 static nce_t *
3334 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3335 {
3336 nce_t *nce;
3337 ill_t *ill = ncec->ncec_ill;
3338
3339 ASSERT(ill != NULL);
3340
3341 if (IS_IPMP(ill) && trigger_fp_req) {
3342 trigger_fp_req = B_FALSE;
3343 ipmp_ncec_refresh_nce(ncec);
3344 }
3345
3346 /*
3347 * If the caller already has the nce corresponding to the ill, use
3348 * that one. Otherwise we have to lookup/add the nce. Calls from
3349 * nce_add_common() fall in the former category, and have just done
3350 * the nce lookup/add that can be reused.
3351 */
3352 if (ncec_nce == NULL)
3353 nce = nce_fastpath_create(ill, ncec);
3354 else
3355 nce = ncec_nce;
3356
3357 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3358 return (nce);
3359
3360 if (trigger_fp_req)
3361 nce_fastpath_trigger(nce);
3362 return (nce);
3363 }
3364
3365 /*
3366 * Trigger fastpath on nce. No locks may be held.
3367 */
3368 static void
3369 nce_fastpath_trigger(nce_t *nce)
3370 {
3371 int res;
3372 ill_t *ill = nce->nce_ill;
3373 ncec_t *ncec = nce->nce_common;
3374
3375 res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3376 /*
3377 * EAGAIN is an indication of a transient error
3378 * i.e. allocation failure etc. leave the ncec in the list it
3379 * will be updated when another probe happens for another ire
3380 * if not it will be taken out of the list when the ire is
3381 * deleted.
3382 */
3383 if (res != 0 && res != EAGAIN && res != ENOTSUP)
3384 nce_fastpath_list_delete(ill, ncec, NULL);
3385 }
3386
3387 /*
3388 * Add ncec to the nce fastpath list on ill.
3389 */
3390 static nce_t *
3391 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
3392 {
3393 nce_t *nce = NULL;
3394
3395 ASSERT(MUTEX_HELD(&ill->ill_lock));
3396 /*
3397 * Atomically ensure that the ill is not CONDEMNED and is not going
3398 * down, before adding the NCE.
3399 */
3400 if (ill->ill_state_flags & ILL_CONDEMNED)
3401 return (NULL);
3402 mutex_enter(&ncec->ncec_lock);
3403 /*
3404 * if ncec has not been deleted and
3405 * is not already in the list add it.
3406 */
3407 if (!NCE_ISCONDEMNED(ncec)) {
3408 nce = nce_lookup(ill, &ncec->ncec_addr);
3409 if (nce != NULL)
3410 goto done;
3411 nce = nce_add(ill, ncec, graveyard);
3412 }
3413 done:
3414 mutex_exit(&ncec->ncec_lock);
3415 return (nce);
3416 }
3417
3418 static nce_t *
3419 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3420 {
3421 nce_t *nce;
3422 list_t graveyard;
3423
3424 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3425 mutex_enter(&ill->ill_lock);
3426 nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
3427 mutex_exit(&ill->ill_lock);
3428 nce_graveyard_free(&graveyard);
3429 return (nce);
3430 }
3431
3432
3433 /*
3434 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3435 * nce is added to the 'dead' list, and the caller must nce_refrele() the
3436 * entry after all locks have been dropped.
3437 */
3438 void
3439 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3440 {
3441 nce_t *nce;
3442
3443 ASSERT(ill != NULL);
3444
3445 /* delete any nces referencing the ncec from underlying ills */
3446 if (IS_IPMP(ill))
3447 ipmp_ncec_delete_nce(ncec);
3448
3449 /* now the ill itself */
3450 mutex_enter(&ill->ill_lock);
3451 for (nce = list_head(&ill->ill_nce); nce != NULL;
3452 nce = list_next(&ill->ill_nce, nce)) {
3453 if (nce->nce_common == ncec) {
3454 nce_refhold(nce);
3455 nce_delete(nce);
3456 break;
3457 }
3458 }
3459 mutex_exit(&ill->ill_lock);
3460 if (nce != NULL) {
3461 if (dead == NULL)
3462 nce_refrele(nce);
3463 else
3464 list_insert_tail(dead, nce);
3465 }
3466 }
3467
3468 /*
3469 * when the fastpath response does not fit in the datab
3470 * associated with the existing nce_fp_mp, we delete and
3471 * add the nce to retrigger fastpath based on the information
3472 * in the ncec_t.
3473 */
3474 static nce_t *
3475 nce_delete_then_add(nce_t *nce)
3476 {
3477 ill_t *ill = nce->nce_ill;
3478 nce_t *newnce = NULL;
3479 list_t graveyard;
3480
3481 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3482 ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3483 (void *)nce, ill->ill_name));
3484 mutex_enter(&ill->ill_lock);
3485 mutex_enter(&nce->nce_common->ncec_lock);
3486 nce_delete(nce);
3487 /*
3488 * Make sure that ncec is not condemned before adding. We hold the
3489 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3490 * ipmp_ncec_delete_nce()
3491 */
3492 if (!NCE_ISCONDEMNED(nce->nce_common))
3493 newnce = nce_add(ill, nce->nce_common, &graveyard);
3494 mutex_exit(&nce->nce_common->ncec_lock);
3495 mutex_exit(&ill->ill_lock);
3496 nce_graveyard_free(&graveyard);
3497 nce_refrele(nce);
3498 return (newnce); /* could be null if nomem */
3499 }
3500
3501 typedef struct nce_fp_match_s {
3502 nce_t *nce_fp_match_res;
3503 mblk_t *nce_fp_match_ack_mp;
3504 } nce_fp_match_t;
3505
3506 /* ARGSUSED */
3507 static int
3508 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3509 {
3510 nce_fp_match_t *nce_fp_marg = arg;
3511 ncec_t *ncec = nce->nce_common;
3512 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp;
3513 uchar_t *mp_rptr, *ud_mp_rptr;
3514 mblk_t *ud_mp = nce->nce_dlur_mp;
3515 ptrdiff_t cmplen;
3516
3517 /*
3518 * mp is the mp associated with the fastpath ack.
3519 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3520 * under consideration. If the contents match, then the
3521 * fastpath ack is used to update the nce.
3522 */
3523 if (ud_mp == NULL)
3524 return (0);
3525 mp_rptr = mp->b_rptr;
3526 cmplen = mp->b_wptr - mp_rptr;
3527 ASSERT(cmplen >= 0);
3528
3529 ud_mp_rptr = ud_mp->b_rptr;
3530 /*
3531 * The ncec is locked here to prevent any other threads from accessing
3532 * and changing nce_dlur_mp when the address becomes resolved to an
3533 * lla while we're in the middle of looking at and comparing the
3534 * hardware address (lla). It is also locked to prevent multiple
3535 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3536 * time.
3537 */
3538 mutex_enter(&ncec->ncec_lock);
3539 if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3540 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3541 nce_fp_marg->nce_fp_match_res = nce;
3542 mutex_exit(&ncec->ncec_lock);
3543 nce_refhold(nce);
3544 return (1);
3545 }
3546 mutex_exit(&ncec->ncec_lock);
3547 return (0);
3548 }
3549
3550 /*
3551 * Update all NCE's that are not in fastpath mode and
3552 * have an nce_fp_mp that matches mp. mp->b_cont contains
3553 * the fastpath header.
3554 *
3555 * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3556 */
3557 void
3558 nce_fastpath_update(ill_t *ill, mblk_t *mp)
3559 {
3560 nce_fp_match_t nce_fp_marg;
3561 nce_t *nce;
3562 mblk_t *nce_fp_mp, *fp_mp;
3563
3564 nce_fp_marg.nce_fp_match_res = NULL;
3565 nce_fp_marg.nce_fp_match_ack_mp = mp;
3566
3567 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3568
3569 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3570 return;
3571
3572 mutex_enter(&nce->nce_lock);
3573 nce_fp_mp = nce->nce_fp_mp;
3574
3575 if (nce_fp_mp != NULL) {
3576 fp_mp = mp->b_cont;
3577 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3578 nce_fp_mp->b_datap->db_lim) {
3579 mutex_exit(&nce->nce_lock);
3580 nce = nce_delete_then_add(nce);
3581 if (nce == NULL) {
3582 return;
3583 }
3584 mutex_enter(&nce->nce_lock);
3585 nce_fp_mp = nce->nce_fp_mp;
3586 }
3587 }
3588
3589 /* Matched - install mp as the fastpath mp */
3590 if (nce_fp_mp == NULL) {
3591 fp_mp = dupb(mp->b_cont);
3592 nce->nce_fp_mp = fp_mp;
3593 } else {
3594 fp_mp = mp->b_cont;
3595 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3596 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3597 + MBLKL(fp_mp);
3598 }
3599 mutex_exit(&nce->nce_lock);
3600 nce_refrele(nce);
3601 }
3602
3603 /*
3604 * Return a pointer to a given option in the packet.
3605 * Assumes that option part of the packet have already been validated.
3606 */
3607 nd_opt_hdr_t *
3608 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3609 {
3610 while (optlen > 0) {
3611 if (opt->nd_opt_type == opt_type)
3612 return (opt);
3613 optlen -= 8 * opt->nd_opt_len;
3614 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3615 }
3616 return (NULL);
3617 }
3618
3619 /*
3620 * Verify all option lengths present are > 0, also check to see
3621 * if the option lengths and packet length are consistent.
3622 */
3623 boolean_t
3624 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3625 {
3626 ASSERT(opt != NULL);
3627 while (optlen > 0) {
3628 if (opt->nd_opt_len == 0)
3629 return (B_FALSE);
3630 optlen -= 8 * opt->nd_opt_len;
3631 if (optlen < 0)
3632 return (B_FALSE);
3633 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3634 }
3635 return (B_TRUE);
3636 }
3637
3638 /*
3639 * ncec_walk function.
3640 * Free a fraction of the NCE cache entries.
3641 *
3642 * A possible optimization here would be to use ncec_last where possible, and
3643 * delete the least-frequently used entry, which would require more complex
3644 * computation as we walk through the ncec's (e.g., track ncec entries by
3645 * order of ncec_last and/or maintain state)
3646 */
3647 static void
3648 ncec_cache_reclaim(ncec_t *ncec, void *arg)
3649 {
3650 ip_stack_t *ipst = ncec->ncec_ipst;
3651 uint_t fraction = *(uint_t *)arg;
3652 uint_t rand;
3653
3654 if ((ncec->ncec_flags &
3655 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3656 return;
3657 }
3658
3659 rand = (uint_t)ddi_get_lbolt() +
3660 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3661 if ((rand/fraction)*fraction == rand) {
3662 IP_STAT(ipst, ip_nce_reclaim_deleted);
3663 ncec_delete(ncec);
3664 }
3665 }
3666
3667 /*
3668 * kmem_cache callback to free up memory.
3669 *
3670 * For now we just delete a fixed fraction.
3671 */
3672 static void
3673 ip_nce_reclaim_stack(ip_stack_t *ipst)
3674 {
3675 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction;
3676
3677 IP_STAT(ipst, ip_nce_reclaim_calls);
3678
3679 ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);
3680
3681 /*
3682 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3683 * Get them to update any stale references to drop any refholds they
3684 * have.
3685 */
3686 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3687 }
3688
3689 /*
3690 * Called by the memory allocator subsystem directly, when the system
3691 * is running low on memory.
3692 */
3693 /* ARGSUSED */
3694 void
3695 ip_nce_reclaim(void *args)
3696 {
3697 netstack_handle_t nh;
3698 netstack_t *ns;
3699 ip_stack_t *ipst;
3700
3701 netstack_next_init(&nh);
3702 while ((ns = netstack_next(&nh)) != NULL) {
3703 /*
3704 * netstack_next() can return a netstack_t with a NULL
3705 * netstack_ip at boot time.
3706 */
3707 if ((ipst = ns->netstack_ip) == NULL) {
3708 netstack_rele(ns);
3709 continue;
3710 }
3711 ip_nce_reclaim_stack(ipst);
3712 netstack_rele(ns);
3713 }
3714 netstack_next_fini(&nh);
3715 }
3716
3717 #ifdef DEBUG
3718 void
3719 ncec_trace_ref(ncec_t *ncec)
3720 {
3721 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3722
3723 if (ncec->ncec_trace_disable)
3724 return;
3725
3726 if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3727 ncec->ncec_trace_disable = B_TRUE;
3728 ncec_trace_cleanup(ncec);
3729 }
3730 }
3731
3732 void
3733 ncec_untrace_ref(ncec_t *ncec)
3734 {
3735 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3736
3737 if (!ncec->ncec_trace_disable)
3738 th_trace_unref(ncec);
3739 }
3740
3741 static void
3742 ncec_trace_cleanup(const ncec_t *ncec)
3743 {
3744 th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3745 }
3746 #endif
3747
3748 /*
3749 * Called when address resolution fails due to a timeout.
3750 * Send an ICMP unreachable in response to all queued packets.
3751 */
3752 void
3753 arp_resolv_failed(ncec_t *ncec)
3754 {
3755 mblk_t *mp, *nxt_mp;
3756 char buf[INET6_ADDRSTRLEN];
3757 struct in_addr ipv4addr;
3758 ill_t *ill = ncec->ncec_ill;
3759 ip_stack_t *ipst = ncec->ncec_ipst;
3760 ip_recv_attr_t iras;
3761
3762 bzero(&iras, sizeof (iras));
3763 iras.ira_flags = IRAF_IS_IPV4;
3764 /*
3765 * we are setting the ira_rill to the ipmp_ill (instead of
3766 * the actual ill on which the packet was received), but this
3767 * is ok because we don't actually need the real ira_rill.
3768 * to send the icmp unreachable to the sender.
3769 */
3770 iras.ira_ill = iras.ira_rill = ill;
3771 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3772 iras.ira_rifindex = iras.ira_ruifindex;
3773
3774 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3775 ip3dbg(("arp_resolv_failed: dst %s\n",
3776 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3777 mutex_enter(&ncec->ncec_lock);
3778 mp = ncec->ncec_qd_mp;
3779 ncec->ncec_qd_mp = NULL;
3780 ncec->ncec_nprobes = 0;
3781 mutex_exit(&ncec->ncec_lock);
3782 while (mp != NULL) {
3783 nxt_mp = mp->b_next;
3784 mp->b_next = NULL;
3785
3786 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3787 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3788 mp, ill);
3789 if (ipst->ips_ip_arp_icmp_error) {
3790 ip3dbg(("arp_resolv_failed: "
3791 "Calling icmp_unreachable\n"));
3792 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3793 } else {
3794 freemsg(mp);
3795 }
3796 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3797 mp = nxt_mp;
3798 }
3799 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3800 }
3801
3802 /*
3803 * if ill is an under_ill, translate it to the ipmp_ill and add the
3804 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3805 * one on the underlying in_ill) will be created for the
3806 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3807 */
3808 int
3809 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3810 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3811 {
3812 int err;
3813 in6_addr_t addr6;
3814 ip_stack_t *ipst = ill->ill_ipst;
3815 nce_t *nce, *upper_nce = NULL;
3816 ill_t *in_ill = ill, *under = NULL;
3817 boolean_t need_ill_refrele = B_FALSE;
3818
3819 if (flags & NCE_F_MCAST) {
3820 /*
3821 * hw_addr will be figured out in nce_set_multicast_v4;
3822 * caller needs to pass in the cast_ill for ipmp
3823 */
3824 ASSERT(hw_addr == NULL);
3825 ASSERT(!IS_IPMP(ill));
3826 err = nce_set_multicast_v4(ill, addr, flags, newnce);
3827 return (err);
3828 }
3829
3830 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3831 ill = ipmp_ill_hold_ipmp_ill(ill);
3832 if (ill == NULL)
3833 return (ENXIO);
3834 need_ill_refrele = B_TRUE;
3835 }
3836 if ((flags & NCE_F_BCAST) != 0) {
3837 /*
3838 * IPv4 broadcast ncec: compute the hwaddr.
3839 */
3840 if (IS_IPMP(ill)) {
3841 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3842 if (under == NULL) {
3843 if (need_ill_refrele)
3844 ill_refrele(ill);
3845 return (ENETDOWN);
3846 }
3847 hw_addr = under->ill_bcast_mp->b_rptr +
3848 NCE_LL_ADDR_OFFSET(under);
3849 hw_addr_len = under->ill_phys_addr_length;
3850 } else {
3851 hw_addr = ill->ill_bcast_mp->b_rptr +
3852 NCE_LL_ADDR_OFFSET(ill),
3853 hw_addr_len = ill->ill_phys_addr_length;
3854 }
3855 }
3856
3857 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3858 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3859 nce = nce_lookup_addr(ill, &addr6);
3860 if (nce == NULL) {
3861 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3862 state, &nce);
3863 } else {
3864 err = EEXIST;
3865 }
3866 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3867 if (err == 0)
3868 err = nce_add_v4_postprocess(nce);
3869
3870 if (in_ill != ill && nce != NULL) {
3871 nce_t *under_nce = NULL;
3872
3873 /*
3874 * in_ill was the under_ill. Try to create the under_nce.
3875 * Hold the ill_g_lock to prevent changes to group membership
3876 * until we are done.
3877 */
3878 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3879 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3880 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3881 ill_t *, ill);
3882 rw_exit(&ipst->ips_ill_g_lock);
3883 err = ENXIO;
3884 nce_refrele(nce);
3885 nce = NULL;
3886 goto bail;
3887 }
3888 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3889 if (under_nce == NULL) {
3890 rw_exit(&ipst->ips_ill_g_lock);
3891 err = EINVAL;
3892 nce_refrele(nce);
3893 nce = NULL;
3894 goto bail;
3895 }
3896 rw_exit(&ipst->ips_ill_g_lock);
3897 upper_nce = nce;
3898 nce = under_nce; /* will be returned to caller */
3899 if (NCE_ISREACHABLE(nce->nce_common))
3900 nce_fastpath_trigger(under_nce);
3901 }
3902 if (nce != NULL) {
3903 if (newnce != NULL)
3904 *newnce = nce;
3905 else
3906 nce_refrele(nce);
3907 }
3908 bail:
3909 if (under != NULL)
3910 ill_refrele(under);
3911 if (upper_nce != NULL)
3912 nce_refrele(upper_nce);
3913 if (need_ill_refrele)
3914 ill_refrele(ill);
3915
3916 return (err);
3917 }
3918
3919 /*
3920 * NDP Cache Entry creation routine for IPv4.
3921 * This routine must always be called with ndp4->ndp_g_lock held.
3922 * Prior to return, ncec_refcnt is incremented.
3923 *
3924 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3925 * are always added pointing at the ipmp_ill. Thus, when the ill passed
3926 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3927 * entries will be created, both pointing at the same ncec_t. The nce_t
3928 * entries will have their nce_ill set to the ipmp_ill and the under_ill
3929 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3930 * Local addresses are always created on the ill passed to nce_add_v4.
3931 */
3932 int
3933 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3934 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3935 {
3936 int err;
3937 boolean_t is_multicast = (flags & NCE_F_MCAST);
3938 struct in6_addr addr6;
3939 nce_t *nce;
3940
3941 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3942 ASSERT(!ill->ill_isv6);
3943 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3944
3945 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3946 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3947 &nce);
3948 ASSERT(newnce != NULL);
3949 *newnce = nce;
3950 return (err);
3951 }
3952
3953 /*
3954 * Post-processing routine to be executed after nce_add_v4(). This function
3955 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3956 * and must be called without any locks held.
3957 *
3958 * Always returns 0, but we return an int to keep this symmetric with the
3959 * IPv6 counter-part.
3960 */
3961 int
3962 nce_add_v4_postprocess(nce_t *nce)
3963 {
3964 ncec_t *ncec = nce->nce_common;
3965 uint16_t flags = ncec->ncec_flags;
3966 boolean_t ndp_need_dad = B_FALSE;
3967 boolean_t dropped;
3968 clock_t delay;
3969 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst;
3970 uchar_t *hw_addr = ncec->ncec_lladdr;
3971 boolean_t trigger_fastpath = B_TRUE;
3972
3973 /*
3974 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3975 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3976 * We call nce_fastpath from nce_update if the link layer address of
3977 * the peer changes from nce_update
3978 */
3979 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3980 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3981 trigger_fastpath = B_FALSE;
3982
3983 if (trigger_fastpath)
3984 nce_fastpath_trigger(nce);
3985
3986 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3987 /*
3988 * Either the caller (by passing in ND_PROBE)
3989 * or nce_add_common() (by the internally computed state
3990 * based on ncec_addr and ill_net_type) has determined
3991 * that this unicast entry needs DAD. Trigger DAD.
3992 */
3993 ndp_need_dad = B_TRUE;
3994 } else if (flags & NCE_F_UNSOL_ADV) {
3995 /*
3996 * We account for the transmit below by assigning one
3997 * less than the ndd variable. Subsequent decrements
3998 * are done in nce_timer.
3999 */
4000 mutex_enter(&ncec->ncec_lock);
4001 ncec->ncec_unsolicit_count =
4002 ipst->ips_ip_arp_publish_count - 1;
4003 mutex_exit(&ncec->ncec_lock);
4004 dropped = arp_announce(ncec);
4005 mutex_enter(&ncec->ncec_lock);
4006 if (dropped)
4007 ncec->ncec_unsolicit_count++;
4008 else
4009 ncec->ncec_last_time_defended = ddi_get_lbolt();
4010 if (ncec->ncec_unsolicit_count != 0) {
4011 nce_start_timer(ncec,
4012 ipst->ips_ip_arp_publish_interval);
4013 }
4014 mutex_exit(&ncec->ncec_lock);
4015 }
4016
4017 /*
4018 * If ncec_xmit_interval is 0, user has configured us to send the first
4019 * probe right away. Do so, and set up for the subsequent probes.
4020 */
4021 if (ndp_need_dad) {
4022 mutex_enter(&ncec->ncec_lock);
4023 if (ncec->ncec_pcnt == 0) {
4024 /*
4025 * DAD probes and announce can be
4026 * administratively disabled by setting the
4027 * probe_count to zero. Restart the timer in
4028 * this case to mark the ipif as ready.
4029 */
4030 ncec->ncec_unsolicit_count = 0;
4031 mutex_exit(&ncec->ncec_lock);
4032 nce_restart_timer(ncec, 0);
4033 } else {
4034 mutex_exit(&ncec->ncec_lock);
4035 delay = ((ncec->ncec_flags & NCE_F_FAST) ?
4036 ipst->ips_arp_probe_delay :
4037 ipst->ips_arp_fastprobe_delay);
4038 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
4039 }
4040 }
4041 return (0);
4042 }
4043
4044 /*
4045 * ncec_walk routine to update all entries that have a given destination or
4046 * gateway address and cached link layer (MAC) address. This is used when ARP
4047 * informs us that a network-to-link-layer mapping may have changed.
4048 */
4049 void
4050 nce_update_hw_changed(ncec_t *ncec, void *arg)
4051 {
4052 nce_hw_map_t *hwm = arg;
4053 ipaddr_t ncec_addr;
4054
4055 if (ncec->ncec_state != ND_REACHABLE)
4056 return;
4057
4058 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
4059 if (ncec_addr != hwm->hwm_addr)
4060 return;
4061
4062 mutex_enter(&ncec->ncec_lock);
4063 if (hwm->hwm_flags != 0)
4064 ncec->ncec_flags = hwm->hwm_flags;
4065 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
4066 mutex_exit(&ncec->ncec_lock);
4067 }
4068
4069 void
4070 ncec_refhold(ncec_t *ncec)
4071 {
4072 mutex_enter(&(ncec)->ncec_lock);
4073 (ncec)->ncec_refcnt++;
4074 ASSERT((ncec)->ncec_refcnt != 0);
4075 #ifdef DEBUG
4076 ncec_trace_ref(ncec);
4077 #endif
4078 mutex_exit(&(ncec)->ncec_lock);
4079 }
4080
4081 void
4082 ncec_refhold_notr(ncec_t *ncec)
4083 {
4084 mutex_enter(&(ncec)->ncec_lock);
4085 (ncec)->ncec_refcnt++;
4086 ASSERT((ncec)->ncec_refcnt != 0);
4087 mutex_exit(&(ncec)->ncec_lock);
4088 }
4089
4090 static void
4091 ncec_refhold_locked(ncec_t *ncec)
4092 {
4093 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
4094 (ncec)->ncec_refcnt++;
4095 #ifdef DEBUG
4096 ncec_trace_ref(ncec);
4097 #endif
4098 }
4099
4100 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
4101 void
4102 ncec_refrele(ncec_t *ncec)
4103 {
4104 mutex_enter(&(ncec)->ncec_lock);
4105 #ifdef DEBUG
4106 ncec_untrace_ref(ncec);
4107 #endif
4108 ASSERT((ncec)->ncec_refcnt != 0);
4109 if (--(ncec)->ncec_refcnt == 0) {
4110 ncec_inactive(ncec);
4111 } else {
4112 mutex_exit(&(ncec)->ncec_lock);
4113 }
4114 }
4115
4116 void
4117 ncec_refrele_notr(ncec_t *ncec)
4118 {
4119 mutex_enter(&(ncec)->ncec_lock);
4120 ASSERT((ncec)->ncec_refcnt != 0);
4121 if (--(ncec)->ncec_refcnt == 0) {
4122 ncec_inactive(ncec);
4123 } else {
4124 mutex_exit(&(ncec)->ncec_lock);
4125 }
4126 }
4127
4128 /*
4129 * Common to IPv4 and IPv6.
4130 */
4131 void
4132 nce_restart_timer(ncec_t *ncec, uint_t ms)
4133 {
4134 timeout_id_t tid;
4135
4136 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
4137
4138 /* First cancel any running timer */
4139 mutex_enter(&ncec->ncec_lock);
4140 tid = ncec->ncec_timeout_id;
4141 ncec->ncec_timeout_id = 0;
4142 if (tid != 0) {
4143 mutex_exit(&ncec->ncec_lock);
4144 (void) untimeout(tid);
4145 mutex_enter(&ncec->ncec_lock);
4146 }
4147
4148 /* Restart timer */
4149 nce_start_timer(ncec, ms);
4150 mutex_exit(&ncec->ncec_lock);
4151 }
4152
4153 static void
4154 nce_start_timer(ncec_t *ncec, uint_t ms)
4155 {
4156 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4157 /*
4158 * Don't start the timer if the ncec has been deleted, or if the timer
4159 * is already running
4160 */
4161 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
4162 ncec->ncec_timeout_id = timeout(nce_timer, ncec,
4163 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
4164 }
4165 }
4166
4167 int
4168 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
4169 uint16_t flags, nce_t **newnce)
4170 {
4171 uchar_t *hw_addr;
4172 int err = 0;
4173 ip_stack_t *ipst = ill->ill_ipst;
4174 in6_addr_t dst6;
4175 nce_t *nce;
4176
4177 ASSERT(!ill->ill_isv6);
4178
4179 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
4180 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
4181 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
4182 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4183 goto done;
4184 }
4185 if (ill->ill_net_type == IRE_IF_RESOLVER) {
4186 /*
4187 * For IRE_IF_RESOLVER a hardware mapping can be
4188 * generated, for IRE_IF_NORESOLVER, resolution cookie
4189 * in the ill is copied in nce_add_v4().
4190 */
4191 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
4192 if (hw_addr == NULL) {
4193 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4194 return (ENOMEM);
4195 }
4196 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
4197 } else {
4198 /*
4199 * IRE_IF_NORESOLVER type simply copies the resolution
4200 * cookie passed in. So no hw_addr is needed.
4201 */
4202 hw_addr = NULL;
4203 }
4204 ASSERT(flags & NCE_F_MCAST);
4205 ASSERT(flags & NCE_F_NONUD);
4206 /* nce_state will be computed by nce_add_common() */
4207 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
4208 ND_UNCHANGED, &nce);
4209 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4210 if (err == 0)
4211 err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
4212 if (hw_addr != NULL)
4213 kmem_free(hw_addr, ill->ill_phys_addr_length);
4214 if (err != 0) {
4215 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
4216 return (err);
4217 }
4218 done:
4219 if (newnce != NULL)
4220 *newnce = nce;
4221 else
4222 nce_refrele(nce);
4223 return (0);
4224 }
4225
4226 /*
4227 * This is used when scanning for "old" (least recently broadcast) NCEs. We
4228 * don't want to have to walk the list for every single one, so we gather up
4229 * batches at a time.
4230 */
4231 #define NCE_RESCHED_LIST_LEN 8
4232
4233 typedef struct {
4234 ill_t *ncert_ill;
4235 uint_t ncert_num;
4236 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN];
4237 } nce_resched_t;
4238
4239 /*
4240 * Pick the longest waiting NCEs for defense.
4241 */
4242 /* ARGSUSED */
4243 static int
4244 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4245 {
4246 nce_resched_t *ncert = arg;
4247 ncec_t **ncecs;
4248 ncec_t **ncec_max;
4249 ncec_t *ncec_temp;
4250 ncec_t *ncec = nce->nce_common;
4251
4252 ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4253 /*
4254 * Only reachable entries that are ready for announcement are eligible.
4255 */
4256 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4257 return (0);
4258 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4259 ncec_refhold(ncec);
4260 ncert->ncert_nces[ncert->ncert_num++] = ncec;
4261 } else {
4262 ncecs = ncert->ncert_nces;
4263 ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4264 ncec_refhold(ncec);
4265 for (; ncecs < ncec_max; ncecs++) {
4266 ASSERT(ncec != NULL);
4267 if ((*ncecs)->ncec_last_time_defended >
4268 ncec->ncec_last_time_defended) {
4269 ncec_temp = *ncecs;
4270 *ncecs = ncec;
4271 ncec = ncec_temp;
4272 }
4273 }
4274 ncec_refrele(ncec);
4275 }
4276 return (0);
4277 }
4278
4279 /*
4280 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this
4281 * doesn't happen very often (if at all), and thus it needn't be highly
4282 * optimized. (Note, though, that it's actually O(N) complexity, because the
4283 * outer loop is bounded by a constant rather than by the length of the list.)
4284 */
4285 static void
4286 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4287 {
4288 ncec_t *ncec;
4289 ip_stack_t *ipst = ill->ill_ipst;
4290 uint_t i, defend_rate;
4291
4292 i = ill->ill_defend_count;
4293 ill->ill_defend_count = 0;
4294 if (ill->ill_isv6)
4295 defend_rate = ipst->ips_ndp_defend_rate;
4296 else
4297 defend_rate = ipst->ips_arp_defend_rate;
4298 /* If none could be sitting around, then don't reschedule */
4299 if (i < defend_rate) {
4300 DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4301 return;
4302 }
4303 ncert->ncert_ill = ill;
4304 while (ill->ill_defend_count < defend_rate) {
4305 nce_walk_common(ill, ncec_reschedule, ncert);
4306 for (i = 0; i < ncert->ncert_num; i++) {
4307
4308 ncec = ncert->ncert_nces[i];
4309 mutex_enter(&ncec->ncec_lock);
4310 ncec->ncec_flags |= NCE_F_DELAYED;
4311 mutex_exit(&ncec->ncec_lock);
4312 /*
4313 * we plan to schedule this ncec, so incr the
4314 * defend_count in anticipation.
4315 */
4316 if (++ill->ill_defend_count >= defend_rate)
4317 break;
4318 }
4319 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4320 break;
4321 }
4322 }
4323
4324 /*
4325 * Check if the current rate-limiting parameters permit the sending
4326 * of another address defense announcement for both IPv4 and IPv6.
4327 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4328 * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4329 * determines how many address defense announcements are permitted
4330 * in any `defense_perio' interval.
4331 */
4332 static boolean_t
4333 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4334 {
4335 clock_t now = ddi_get_lbolt();
4336 ip_stack_t *ipst = ill->ill_ipst;
4337 clock_t start = ill->ill_defend_start;
4338 uint32_t elapsed, defend_period, defend_rate;
4339 nce_resched_t ncert;
4340 boolean_t ret;
4341 int i;
4342
4343 if (ill->ill_isv6) {
4344 defend_period = ipst->ips_ndp_defend_period;
4345 defend_rate = ipst->ips_ndp_defend_rate;
4346 } else {
4347 defend_period = ipst->ips_arp_defend_period;
4348 defend_rate = ipst->ips_arp_defend_rate;
4349 }
4350 if (defend_rate == 0)
4351 return (B_TRUE);
4352 bzero(&ncert, sizeof (ncert));
4353 mutex_enter(&ill->ill_lock);
4354 if (start > 0) {
4355 elapsed = now - start;
4356 if (elapsed > SEC_TO_TICK(defend_period)) {
4357 ill->ill_defend_start = now;
4358 /*
4359 * nce_ill_reschedule will attempt to
4360 * prevent starvation by reschduling the
4361 * oldest entries, which are marked with
4362 * the NCE_F_DELAYED flag.
4363 */
4364 nce_ill_reschedule(ill, &ncert);
4365 }
4366 } else {
4367 ill->ill_defend_start = now;
4368 }
4369 ASSERT(ill->ill_defend_count <= defend_rate);
4370 mutex_enter(&ncec->ncec_lock);
4371 if (ncec->ncec_flags & NCE_F_DELAYED) {
4372 /*
4373 * This ncec was rescheduled as one of the really old
4374 * entries needing on-going defense. The
4375 * ill_defend_count was already incremented in
4376 * nce_ill_reschedule. Go ahead and send the announce.
4377 */
4378 ncec->ncec_flags &= ~NCE_F_DELAYED;
4379 mutex_exit(&ncec->ncec_lock);
4380 ret = B_FALSE;
4381 goto done;
4382 }
4383 mutex_exit(&ncec->ncec_lock);
4384 if (ill->ill_defend_count < defend_rate)
4385 ill->ill_defend_count++;
4386 if (ill->ill_defend_count == defend_rate) {
4387 /*
4388 * we are no longer allowed to send unbidden defense
4389 * messages. Wait for rescheduling.
4390 */
4391 ret = B_TRUE;
4392 } else {
4393 ret = B_FALSE;
4394 }
4395 done:
4396 mutex_exit(&ill->ill_lock);
4397 /*
4398 * After all the locks have been dropped we can restart nce timer,
4399 * and refrele the delayed ncecs
4400 */
4401 for (i = 0; i < ncert.ncert_num; i++) {
4402 clock_t xmit_interval;
4403 ncec_t *tmp;
4404
4405 tmp = ncert.ncert_nces[i];
4406 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4407 B_FALSE);
4408 nce_restart_timer(tmp, xmit_interval);
4409 ncec_refrele(tmp);
4410 }
4411 return (ret);
4412 }
4413
4414 boolean_t
4415 ndp_announce(ncec_t *ncec)
4416 {
4417 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4418 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4419 nce_advert_flags(ncec)));
4420 }
4421
4422 ill_t *
4423 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4424 {
4425 mblk_t *mp;
4426 in6_addr_t src6;
4427 ipaddr_t src4;
4428 ill_t *ill = ncec->ncec_ill;
4429 ill_t *src_ill = NULL;
4430 ipif_t *ipif = NULL;
4431 boolean_t is_myaddr = NCE_MYADDR(ncec);
4432 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4433
4434 ASSERT(src != NULL);
4435 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4436 src6 = *src;
4437 if (is_myaddr) {
4438 src6 = ncec->ncec_addr;
4439 if (!isv6)
4440 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4441 } else {
4442 /*
4443 * try to find one from the outgoing packet.
4444 */
4445 mutex_enter(&ncec->ncec_lock);
4446 mp = ncec->ncec_qd_mp;
4447 if (mp != NULL) {
4448 if (isv6) {
4449 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4450
4451 src6 = ip6h->ip6_src;
4452 } else {
4453 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4454
4455 src4 = ipha->ipha_src;
4456 IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4457 }
4458 }
4459 mutex_exit(&ncec->ncec_lock);
4460 }
4461
4462 /*
4463 * For outgoing packets, if the src of outgoing packet is one
4464 * of the assigned interface addresses use it, otherwise we
4465 * will pick the source address below.
4466 * For local addresses (is_myaddr) doing DAD, NDP announce
4467 * messages are mcast. So we use the (IPMP) cast_ill or the
4468 * (non-IPMP) ncec_ill for these message types. The only case
4469 * of unicast DAD messages are for IPv6 ND probes, for which
4470 * we find the ipif_bound_ill corresponding to the ncec_addr.
4471 */
4472 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4473 if (isv6) {
4474 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4475 ill->ill_ipst);
4476 } else {
4477 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4478 ill->ill_ipst);
4479 }
4480
4481 /*
4482 * If no relevant ipif can be found, then it's not one of our
4483 * addresses. Reset to :: and try to find a src for the NS or
4484 * ARP request using ipif_select_source_v[4,6] below.
4485 * If an ipif can be found, but it's not yet done with
4486 * DAD verification, and we are not being invoked for
4487 * DAD (i.e., !is_myaddr), then just postpone this
4488 * transmission until later.
4489 */
4490 if (ipif == NULL) {
4491 src6 = ipv6_all_zeros;
4492 src4 = INADDR_ANY;
4493 } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4494 DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4495 ncec_t *, ncec, ipif_t *, ipif);
4496 ipif_refrele(ipif);
4497 return (NULL);
4498 }
4499 }
4500
4501 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4502 /*
4503 * Pick a source address for this solicitation, but
4504 * restrict the selection to addresses assigned to the
4505 * output interface. We do this because the destination will
4506 * create a neighbor cache entry for the source address of
4507 * this packet, so the source address had better be a valid
4508 * neighbor.
4509 */
4510 if (isv6) {
4511 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4512 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4513 B_FALSE, NULL);
4514 } else {
4515 ipaddr_t nce_addr;
4516
4517 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4518 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4519 B_FALSE, NULL);
4520 }
4521 if (ipif == NULL && IS_IPMP(ill)) {
4522 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4523
4524 if (send_ill != NULL) {
4525 if (isv6) {
4526 ipif = ipif_select_source_v6(send_ill,
4527 &ncec->ncec_addr, B_TRUE,
4528 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4529 B_FALSE, NULL);
4530 } else {
4531 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4532 src4);
4533 ipif = ipif_select_source_v4(send_ill,
4534 src4, ALL_ZONES, B_TRUE, NULL);
4535 }
4536 ill_refrele(send_ill);
4537 }
4538 }
4539
4540 if (ipif == NULL) {
4541 char buf[INET6_ADDRSTRLEN];
4542
4543 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4544 inet_ntop((isv6 ? AF_INET6 : AF_INET),
4545 (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4546 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4547 return (NULL);
4548 }
4549 src6 = ipif->ipif_v6lcl_addr;
4550 }
4551 *src = src6;
4552 if (ipif != NULL) {
4553 src_ill = ipif->ipif_ill;
4554 if (IS_IPMP(src_ill))
4555 src_ill = ipmp_ipif_hold_bound_ill(ipif);
4556 else
4557 ill_refhold(src_ill);
4558 ipif_refrele(ipif);
4559 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4560 ill_t *, src_ill);
4561 }
4562 return (src_ill);
4563 }
4564
4565 void
4566 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4567 uchar_t *hwaddr, int hwaddr_len, int flags)
4568 {
4569 ill_t *ill;
4570 ncec_t *ncec;
4571 nce_t *nce;
4572 uint16_t new_state;
4573
4574 ill = (ipif ? ipif->ipif_ill : NULL);
4575 if (ill != NULL) {
4576 /*
4577 * only one ncec is possible
4578 */
4579 nce = nce_lookup_v4(ill, addr);
4580 if (nce != NULL) {
4581 ncec = nce->nce_common;
4582 mutex_enter(&ncec->ncec_lock);
4583 if (NCE_ISREACHABLE(ncec))
4584 new_state = ND_UNCHANGED;
4585 else
4586 new_state = ND_STALE;
4587 ncec->ncec_flags = flags;
4588 nce_update(ncec, new_state, hwaddr);
4589 mutex_exit(&ncec->ncec_lock);
4590 nce_refrele(nce);
4591 return;
4592 }
4593 } else {
4594 /*
4595 * ill is wildcard; clean up all ncec's and ire's
4596 * that match on addr.
4597 */
4598 nce_hw_map_t hwm;
4599
4600 hwm.hwm_addr = *addr;
4601 hwm.hwm_hwlen = hwaddr_len;
4602 hwm.hwm_hwaddr = hwaddr;
4603 hwm.hwm_flags = flags;
4604
4605 ncec_walk_common(ipst->ips_ndp4, NULL,
4606 nce_update_hw_changed, &hwm, B_TRUE);
4607 }
4608 }
4609
4610 /*
4611 * Common function to add ncec entries.
4612 * we always add the ncec with ncec_ill == ill, and always create
4613 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4614 * ncec is !reachable.
4615 *
4616 * When the caller passes in an nce_state of ND_UNCHANGED,
4617 * nce_add_common() will determine the state of the created nce based
4618 * on the ill_net_type and nce_flags used. Otherwise, the nce will
4619 * be created with state set to the passed in nce_state.
4620 */
4621 static int
4622 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4623 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4624 {
4625 static ncec_t nce_nil;
4626 uchar_t *template = NULL;
4627 int err;
4628 ncec_t *ncec;
4629 ncec_t **ncep;
4630 ip_stack_t *ipst = ill->ill_ipst;
4631 uint16_t state;
4632 boolean_t fastprobe = B_FALSE;
4633 struct ndp_g_s *ndp;
4634 nce_t *nce = NULL;
4635 list_t graveyard;
4636 mblk_t *dlur_mp = NULL;
4637
4638 if (ill->ill_isv6)
4639 ndp = ill->ill_ipst->ips_ndp6;
4640 else
4641 ndp = ill->ill_ipst->ips_ndp4;
4642
4643 *retnce = NULL;
4644
4645 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4646
4647 if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4648 ip0dbg(("nce_add_common: no addr\n"));
4649 return (EINVAL);
4650 }
4651 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4652 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4653 return (EINVAL);
4654 }
4655
4656 if (ill->ill_isv6) {
4657 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4658 } else {
4659 ipaddr_t v4addr;
4660
4661 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4662 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4663 }
4664
4665 /*
4666 * The caller has ensured that there is no nce on ill, but there could
4667 * still be an nce_common_t for the address, so that we find exisiting
4668 * ncec_t strucutures first, and atomically add a new nce_t if
4669 * one is found. The ndp_g_lock ensures that we don't cross threads
4670 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4671 * compare for matches across the illgrp because this function is
4672 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4673 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4674 * appropriate.
4675 */
4676 ncec = *ncep;
4677 for (; ncec != NULL; ncec = ncec->ncec_next) {
4678 if (ncec->ncec_ill == ill) {
4679 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4680 /*
4681 * We should never find *retnce to be
4682 * MYADDR, since the caller may then
4683 * incorrectly restart a DAD timer that's
4684 * already running. However, if we are in
4685 * forwarding mode, and the interface is
4686 * moving in/out of groups, the data
4687 * path ire lookup (e.g., ire_revalidate_nce)
4688 * may have determined that some destination
4689 * is offlink while the control path is adding
4690 * that address as a local address.
4691 * Recover from this case by failing the
4692 * lookup
4693 */
4694 if (NCE_MYADDR(ncec))
4695 return (ENXIO);
4696 *retnce = nce_ill_lookup_then_add(ill, ncec);
4697 if (*retnce != NULL)
4698 break;
4699 }
4700 }
4701 }
4702 if (*retnce != NULL) /* caller must trigger fastpath on nce */
4703 return (0);
4704
4705 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4706 if (ncec == NULL)
4707 return (ENOMEM);
4708 *ncec = nce_nil;
4709 ncec->ncec_ill = ill;
4710 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4711 ncec->ncec_flags = flags;
4712 ncec->ncec_ipst = ipst; /* No netstack_hold */
4713
4714 if (!ill->ill_isv6) {
4715 ipaddr_t addr4;
4716
4717 /*
4718 * DAD probe interval and probe count are set based on
4719 * fast/slow probe settings. If the underlying link doesn't
4720 * have reliably up/down notifications or if we're working
4721 * with IPv4 169.254.0.0/16 Link Local Address space, then
4722 * don't use the fast timers. Otherwise, use them.
4723 */
4724 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4725 IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4726 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4727 fastprobe = B_TRUE;
4728 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4729 !IS_IPV4_LL_SPACE(&addr4)) {
4730 ill_t *hwaddr_ill;
4731
4732 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4733 hw_addr_len);
4734 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4735 fastprobe = B_TRUE;
4736 }
4737 if (fastprobe) {
4738 ncec->ncec_xmit_interval =
4739 ipst->ips_arp_fastprobe_interval;
4740 ncec->ncec_pcnt =
4741 ipst->ips_arp_fastprobe_count;
4742 ncec->ncec_flags |= NCE_F_FAST;
4743 } else {
4744 ncec->ncec_xmit_interval =
4745 ipst->ips_arp_probe_interval;
4746 ncec->ncec_pcnt =
4747 ipst->ips_arp_probe_count;
4748 }
4749 if (NCE_PUBLISH(ncec)) {
4750 ncec->ncec_unsolicit_count =
4751 ipst->ips_ip_arp_publish_count;
4752 }
4753 } else {
4754 /*
4755 * probe interval is constant: ILL_PROBE_INTERVAL
4756 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4757 */
4758 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4759 if (NCE_PUBLISH(ncec)) {
4760 ncec->ncec_unsolicit_count =
4761 ipst->ips_ip_ndp_unsolicit_count;
4762 }
4763 }
4764 ncec->ncec_rcnt = ill->ill_xmit_count;
4765 ncec->ncec_addr = *addr;
4766 ncec->ncec_qd_mp = NULL;
4767 ncec->ncec_refcnt = 1; /* for ncec getting created */
4768 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4769 ncec->ncec_trace_disable = B_FALSE;
4770
4771 /*
4772 * ncec_lladdr holds link layer address
4773 */
4774 if (hw_addr_len > 0) {
4775 template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4776 if (template == NULL) {
4777 err = ENOMEM;
4778 goto err_ret;
4779 }
4780 ncec->ncec_lladdr = template;
4781 ncec->ncec_lladdr_length = hw_addr_len;
4782 bzero(ncec->ncec_lladdr, hw_addr_len);
4783 }
4784 if ((flags & NCE_F_BCAST) != 0) {
4785 state = ND_REACHABLE;
4786 ASSERT(hw_addr_len > 0);
4787 } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4788 state = ND_INITIAL;
4789 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4790 /*
4791 * NORESOLVER entries are always created in the REACHABLE
4792 * state.
4793 */
4794 state = ND_REACHABLE;
4795 if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4796 ill->ill_mactype != DL_IPV4 &&
4797 ill->ill_mactype != DL_6TO4) {
4798 /*
4799 * We create a nce_res_mp with the IP nexthop address
4800 * as the destination address if the physical length
4801 * is exactly 4 bytes for point-to-multipoint links
4802 * that do their own resolution from IP to link-layer
4803 * address (e.g. IP over X.25).
4804 */
4805 bcopy((uchar_t *)addr,
4806 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4807 }
4808 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4809 ill->ill_mactype != DL_IPV6) {
4810 /*
4811 * We create a nce_res_mp with the IP nexthop address
4812 * as the destination address if the physical legnth
4813 * is exactly 16 bytes for point-to-multipoint links
4814 * that do their own resolution from IP to link-layer
4815 * address.
4816 */
4817 bcopy((uchar_t *)addr,
4818 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4819 }
4820 /*
4821 * Since NUD is not part of the base IPv4 protocol definition,
4822 * IPv4 neighbor entries on NORESOLVER interfaces will never
4823 * age, and are marked NCE_F_NONUD.
4824 */
4825 if (!ill->ill_isv6)
4826 ncec->ncec_flags |= NCE_F_NONUD;
4827 } else if (ill->ill_net_type == IRE_LOOPBACK) {
4828 state = ND_REACHABLE;
4829 }
4830
4831 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4832 /*
4833 * We are adding an ncec with a deterministic hw_addr,
4834 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4835 *
4836 * if we are adding a unicast ncec for the local address
4837 * it would be REACHABLE; we would be adding a ND_STALE entry
4838 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4839 * addresses are added in PROBE to trigger DAD.
4840 */
4841 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4842 ill->ill_net_type == IRE_IF_NORESOLVER)
4843 state = ND_REACHABLE;
4844 else if (!NCE_PUBLISH(ncec))
4845 state = ND_STALE;
4846 else
4847 state = ND_PROBE;
4848 if (hw_addr != NULL)
4849 nce_set_ll(ncec, hw_addr);
4850 }
4851 /* caller overrides internally computed state */
4852 if (nce_state != ND_UNCHANGED)
4853 state = nce_state;
4854
4855 if (state == ND_PROBE)
4856 ncec->ncec_flags |= NCE_F_UNVERIFIED;
4857
4858 ncec->ncec_state = state;
4859
4860 if (state == ND_REACHABLE) {
4861 ncec->ncec_last = ncec->ncec_init_time =
4862 TICK_TO_MSEC(ddi_get_lbolt64());
4863 } else {
4864 ncec->ncec_last = 0;
4865 if (state == ND_INITIAL)
4866 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4867 }
4868 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4869 offsetof(ncec_cb_t, ncec_cb_node));
4870 /*
4871 * have all the memory allocations out of the way before taking locks
4872 * and adding the nce.
4873 */
4874 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4875 if (nce == NULL) {
4876 err = ENOMEM;
4877 goto err_ret;
4878 }
4879 if (ncec->ncec_lladdr != NULL ||
4880 ill->ill_net_type == IRE_IF_NORESOLVER) {
4881 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4882 ill->ill_phys_addr_length, ill->ill_sap,
4883 ill->ill_sap_length);
4884 if (dlur_mp == NULL) {
4885 err = ENOMEM;
4886 goto err_ret;
4887 }
4888 }
4889
4890 /*
4891 * Atomically ensure that the ill is not CONDEMNED, before
4892 * adding the NCE.
4893 */
4894 mutex_enter(&ill->ill_lock);
4895 if (ill->ill_state_flags & ILL_CONDEMNED) {
4896 mutex_exit(&ill->ill_lock);
4897 err = EINVAL;
4898 goto err_ret;
4899 }
4900 if (!NCE_MYADDR(ncec) &&
4901 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4902 mutex_exit(&ill->ill_lock);
4903 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4904 err = EINVAL;
4905 goto err_ret;
4906 }
4907 /*
4908 * Acquire the ncec_lock even before adding the ncec to the list
4909 * so that it cannot get deleted after the ncec is added, but
4910 * before we add the nce.
4911 */
4912 mutex_enter(&ncec->ncec_lock);
4913 if ((ncec->ncec_next = *ncep) != NULL)
4914 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4915 *ncep = ncec;
4916 ncec->ncec_ptpn = ncep;
4917
4918 /* Bump up the number of ncec's referencing this ill */
4919 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4920 (char *), "ncec", (void *), ncec);
4921 ill->ill_ncec_cnt++;
4922 /*
4923 * Since we hold the ncec_lock at this time, the ncec cannot be
4924 * condemned, and we can safely add the nce.
4925 */
4926 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
4927 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
4928 mutex_exit(&ncec->ncec_lock);
4929 mutex_exit(&ill->ill_lock);
4930 nce_graveyard_free(&graveyard);
4931
4932 /* caller must trigger fastpath on *retnce */
4933 return (0);
4934
4935 err_ret:
4936 if (ncec != NULL)
4937 kmem_cache_free(ncec_cache, ncec);
4938 if (nce != NULL)
4939 kmem_cache_free(nce_cache, nce);
4940 freemsg(dlur_mp);
4941 if (template != NULL)
4942 kmem_free(template, ill->ill_phys_addr_length);
4943 return (err);
4944 }
4945
4946 /*
4947 * take a ref on the nce
4948 */
4949 void
4950 nce_refhold(nce_t *nce)
4951 {
4952 mutex_enter(&nce->nce_lock);
4953 nce->nce_refcnt++;
4954 ASSERT((nce)->nce_refcnt != 0);
4955 mutex_exit(&nce->nce_lock);
4956 }
4957
4958 /*
4959 * release a ref on the nce; In general, this
4960 * cannot be called with locks held because nce_inactive
4961 * may result in nce_inactive which will take the ill_lock,
4962 * do ipif_ill_refrele_tail etc. Thus the one exception
4963 * where this can be called with locks held is when the caller
4964 * is certain that the nce_refcnt is sufficient to prevent
4965 * the invocation of nce_inactive.
4966 */
4967 void
4968 nce_refrele(nce_t *nce)
4969 {
4970 ASSERT((nce)->nce_refcnt != 0);
4971 mutex_enter(&nce->nce_lock);
4972 if (--nce->nce_refcnt == 0)
4973 nce_inactive(nce); /* destroys the mutex */
4974 else
4975 mutex_exit(&nce->nce_lock);
4976 }
4977
4978 /*
4979 * free the nce after all refs have gone away.
4980 */
4981 static void
4982 nce_inactive(nce_t *nce)
4983 {
4984 ill_t *ill = nce->nce_ill;
4985
4986 ASSERT(nce->nce_refcnt == 0);
4987
4988 ncec_refrele_notr(nce->nce_common);
4989 nce->nce_common = NULL;
4990 freemsg(nce->nce_fp_mp);
4991 freemsg(nce->nce_dlur_mp);
4992
4993 mutex_enter(&ill->ill_lock);
4994 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4995 (char *), "nce", (void *), nce);
4996 ill->ill_nce_cnt--;
4997 nce->nce_ill = NULL;
4998 /*
4999 * If the number of ncec's associated with this ill have dropped
5000 * to zero, check whether we need to restart any operation that
5001 * is waiting for this to happen.
5002 */
5003 if (ILL_DOWN_OK(ill)) {
5004 /* ipif_ill_refrele_tail drops the ill_lock */
5005 ipif_ill_refrele_tail(ill);
5006 } else {
5007 mutex_exit(&ill->ill_lock);
5008 }
5009
5010 mutex_destroy(&nce->nce_lock);
5011 kmem_cache_free(nce_cache, nce);
5012 }
5013
5014 /*
5015 * Add an nce to the ill_nce list.
5016 *
5017 * Adding multicast NCEs is subject to a per-ill limit. This function returns
5018 * NULL if that's the case, and it may reap a number of multicast nces.
5019 * Callers (and upstack) must be able to cope with NULL returns.
5020 */
5021 static nce_t *
5022 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
5023 list_t *graveyard)
5024 {
5025 ASSERT(MUTEX_HELD(&ill->ill_lock));
5026
5027 if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
5028 if (nce_too_many_mcast(ill, graveyard)) {
5029 kmem_cache_free(nce_cache, nce);
5030 return (NULL);
5031 }
5032 ill->ill_mcast_nces++;
5033 }
5034
5035 bzero(nce, sizeof (*nce));
5036 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
5037 nce->nce_common = ncec;
5038 nce->nce_addr = ncec->ncec_addr;
5039 nce->nce_ill = ill;
5040 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
5041 (char *), "nce", (void *), nce);
5042 ill->ill_nce_cnt++;
5043
5044 nce->nce_refcnt = 1; /* for the thread */
5045 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
5046 nce->nce_dlur_mp = dlur_mp;
5047
5048 /* add nce to the ill's fastpath list. */
5049 nce->nce_refcnt++; /* for the list */
5050 list_insert_head(&ill->ill_nce, nce);
5051 return (nce);
5052 }
5053
5054 static nce_t *
5055 nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
5056 {
5057 nce_t *nce;
5058 mblk_t *dlur_mp = NULL;
5059
5060 ASSERT(MUTEX_HELD(&ill->ill_lock));
5061 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
5062
5063 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
5064 if (nce == NULL)
5065 return (NULL);
5066 if (ncec->ncec_lladdr != NULL ||
5067 ill->ill_net_type == IRE_IF_NORESOLVER) {
5068 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
5069 ill->ill_phys_addr_length, ill->ill_sap,
5070 ill->ill_sap_length);
5071 if (dlur_mp == NULL) {
5072 kmem_cache_free(nce_cache, nce);
5073 return (NULL);
5074 }
5075 }
5076 /*
5077 * If nce_add_impl() returns NULL due to on multicast limiting, caller
5078 * will (correctly) assume ENOMEM.
5079 */
5080 return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
5081 }
5082
5083 /*
5084 * remove the nce from the ill_faspath list
5085 */
5086 void
5087 nce_delete(nce_t *nce)
5088 {
5089 ill_t *ill = nce->nce_ill;
5090
5091 ASSERT(MUTEX_HELD(&ill->ill_lock));
5092
5093 mutex_enter(&nce->nce_lock);
5094 if (nce->nce_is_condemned) {
5095 /*
5096 * some other thread has removed this nce from the ill_nce list
5097 */
5098 mutex_exit(&nce->nce_lock);
5099 return;
5100 }
5101 nce->nce_is_condemned = B_TRUE;
5102 mutex_exit(&nce->nce_lock);
5103
5104 /* Update the count of multicast NCEs. */
5105 if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
5106 ill->ill_mcast_nces--;
5107
5108 list_remove(&ill->ill_nce, nce);
5109 /*
5110 * even though we are holding the ill_lock, it is ok to
5111 * call nce_refrele here because we know that we should have
5112 * at least 2 refs on the nce: one for the thread, and one
5113 * for the list. The refrele below will release the one for
5114 * the list.
5115 */
5116 nce_refrele(nce);
5117 }
5118
5119 nce_t *
5120 nce_lookup(ill_t *ill, const in6_addr_t *addr)
5121 {
5122 nce_t *nce = NULL;
5123
5124 ASSERT(ill != NULL);
5125 ASSERT(MUTEX_HELD(&ill->ill_lock));
5126
5127 for (nce = list_head(&ill->ill_nce); nce != NULL;
5128 nce = list_next(&ill->ill_nce, nce)) {
5129 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
5130 break;
5131 }
5132
5133 /*
5134 * if we found the nce on the ill_nce list while holding
5135 * the ill_lock, then it cannot be condemned yet.
5136 */
5137 if (nce != NULL) {
5138 ASSERT(!nce->nce_is_condemned);
5139 nce_refhold(nce);
5140 }
5141 return (nce);
5142 }
5143
5144 /*
5145 * Walk the ill_nce list on ill. The callback function func() cannot perform
5146 * any destructive actions.
5147 */
5148 static void
5149 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
5150 {
5151 nce_t *nce = NULL, *nce_next;
5152
5153 ASSERT(MUTEX_HELD(&ill->ill_lock));
5154 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
5155 nce_next = list_next(&ill->ill_nce, nce);
5156 if (func(ill, nce, arg) != 0)
5157 break;
5158 nce = nce_next;
5159 }
5160 }
5161
5162 void
5163 nce_walk(ill_t *ill, pfi_t func, void *arg)
5164 {
5165 mutex_enter(&ill->ill_lock);
5166 nce_walk_common(ill, func, arg);
5167 mutex_exit(&ill->ill_lock);
5168 }
5169
5170 void
5171 nce_flush(ill_t *ill, boolean_t flushall)
5172 {
5173 nce_t *nce, *nce_next;
5174 list_t dead;
5175
5176 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
5177 mutex_enter(&ill->ill_lock);
5178 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
5179 nce_next = list_next(&ill->ill_nce, nce);
5180 if (!flushall && NCE_PUBLISH(nce->nce_common)) {
5181 nce = nce_next;
5182 continue;
5183 }
5184 /*
5185 * nce_delete requires that the caller should either not
5186 * be holding locks, or should hold a ref to ensure that
5187 * we wont hit ncec_inactive. So take a ref and clean up
5188 * after the list is flushed.
5189 */
5190 nce_refhold(nce);
5191 nce_delete(nce);
5192 list_insert_tail(&dead, nce);
5193 nce = nce_next;
5194 }
5195 mutex_exit(&ill->ill_lock);
5196 while ((nce = list_head(&dead)) != NULL) {
5197 list_remove(&dead, nce);
5198 nce_refrele(nce);
5199 }
5200 ASSERT(list_is_empty(&dead));
5201 list_destroy(&dead);
5202 }
5203
5204 /* Return an interval that is anywhere in the [1 .. intv] range */
5205 static clock_t
5206 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
5207 {
5208 clock_t rnd, frac;
5209
5210 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
5211 /* Note that clock_t is signed; must chop off bits */
5212 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
5213 if (initial_time) {
5214 if (intv <= 0)
5215 intv = 1;
5216 else
5217 intv = (rnd % intv) + 1;
5218 } else {
5219 /* Compute 'frac' as 20% of the configured interval */
5220 if ((frac = intv / 5) <= 1)
5221 frac = 2;
5222 /* Set intv randomly in the range [intv-frac .. intv+frac] */
5223 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
5224 intv = 1;
5225 }
5226 return (intv);
5227 }
5228
5229 void
5230 nce_resolv_ipmp_ok(ncec_t *ncec)
5231 {
5232 mblk_t *mp;
5233 uint_t pkt_len;
5234 iaflags_t ixaflags = IXAF_NO_TRACE;
5235 nce_t *under_nce;
5236 ill_t *ill = ncec->ncec_ill;
5237 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
5238 ipif_t *src_ipif = NULL;
5239 ip_stack_t *ipst = ill->ill_ipst;
5240 ill_t *send_ill;
5241 uint_t nprobes;
5242
5243 ASSERT(IS_IPMP(ill));
5244
5245 mutex_enter(&ncec->ncec_lock);
5246 nprobes = ncec->ncec_nprobes;
5247 mp = ncec->ncec_qd_mp;
5248 ncec->ncec_qd_mp = NULL;
5249 ncec->ncec_nprobes = 0;
5250 mutex_exit(&ncec->ncec_lock);
5251
5252 while (mp != NULL) {
5253 mblk_t *nxt_mp;
5254
5255 nxt_mp = mp->b_next;
5256 mp->b_next = NULL;
5257 if (isv6) {
5258 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
5259
5260 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
5261 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
5262 ill, ALL_ZONES, ipst);
5263 } else {
5264 ipha_t *ipha = (ipha_t *)mp->b_rptr;
5265
5266 ixaflags |= IXAF_IS_IPV4;
5267 pkt_len = ntohs(ipha->ipha_length);
5268 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5269 ill, ALL_ZONES, ipst);
5270 }
5271
5272 /*
5273 * find a new nce based on an under_ill. The first IPMP probe
5274 * packet gets queued, so we could still find a src_ipif that
5275 * matches an IPMP test address.
5276 */
5277 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5278 /*
5279 * if src_ipif is null, this could be either a
5280 * forwarded packet or a probe whose src got deleted.
5281 * We identify the former case by looking for the
5282 * ncec_nprobes: the first ncec_nprobes packets are
5283 * probes;
5284 */
5285 if (src_ipif == NULL && nprobes > 0)
5286 goto drop_pkt;
5287
5288 /*
5289 * For forwarded packets, we use the ipmp rotor
5290 * to find send_ill.
5291 */
5292 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5293 B_TRUE);
5294 } else {
5295 send_ill = src_ipif->ipif_ill;
5296 ill_refhold(send_ill);
5297 }
5298
5299 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5300 (ncec_t *), ncec, (ipif_t *),
5301 src_ipif, (ill_t *), send_ill);
5302
5303 if (send_ill == NULL) {
5304 if (src_ipif != NULL)
5305 ipif_refrele(src_ipif);
5306 goto drop_pkt;
5307 }
5308 /* create an under_nce on send_ill */
5309 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5310 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5311 under_nce = nce_fastpath_create(send_ill, ncec);
5312 else
5313 under_nce = NULL;
5314 rw_exit(&ipst->ips_ill_g_lock);
5315 if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5316 nce_fastpath_trigger(under_nce);
5317
5318 ill_refrele(send_ill);
5319 if (src_ipif != NULL)
5320 ipif_refrele(src_ipif);
5321
5322 if (under_nce != NULL) {
5323 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5324 ALL_ZONES, 0, NULL);
5325 nce_refrele(under_nce);
5326 if (nprobes > 0)
5327 nprobes--;
5328 mp = nxt_mp;
5329 continue;
5330 }
5331 drop_pkt:
5332 if (isv6) {
5333 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5334 } else {
5335 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5336 }
5337 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5338 freemsg(mp);
5339 if (nprobes > 0)
5340 nprobes--;
5341 mp = nxt_mp;
5342 }
5343 ncec_cb_dispatch(ncec); /* complete callbacks */
5344 }