1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * Copyright (c) 2019, Joyent, Inc.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/stropts.h>
  32 #include <sys/strsun.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/errno.h>
  35 #include <sys/dlpi.h>
  36 #include <sys/socket.h>
  37 #include <sys/ddi.h>
  38 #include <sys/sunddi.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/debug.h>
  41 #include <sys/vtrace.h>
  42 #include <sys/kmem.h>
  43 #include <sys/zone.h>
  44 #include <sys/ethernet.h>
  45 #include <sys/sdt.h>
  46 #include <sys/mac.h>
  47 
  48 #include <net/if.h>
  49 #include <net/if_types.h>
  50 #include <net/if_dl.h>
  51 #include <net/route.h>
  52 #include <netinet/in.h>
  53 #include <netinet/ip6.h>
  54 #include <netinet/icmp6.h>
  55 
  56 #include <inet/common.h>
  57 #include <inet/mi.h>
  58 #include <inet/mib2.h>
  59 #include <inet/nd.h>
  60 #include <inet/ip.h>
  61 #include <inet/ip_impl.h>
  62 #include <inet/ipclassifier.h>
  63 #include <inet/ip_if.h>
  64 #include <inet/ip_ire.h>
  65 #include <inet/ip_rts.h>
  66 #include <inet/ip6.h>
  67 #include <inet/ip_ndp.h>
  68 #include <inet/sctp_ip.h>
  69 #include <inet/ip_arp.h>
  70 #include <inet/ip2mac_impl.h>
  71 
  72 #define ANNOUNCE_INTERVAL(isv6) \
  73         (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
  74         ipst->ips_ip_arp_publish_interval)
  75 
  76 #define DEFENSE_INTERVAL(isv6) \
  77         (isv6 ? ipst->ips_ndp_defend_interval : \
  78         ipst->ips_arp_defend_interval)
  79 
  80 /* Non-tunable probe interval, based on link capabilities */
  81 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
  82 
  83 /*
  84  * The IPv4 Link Local address space is special; we do extra duplicate checking
  85  * there, as the entire assignment mechanism rests on random numbers.
  86  */
  87 #define IS_IPV4_LL_SPACE(ptr)   (((uchar_t *)ptr)[0] == 169 && \
  88                                 ((uchar_t *)ptr)[1] == 254)
  89 
  90 /*
  91  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
  92  * in to the ncec*add* functions.
  93  *
  94  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
  95  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
  96  * that we will respond to requests for the protocol address.
  97  */
  98 #define NCE_EXTERNAL_FLAGS_MASK \
  99         (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
 100         NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
 101         NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
 102 
 103 /*
 104  * Lock ordering:
 105  *
 106  *      ndp_g_lock -> ill_lock -> ncec_lock
 107  *
 108  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
 109  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
 110  * ncec_refcnt).
 111  */
 112 
 113 static  void    nce_cleanup_list(ncec_t *ncec);
 114 static  void    nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
 115 static  ncec_t  *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
 116     ncec_t *);
 117 static  nce_t   *nce_lookup_addr(ill_t *, const in6_addr_t *);
 118 static  int     nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
 119     uint16_t ncec_flags, nce_t **newnce);
 120 static  int     nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
 121     uint16_t ncec_flags, nce_t **newnce);
 122 static  boolean_t       ndp_xmit(ill_t *ill, uint32_t operation,
 123     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
 124     const in6_addr_t *target, int flag);
 125 static void     ncec_refhold_locked(ncec_t *);
 126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
 127 static  void    nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
 128 static  int     nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 129     uint16_t, uint16_t, nce_t **);
 130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
 131 static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
 132 static void nce_inactive(nce_t *);
 133 extern nce_t    *nce_lookup(ill_t *, const in6_addr_t *);
 134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
 135 static int      nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 136     uint16_t, uint16_t, nce_t **);
 137 static int      nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
 138     uint16_t, uint16_t, nce_t **);
 139 static int  nce_add_v6_postprocess(nce_t *);
 140 static int  nce_add_v4_postprocess(nce_t *);
 141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
 142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
 143 static void nce_resolv_ipmp_ok(ncec_t *);
 144 static void nce_walk_common(ill_t *, pfi_t, void *);
 145 static void nce_start_timer(ncec_t *, uint_t);
 146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
 147 static void nce_fastpath_trigger(nce_t *);
 148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
 149 
 150 #ifdef DEBUG
 151 static void     ncec_trace_cleanup(const ncec_t *);
 152 #endif
 153 
 154 #define NCE_HASH_PTR_V4(ipst, addr)                                     \
 155         (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
 156 
 157 #define NCE_HASH_PTR_V6(ipst, addr)                              \
 158         (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
 159                 NCE_TABLE_SIZE)]))
 160 
 161 extern kmem_cache_t *ncec_cache;
 162 extern kmem_cache_t *nce_cache;
 163 
 164 /*
 165  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
 166  * If src_ill is not null, the ncec_addr is bound to src_ill. The
 167  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
 168  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
 169  * IPMP cast_ill (in the IPMP case).
 170  *
 171  * Note that the probe interval is based on the src_ill for IPv6, and
 172  * the ncec_xmit_interval for IPv4.
 173  */
 174 static void
 175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
 176 {
 177         boolean_t dropped;
 178         uint32_t probe_interval;
 179 
 180         ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
 181         ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
 182         if (ncec->ncec_ipversion == IPV6_VERSION) {
 183                 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
 184                     ncec->ncec_lladdr, ncec->ncec_lladdr_length,
 185                     &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
 186                 probe_interval = ILL_PROBE_INTERVAL(src_ill);
 187         } else {
 188                 /* IPv4 DAD delay the initial probe. */
 189                 if (send_probe)
 190                         dropped = arp_probe(ncec);
 191                 else
 192                         dropped = B_TRUE;
 193                 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
 194                     !send_probe);
 195         }
 196         if (!dropped) {
 197                 mutex_enter(&ncec->ncec_lock);
 198                 ncec->ncec_pcnt--;
 199                 mutex_exit(&ncec->ncec_lock);
 200         }
 201         nce_restart_timer(ncec, probe_interval);
 202 }
 203 
 204 /*
 205  * Compute default flags to use for an advertisement of this ncec's address.
 206  */
 207 static int
 208 nce_advert_flags(const ncec_t *ncec)
 209 {
 210         int flag = 0;
 211 
 212         if (ncec->ncec_flags & NCE_F_ISROUTER)
 213                 flag |= NDP_ISROUTER;
 214         if (!(ncec->ncec_flags & NCE_F_ANYCAST))
 215                 flag |= NDP_ORIDE;
 216 
 217         return (flag);
 218 }
 219 
 220 /*
 221  * NDP Cache Entry creation routine.
 222  * This routine must always be called with ndp6->ndp_g_lock held.
 223  */
 224 int
 225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
 226     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 227 {
 228         int             err;
 229         nce_t           *nce;
 230 
 231         ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
 232         ASSERT(ill != NULL && ill->ill_isv6);
 233 
 234         err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
 235             &nce);
 236         if (err != 0)
 237                 return (err);
 238         ASSERT(newnce != NULL);
 239         *newnce = nce;
 240         return (err);
 241 }
 242 
 243 /*
 244  * Post-processing routine to be executed after nce_add_v6(). This function
 245  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
 246  * and must be called without any locks held.
 247  */
 248 int
 249 nce_add_v6_postprocess(nce_t *nce)
 250 {
 251         ncec_t          *ncec = nce->nce_common;
 252         boolean_t       dropped = B_FALSE;
 253         uchar_t         *hw_addr = ncec->ncec_lladdr;
 254         uint_t          hw_addr_len = ncec->ncec_lladdr_length;
 255         ill_t           *ill = ncec->ncec_ill;
 256         int             err = 0;
 257         uint16_t        flags = ncec->ncec_flags;
 258         ip_stack_t      *ipst = ill->ill_ipst;
 259         boolean_t       trigger_fastpath = B_TRUE;
 260 
 261         /*
 262          * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
 263          * we call nce_fastpath as soon as the ncec is resolved in nce_process.
 264          * We call nce_fastpath from nce_update if the link layer address of
 265          * the peer changes from nce_update
 266          */
 267         if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
 268             (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
 269                 trigger_fastpath = B_FALSE;
 270 
 271         if (trigger_fastpath)
 272                 nce_fastpath_trigger(nce);
 273         if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
 274                 ill_t *hwaddr_ill;
 275                 /*
 276                  * Unicast entry that needs DAD.
 277                  */
 278                 if (IS_IPMP(ill)) {
 279                         hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
 280                             hw_addr, hw_addr_len);
 281                 } else {
 282                         hwaddr_ill = ill;
 283                 }
 284                 nce_dad(ncec, hwaddr_ill, B_TRUE);
 285                 err = EINPROGRESS;
 286         } else if (flags & NCE_F_UNSOL_ADV) {
 287                 /*
 288                  * We account for the transmit below by assigning one
 289                  * less than the ndd variable. Subsequent decrements
 290                  * are done in nce_timer.
 291                  */
 292                 mutex_enter(&ncec->ncec_lock);
 293                 ncec->ncec_unsolicit_count =
 294                     ipst->ips_ip_ndp_unsolicit_count - 1;
 295                 mutex_exit(&ncec->ncec_lock);
 296                 dropped = ndp_xmit(ill,
 297                     ND_NEIGHBOR_ADVERT,
 298                     hw_addr,
 299                     hw_addr_len,
 300                     &ncec->ncec_addr,    /* Source and target of the adv */
 301                     &ipv6_all_hosts_mcast, /* Destination of the packet */
 302                     nce_advert_flags(ncec));
 303                 mutex_enter(&ncec->ncec_lock);
 304                 if (dropped)
 305                         ncec->ncec_unsolicit_count++;
 306                 else
 307                         ncec->ncec_last_time_defended = ddi_get_lbolt();
 308                 if (ncec->ncec_unsolicit_count != 0) {
 309                         nce_start_timer(ncec,
 310                             ipst->ips_ip_ndp_unsolicit_interval);
 311                 }
 312                 mutex_exit(&ncec->ncec_lock);
 313         }
 314         return (err);
 315 }
 316 
 317 /*
 318  * Atomically lookup and add (if needed) Neighbor Cache information for
 319  * an address.
 320  *
 321  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
 322  * are always added pointing at the ipmp_ill. Thus, when the ill passed
 323  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
 324  * entries will be created, both pointing at the same ncec_t. The nce_t
 325  * entries will have their nce_ill set to the ipmp_ill and the under_ill
 326  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
 327  * Local addresses are always created on the ill passed to nce_add_v6.
 328  */
 329 int
 330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
 331     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 332 {
 333         int             err = 0;
 334         ip_stack_t      *ipst = ill->ill_ipst;
 335         nce_t           *nce, *upper_nce = NULL;
 336         ill_t           *in_ill = ill;
 337         boolean_t       need_ill_refrele = B_FALSE;
 338 
 339         if (flags & NCE_F_MCAST) {
 340                 /*
 341                  * hw_addr will be figured out in nce_set_multicast_v6;
 342                  * caller has to select the cast_ill
 343                  */
 344                 ASSERT(hw_addr == NULL);
 345                 ASSERT(!IS_IPMP(ill));
 346                 err = nce_set_multicast_v6(ill, addr, flags, newnce);
 347                 return (err);
 348         }
 349         ASSERT(ill->ill_isv6);
 350         if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
 351                 ill = ipmp_ill_hold_ipmp_ill(ill);
 352                 if (ill == NULL)
 353                         return (ENXIO);
 354                 need_ill_refrele = B_TRUE;
 355         }
 356 
 357         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 358         nce = nce_lookup_addr(ill, addr);
 359         if (nce == NULL) {
 360                 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
 361                     &nce);
 362         } else {
 363                 err = EEXIST;
 364         }
 365         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 366         if (err == 0)
 367                 err = nce_add_v6_postprocess(nce);
 368         if (in_ill != ill && nce != NULL) {
 369                 nce_t *under_nce = NULL;
 370 
 371                 /*
 372                  * in_ill was the under_ill. Try to create the under_nce.
 373                  * Hold the ill_g_lock to prevent changes to group membership
 374                  * until we are done.
 375                  */
 376                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 377                 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
 378                         DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
 379                             ill_t *, ill);
 380                         rw_exit(&ipst->ips_ill_g_lock);
 381                         err = ENXIO;
 382                         nce_refrele(nce);
 383                         nce = NULL;
 384                         goto bail;
 385                 }
 386                 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
 387                 if (under_nce == NULL) {
 388                         rw_exit(&ipst->ips_ill_g_lock);
 389                         err = EINVAL;
 390                         nce_refrele(nce);
 391                         nce = NULL;
 392                         goto bail;
 393                 }
 394                 rw_exit(&ipst->ips_ill_g_lock);
 395                 upper_nce = nce;
 396                 nce = under_nce; /* will be returned to caller */
 397                 if (NCE_ISREACHABLE(nce->nce_common))
 398                         nce_fastpath_trigger(under_nce);
 399         }
 400         /* nce_refrele is deferred until the lock is dropped  */
 401         if (nce != NULL) {
 402                 if (newnce != NULL)
 403                         *newnce = nce;
 404                 else
 405                         nce_refrele(nce);
 406         }
 407 bail:
 408         if (upper_nce != NULL)
 409                 nce_refrele(upper_nce);
 410         if (need_ill_refrele)
 411                 ill_refrele(ill);
 412         return (err);
 413 }
 414 
 415 /*
 416  * Remove all the CONDEMNED nces from the appropriate hash table.
 417  * We create a private list of NCEs, these may have ires pointing
 418  * to them, so the list will be passed through to clean up dependent
 419  * ires and only then we can do ncec_refrele() which can make NCE inactive.
 420  */
 421 static void
 422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
 423 {
 424         ncec_t *ncec1;
 425         ncec_t **ptpn;
 426 
 427         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 428         ASSERT(ndp->ndp_g_walker == 0);
 429         for (; ncec; ncec = ncec1) {
 430                 ncec1 = ncec->ncec_next;
 431                 mutex_enter(&ncec->ncec_lock);
 432                 if (NCE_ISCONDEMNED(ncec)) {
 433                         ptpn = ncec->ncec_ptpn;
 434                         ncec1 = ncec->ncec_next;
 435                         if (ncec1 != NULL)
 436                                 ncec1->ncec_ptpn = ptpn;
 437                         *ptpn = ncec1;
 438                         ncec->ncec_ptpn = NULL;
 439                         ncec->ncec_next = NULL;
 440                         ncec->ncec_next = *free_nce_list;
 441                         *free_nce_list = ncec;
 442                 }
 443                 mutex_exit(&ncec->ncec_lock);
 444         }
 445 }
 446 
 447 /*
 448  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
 449  *    will return this NCE. Also no new timeouts will
 450  *    be started (See nce_restart_timer).
 451  * 2. Cancel any currently running timeouts.
 452  * 3. If there is an ndp walker, return. The walker will do the cleanup.
 453  *    This ensures that walkers see a consistent list of NCEs while walking.
 454  * 4. Otherwise remove the NCE from the list of NCEs
 455  */
 456 void
 457 ncec_delete(ncec_t *ncec)
 458 {
 459         ncec_t  **ptpn;
 460         ncec_t  *ncec1;
 461         int     ipversion = ncec->ncec_ipversion;
 462         ndp_g_t *ndp;
 463         ip_stack_t      *ipst = ncec->ncec_ipst;
 464 
 465         if (ipversion == IPV4_VERSION)
 466                 ndp = ipst->ips_ndp4;
 467         else
 468                 ndp = ipst->ips_ndp6;
 469 
 470         /* Serialize deletes */
 471         mutex_enter(&ncec->ncec_lock);
 472         if (NCE_ISCONDEMNED(ncec)) {
 473                 /* Some other thread is doing the delete */
 474                 mutex_exit(&ncec->ncec_lock);
 475                 return;
 476         }
 477         /*
 478          * Caller has a refhold. Also 1 ref for being in the list. Thus
 479          * refcnt has to be >= 2
 480          */
 481         ASSERT(ncec->ncec_refcnt >= 2);
 482         ncec->ncec_flags |= NCE_F_CONDEMNED;
 483         mutex_exit(&ncec->ncec_lock);
 484 
 485         /* Count how many condemned ires for kmem_cache callback */
 486         atomic_inc_32(&ipst->ips_num_nce_condemned);
 487         nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
 488 
 489         /* Complete any waiting callbacks */
 490         ncec_cb_dispatch(ncec);
 491 
 492         /*
 493          * Cancel any running timer. Timeout can't be restarted
 494          * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
 495          * Passing invalid timeout id is fine.
 496          */
 497         if (ncec->ncec_timeout_id != 0) {
 498                 (void) untimeout(ncec->ncec_timeout_id);
 499                 ncec->ncec_timeout_id = 0;
 500         }
 501 
 502         mutex_enter(&ndp->ndp_g_lock);
 503         if (ncec->ncec_ptpn == NULL) {
 504                 /*
 505                  * The last ndp walker has already removed this ncec from
 506                  * the list after we marked the ncec CONDEMNED and before
 507                  * we grabbed the global lock.
 508                  */
 509                 mutex_exit(&ndp->ndp_g_lock);
 510                 return;
 511         }
 512         if (ndp->ndp_g_walker > 0) {
 513                 /*
 514                  * Can't unlink. The walker will clean up
 515                  */
 516                 ndp->ndp_g_walker_cleanup = B_TRUE;
 517                 mutex_exit(&ndp->ndp_g_lock);
 518                 return;
 519         }
 520 
 521         /*
 522          * Now remove the ncec from the list. nce_restart_timer won't restart
 523          * the timer since it is marked CONDEMNED.
 524          */
 525         ptpn = ncec->ncec_ptpn;
 526         ncec1 = ncec->ncec_next;
 527         if (ncec1 != NULL)
 528                 ncec1->ncec_ptpn = ptpn;
 529         *ptpn = ncec1;
 530         ncec->ncec_ptpn = NULL;
 531         ncec->ncec_next = NULL;
 532         mutex_exit(&ndp->ndp_g_lock);
 533 
 534         /* Removed from ncec_ptpn/ncec_next list */
 535         ncec_refrele_notr(ncec);
 536 }
 537 
 538 void
 539 ncec_inactive(ncec_t *ncec)
 540 {
 541         mblk_t          **mpp;
 542         ill_t           *ill = ncec->ncec_ill;
 543         ip_stack_t      *ipst = ncec->ncec_ipst;
 544 
 545         ASSERT(ncec->ncec_refcnt == 0);
 546         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 547 
 548         /* Count how many condemned nces for kmem_cache callback */
 549         if (NCE_ISCONDEMNED(ncec))
 550                 atomic_add_32(&ipst->ips_num_nce_condemned, -1);
 551 
 552         /* Free all allocated messages */
 553         mpp = &ncec->ncec_qd_mp;
 554         while (*mpp != NULL) {
 555                 mblk_t  *mp;
 556 
 557                 mp = *mpp;
 558                 *mpp = mp->b_next;
 559 
 560                 inet_freemsg(mp);
 561         }
 562         /*
 563          * must have been cleaned up in ncec_delete
 564          */
 565         ASSERT(list_is_empty(&ncec->ncec_cb));
 566         list_destroy(&ncec->ncec_cb);
 567         /*
 568          * free the ncec_lladdr if one was allocated in nce_add_common()
 569          */
 570         if (ncec->ncec_lladdr_length > 0)
 571                 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
 572 
 573 #ifdef DEBUG
 574         ncec_trace_cleanup(ncec);
 575 #endif
 576 
 577         mutex_enter(&ill->ill_lock);
 578         DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
 579             (char *), "ncec", (void *), ncec);
 580         ill->ill_ncec_cnt--;
 581         ncec->ncec_ill = NULL;
 582         /*
 583          * If the number of ncec's associated with this ill have dropped
 584          * to zero, check whether we need to restart any operation that
 585          * is waiting for this to happen.
 586          */
 587         if (ILL_DOWN_OK(ill)) {
 588                 /* ipif_ill_refrele_tail drops the ill_lock */
 589                 ipif_ill_refrele_tail(ill);
 590         } else {
 591                 mutex_exit(&ill->ill_lock);
 592         }
 593 
 594         mutex_destroy(&ncec->ncec_lock);
 595         kmem_cache_free(ncec_cache, ncec);
 596 }
 597 
 598 /*
 599  * ncec_walk routine.  Delete the ncec if it is associated with the ill
 600  * that is going away.  Always called as a writer.
 601  */
 602 void
 603 ncec_delete_per_ill(ncec_t *ncec, void *arg)
 604 {
 605         if ((ncec != NULL) && ncec->ncec_ill == arg) {
 606                 ncec_delete(ncec);
 607         }
 608 }
 609 
 610 /*
 611  * Neighbor Cache cleanup logic for a list of ncec_t entries.
 612  */
 613 static void
 614 nce_cleanup_list(ncec_t *ncec)
 615 {
 616         ncec_t *ncec_next;
 617 
 618         ASSERT(ncec != NULL);
 619         while (ncec != NULL) {
 620                 ncec_next = ncec->ncec_next;
 621                 ncec->ncec_next = NULL;
 622 
 623                 /*
 624                  * It is possible for the last ndp walker (this thread)
 625                  * to come here after ncec_delete has marked the ncec CONDEMNED
 626                  * and before it has removed the ncec from the fastpath list
 627                  * or called untimeout. So we need to do it here. It is safe
 628                  * for both ncec_delete and this thread to do it twice or
 629                  * even simultaneously since each of the threads has a
 630                  * reference on the ncec.
 631                  */
 632                 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
 633                 /*
 634                  * Cancel any running timer. Timeout can't be restarted
 635                  * since CONDEMNED is set. The ncec_lock can't be
 636                  * held across untimeout though passing invalid timeout
 637                  * id is fine.
 638                  */
 639                 if (ncec->ncec_timeout_id != 0) {
 640                         (void) untimeout(ncec->ncec_timeout_id);
 641                         ncec->ncec_timeout_id = 0;
 642                 }
 643                 /* Removed from ncec_ptpn/ncec_next list */
 644                 ncec_refrele_notr(ncec);
 645                 ncec = ncec_next;
 646         }
 647 }
 648 
 649 /*
 650  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
 651  */
 652 boolean_t
 653 nce_restart_dad(ncec_t *ncec)
 654 {
 655         boolean_t started;
 656         ill_t *ill, *hwaddr_ill;
 657 
 658         if (ncec == NULL)
 659                 return (B_FALSE);
 660         ill = ncec->ncec_ill;
 661         mutex_enter(&ncec->ncec_lock);
 662         if (ncec->ncec_state == ND_PROBE) {
 663                 mutex_exit(&ncec->ncec_lock);
 664                 started = B_TRUE;
 665         } else if (ncec->ncec_state == ND_REACHABLE) {
 666                 ASSERT(ncec->ncec_lladdr != NULL);
 667                 ncec->ncec_state = ND_PROBE;
 668                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
 669                 /*
 670                  * Slight cheat here: we don't use the initial probe delay
 671                  * for IPv4 in this obscure case.
 672                  */
 673                 mutex_exit(&ncec->ncec_lock);
 674                 if (IS_IPMP(ill)) {
 675                         hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
 676                             ncec->ncec_lladdr, ncec->ncec_lladdr_length);
 677                 } else {
 678                         hwaddr_ill = ill;
 679                 }
 680                 nce_dad(ncec, hwaddr_ill, B_TRUE);
 681                 started = B_TRUE;
 682         } else {
 683                 mutex_exit(&ncec->ncec_lock);
 684                 started = B_FALSE;
 685         }
 686         return (started);
 687 }
 688 
 689 /*
 690  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
 691  * If one is found, the refcnt on the ncec will be incremented.
 692  */
 693 ncec_t *
 694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
 695 {
 696         ncec_t          *ncec;
 697         ip_stack_t      *ipst = ill->ill_ipst;
 698 
 699         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 700         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 701 
 702         /* Get head of v6 hash table */
 703         ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
 704         ncec = ncec_lookup_illgrp(ill, addr, ncec);
 705         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 706         rw_exit(&ipst->ips_ill_g_lock);
 707         return (ncec);
 708 }
 709 /*
 710  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
 711  * If one is found, the refcnt on the ncec will be incremented.
 712  */
 713 ncec_t *
 714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
 715 {
 716         ncec_t  *ncec = NULL;
 717         in6_addr_t addr6;
 718         ip_stack_t *ipst = ill->ill_ipst;
 719 
 720         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 721         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
 722 
 723         /* Get head of v4 hash table */
 724         ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
 725         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
 726         ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
 727         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 728         rw_exit(&ipst->ips_ill_g_lock);
 729         return (ncec);
 730 }
 731 
 732 /*
 733  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
 734  * If an ncec is found, increment the hold count on that ncec.
 735  * The caller passes in the start of the appropriate hash table, and must
 736  * be holding the appropriate global lock (ndp_g_lock). In addition, since
 737  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
 738  * must be held as reader.
 739  *
 740  * This function always matches across the ipmp group.
 741  */
 742 ncec_t *
 743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
 744 {
 745         ndp_g_t         *ndp;
 746         ip_stack_t      *ipst = ill->ill_ipst;
 747 
 748         if (ill->ill_isv6)
 749                 ndp = ipst->ips_ndp6;
 750         else
 751                 ndp = ipst->ips_ndp4;
 752 
 753         ASSERT(ill != NULL);
 754         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 755         if (IN6_IS_ADDR_UNSPECIFIED(addr))
 756                 return (NULL);
 757         for (; ncec != NULL; ncec = ncec->ncec_next) {
 758                 if (ncec->ncec_ill == ill ||
 759                     IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
 760                         if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
 761                                 mutex_enter(&ncec->ncec_lock);
 762                                 if (!NCE_ISCONDEMNED(ncec)) {
 763                                         ncec_refhold_locked(ncec);
 764                                         mutex_exit(&ncec->ncec_lock);
 765                                         break;
 766                                 }
 767                                 mutex_exit(&ncec->ncec_lock);
 768                         }
 769                 }
 770         }
 771         return (ncec);
 772 }
 773 
 774 /*
 775  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
 776  * entries for ill only, i.e., when ill is part of an ipmp group,
 777  * nce_lookup_v4 will never try to match across the group.
 778  */
 779 nce_t *
 780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
 781 {
 782         nce_t *nce;
 783         in6_addr_t addr6;
 784         ip_stack_t *ipst = ill->ill_ipst;
 785 
 786         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
 787         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
 788         nce = nce_lookup_addr(ill, &addr6);
 789         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 790         return (nce);
 791 }
 792 
 793 /*
 794  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
 795  * entries for ill only, i.e., when ill is part of an ipmp group,
 796  * nce_lookup_v6 will never try to match across the group.
 797  */
 798 nce_t *
 799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
 800 {
 801         nce_t *nce;
 802         ip_stack_t *ipst = ill->ill_ipst;
 803 
 804         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 805         nce = nce_lookup_addr(ill, addr6);
 806         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 807         return (nce);
 808 }
 809 
 810 static nce_t *
 811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
 812 {
 813         nce_t *nce;
 814 
 815         ASSERT(ill != NULL);
 816 #ifdef DEBUG
 817         if (ill->ill_isv6)
 818                 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
 819         else
 820                 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
 821 #endif
 822         mutex_enter(&ill->ill_lock);
 823         nce = nce_lookup(ill, addr);
 824         mutex_exit(&ill->ill_lock);
 825         return (nce);
 826 }
 827 
 828 
 829 /*
 830  * Router turned to host.  We need to make sure that cached copies of the ncec
 831  * are not used for forwarding packets if they were derived from the default
 832  * route, and that the default route itself is removed, as  required by
 833  * section 7.2.5 of RFC 2461.
 834  *
 835  * Note that the ncec itself probably has valid link-layer information for the
 836  * nexthop, so that there is no reason to delete the ncec, as long as the
 837  * ISROUTER flag is turned off.
 838  */
 839 static void
 840 ncec_router_to_host(ncec_t *ncec)
 841 {
 842         ire_t           *ire;
 843         ip_stack_t      *ipst = ncec->ncec_ipst;
 844 
 845         mutex_enter(&ncec->ncec_lock);
 846         ncec->ncec_flags &= ~NCE_F_ISROUTER;
 847         mutex_exit(&ncec->ncec_lock);
 848 
 849         ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
 850             &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
 851             MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
 852         if (ire != NULL) {
 853                 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
 854                 ire_delete(ire);
 855                 ire_refrele(ire);
 856         }
 857 }
 858 
 859 /*
 860  * Process passed in parameters either from an incoming packet or via
 861  * user ioctl.
 862  */
 863 void
 864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
 865 {
 866         ill_t   *ill = ncec->ncec_ill;
 867         uint32_t hw_addr_len = ill->ill_phys_addr_length;
 868         boolean_t ll_updated = B_FALSE;
 869         boolean_t ll_changed;
 870         nce_t   *nce;
 871 
 872         ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
 873         /*
 874          * No updates of link layer address or the neighbor state is
 875          * allowed, when the cache is in NONUD state.  This still
 876          * allows for responding to reachability solicitation.
 877          */
 878         mutex_enter(&ncec->ncec_lock);
 879         if (ncec->ncec_state == ND_INCOMPLETE) {
 880                 if (hw_addr == NULL) {
 881                         mutex_exit(&ncec->ncec_lock);
 882                         return;
 883                 }
 884                 nce_set_ll(ncec, hw_addr);
 885                 /*
 886                  * Update ncec state and send the queued packets
 887                  * back to ip this time ire will be added.
 888                  */
 889                 if (flag & ND_NA_FLAG_SOLICITED) {
 890                         nce_update(ncec, ND_REACHABLE, NULL);
 891                 } else {
 892                         nce_update(ncec, ND_STALE, NULL);
 893                 }
 894                 mutex_exit(&ncec->ncec_lock);
 895                 nce = nce_fastpath(ncec, B_TRUE, NULL);
 896                 nce_resolv_ok(ncec);
 897                 if (nce != NULL)
 898                         nce_refrele(nce);
 899                 return;
 900         }
 901         ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
 902         if (!is_adv) {
 903                 /* If this is a SOLICITATION request only */
 904                 if (ll_changed)
 905                         nce_update(ncec, ND_STALE, hw_addr);
 906                 mutex_exit(&ncec->ncec_lock);
 907                 ncec_cb_dispatch(ncec);
 908                 return;
 909         }
 910         if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
 911                 /* If in any other state than REACHABLE, ignore */
 912                 if (ncec->ncec_state == ND_REACHABLE) {
 913                         nce_update(ncec, ND_STALE, NULL);
 914                 }
 915                 mutex_exit(&ncec->ncec_lock);
 916                 ncec_cb_dispatch(ncec);
 917                 return;
 918         } else {
 919                 if (ll_changed) {
 920                         nce_update(ncec, ND_UNCHANGED, hw_addr);
 921                         ll_updated = B_TRUE;
 922                 }
 923                 if (flag & ND_NA_FLAG_SOLICITED) {
 924                         nce_update(ncec, ND_REACHABLE, NULL);
 925                 } else {
 926                         if (ll_updated) {
 927                                 nce_update(ncec, ND_STALE, NULL);
 928                         }
 929                 }
 930                 mutex_exit(&ncec->ncec_lock);
 931                 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
 932                     NCE_F_ISROUTER)) {
 933                         ncec_router_to_host(ncec);
 934                 } else {
 935                         ncec_cb_dispatch(ncec);
 936                 }
 937         }
 938 }
 939 
 940 /*
 941  * Pass arg1 to the cbf supplied, along with each ncec in existence.
 942  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
 943  * walking the hash list.
 944  */
 945 void
 946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
 947     void *arg1, boolean_t trace)
 948 {
 949         ncec_t  *ncec;
 950         ncec_t  *ncec1;
 951         ncec_t  **ncep;
 952         ncec_t  *free_nce_list = NULL;
 953 
 954         mutex_enter(&ndp->ndp_g_lock);
 955         /* Prevent ncec_delete from unlink and free of NCE */
 956         ndp->ndp_g_walker++;
 957         mutex_exit(&ndp->ndp_g_lock);
 958         for (ncep = ndp->nce_hash_tbl;
 959             ncep < A_END(ndp->nce_hash_tbl); ncep++) {
 960                 for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
 961                         ncec1 = ncec->ncec_next;
 962                         if (ill == NULL || ncec->ncec_ill == ill) {
 963                                 if (trace) {
 964                                         ncec_refhold(ncec);
 965                                         (*cbf)(ncec, arg1);
 966                                         ncec_refrele(ncec);
 967                                 } else {
 968                                         ncec_refhold_notr(ncec);
 969                                         (*cbf)(ncec, arg1);
 970                                         ncec_refrele_notr(ncec);
 971                                 }
 972                         }
 973                 }
 974         }
 975         mutex_enter(&ndp->ndp_g_lock);
 976         ndp->ndp_g_walker--;
 977         if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
 978                 /* Time to delete condemned entries */
 979                 for (ncep = ndp->nce_hash_tbl;
 980                     ncep < A_END(ndp->nce_hash_tbl); ncep++) {
 981                         ncec = *ncep;
 982                         if (ncec != NULL) {
 983                                 nce_remove(ndp, ncec, &free_nce_list);
 984                         }
 985                 }
 986                 ndp->ndp_g_walker_cleanup = B_FALSE;
 987         }
 988 
 989         mutex_exit(&ndp->ndp_g_lock);
 990 
 991         if (free_nce_list != NULL) {
 992                 nce_cleanup_list(free_nce_list);
 993         }
 994 }
 995 
 996 /*
 997  * Walk everything.
 998  * Note that ill can be NULL hence can't derive the ipst from it.
 999  */
1000 void
1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003         ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004         ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006 
1007 /*
1008  * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
1009  * NCEs, and the number to reclaim if we hit the limit.  Used by
1010  * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
1011  * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
1012  */
1013 
1014 /* Maximum number of multicast NCEs on an ill. */
1015 uint_t ip_max_ill_mcast_nces = 16384;
1016 /*
1017  * Number of NCEs to delete if we hit the maximum above.  0 means *don't* and
1018  * return an error.  Non-zero means delete so many, and if the number is >=
1019  * the max above, that means delete them all.
1020  */
1021 uint_t ip_ill_mcast_reclaim = 256;
1022 
1023 /*
1024  * Encapsulate multicast ill capping in a function, for easier DTrace
1025  * detections.  Return a list of refheld NCEs to destroy-via-refrele.  That
1026  * list can be NULL, but can only be non-NULL if we successfully reclaimed.
1027  *
1028  * NOTE:  This function must be called while holding the ill_lock AND
1029  * JUST PRIOR to making the insertion into the ill_nce list.
1030  *
1031  * We can't release the ones we delete ourselves because the ill_lock is held
1032  * by the caller. They are, instead, passed back in a list_t for deletion
1033  * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
1034  *
1035  * While this covers nce_t, ncec_t gets done even further down the road.  See
1036  * nce_graveyard_free() for why.
1037  */
1038 static boolean_t
1039 nce_too_many_mcast(ill_t *ill, list_t *graveyard)
1040 {
1041         uint_t reclaim_count, max_count, reclaimed = 0;
1042         boolean_t too_many;
1043         nce_t *nce, *deadman;
1044 
1045         ASSERT(graveyard != NULL);
1046         ASSERT(list_is_empty(graveyard));
1047         ASSERT(MUTEX_HELD(&ill->ill_lock));
1048 
1049         /*
1050          * NOTE: Some grinning weirdo may have lowered the global max beyond
1051          * what this ill currently has.  The behavior in this case will be
1052          * trim-back just by the reclaim amount for any new ones.
1053          */
1054         max_count = ip_max_ill_mcast_nces;
1055         reclaim_count = min(ip_ill_mcast_reclaim, max_count);
1056 
1057         /* All good? */
1058         if (ill->ill_mcast_nces < max_count)
1059                 return (B_FALSE);       /* Yes, all good. */
1060 
1061         if (reclaim_count == 0)
1062                 return (B_TRUE);        /* Don't bother - we're stuck. */
1063 
1064         /* We need to reclaim now.  Exploit our held ill_lock. */
1065 
1066         /*
1067          * Start at the tail and work backwards, new nces are head-inserted,
1068          * so we'll be reaping the oldest entries.
1069          */
1070         nce = list_tail(&ill->ill_nce);
1071         while (reclaimed < reclaim_count) {
1072                 /* Skip ahead to a multicast NCE. */
1073                 while (nce != NULL &&
1074                     (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
1075                         nce = list_prev(&ill->ill_nce, nce);
1076                 }
1077                 if (nce == NULL)
1078                         break;
1079 
1080                 /*
1081                  * NOTE: For now, we just delete the first one(s) we find.
1082                  * This is not optimal, and may require some inspection of nce
1083                  * & its ncec to be better.
1084                  */
1085                 deadman = nce;
1086                 nce = list_prev(&ill->ill_nce, nce);
1087 
1088                 /* nce_delete() requires caller holds... */
1089                 nce_refhold(deadman);
1090                 nce_delete(deadman);    /* Bumps down ill_mcast_nces. */
1091 
1092                 /* Link the dead ones singly, still refheld... */
1093                 list_insert_tail(graveyard, deadman);
1094                 reclaimed++;
1095         }
1096 
1097         if (reclaimed != reclaim_count) {
1098                 /* We didn't have enough to reach reclaim_count. Why?!? */
1099                 DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
1100                     uint_t, reclaimed, uint_t, reclaim_count);
1101 
1102                 /* In case for some REALLY weird reason we found none! */
1103                 too_many = (reclaimed == 0);
1104         } else {
1105                 too_many = B_FALSE;
1106         }
1107 
1108         return (too_many);
1109 }
1110 
1111 static void
1112 ncec_mcast_reap_one(ncec_t *ncec, void *arg)
1113 {
1114         boolean_t reapit;
1115         ill_t *ill = (ill_t *)arg;
1116 
1117         /* Obvious no-lock-needed checks... */
1118         if (ncec == NULL || ncec->ncec_ill != ill ||
1119             (ncec->ncec_flags & NCE_F_MCAST) == 0)
1120                 return;
1121 
1122         mutex_enter(&ncec->ncec_lock);
1123         /*
1124          * It's refheld by the walk infrastructure. It has one reference for
1125          * being in the ndp_g_hash, and if an nce_t exists, that's one more.
1126          * We want ones without an nce_t, so 2 is the magic number.  If it's
1127          * LESS than 2, we have much bigger problems anyway.
1128          */
1129         ASSERT(ncec->ncec_refcnt >= 2);
1130         reapit = (ncec->ncec_refcnt == 2);
1131         mutex_exit(&ncec->ncec_lock);
1132 
1133         if (reapit) {
1134                 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
1135                 ncec_delete(ncec);
1136         }
1137 }
1138 
1139 /*
1140  * Attempt to reap stray multicast ncec_t structures left in the wake of
1141  * nce_graveyard_free(). This is a taskq servicing routine, as it's well
1142  * outside any netstack-global locks being held - ndp_g_lock in this case.  We
1143  * have a reference hold on the ill, which will prevent any unplumbing races.
1144  */
1145 static void
1146 ncec_mcast_reap(void *arg)
1147 {
1148         ill_t *ill = (ill_t *)arg;
1149 
1150         IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
1151         ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
1152         mutex_enter(&ill->ill_lock);
1153         ill->ill_mcast_ncec_cleanup = B_FALSE;
1154         /*
1155          * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
1156          * below for why.
1157          */
1158         ill->ill_refcnt--;
1159         if (ill->ill_refcnt == 0)
1160                 ipif_ill_refrele_tail(ill);     /* Drops ill_lock. */
1161         else
1162                 mutex_exit(&ill->ill_lock);
1163 }
1164 
1165 /*
1166  * Free a list (including handling an empty list or NULL list) of
1167  * reference-held NCEs that were reaped from a nce_too_many_mcast()
1168  * call. Separate because the caller must have dropped ndp_g_lock first.
1169  *
1170  * This also schedules a taskq task to unlink underlying NCECs from the
1171  * ndp_g_hash, which are protected by ndp_g_lock.
1172  */
1173 static void
1174 nce_graveyard_free(list_t *graveyard)
1175 {
1176         nce_t *deadman, *current;
1177         ill_t *ill;
1178         boolean_t doit;
1179 
1180         if (graveyard == NULL)
1181                 return;
1182 
1183         current = list_head(graveyard);
1184         if (current == NULL) {
1185                 list_destroy(graveyard);
1186                 return;
1187         }
1188 
1189         ill = current->nce_ill;
1190         /*
1191          * Normally one should ill_refhold(ill) here.  There's no _notr()
1192          * variant like there is for ire_t, dce_t, or even ncec_t, but this is
1193          * the ONLY case that'll break the mh_trace that IP debugging uses for
1194          * reference counts (i.e. they assume same thread releases as
1195          * holds). Instead, we inline ill_refhold() here.  We must do the same
1196          * in the release done by the ncec_mcast_reap() above.
1197          */
1198         mutex_enter(&ill->ill_lock);
1199         ill->ill_refcnt++;
1200         mutex_exit(&ill->ill_lock);
1201 
1202         while (current != NULL) {
1203                 ASSERT3P(ill, ==, current->nce_ill);
1204                 deadman = current;
1205                 current = list_next(graveyard, deadman);
1206                 list_remove(graveyard, deadman);
1207                 ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
1208                     0);
1209                 nce_refrele(deadman);
1210         }
1211         list_destroy(graveyard);
1212 
1213         mutex_enter(&ill->ill_lock);
1214         if (ill->ill_mcast_ncec_cleanup)
1215                 doit = B_FALSE;
1216         else {
1217                 ill->ill_mcast_ncec_cleanup = B_TRUE;
1218                 doit = B_TRUE;
1219         }
1220         mutex_exit(&ill->ill_lock);
1221         if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
1222             ill, TQ_NOSLEEP) == NULL) {
1223                 mutex_enter(&ill->ill_lock);
1224                 if (doit) {
1225                         IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
1226                         ill->ill_mcast_ncec_cleanup = B_FALSE;
1227                 }
1228                 /* There's no _notr() for ill_refrele(), so inline it here. */
1229                 ill->ill_refcnt--;
1230                 if (ill->ill_refcnt == 0)
1231                         ipif_ill_refrele_tail(ill);     /* Drops ill_lock */
1232                 else
1233                         mutex_exit(&ill->ill_lock);
1234         }
1235 }
1236 
1237 /*
1238  * For each interface an entry is added for the unspecified multicast group.
1239  * Here that mapping is used to form the multicast cache entry for a particular
1240  * multicast destination.
1241  */
1242 static int
1243 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1244     uint16_t flags, nce_t **newnce)
1245 {
1246         uchar_t         *hw_addr;
1247         int             err = 0;
1248         ip_stack_t      *ipst = ill->ill_ipst;
1249         nce_t           *nce;
1250 
1251         ASSERT(ill != NULL);
1252         ASSERT(ill->ill_isv6);
1253         ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1254 
1255         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1256         nce = nce_lookup_addr(ill, dst);
1257         if (nce != NULL) {
1258                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1259                 goto done;
1260         }
1261         if (ill->ill_net_type == IRE_IF_RESOLVER) {
1262                 /*
1263                  * For IRE_IF_RESOLVER a hardware mapping can be
1264                  * generated.
1265                  */
1266                 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1267                 if (hw_addr == NULL) {
1268                         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1269                         return (ENOMEM);
1270                 }
1271                 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1272         } else {
1273                 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1274                 hw_addr = NULL;
1275         }
1276         ASSERT((flags & NCE_F_MCAST) != 0);
1277         ASSERT((flags & NCE_F_NONUD) != 0);
1278         /* nce_state will be computed by nce_add_common() */
1279         err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1280             ND_UNCHANGED, &nce);
1281         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1282         if (err == 0)
1283                 err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
1284         if (hw_addr != NULL)
1285                 kmem_free(hw_addr, ill->ill_nd_lla_len);
1286         if (err != 0) {
1287                 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1288                 return (err);
1289         }
1290 done:
1291         ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1292         if (newnce != NULL)
1293                 *newnce = nce;
1294         else
1295                 nce_refrele(nce);
1296         return (0);
1297 }
1298 
1299 /*
1300  * Return the link layer address, and any flags of a ncec.
1301  */
1302 int
1303 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1304 {
1305         ncec_t          *ncec;
1306         in6_addr_t      *addr;
1307         sin6_t          *sin6;
1308 
1309         ASSERT(ill != NULL && ill->ill_isv6);
1310         sin6 = (sin6_t *)&lnr->lnr_addr;
1311         addr =  &sin6->sin6_addr;
1312 
1313         /*
1314          * NOTE: if the ill is an IPMP interface, then match against the whole
1315          * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1316          * addresses for the data addresses on an IPMP interface even though
1317          * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1318          */
1319         ncec = ncec_lookup_illgrp_v6(ill, addr);
1320         if (ncec == NULL)
1321                 return (ESRCH);
1322         /* If no link layer address is available yet, return ESRCH */
1323         if (!NCE_ISREACHABLE(ncec)) {
1324                 ncec_refrele(ncec);
1325                 return (ESRCH);
1326         }
1327         lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1328         bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1329             lnr->lnr_hdw_len);
1330         if (ncec->ncec_flags & NCE_F_ISROUTER)
1331                 lnr->lnr_flags = NDF_ISROUTER_ON;
1332         if (ncec->ncec_flags & NCE_F_ANYCAST)
1333                 lnr->lnr_flags |= NDF_ANYCAST_ON;
1334         if (ncec->ncec_flags & NCE_F_STATIC)
1335                 lnr->lnr_flags |= NDF_STATIC;
1336         ncec_refrele(ncec);
1337         return (0);
1338 }
1339 
1340 /*
1341  * Finish setting up the Enable/Disable multicast for the driver.
1342  */
1343 mblk_t *
1344 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1345     uint32_t hw_addr_offset, mblk_t *mp)
1346 {
1347         uchar_t         *hw_addr;
1348         ipaddr_t        v4group;
1349         uchar_t         *addr;
1350 
1351         ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1352         if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1353                 IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1354 
1355                 ASSERT(CLASSD(v4group));
1356                 ASSERT(!(ill->ill_isv6));
1357 
1358                 addr = (uchar_t *)&v4group;
1359         } else {
1360                 ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1361                 ASSERT(ill->ill_isv6);
1362 
1363                 addr = (uchar_t *)v6group;
1364         }
1365         hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1366         if (hw_addr == NULL) {
1367                 ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1368                 freemsg(mp);
1369                 return (NULL);
1370         }
1371 
1372         ip_mcast_mapping(ill, addr, hw_addr);
1373         return (mp);
1374 }
1375 
1376 void
1377 ip_ndp_resolve(ncec_t *ncec)
1378 {
1379         in_addr_t       sender4 = INADDR_ANY;
1380         in6_addr_t      sender6 = ipv6_all_zeros;
1381         ill_t           *src_ill;
1382         uint32_t        ms;
1383 
1384         src_ill = nce_resolve_src(ncec, &sender6);
1385         if (src_ill == NULL) {
1386                 /* Make sure we try again later */
1387                 ms = ncec->ncec_ill->ill_reachable_retrans_time;
1388                 nce_restart_timer(ncec, (clock_t)ms);
1389                 return;
1390         }
1391         if (ncec->ncec_ipversion == IPV4_VERSION)
1392                 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1393         mutex_enter(&ncec->ncec_lock);
1394         if (ncec->ncec_ipversion == IPV6_VERSION)
1395                 ms = ndp_solicit(ncec, sender6, src_ill);
1396         else
1397                 ms = arp_request(ncec, sender4, src_ill);
1398         mutex_exit(&ncec->ncec_lock);
1399         if (ms == 0) {
1400                 if (ncec->ncec_state != ND_REACHABLE) {
1401                         if (ncec->ncec_ipversion == IPV6_VERSION)
1402                                 ndp_resolv_failed(ncec);
1403                         else
1404                                 arp_resolv_failed(ncec);
1405                         ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1406                         nce_make_unreachable(ncec);
1407                         ncec_delete(ncec);
1408                 }
1409         } else {
1410                 nce_restart_timer(ncec, (clock_t)ms);
1411         }
1412 done:
1413         ill_refrele(src_ill);
1414 }
1415 
1416 /*
1417  * Send an IPv6 neighbor solicitation.
1418  * Returns number of milliseconds after which we should either rexmit or abort.
1419  * Return of zero means we should abort.
1420  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1421  * The optional source address is used as a hint to ndp_solicit for
1422  * which source to use in the packet.
1423  *
1424  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1425  * the packet.
1426  */
1427 uint32_t
1428 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1429 {
1430         in6_addr_t      dst;
1431         boolean_t       dropped = B_FALSE;
1432 
1433         ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1434         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1435 
1436         if (ncec->ncec_rcnt == 0)
1437                 return (0);
1438 
1439         dst = ncec->ncec_addr;
1440         ncec->ncec_rcnt--;
1441         mutex_exit(&ncec->ncec_lock);
1442         dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1443             ill->ill_phys_addr_length, &src, &dst, 0);
1444         mutex_enter(&ncec->ncec_lock);
1445         if (dropped)
1446                 ncec->ncec_rcnt++;
1447         return (ncec->ncec_ill->ill_reachable_retrans_time);
1448 }
1449 
1450 /*
1451  * Attempt to recover an address on an interface that's been marked as a
1452  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1453  * no easy way to just probe the address and have the right thing happen if
1454  * it's no longer in use.  Instead, we just bring it up normally and allow the
1455  * regular interface start-up logic to probe for a remaining duplicate and take
1456  * us back down if necessary.
1457  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1458  * ip_ndp_excl.
1459  */
1460 /* ARGSUSED */
1461 void
1462 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1463 {
1464         ill_t   *ill = rq->q_ptr;
1465         ipif_t  *ipif;
1466         in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1467         in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1468         boolean_t addr_equal;
1469 
1470         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1471                 /*
1472                  * We do not support recovery of proxy ARP'd interfaces,
1473                  * because the system lacks a complete proxy ARP mechanism.
1474                  */
1475                 if (ill->ill_isv6) {
1476                         addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1477                             addr6);
1478                 } else {
1479                         addr_equal = (ipif->ipif_lcl_addr == *addr4);
1480                 }
1481 
1482                 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1483                         continue;
1484 
1485                 /*
1486                  * If we have already recovered or if the interface is going
1487                  * away, then ignore.
1488                  */
1489                 mutex_enter(&ill->ill_lock);
1490                 if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1491                     (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1492                         mutex_exit(&ill->ill_lock);
1493                         continue;
1494                 }
1495 
1496                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
1497                 ill->ill_ipif_dup_count--;
1498                 mutex_exit(&ill->ill_lock);
1499                 ipif->ipif_was_dup = B_TRUE;
1500 
1501                 if (ill->ill_isv6) {
1502                         VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1503                         (void) ipif_up_done_v6(ipif);
1504                 } else {
1505                         VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1506                             EINPROGRESS);
1507                         (void) ipif_up_done(ipif);
1508                 }
1509         }
1510         freeb(mp);
1511 }
1512 
1513 /*
1514  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1515  * As long as someone else holds the address, the interface will stay down.
1516  * When that conflict goes away, the interface is brought back up.  This is
1517  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1518  * server will recover from a failure.
1519  *
1520  * For DHCP and temporary addresses, recovery is not done in the kernel.
1521  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1522  *
1523  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1524  */
1525 void
1526 ipif_dup_recovery(void *arg)
1527 {
1528         ipif_t *ipif = arg;
1529 
1530         ipif->ipif_recovery_id = 0;
1531         if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1532                 return;
1533 
1534         /*
1535          * No lock, because this is just an optimization.
1536          */
1537         if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1538                 return;
1539 
1540         /* If the link is down, we'll retry this later */
1541         if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1542                 return;
1543 
1544         ipif_do_recovery(ipif);
1545 }
1546 
1547 /*
1548  * Perform interface recovery by forcing the duplicate interfaces up and
1549  * allowing the system to determine which ones should stay up.
1550  *
1551  * Called both by recovery timer expiry and link-up notification.
1552  */
1553 void
1554 ipif_do_recovery(ipif_t *ipif)
1555 {
1556         ill_t *ill = ipif->ipif_ill;
1557         mblk_t *mp;
1558         ip_stack_t *ipst = ill->ill_ipst;
1559         size_t mp_size;
1560 
1561         if (ipif->ipif_isv6)
1562                 mp_size = sizeof (ipif->ipif_v6lcl_addr);
1563         else
1564                 mp_size = sizeof (ipif->ipif_lcl_addr);
1565         mp = allocb(mp_size, BPRI_MED);
1566         if (mp == NULL) {
1567                 mutex_enter(&ill->ill_lock);
1568                 if (ipst->ips_ip_dup_recovery > 0 &&
1569                     ipif->ipif_recovery_id == 0 &&
1570                     !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1571                         ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1572                             ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1573                 }
1574                 mutex_exit(&ill->ill_lock);
1575         } else {
1576                 /*
1577                  * A recovery timer may still be running if we got here from
1578                  * ill_restart_dad(); cancel that timer.
1579                  */
1580                 if (ipif->ipif_recovery_id != 0)
1581                         (void) untimeout(ipif->ipif_recovery_id);
1582                 ipif->ipif_recovery_id = 0;
1583 
1584                 if (ipif->ipif_isv6) {
1585                         bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1586                             sizeof (ipif->ipif_v6lcl_addr));
1587                 } else  {
1588                         bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1589                             sizeof (ipif->ipif_lcl_addr));
1590                 }
1591                 ill_refhold(ill);
1592                 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1593                     B_FALSE);
1594         }
1595 }
1596 
1597 /*
1598  * Find the MAC and IP addresses in an NA/NS message.
1599  */
1600 static void
1601 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1602     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1603 {
1604         icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1605         nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1606         uchar_t *addr;
1607         int alen;
1608 
1609         /* icmp_inbound_v6 ensures this */
1610         ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1611 
1612         addr = ira->ira_l2src;
1613         alen = ill->ill_phys_addr_length;
1614         if (alen > 0) {
1615                 *haddr = addr;
1616                 *haddrlenp = alen;
1617         } else {
1618                 *haddr = NULL;
1619                 *haddrlenp = 0;
1620         }
1621 
1622         /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1623         *targp = ns->nd_ns_target;
1624 }
1625 
1626 /*
1627  * This is for exclusive changes due to NDP duplicate address detection
1628  * failure.
1629  */
1630 /* ARGSUSED */
1631 static void
1632 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1633 {
1634         ill_t   *ill = rq->q_ptr;
1635         ipif_t  *ipif;
1636         uchar_t *haddr;
1637         uint_t  haddrlen;
1638         ip_stack_t *ipst = ill->ill_ipst;
1639         in6_addr_t targ;
1640         ip_recv_attr_t iras;
1641         mblk_t  *attrmp;
1642 
1643         attrmp = mp;
1644         mp = mp->b_cont;
1645         attrmp->b_cont = NULL;
1646         if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1647                 /* The ill or ip_stack_t disappeared on us */
1648                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1649                 ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1650                 freemsg(mp);
1651                 ira_cleanup(&iras, B_TRUE);
1652                 return;
1653         }
1654 
1655         ASSERT(ill == iras.ira_rill);
1656 
1657         ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1658         if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1659                 /*
1660                  * Ignore conflicts generated by misbehaving switches that
1661                  * just reflect our own messages back to us.  For IPMP, we may
1662                  * see reflections across any ill in the illgrp.
1663                  *
1664                  * RFC2462 and revisions tried to detect both the case
1665                  * when a statically configured IPv6 address is a duplicate,
1666                  * and the case when the L2 address itself is a duplicate. The
1667                  * later is important because, with stateles address autoconf,
1668                  * if the L2 address is a duplicate, the resulting IPv6
1669                  * address(es) would also be duplicates. We rely on DAD of the
1670                  * IPv6 address itself to detect the latter case.
1671                  */
1672                 /* For an under ill_grp can change under lock */
1673                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1674                 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1675                     IS_UNDER_IPMP(ill) &&
1676                     ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1677                     haddrlen) != NULL) {
1678                         rw_exit(&ipst->ips_ill_g_lock);
1679                         goto ignore_conflict;
1680                 }
1681                 rw_exit(&ipst->ips_ill_g_lock);
1682         }
1683 
1684         /*
1685          * Look up the appropriate ipif.
1686          */
1687         ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1688         if (ipif == NULL)
1689                 goto ignore_conflict;
1690 
1691         /* Reload the ill to match the ipif */
1692         ill = ipif->ipif_ill;
1693 
1694         /* If it's already duplicate or ineligible, then don't do anything. */
1695         if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1696                 ipif_refrele(ipif);
1697                 goto ignore_conflict;
1698         }
1699 
1700         /*
1701          * If this is a failure during duplicate recovery, then don't
1702          * complain.  It may take a long time to recover.
1703          */
1704         if (!ipif->ipif_was_dup) {
1705                 char ibuf[LIFNAMSIZ];
1706                 char hbuf[MAC_STR_LEN];
1707                 char sbuf[INET6_ADDRSTRLEN];
1708 
1709                 ipif_get_name(ipif, ibuf, sizeof (ibuf));
1710                 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1711                     " disabled", ibuf,
1712                     inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1713                     mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1714         }
1715         mutex_enter(&ill->ill_lock);
1716         ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1717         ipif->ipif_flags |= IPIF_DUPLICATE;
1718         ill->ill_ipif_dup_count++;
1719         mutex_exit(&ill->ill_lock);
1720         (void) ipif_down(ipif, NULL, NULL);
1721         (void) ipif_down_tail(ipif);
1722         mutex_enter(&ill->ill_lock);
1723         if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1724             ill->ill_net_type == IRE_IF_RESOLVER &&
1725             !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1726             ipst->ips_ip_dup_recovery > 0) {
1727                 ASSERT(ipif->ipif_recovery_id == 0);
1728                 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1729                     ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1730         }
1731         mutex_exit(&ill->ill_lock);
1732         ipif_refrele(ipif);
1733 
1734 ignore_conflict:
1735         freemsg(mp);
1736         ira_cleanup(&iras, B_TRUE);
1737 }
1738 
1739 /*
1740  * Handle failure by tearing down the ipifs with the specified address.  Note
1741  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1742  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1743  * we start a timer on the ipif.
1744  * Caller has to free mp;
1745  */
1746 static void
1747 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1748 {
1749         const uchar_t   *haddr;
1750         ill_t           *ill = ira->ira_rill;
1751 
1752         /*
1753          * Ignore conflicts generated by misbehaving switches that just
1754          * reflect our own messages back to us.
1755          */
1756 
1757         /* icmp_inbound_v6 ensures this */
1758         ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1759         haddr = ira->ira_l2src;
1760         if (haddr != NULL &&
1761             bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1762                 return;
1763         }
1764 
1765         if ((mp = copymsg(mp)) != NULL) {
1766                 mblk_t  *attrmp;
1767 
1768                 attrmp = ip_recv_attr_to_mblk(ira);
1769                 if (attrmp == NULL) {
1770                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1771                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1772                         freemsg(mp);
1773                 } else {
1774                         ASSERT(attrmp->b_cont == NULL);
1775                         attrmp->b_cont = mp;
1776                         mp = attrmp;
1777                         ill_refhold(ill);
1778                         qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1779                             B_FALSE);
1780                 }
1781         }
1782 }
1783 
1784 /*
1785  * Handle a discovered conflict: some other system is advertising that it owns
1786  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1787  * interface.
1788  *
1789  * Handles both IPv4 and IPv6
1790  */
1791 boolean_t
1792 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1793 {
1794         ipif_t          *ipif;
1795         clock_t         now;
1796         uint_t          maxdefense;
1797         uint_t          defs;
1798         ill_t           *ill = ira->ira_ill;
1799         ip_stack_t      *ipst = ill->ill_ipst;
1800         uint32_t        elapsed;
1801         boolean_t       isv6 = ill->ill_isv6;
1802         ipaddr_t        ncec_addr;
1803 
1804         if (isv6) {
1805                 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1806                     ipst);
1807         } else {
1808                 if (arp_no_defense) {
1809                         /*
1810                          * Yes, there is a conflict, but no, we do not
1811                          * defend ourself.
1812                          */
1813                         return (B_TRUE);
1814                 }
1815                 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1816                 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1817                     ipst);
1818         }
1819         if (ipif == NULL)
1820                 return (B_FALSE);
1821 
1822         /*
1823          * First, figure out if this address is disposable.
1824          */
1825         if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1826                 maxdefense = ipst->ips_ip_max_temp_defend;
1827         else
1828                 maxdefense = ipst->ips_ip_max_defend;
1829 
1830         /*
1831          * Now figure out how many times we've defended ourselves.  Ignore
1832          * defenses that happened long in the past.
1833          */
1834         now = ddi_get_lbolt();
1835         elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1836         mutex_enter(&ncec->ncec_lock);
1837         if ((defs = ncec->ncec_defense_count) > 0 &&
1838             elapsed > ipst->ips_ip_defend_interval) {
1839                 /*
1840                  * ip_defend_interval has elapsed.
1841                  * reset the defense count.
1842                  */
1843                 ncec->ncec_defense_count = defs = 0;
1844         }
1845         ncec->ncec_defense_count++;
1846         ncec->ncec_last_time_defended = now;
1847         mutex_exit(&ncec->ncec_lock);
1848         ipif_refrele(ipif);
1849 
1850         /*
1851          * If we've defended ourselves too many times already, then give up and
1852          * tear down the interface(s) using this address.
1853          * Otherwise, caller has to defend by sending out an announce.
1854          */
1855         if (defs >= maxdefense) {
1856                 if (isv6)
1857                         ndp_failure(mp, ira);
1858                 else
1859                         arp_failure(mp, ira);
1860         } else {
1861                 return (B_TRUE); /* caller must defend this address */
1862         }
1863         return (B_FALSE);
1864 }
1865 
1866 /*
1867  * Handle reception of Neighbor Solicitation messages.
1868  */
1869 static void
1870 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1871 {
1872         ill_t           *ill = ira->ira_ill, *under_ill;
1873         nd_neighbor_solicit_t *ns;
1874         uint32_t        hlen = ill->ill_phys_addr_length;
1875         uchar_t         *haddr = NULL;
1876         icmp6_t         *icmp_nd;
1877         ip6_t           *ip6h;
1878         ncec_t          *our_ncec = NULL;
1879         in6_addr_t      target;
1880         in6_addr_t      src;
1881         int             len;
1882         int             flag = 0;
1883         nd_opt_hdr_t    *opt = NULL;
1884         boolean_t       bad_solicit = B_FALSE;
1885         mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
1886         boolean_t       need_ill_refrele = B_FALSE;
1887 
1888         ip6h = (ip6_t *)mp->b_rptr;
1889         icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1890         len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1891         src = ip6h->ip6_src;
1892         ns = (nd_neighbor_solicit_t *)icmp_nd;
1893         target = ns->nd_ns_target;
1894         if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1895             IN6_IS_ADDR_LOOPBACK(&target)) {
1896                 if (ip_debug > 2) {
1897                         /* ip1dbg */
1898                         pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1899                             AF_INET6, &target);
1900                 }
1901                 bad_solicit = B_TRUE;
1902                 goto done;
1903         }
1904         if (len > sizeof (nd_neighbor_solicit_t)) {
1905                 /* Options present */
1906                 opt = (nd_opt_hdr_t *)&ns[1];
1907                 len -= sizeof (nd_neighbor_solicit_t);
1908                 if (!ndp_verify_optlen(opt, len)) {
1909                         ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1910                         bad_solicit = B_TRUE;
1911                         goto done;
1912                 }
1913         }
1914         if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1915                 /* Check to see if this is a valid DAD solicitation */
1916                 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1917                         if (ip_debug > 2) {
1918                                 /* ip1dbg */
1919                                 pr_addr_dbg("ndp_input_solicit: IPv6 "
1920                                     "Destination is not solicited node "
1921                                     "multicast %s\n", AF_INET6,
1922                                     &ip6h->ip6_dst);
1923                         }
1924                         bad_solicit = B_TRUE;
1925                         goto done;
1926                 }
1927         }
1928 
1929         /*
1930          * NOTE: with IPMP, it's possible the nominated multicast ill (which
1931          * received this packet if it's multicast) is not the ill tied to
1932          * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1933          * to ensure we find the associated NCE.
1934          */
1935         our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1936         /*
1937          * If this is a valid Solicitation for an address we are publishing,
1938          * then a PUBLISH entry should exist in the cache
1939          */
1940         if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1941                 ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1942                     "ifname=%s ", ill->ill_name));
1943                 if (ip_debug > 2) {
1944                         /* ip1dbg */
1945                         pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1946                 }
1947                 if (our_ncec == NULL)
1948                         bad_solicit = B_TRUE;
1949                 goto done;
1950         }
1951 
1952         /* At this point we should have a verified NS per spec */
1953         if (opt != NULL) {
1954                 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1955                 if (opt != NULL) {
1956                         haddr = (uchar_t *)&opt[1];
1957                         if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1958                             hlen == 0) {
1959                                 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1960                                 bad_solicit = B_TRUE;
1961                                 goto done;
1962                         }
1963                 }
1964         }
1965 
1966         /* If sending directly to peer, set the unicast flag */
1967         if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1968                 flag |= NDP_UNICAST;
1969 
1970         /*
1971          * Create/update the entry for the soliciting node on the ipmp_ill.
1972          * or respond to outstanding queries, don't if
1973          * the source is unspecified address.
1974          */
1975         if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1976                 int     err;
1977                 nce_t   *nnce;
1978 
1979                 ASSERT(ill->ill_isv6);
1980                 /*
1981                  * Regular solicitations *must* include the Source Link-Layer
1982                  * Address option.  Ignore messages that do not.
1983                  */
1984                 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1985                         ip1dbg(("ndp_input_solicit: source link-layer address "
1986                             "option missing with a specified source.\n"));
1987                         bad_solicit = B_TRUE;
1988                         goto done;
1989                 }
1990 
1991                 /*
1992                  * This is a regular solicitation.  If we're still in the
1993                  * process of verifying the address, then don't respond at all
1994                  * and don't keep track of the sender.
1995                  */
1996                 if (our_ncec->ncec_state == ND_PROBE)
1997                         goto done;
1998 
1999                 /*
2000                  * If the solicitation doesn't have sender hardware address
2001                  * (legal for unicast solicitation), then process without
2002                  * installing the return NCE.  Either we already know it, or
2003                  * we'll be forced to look it up when (and if) we reply to the
2004                  * packet.
2005                  */
2006                 if (haddr == NULL)
2007                         goto no_source;
2008 
2009                 under_ill = ill;
2010                 if (IS_UNDER_IPMP(under_ill)) {
2011                         ill = ipmp_ill_hold_ipmp_ill(under_ill);
2012                         if (ill == NULL)
2013                                 ill = under_ill;
2014                         else
2015                                 need_ill_refrele = B_TRUE;
2016                 }
2017                 err = nce_lookup_then_add_v6(ill,
2018                     haddr, hlen,
2019                     &src,   /* Soliciting nodes address */
2020                     0,
2021                     ND_STALE,
2022                     &nnce);
2023 
2024                 if (need_ill_refrele) {
2025                         ill_refrele(ill);
2026                         ill = under_ill;
2027                         need_ill_refrele =  B_FALSE;
2028                 }
2029                 switch (err) {
2030                 case 0:
2031                         /* done with this entry */
2032                         nce_refrele(nnce);
2033                         break;
2034                 case EEXIST:
2035                         /*
2036                          * B_FALSE indicates this is not an an advertisement.
2037                          */
2038                         nce_process(nnce->nce_common, haddr, 0, B_FALSE);
2039                         nce_refrele(nnce);
2040                         break;
2041                 default:
2042                         ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
2043                             err));
2044                         goto done;
2045                 }
2046 no_source:
2047                 flag |= NDP_SOLICITED;
2048         } else {
2049                 /*
2050                  * No source link layer address option should be present in a
2051                  * valid DAD request.
2052                  */
2053                 if (haddr != NULL) {
2054                         ip1dbg(("ndp_input_solicit: source link-layer address "
2055                             "option present with an unspecified source.\n"));
2056                         bad_solicit = B_TRUE;
2057                         goto done;
2058                 }
2059                 if (our_ncec->ncec_state == ND_PROBE) {
2060                         /*
2061                          * Internally looped-back probes will have
2062                          * IRAF_L2SRC_LOOPBACK set so we can ignore our own
2063                          * transmissions.
2064                          */
2065                         if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
2066                                 /*
2067                                  * If someone else is probing our address, then
2068                                  * we've crossed wires.  Declare failure.
2069                                  */
2070                                 ndp_failure(mp, ira);
2071                         }
2072                         goto done;
2073                 }
2074                 /*
2075                  * This is a DAD probe.  Multicast the advertisement to the
2076                  * all-nodes address.
2077                  */
2078                 src = ipv6_all_hosts_mcast;
2079         }
2080         flag |= nce_advert_flags(our_ncec);
2081         (void) ndp_xmit(ill,
2082             ND_NEIGHBOR_ADVERT,
2083             our_ncec->ncec_lladdr,
2084             our_ncec->ncec_lladdr_length,
2085             &target,        /* Source and target of the advertisement pkt */
2086             &src,   /* IP Destination (source of original pkt) */
2087             flag);
2088 done:
2089         if (bad_solicit)
2090                 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2091         if (our_ncec != NULL)
2092                 ncec_refrele(our_ncec);
2093 }
2094 
2095 /*
2096  * Handle reception of Neighbor Solicitation messages
2097  */
2098 void
2099 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
2100 {
2101         ill_t           *ill = ira->ira_ill;
2102         nd_neighbor_advert_t *na;
2103         uint32_t        hlen = ill->ill_phys_addr_length;
2104         uchar_t         *haddr = NULL;
2105         icmp6_t         *icmp_nd;
2106         ip6_t           *ip6h;
2107         ncec_t          *dst_ncec = NULL;
2108         in6_addr_t      target;
2109         nd_opt_hdr_t    *opt = NULL;
2110         int             len;
2111         ip_stack_t      *ipst = ill->ill_ipst;
2112         mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
2113 
2114         ip6h = (ip6_t *)mp->b_rptr;
2115         icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2116         len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2117         na = (nd_neighbor_advert_t *)icmp_nd;
2118 
2119         if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2120             (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2121                 ip1dbg(("ndp_input_advert: Target is multicast but the "
2122                     "solicited flag is not zero\n"));
2123                 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2124                 return;
2125         }
2126         target = na->nd_na_target;
2127         if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
2128             IN6_IS_ADDR_LOOPBACK(&target)) {
2129                 if (ip_debug > 2) {
2130                         /* ip1dbg */
2131                         pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
2132                             AF_INET6, &target);
2133                 }
2134                 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2135                 return;
2136         }
2137         if (len > sizeof (nd_neighbor_advert_t)) {
2138                 opt = (nd_opt_hdr_t *)&na[1];
2139                 if (!ndp_verify_optlen(opt,
2140                     len - sizeof (nd_neighbor_advert_t))) {
2141                         ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2142                         BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2143                         return;
2144                 }
2145                 /* At this point we have a verified NA per spec */
2146                 len -= sizeof (nd_neighbor_advert_t);
2147                 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2148                 if (opt != NULL) {
2149                         haddr = (uchar_t *)&opt[1];
2150                         if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2151                             hlen == 0) {
2152                                 ip1dbg(("ndp_input_advert: bad SLLA\n"));
2153                                 BUMP_MIB(mib,
2154                                     ipv6IfIcmpInBadNeighborAdvertisements);
2155                                 return;
2156                         }
2157                 }
2158         }
2159 
2160         /*
2161          * NOTE: we match across the illgrp since we need to do DAD for all of
2162          * our local addresses, and those are spread across all the active
2163          * ills in the group.
2164          */
2165         if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
2166                 return;
2167 
2168         if (NCE_PUBLISH(dst_ncec)) {
2169                 /*
2170                  * Someone just advertised an addresses that we publish. First,
2171                  * check it it was us -- if so, we can safely ignore it.
2172                  * We don't get the haddr from the ira_l2src because, in the
2173                  * case that the packet originated from us, on an IPMP group,
2174                  * the ira_l2src may would be the link-layer address of the
2175                  * cast_ill used to send the packet, which may not be the same
2176                  * as the dst_ncec->ncec_lladdr of the address.
2177                  */
2178                 if (haddr != NULL) {
2179                         if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
2180                                 goto out;
2181 
2182                         if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
2183                                 goto out;   /* from us -- no conflict */
2184 
2185                         /*
2186                          * If we're in an IPMP group, check if this is an echo
2187                          * from another ill in the group.  Use the double-
2188                          * checked locking pattern to avoid grabbing
2189                          * ill_g_lock in the non-IPMP case.
2190                          */
2191                         if (IS_UNDER_IPMP(ill)) {
2192                                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2193                                 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
2194                                     ill->ill_grp, haddr, hlen) != NULL) {
2195                                         rw_exit(&ipst->ips_ill_g_lock);
2196                                         goto out;
2197                                 }
2198                                 rw_exit(&ipst->ips_ill_g_lock);
2199                         }
2200                 }
2201 
2202                 /*
2203                  * This appears to be a real conflict.  If we're trying to
2204                  * configure this NCE (ND_PROBE), then shut it down.
2205                  * Otherwise, handle the discovered conflict.
2206                  */
2207                 if (dst_ncec->ncec_state == ND_PROBE) {
2208                         ndp_failure(mp, ira);
2209                 } else {
2210                         if (ip_nce_conflict(mp, ira, dst_ncec)) {
2211                                 char hbuf[MAC_STR_LEN];
2212                                 char sbuf[INET6_ADDRSTRLEN];
2213 
2214                                 cmn_err(CE_WARN,
2215                                     "node '%s' is using %s on %s",
2216                                     inet_ntop(AF_INET6, &target, sbuf,
2217                                     sizeof (sbuf)),
2218                                     haddr == NULL ? "<none>" :
2219                                     mac_colon_addr(haddr, hlen, hbuf,
2220                                     sizeof (hbuf)), ill->ill_name);
2221                                 /*
2222                                  * RFC 4862, Section 5.4.4 does not mandate
2223                                  * any specific behavior when an NA matches
2224                                  * a non-tentative address assigned to the
2225                                  * receiver. We make the choice of defending
2226                                  * our address, based on the assumption that
2227                                  * the sender has not detected the Duplicate.
2228                                  *
2229                                  * ncec_last_time_defended has been adjusted
2230                                  * in ip_nce_conflict()
2231                                  */
2232                                 (void) ndp_announce(dst_ncec);
2233                         }
2234                 }
2235         } else {
2236                 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2237                         dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2238 
2239                 /* B_TRUE indicates this an advertisement */
2240                 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2241         }
2242 out:
2243         ncec_refrele(dst_ncec);
2244 }
2245 
2246 /*
2247  * Process NDP neighbor solicitation/advertisement messages.
2248  * The checksum has already checked o.k before reaching here.
2249  * Information about the datalink header is contained in ira_l2src, but
2250  * that should be ignored for loopback packets.
2251  */
2252 void
2253 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2254 {
2255         ill_t           *ill = ira->ira_rill;
2256         icmp6_t         *icmp_nd;
2257         ip6_t           *ip6h;
2258         int             len;
2259         mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
2260         ill_t           *orig_ill = NULL;
2261 
2262         /*
2263          * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2264          * and make it be the IPMP upper so avoid being confused by a packet
2265          * addressed to a unicast address on a different ill.
2266          */
2267         if (IS_UNDER_IPMP(ill)) {
2268                 orig_ill = ill;
2269                 ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2270                 if (ill == NULL) {
2271                         ill = orig_ill;
2272                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2273                         ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2274                             mp, ill);
2275                         freemsg(mp);
2276                         return;
2277                 }
2278                 ASSERT(ill != orig_ill);
2279                 orig_ill = ira->ira_ill;
2280                 ira->ira_ill = ill;
2281                 mib = ill->ill_icmp6_mib;
2282         }
2283         if (!pullupmsg(mp, -1)) {
2284                 ip1dbg(("ndp_input: pullupmsg failed\n"));
2285                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2286                 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2287                 goto done;
2288         }
2289         ip6h = (ip6_t *)mp->b_rptr;
2290         if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2291                 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2292                 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2293                 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2294                 goto done;
2295         }
2296         /*
2297          * NDP does not accept any extension headers between the
2298          * IP header and the ICMP header since e.g. a routing
2299          * header could be dangerous.
2300          * This assumes that any AH or ESP headers are removed
2301          * by ip prior to passing the packet to ndp_input.
2302          */
2303         if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2304                 ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2305                     ip6h->ip6_nxt));
2306                 ip_drop_input("Wrong next header", mp, ill);
2307                 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2308                 goto done;
2309         }
2310         icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2311         ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2312             icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2313         if (icmp_nd->icmp6_code != 0) {
2314                 ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2315                 ip_drop_input("code non-zero", mp, ill);
2316                 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2317                 goto done;
2318         }
2319         len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2320         /*
2321          * Make sure packet length is large enough for either
2322          * a NS or a NA icmp packet.
2323          */
2324         if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2325                 ip1dbg(("ndp_input: packet too short\n"));
2326                 ip_drop_input("packet too short", mp, ill);
2327                 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2328                 goto done;
2329         }
2330         if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2331                 ndp_input_solicit(mp, ira);
2332         } else {
2333                 ndp_input_advert(mp, ira);
2334         }
2335 done:
2336         freemsg(mp);
2337         if (orig_ill != NULL) {
2338                 ill_refrele(ill);
2339                 ira->ira_ill = orig_ill;
2340         }
2341 }
2342 
2343 /*
2344  * ndp_xmit is called to form and transmit a ND solicitation or
2345  * advertisement ICMP packet.
2346  *
2347  * If the source address is unspecified and this isn't a probe (used for
2348  * duplicate address detection), an appropriate source address and link layer
2349  * address will be chosen here.  The link layer address option is included if
2350  * the source is specified (i.e., all non-probe packets), and omitted (per the
2351  * specification) otherwise.
2352  *
2353  * It returns B_FALSE only if it does a successful put() to the
2354  * corresponding ill's ill_wq otherwise returns B_TRUE.
2355  */
2356 static boolean_t
2357 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2358     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2359 {
2360         uint32_t        len;
2361         icmp6_t         *icmp6;
2362         mblk_t          *mp;
2363         ip6_t           *ip6h;
2364         nd_opt_hdr_t    *opt;
2365         uint_t          plen;
2366         zoneid_t        zoneid = GLOBAL_ZONEID;
2367         ill_t           *hwaddr_ill = ill;
2368         ip_xmit_attr_t  ixas;
2369         ip_stack_t      *ipst = ill->ill_ipst;
2370         boolean_t       need_refrele = B_FALSE;
2371         boolean_t       probe = B_FALSE;
2372 
2373         if (IS_UNDER_IPMP(ill)) {
2374                 probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2375                 /*
2376                  * We send non-probe packets on the upper IPMP interface.
2377                  * ip_output_simple() will use cast_ill for sending any
2378                  * multicast packets. Note that we can't follow the same
2379                  * logic for probe packets because all interfaces in the ipmp
2380                  * group may have failed, so that we really want to only try
2381                  * to send the ND packet on the ill corresponding to the src
2382                  * address.
2383                  */
2384                 if (!probe) {
2385                         ill = ipmp_ill_hold_ipmp_ill(ill);
2386                         if (ill != NULL)
2387                                 need_refrele = B_TRUE;
2388                         else
2389                                 ill = hwaddr_ill;
2390                 }
2391         }
2392 
2393         /*
2394          * If we have a unspecified source(sender) address, select a
2395          * proper source address for the solicitation here itself so
2396          * that we can initialize the h/w address correctly.
2397          *
2398          * If the sender is specified then we use this address in order
2399          * to lookup the zoneid before calling ip_output_v6(). This is to
2400          * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2401          * by IP (we cannot guarantee that the global zone has an interface
2402          * route to the destination).
2403          *
2404          * Note that the NA never comes here with the unspecified source
2405          * address.
2406          */
2407 
2408         /*
2409          * Probes will have unspec src at this point.
2410          */
2411         if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2412                 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2413                 /*
2414                  * It's possible for ipif_lookup_addr_zoneid_v6() to return
2415                  * ALL_ZONES if it cannot find a matching ipif for the address
2416                  * we are trying to use. In this case we err on the side of
2417                  * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2418                  */
2419                 if (zoneid == ALL_ZONES)
2420                         zoneid = GLOBAL_ZONEID;
2421         }
2422 
2423         plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2424         len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2425         mp = allocb(len,  BPRI_LO);
2426         if (mp == NULL) {
2427                 if (need_refrele)
2428                         ill_refrele(ill);
2429                 return (B_TRUE);
2430         }
2431 
2432         bzero((char *)mp->b_rptr, len);
2433         mp->b_wptr = mp->b_rptr + len;
2434 
2435         bzero(&ixas, sizeof (ixas));
2436         ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2437 
2438         ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2439         ixas.ixa_ipst = ipst;
2440         ixas.ixa_cred = kcred;
2441         ixas.ixa_cpid = NOPID;
2442         ixas.ixa_tsl = NULL;
2443         ixas.ixa_zoneid = zoneid;
2444 
2445         ip6h = (ip6_t *)mp->b_rptr;
2446         ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2447         ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2448         ip6h->ip6_nxt = IPPROTO_ICMPV6;
2449         ip6h->ip6_hops = IPV6_MAX_HOPS;
2450         ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2451         ip6h->ip6_dst = *target;
2452         icmp6 = (icmp6_t *)&ip6h[1];
2453 
2454         if (hw_addr_len != 0) {
2455                 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2456                     sizeof (nd_neighbor_advert_t));
2457         } else {
2458                 opt = NULL;
2459         }
2460         if (operation == ND_NEIGHBOR_SOLICIT) {
2461                 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2462 
2463                 if (opt != NULL && !(flag & NDP_PROBE)) {
2464                         /*
2465                          * Note that we don't send out SLLA for ND probes
2466                          * per RFC 4862, even though we do send out the src
2467                          * haddr for IPv4 DAD probes, even though both IPv4
2468                          * and IPv6 go out with the unspecified/INADDR_ANY
2469                          * src IP addr.
2470                          */
2471                         opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2472                 }
2473                 ip6h->ip6_src = *sender;
2474                 ns->nd_ns_target = *target;
2475                 if (!(flag & NDP_UNICAST)) {
2476                         /* Form multicast address of the target */
2477                         ip6h->ip6_dst = ipv6_solicited_node_mcast;
2478                         ip6h->ip6_dst.s6_addr32[3] |=
2479                             ns->nd_ns_target.s6_addr32[3];
2480                 }
2481         } else {
2482                 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2483 
2484                 ASSERT(!(flag & NDP_PROBE));
2485                 if (opt != NULL)
2486                         opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2487                 ip6h->ip6_src = *sender;
2488                 na->nd_na_target = *sender;
2489                 if (flag & NDP_ISROUTER)
2490                         na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2491                 if (flag & NDP_SOLICITED)
2492                         na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2493                 if (flag & NDP_ORIDE)
2494                         na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2495         }
2496 
2497         if (!(flag & NDP_PROBE)) {
2498                 if (hw_addr != NULL && opt != NULL) {
2499                         /* Fill in link layer address and option len */
2500                         opt->nd_opt_len = (uint8_t)plen;
2501                         bcopy(hw_addr, &opt[1], hw_addr_len);
2502                 }
2503         }
2504         if (opt != NULL && opt->nd_opt_type == 0) {
2505                 /* If there's no link layer address option, then strip it. */
2506                 len -= plen * 8;
2507                 mp->b_wptr = mp->b_rptr + len;
2508                 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2509         }
2510 
2511         icmp6->icmp6_type = (uint8_t)operation;
2512         icmp6->icmp6_code = 0;
2513         /*
2514          * Prepare for checksum by putting icmp length in the icmp
2515          * checksum field. The checksum is calculated in ip_output.c.
2516          */
2517         icmp6->icmp6_cksum = ip6h->ip6_plen;
2518 
2519         (void) ip_output_simple(mp, &ixas);
2520         ixa_cleanup(&ixas);
2521         if (need_refrele)
2522                 ill_refrele(ill);
2523         return (B_FALSE);
2524 }
2525 
2526 /*
2527  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2528  * The datapath uses this as an indication that there
2529  * is a problem (as opposed to a NCE that was just
2530  * reclaimed due to lack of memory.
2531  * Note that static ARP entries never become unreachable.
2532  */
2533 void
2534 nce_make_unreachable(ncec_t *ncec)
2535 {
2536         mutex_enter(&ncec->ncec_lock);
2537         ncec->ncec_state = ND_UNREACHABLE;
2538         mutex_exit(&ncec->ncec_lock);
2539 }
2540 
2541 /*
2542  * NCE retransmit timer. Common to IPv4 and IPv6.
2543  * This timer goes off when:
2544  * a. It is time to retransmit a resolution for resolver.
2545  * b. It is time to send reachability probes.
2546  */
2547 void
2548 nce_timer(void *arg)
2549 {
2550         ncec_t          *ncec = arg;
2551         ill_t           *ill = ncec->ncec_ill, *src_ill;
2552         char            addrbuf[INET6_ADDRSTRLEN];
2553         boolean_t       dropped = B_FALSE;
2554         ip_stack_t      *ipst = ncec->ncec_ipst;
2555         boolean_t       isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2556         in_addr_t       sender4 = INADDR_ANY;
2557         in6_addr_t      sender6 = ipv6_all_zeros;
2558 
2559         /*
2560          * The timer has to be cancelled by ncec_delete before doing the final
2561          * refrele. So the NCE is guaranteed to exist when the timer runs
2562          * until it clears the timeout_id. Before clearing the timeout_id
2563          * bump up the refcnt so that we can continue to use the ncec
2564          */
2565         ASSERT(ncec != NULL);
2566         mutex_enter(&ncec->ncec_lock);
2567         ncec_refhold_locked(ncec);
2568         ncec->ncec_timeout_id = 0;
2569         mutex_exit(&ncec->ncec_lock);
2570 
2571         src_ill = nce_resolve_src(ncec, &sender6);
2572         /* if we could not find a sender address, return */
2573         if (src_ill == NULL) {
2574                 if (!isv6) {
2575                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2576                         ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2577                             &sender4, addrbuf, sizeof (addrbuf))));
2578                 } else {
2579                         ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2580                             &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2581                 }
2582                 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2583                 ncec_refrele(ncec);
2584                 return;
2585         }
2586         if (!isv6)
2587                 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2588 
2589         mutex_enter(&ncec->ncec_lock);
2590         /*
2591          * Check the reachability state.
2592          */
2593         switch (ncec->ncec_state) {
2594         case ND_DELAY:
2595                 ASSERT(ncec->ncec_lladdr != NULL);
2596                 ncec->ncec_state = ND_PROBE;
2597                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2598                 if (isv6) {
2599                         mutex_exit(&ncec->ncec_lock);
2600                         dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2601                             src_ill->ill_phys_addr,
2602                             src_ill->ill_phys_addr_length,
2603                             &sender6, &ncec->ncec_addr,
2604                             NDP_UNICAST);
2605                 } else {
2606                         dropped = (arp_request(ncec, sender4, src_ill) == 0);
2607                         mutex_exit(&ncec->ncec_lock);
2608                 }
2609                 if (!dropped) {
2610                         mutex_enter(&ncec->ncec_lock);
2611                         ncec->ncec_pcnt--;
2612                         mutex_exit(&ncec->ncec_lock);
2613                 }
2614                 if (ip_debug > 3) {
2615                         /* ip2dbg */
2616                         pr_addr_dbg("nce_timer: state for %s changed "
2617                             "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2618                 }
2619                 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2620                 break;
2621         case ND_PROBE:
2622                 /* must be retransmit timer */
2623                 ASSERT(ncec->ncec_pcnt >= -1);
2624                 if (ncec->ncec_pcnt > 0) {
2625                         /*
2626                          * As per RFC2461, the ncec gets deleted after
2627                          * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2628                          * Note that the first unicast solicitation is sent
2629                          * during the DELAY state.
2630                          */
2631                         ip2dbg(("nce_timer: pcount=%x dst %s\n",
2632                             ncec->ncec_pcnt,
2633                             inet_ntop((isv6? AF_INET6 : AF_INET),
2634                             &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2635                         if (NCE_PUBLISH(ncec)) {
2636                                 mutex_exit(&ncec->ncec_lock);
2637                                 /*
2638                                  * send out a probe; note that src_ill
2639                                  * is ignored by nce_dad() for all
2640                                  * DAD message types other than IPv6
2641                                  * unicast probes
2642                                  */
2643                                 nce_dad(ncec, src_ill, B_TRUE);
2644                         } else {
2645                                 ASSERT(src_ill != NULL);
2646                                 if (isv6) {
2647                                         mutex_exit(&ncec->ncec_lock);
2648                                         dropped = ndp_xmit(src_ill,
2649                                             ND_NEIGHBOR_SOLICIT,
2650                                             src_ill->ill_phys_addr,
2651                                             src_ill->ill_phys_addr_length,
2652                                             &sender6, &ncec->ncec_addr,
2653                                             NDP_UNICAST);
2654                                 } else {
2655                                         /*
2656                                          * since the nce is REACHABLE,
2657                                          * the ARP request will be sent out
2658                                          * as a link-layer unicast.
2659                                          */
2660                                         dropped = (arp_request(ncec, sender4,
2661                                             src_ill) == 0);
2662                                         mutex_exit(&ncec->ncec_lock);
2663                                 }
2664                                 if (!dropped) {
2665                                         mutex_enter(&ncec->ncec_lock);
2666                                         ncec->ncec_pcnt--;
2667                                         mutex_exit(&ncec->ncec_lock);
2668                                 }
2669                                 nce_restart_timer(ncec,
2670                                     ill->ill_reachable_retrans_time);
2671                         }
2672                 } else if (ncec->ncec_pcnt < 0) {
2673                         /* No hope, delete the ncec */
2674                         /* Tell datapath it went bad */
2675                         ncec->ncec_state = ND_UNREACHABLE;
2676                         mutex_exit(&ncec->ncec_lock);
2677                         if (ip_debug > 2) {
2678                                 /* ip1dbg */
2679                                 pr_addr_dbg("nce_timer: Delete NCE for"
2680                                     " dst %s\n", (isv6? AF_INET6: AF_INET),
2681                                     &ncec->ncec_addr);
2682                         }
2683                         /* if static ARP can't delete. */
2684                         if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2685                                 ncec_delete(ncec);
2686 
2687                 } else if (!NCE_PUBLISH(ncec)) {
2688                         /*
2689                          * Probe count is 0 for a dynamic entry (one that we
2690                          * ourselves are not publishing). We should never get
2691                          * here if NONUD was requested, hence the ASSERT below.
2692                          */
2693                         ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2694                         ip2dbg(("nce_timer: pcount=%x dst %s\n",
2695                             ncec->ncec_pcnt, inet_ntop(AF_INET6,
2696                             &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2697                         ncec->ncec_pcnt--;
2698                         mutex_exit(&ncec->ncec_lock);
2699                         /* Wait one interval before killing */
2700                         nce_restart_timer(ncec,
2701                             ill->ill_reachable_retrans_time);
2702                 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2703                         ipif_t *ipif;
2704                         ipaddr_t ncec_addr;
2705 
2706                         /*
2707                          * We're done probing, and we can now declare this
2708                          * address to be usable.  Let IP know that it's ok to
2709                          * use.
2710                          */
2711                         ncec->ncec_state = ND_REACHABLE;
2712                         ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2713                         mutex_exit(&ncec->ncec_lock);
2714                         if (isv6) {
2715                                 ipif = ipif_lookup_addr_exact_v6(
2716                                     &ncec->ncec_addr, ill, ipst);
2717                         } else {
2718                                 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2719                                     ncec_addr);
2720                                 ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2721                                     ipst);
2722                         }
2723                         if (ipif != NULL) {
2724                                 if (ipif->ipif_was_dup) {
2725                                         char ibuf[LIFNAMSIZ];
2726                                         char sbuf[INET6_ADDRSTRLEN];
2727 
2728                                         ipif->ipif_was_dup = B_FALSE;
2729                                         (void) inet_ntop(AF_INET6,
2730                                             &ipif->ipif_v6lcl_addr,
2731                                             sbuf, sizeof (sbuf));
2732                                         ipif_get_name(ipif, ibuf,
2733                                             sizeof (ibuf));
2734                                         cmn_err(CE_NOTE, "recovered address "
2735                                             "%s on %s", sbuf, ibuf);
2736                                 }
2737                                 if ((ipif->ipif_flags & IPIF_UP) &&
2738                                     !ipif->ipif_addr_ready)
2739                                         ipif_up_notify(ipif);
2740                                 ipif->ipif_addr_ready = 1;
2741                                 ipif_refrele(ipif);
2742                         }
2743                         if (!isv6 && arp_no_defense)
2744                                 break;
2745                         /* Begin defending our new address */
2746                         if (ncec->ncec_unsolicit_count > 0) {
2747                                 ncec->ncec_unsolicit_count--;
2748                                 if (isv6) {
2749                                         dropped = ndp_announce(ncec);
2750                                 } else {
2751                                         dropped = arp_announce(ncec);
2752                                 }
2753 
2754                                 if (dropped)
2755                                         ncec->ncec_unsolicit_count++;
2756                                 else
2757                                         ncec->ncec_last_time_defended =
2758                                             ddi_get_lbolt();
2759                         }
2760                         if (ncec->ncec_unsolicit_count > 0) {
2761                                 nce_restart_timer(ncec,
2762                                     ANNOUNCE_INTERVAL(isv6));
2763                         } else if (DEFENSE_INTERVAL(isv6) != 0) {
2764                                 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2765                         }
2766                 } else {
2767                         /*
2768                          * This is an address we're probing to be our own, but
2769                          * the ill is down.  Wait until it comes back before
2770                          * doing anything, but switch to reachable state so
2771                          * that the restart will work.
2772                          */
2773                         ncec->ncec_state = ND_REACHABLE;
2774                         mutex_exit(&ncec->ncec_lock);
2775                 }
2776                 break;
2777         case ND_INCOMPLETE: {
2778                 mblk_t  *mp, *nextmp;
2779                 mblk_t  **prevmpp;
2780 
2781                 /*
2782                  * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2783                  * for any IPMP probe packets, and toss them.  IPMP probe
2784                  * packets will always be at the head of ncec_qd_mp, so that
2785                  * we can stop at the first queued ND packet that is
2786                  * not a probe packet.
2787                  */
2788                 prevmpp = &ncec->ncec_qd_mp;
2789                 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2790                         nextmp = mp->b_next;
2791 
2792                         if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2793                                 inet_freemsg(mp);
2794                                 ncec->ncec_nprobes--;
2795                                 *prevmpp = nextmp;
2796                         } else {
2797                                 prevmpp = &mp->b_next;
2798                         }
2799                 }
2800 
2801                 /*
2802                  * Must be resolver's retransmit timer.
2803                  */
2804                 mutex_exit(&ncec->ncec_lock);
2805                 ip_ndp_resolve(ncec);
2806                 break;
2807         }
2808         case ND_REACHABLE:
2809                 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2810                     ncec->ncec_unsolicit_count != 0) ||
2811                     (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2812                         if (ncec->ncec_unsolicit_count > 0) {
2813                                 ncec->ncec_unsolicit_count--;
2814                                 mutex_exit(&ncec->ncec_lock);
2815                                 /*
2816                                  * When we get to zero announcements left,
2817                                  * switch to address defense
2818                                  */
2819                         } else {
2820                                 boolean_t rate_limit;
2821 
2822                                 mutex_exit(&ncec->ncec_lock);
2823                                 rate_limit = ill_defend_rate_limit(ill, ncec);
2824                                 if (rate_limit) {
2825                                         nce_restart_timer(ncec,
2826                                             DEFENSE_INTERVAL(isv6));
2827                                         break;
2828                                 }
2829                         }
2830                         if (isv6) {
2831                                 dropped = ndp_announce(ncec);
2832                         } else {
2833                                 dropped = arp_announce(ncec);
2834                         }
2835                         mutex_enter(&ncec->ncec_lock);
2836                         if (dropped) {
2837                                 ncec->ncec_unsolicit_count++;
2838                         } else {
2839                                 ncec->ncec_last_time_defended =
2840                                     ddi_get_lbolt();
2841                         }
2842                         mutex_exit(&ncec->ncec_lock);
2843                         if (ncec->ncec_unsolicit_count != 0) {
2844                                 nce_restart_timer(ncec,
2845                                     ANNOUNCE_INTERVAL(isv6));
2846                         } else {
2847                                 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2848                         }
2849                 } else {
2850                         mutex_exit(&ncec->ncec_lock);
2851                 }
2852                 break;
2853         default:
2854                 mutex_exit(&ncec->ncec_lock);
2855                 break;
2856         }
2857 done:
2858         ncec_refrele(ncec);
2859         ill_refrele(src_ill);
2860 }
2861 
2862 /*
2863  * Set a link layer address from the ll_addr passed in.
2864  * Copy SAP from ill.
2865  */
2866 static void
2867 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2868 {
2869         ill_t   *ill = ncec->ncec_ill;
2870 
2871         ASSERT(ll_addr != NULL);
2872         if (ill->ill_phys_addr_length > 0) {
2873                 /*
2874                  * The bcopy() below used to be called for the physical address
2875                  * length rather than the link layer address length. For
2876                  * ethernet and many other media, the phys_addr and lla are
2877                  * identical.
2878                  *
2879                  * The phys_addr and lla may not be the same for devices that
2880                  * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2881                  * no known instances of these.
2882                  *
2883                  * For PPP or other interfaces with a zero length
2884                  * physical address, don't do anything here.
2885                  * The bcopy() with a zero phys_addr length was previously
2886                  * a no-op for interfaces with a zero-length physical address.
2887                  * Using the lla for them would change the way they operate.
2888                  * Doing nothing in such cases preserves expected behavior.
2889                  */
2890                 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2891         }
2892 }
2893 
2894 boolean_t
2895 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2896     uint32_t ll_addr_len)
2897 {
2898         ASSERT(ncec->ncec_lladdr != NULL);
2899         if (ll_addr == NULL)
2900                 return (B_FALSE);
2901         if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2902                 return (B_TRUE);
2903         return (B_FALSE);
2904 }
2905 
2906 /*
2907  * Updates the link layer address or the reachability state of
2908  * a cache entry.  Reset probe counter if needed.
2909  */
2910 void
2911 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2912 {
2913         ill_t   *ill = ncec->ncec_ill;
2914         boolean_t need_stop_timer = B_FALSE;
2915         boolean_t need_fastpath_update = B_FALSE;
2916         nce_t   *nce = NULL;
2917         timeout_id_t tid;
2918 
2919         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2920         /*
2921          * If this interface does not do NUD, there is no point
2922          * in allowing an update to the cache entry.  Although
2923          * we will respond to NS.
2924          * The only time we accept an update for a resolver when
2925          * NUD is turned off is when it has just been created.
2926          * Non-Resolvers will always be created as REACHABLE.
2927          */
2928         if (new_state != ND_UNCHANGED) {
2929                 if ((ncec->ncec_flags & NCE_F_NONUD) &&
2930                     (ncec->ncec_state != ND_INCOMPLETE))
2931                         return;
2932                 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2933                 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2934                 need_stop_timer = B_TRUE;
2935                 if (new_state == ND_REACHABLE)
2936                         ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2937                 else {
2938                         /* We force NUD in this case */
2939                         ncec->ncec_last = 0;
2940                 }
2941                 ncec->ncec_state = new_state;
2942                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2943                 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2944                     new_state == ND_INCOMPLETE);
2945         }
2946         if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2947                 tid = ncec->ncec_timeout_id;
2948                 ncec->ncec_timeout_id = 0;
2949         }
2950         /*
2951          * Re-trigger fastpath probe and
2952          * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2953          * whatever packets that happens to be transmitting at the time.
2954          */
2955         if (new_ll_addr != NULL) {
2956                 bcopy(new_ll_addr, ncec->ncec_lladdr,
2957                     ill->ill_phys_addr_length);
2958                 need_fastpath_update = B_TRUE;
2959         }
2960         mutex_exit(&ncec->ncec_lock);
2961         if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2962                 if (tid != 0)
2963                         (void) untimeout(tid);
2964         }
2965         if (need_fastpath_update) {
2966                 /*
2967                  * Delete any existing existing dlur_mp and fp_mp information.
2968                  * For IPMP interfaces, all underlying ill's must be checked
2969                  * and purged.
2970                  */
2971                 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2972                 /*
2973                  * add the new dlur_mp and fp_mp
2974                  */
2975                 nce = nce_fastpath(ncec, B_TRUE, NULL);
2976                 if (nce != NULL)
2977                         nce_refrele(nce);
2978         }
2979         mutex_enter(&ncec->ncec_lock);
2980 }
2981 
2982 static void
2983 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2984 {
2985         uint_t  count = 0;
2986         mblk_t  **mpp, *tmp;
2987 
2988         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2989 
2990         for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2991                 if (++count > ncec->ncec_ill->ill_max_buf) {
2992                         tmp = ncec->ncec_qd_mp->b_next;
2993                         ncec->ncec_qd_mp->b_next = NULL;
2994                         /*
2995                          * if we never create data addrs on the under_ill
2996                          * does this matter?
2997                          */
2998                         BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2999                             ipIfStatsOutDiscards);
3000                         ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
3001                             ncec->ncec_ill);
3002                         freemsg(ncec->ncec_qd_mp);
3003                         ncec->ncec_qd_mp = tmp;
3004                 }
3005         }
3006 
3007         if (head_insert) {
3008                 ncec->ncec_nprobes++;
3009                 mp->b_next = ncec->ncec_qd_mp;
3010                 ncec->ncec_qd_mp = mp;
3011         } else {
3012                 *mpp = mp;
3013         }
3014 }
3015 
3016 /*
3017  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
3018  * queued at the head or tail of the queue based on the input argument
3019  * 'head_insert'. The caller should specify this argument as B_TRUE if this
3020  * packet is an IPMP probe packet, in which case the following happens:
3021  *
3022  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
3023  *      (non-ipmp_probe) load-speading case where the source address of the ND
3024  *      packet is not tied to ncec_ill. If the ill bound to the source address
3025  *      cannot receive, the response to the ND packet will not be received.
3026  *      However, if ND packets for ncec_ill's probes are queued behind that ND
3027  *      packet, those probes will also fail to be sent, and thus in.mpathd will
3028  *       erroneously conclude that ncec_ill has also failed.
3029  *
3030  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
3031  *      the first attempt.  This ensures that ND problems do not manifest as
3032  *      probe RTT spikes.
3033  *
3034  * We achieve this by inserting ipmp_probe() packets at the head of the
3035  * nce_queue.
3036  *
3037  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
3038  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
3039  */
3040 void
3041 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
3042 {
3043         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3044         nce_queue_mp_common(ncec, mp, head_insert);
3045 }
3046 
3047 /*
3048  * Called when address resolution failed due to a timeout.
3049  * Send an ICMP unreachable in response to all queued packets.
3050  */
3051 void
3052 ndp_resolv_failed(ncec_t *ncec)
3053 {
3054         mblk_t  *mp, *nxt_mp;
3055         char    buf[INET6_ADDRSTRLEN];
3056         ill_t *ill = ncec->ncec_ill;
3057         ip_recv_attr_t  iras;
3058 
3059         bzero(&iras, sizeof (iras));
3060         iras.ira_flags = 0;
3061         /*
3062          * we are setting the ira_rill to the ipmp_ill (instead of
3063          * the actual ill on which the packet was received), but this
3064          * is ok because we don't actually need the real ira_rill.
3065          * to send the icmp unreachable to the sender.
3066          */
3067         iras.ira_ill = iras.ira_rill = ill;
3068         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3069         iras.ira_rifindex = iras.ira_ruifindex;
3070 
3071         ip1dbg(("ndp_resolv_failed: dst %s\n",
3072             inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
3073         mutex_enter(&ncec->ncec_lock);
3074         mp = ncec->ncec_qd_mp;
3075         ncec->ncec_qd_mp = NULL;
3076         ncec->ncec_nprobes = 0;
3077         mutex_exit(&ncec->ncec_lock);
3078         while (mp != NULL) {
3079                 nxt_mp = mp->b_next;
3080                 mp->b_next = NULL;
3081 
3082                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3083                 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3084                     mp, ill);
3085                 icmp_unreachable_v6(mp,
3086                     ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
3087                 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3088                 mp = nxt_mp;
3089         }
3090         ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3091 }
3092 
3093 /*
3094  * Handle the completion of NDP and ARP resolution.
3095  */
3096 void
3097 nce_resolv_ok(ncec_t *ncec)
3098 {
3099         mblk_t *mp;
3100         uint_t pkt_len;
3101         iaflags_t ixaflags = IXAF_NO_TRACE;
3102         nce_t *nce;
3103         ill_t   *ill = ncec->ncec_ill;
3104         boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
3105         ip_stack_t *ipst = ill->ill_ipst;
3106 
3107         if (IS_IPMP(ncec->ncec_ill)) {
3108                 nce_resolv_ipmp_ok(ncec);
3109                 return;
3110         }
3111         /* non IPMP case */
3112 
3113         mutex_enter(&ncec->ncec_lock);
3114         ASSERT(ncec->ncec_nprobes == 0);
3115         mp = ncec->ncec_qd_mp;
3116         ncec->ncec_qd_mp = NULL;
3117         mutex_exit(&ncec->ncec_lock);
3118 
3119         while (mp != NULL) {
3120                 mblk_t *nxt_mp;
3121 
3122                 if (ill->ill_isv6) {
3123                         ip6_t *ip6h = (ip6_t *)mp->b_rptr;
3124 
3125                         pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
3126                 } else {
3127                         ipha_t *ipha = (ipha_t *)mp->b_rptr;
3128 
3129                         ixaflags |= IXAF_IS_IPV4;
3130                         pkt_len = ntohs(ipha->ipha_length);
3131                 }
3132                 nxt_mp = mp->b_next;
3133                 mp->b_next = NULL;
3134                 /*
3135                  * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
3136                  * longer available, but it's ok to drop this flag because TCP
3137                  * has its own flow-control in effect, so TCP packets
3138                  * are not likely to get here when flow-control is in effect.
3139                  */
3140                 mutex_enter(&ill->ill_lock);
3141                 nce = nce_lookup(ill, &ncec->ncec_addr);
3142                 mutex_exit(&ill->ill_lock);
3143 
3144                 if (nce == NULL) {
3145                         if (isv6) {
3146                                 BUMP_MIB(&ipst->ips_ip6_mib,
3147                                     ipIfStatsOutDiscards);
3148                         } else {
3149                                 BUMP_MIB(&ipst->ips_ip_mib,
3150                                     ipIfStatsOutDiscards);
3151                         }
3152                         ip_drop_output("ipIfStatsOutDiscards - no nce",
3153                             mp, NULL);
3154                         freemsg(mp);
3155                 } else {
3156                         /*
3157                          * We don't know the zoneid, but
3158                          * ip_xmit does not care since IXAF_NO_TRACE
3159                          * is set. (We traced the packet the first
3160                          * time through ip_xmit.)
3161                          */
3162                         (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
3163                             ALL_ZONES, 0, NULL);
3164                         nce_refrele(nce);
3165                 }
3166                 mp = nxt_mp;
3167         }
3168 
3169         ncec_cb_dispatch(ncec); /* complete callbacks */
3170 }
3171 
3172 /*
3173  * Called by SIOCSNDP* ioctl to add/change an ncec entry
3174  * and the corresponding attributes.
3175  * Disallow states other than ND_REACHABLE or ND_STALE.
3176  */
3177 int
3178 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3179 {
3180         sin6_t          *sin6;
3181         in6_addr_t      *addr;
3182         ncec_t          *ncec;
3183         nce_t           *nce;
3184         int             err = 0;
3185         uint16_t        new_flags = 0;
3186         uint16_t        old_flags = 0;
3187         int             inflags = lnr->lnr_flags;
3188         ip_stack_t      *ipst = ill->ill_ipst;
3189         boolean_t       do_postprocess = B_FALSE;
3190 
3191         ASSERT(ill->ill_isv6);
3192         if ((lnr->lnr_state_create != ND_REACHABLE) &&
3193             (lnr->lnr_state_create != ND_STALE))
3194                 return (EINVAL);
3195 
3196         sin6 = (sin6_t *)&lnr->lnr_addr;
3197         addr = &sin6->sin6_addr;
3198 
3199         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3200         ASSERT(!IS_UNDER_IPMP(ill));
3201         nce = nce_lookup_addr(ill, addr);
3202         if (nce != NULL)
3203                 new_flags = nce->nce_common->ncec_flags;
3204 
3205         switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3206         case NDF_ISROUTER_ON:
3207                 new_flags |= NCE_F_ISROUTER;
3208                 break;
3209         case NDF_ISROUTER_OFF:
3210                 new_flags &= ~NCE_F_ISROUTER;
3211                 break;
3212         case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3213                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3214                 if (nce != NULL)
3215                         nce_refrele(nce);
3216                 return (EINVAL);
3217         }
3218         if (inflags & NDF_STATIC)
3219                 new_flags |= NCE_F_STATIC;
3220 
3221         switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3222         case NDF_ANYCAST_ON:
3223                 new_flags |= NCE_F_ANYCAST;
3224                 break;
3225         case NDF_ANYCAST_OFF:
3226                 new_flags &= ~NCE_F_ANYCAST;
3227                 break;
3228         case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3229                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3230                 if (nce != NULL)
3231                         nce_refrele(nce);
3232                 return (EINVAL);
3233         }
3234 
3235         if (nce == NULL) {
3236                 err = nce_add_v6(ill,
3237                     (uchar_t *)lnr->lnr_hdw_addr,
3238                     ill->ill_phys_addr_length,
3239                     addr,
3240                     new_flags,
3241                     lnr->lnr_state_create,
3242                     &nce);
3243                 if (err != 0) {
3244                         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3245                         ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3246                         return (err);
3247                 } else {
3248                         do_postprocess = B_TRUE;
3249                 }
3250         }
3251         ncec = nce->nce_common;
3252         old_flags = ncec->ncec_flags;
3253         if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3254                 ncec_router_to_host(ncec);
3255                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3256                 if (do_postprocess)
3257                         err = nce_add_v6_postprocess(nce);
3258                 nce_refrele(nce);
3259                 return (0);
3260         }
3261         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3262 
3263         if (do_postprocess)
3264                 err = nce_add_v6_postprocess(nce);
3265         /*
3266          * err cannot be anything other than 0 because we don't support
3267          * proxy arp of static addresses.
3268          */
3269         ASSERT(err == 0);
3270 
3271         mutex_enter(&ncec->ncec_lock);
3272         ncec->ncec_flags = new_flags;
3273         mutex_exit(&ncec->ncec_lock);
3274         /*
3275          * Note that we ignore the state at this point, which
3276          * should be either STALE or REACHABLE.  Instead we let
3277          * the link layer address passed in to determine the state
3278          * much like incoming packets.
3279          */
3280         nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3281         nce_refrele(nce);
3282         return (0);
3283 }
3284 
3285 /*
3286  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3287  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3288  * be held to ensure that they are in the same group.
3289  */
3290 static nce_t *
3291 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3292 {
3293 
3294         nce_t *nce;
3295 
3296         nce = nce_ill_lookup_then_add(ill, ncec);
3297 
3298         if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3299                 return (nce);
3300 
3301         /*
3302          * hold the ncec_lock to synchronize with nce_update() so that,
3303          * at the end of this function, the contents of nce_dlur_mp are
3304          * consistent with ncec->ncec_lladdr, even though some intermediate
3305          * packet may have been sent out with a mangled address, which would
3306          * only be a transient condition.
3307          */
3308         mutex_enter(&ncec->ncec_lock);
3309         if (ncec->ncec_lladdr != NULL) {
3310                 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3311                     NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3312         } else {
3313                 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3314                     ill->ill_sap_length);
3315         }
3316         mutex_exit(&ncec->ncec_lock);
3317         return (nce);
3318 }
3319 
3320 /*
3321  * we make nce_fp_mp to have an M_DATA prepend.
3322  * The caller ensures there is hold on ncec for this function.
3323  * Note that since ill_fastpath_probe() copies the mblk there is
3324  * no need to hold the nce or ncec beyond this function.
3325  *
3326  * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3327  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3328  * and will be returned back by this function, so that no extra nce_refrele
3329  * is required for the caller. The calls from nce_add_common() use this
3330  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3331  * nce_refrele of the returned nce (when it is non-null).
3332  */
3333 static nce_t *
3334 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3335 {
3336         nce_t *nce;
3337         ill_t *ill = ncec->ncec_ill;
3338 
3339         ASSERT(ill != NULL);
3340 
3341         if (IS_IPMP(ill) && trigger_fp_req) {
3342                 trigger_fp_req = B_FALSE;
3343                 ipmp_ncec_refresh_nce(ncec);
3344         }
3345 
3346         /*
3347          * If the caller already has the nce corresponding to the ill, use
3348          * that one. Otherwise we have to lookup/add the nce. Calls from
3349          * nce_add_common() fall in the former category, and have just done
3350          * the nce lookup/add that can be reused.
3351          */
3352         if (ncec_nce == NULL)
3353                 nce = nce_fastpath_create(ill, ncec);
3354         else
3355                 nce = ncec_nce;
3356 
3357         if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3358                 return (nce);
3359 
3360         if (trigger_fp_req)
3361                 nce_fastpath_trigger(nce);
3362         return (nce);
3363 }
3364 
3365 /*
3366  * Trigger fastpath on nce. No locks may be held.
3367  */
3368 static void
3369 nce_fastpath_trigger(nce_t *nce)
3370 {
3371         int res;
3372         ill_t *ill = nce->nce_ill;
3373         ncec_t *ncec = nce->nce_common;
3374 
3375         res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3376         /*
3377          * EAGAIN is an indication of a transient error
3378          * i.e. allocation failure etc. leave the ncec in the list it
3379          * will be updated when another probe happens for another ire
3380          * if not it will be taken out of the list when the ire is
3381          * deleted.
3382          */
3383         if (res != 0 && res != EAGAIN && res != ENOTSUP)
3384                 nce_fastpath_list_delete(ill, ncec, NULL);
3385 }
3386 
3387 /*
3388  * Add ncec to the nce fastpath list on ill.
3389  */
3390 static nce_t *
3391 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
3392 {
3393         nce_t *nce = NULL;
3394 
3395         ASSERT(MUTEX_HELD(&ill->ill_lock));
3396         /*
3397          * Atomically ensure that the ill is not CONDEMNED and is not going
3398          * down, before adding the NCE.
3399          */
3400         if (ill->ill_state_flags & ILL_CONDEMNED)
3401                 return (NULL);
3402         mutex_enter(&ncec->ncec_lock);
3403         /*
3404          * if ncec has not been deleted and
3405          * is not already in the list add it.
3406          */
3407         if (!NCE_ISCONDEMNED(ncec)) {
3408                 nce = nce_lookup(ill, &ncec->ncec_addr);
3409                 if (nce != NULL)
3410                         goto done;
3411                 nce = nce_add(ill, ncec, graveyard);
3412         }
3413 done:
3414         mutex_exit(&ncec->ncec_lock);
3415         return (nce);
3416 }
3417 
3418 static nce_t *
3419 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3420 {
3421         nce_t *nce;
3422         list_t graveyard;
3423 
3424         list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3425         mutex_enter(&ill->ill_lock);
3426         nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
3427         mutex_exit(&ill->ill_lock);
3428         nce_graveyard_free(&graveyard);
3429         return (nce);
3430 }
3431 
3432 
3433 /*
3434  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3435  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3436  * entry after all locks have been dropped.
3437  */
3438 void
3439 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3440 {
3441         nce_t *nce;
3442 
3443         ASSERT(ill != NULL);
3444 
3445         /* delete any nces referencing the ncec from underlying ills */
3446         if (IS_IPMP(ill))
3447                 ipmp_ncec_delete_nce(ncec);
3448 
3449         /* now the ill itself */
3450         mutex_enter(&ill->ill_lock);
3451         for (nce = list_head(&ill->ill_nce); nce != NULL;
3452             nce = list_next(&ill->ill_nce, nce)) {
3453                 if (nce->nce_common == ncec) {
3454                         nce_refhold(nce);
3455                         nce_delete(nce);
3456                         break;
3457                 }
3458         }
3459         mutex_exit(&ill->ill_lock);
3460         if (nce != NULL) {
3461                 if (dead == NULL)
3462                         nce_refrele(nce);
3463                 else
3464                         list_insert_tail(dead, nce);
3465         }
3466 }
3467 
3468 /*
3469  * when the fastpath response does not fit in the datab
3470  * associated with the existing nce_fp_mp, we delete and
3471  * add the nce to retrigger fastpath based on the information
3472  * in the ncec_t.
3473  */
3474 static nce_t *
3475 nce_delete_then_add(nce_t *nce)
3476 {
3477         ill_t           *ill = nce->nce_ill;
3478         nce_t           *newnce = NULL;
3479         list_t          graveyard;
3480 
3481         list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3482         ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3483             (void *)nce, ill->ill_name));
3484         mutex_enter(&ill->ill_lock);
3485         mutex_enter(&nce->nce_common->ncec_lock);
3486         nce_delete(nce);
3487         /*
3488          * Make sure that ncec is not condemned before adding. We hold the
3489          * ill_lock and ncec_lock to synchronize with ncec_delete() and
3490          * ipmp_ncec_delete_nce()
3491          */
3492         if (!NCE_ISCONDEMNED(nce->nce_common))
3493                 newnce = nce_add(ill, nce->nce_common, &graveyard);
3494         mutex_exit(&nce->nce_common->ncec_lock);
3495         mutex_exit(&ill->ill_lock);
3496         nce_graveyard_free(&graveyard);
3497         nce_refrele(nce);
3498         return (newnce); /* could be null if nomem */
3499 }
3500 
3501 typedef struct nce_fp_match_s {
3502         nce_t   *nce_fp_match_res;
3503         mblk_t  *nce_fp_match_ack_mp;
3504 } nce_fp_match_t;
3505 
3506 /* ARGSUSED */
3507 static int
3508 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3509 {
3510         nce_fp_match_t  *nce_fp_marg = arg;
3511         ncec_t          *ncec = nce->nce_common;
3512         mblk_t          *mp = nce_fp_marg->nce_fp_match_ack_mp;
3513         uchar_t *mp_rptr, *ud_mp_rptr;
3514         mblk_t          *ud_mp = nce->nce_dlur_mp;
3515         ptrdiff_t       cmplen;
3516 
3517         /*
3518          * mp is the mp associated with the fastpath ack.
3519          * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3520          * under consideration. If the contents match, then the
3521          * fastpath ack is used to update the nce.
3522          */
3523         if (ud_mp == NULL)
3524                 return (0);
3525         mp_rptr = mp->b_rptr;
3526         cmplen = mp->b_wptr - mp_rptr;
3527         ASSERT(cmplen >= 0);
3528 
3529         ud_mp_rptr = ud_mp->b_rptr;
3530         /*
3531          * The ncec is locked here to prevent any other threads from accessing
3532          * and changing nce_dlur_mp when the address becomes resolved to an
3533          * lla while we're in the middle of looking at and comparing the
3534          * hardware address (lla). It is also locked to prevent multiple
3535          * threads in nce_fastpath() from examining nce_dlur_mp at the same
3536          * time.
3537          */
3538         mutex_enter(&ncec->ncec_lock);
3539         if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3540             bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3541                 nce_fp_marg->nce_fp_match_res = nce;
3542                 mutex_exit(&ncec->ncec_lock);
3543                 nce_refhold(nce);
3544                 return (1);
3545         }
3546         mutex_exit(&ncec->ncec_lock);
3547         return (0);
3548 }
3549 
3550 /*
3551  * Update all NCE's that are not in fastpath mode and
3552  * have an nce_fp_mp that matches mp. mp->b_cont contains
3553  * the fastpath header.
3554  *
3555  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3556  */
3557 void
3558 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3559 {
3560         nce_fp_match_t nce_fp_marg;
3561         nce_t *nce;
3562         mblk_t *nce_fp_mp, *fp_mp;
3563 
3564         nce_fp_marg.nce_fp_match_res = NULL;
3565         nce_fp_marg.nce_fp_match_ack_mp = mp;
3566 
3567         nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3568 
3569         if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3570                 return;
3571 
3572         mutex_enter(&nce->nce_lock);
3573         nce_fp_mp = nce->nce_fp_mp;
3574 
3575         if (nce_fp_mp != NULL) {
3576                 fp_mp = mp->b_cont;
3577                 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3578                     nce_fp_mp->b_datap->db_lim) {
3579                         mutex_exit(&nce->nce_lock);
3580                         nce = nce_delete_then_add(nce);
3581                         if (nce == NULL) {
3582                                 return;
3583                         }
3584                         mutex_enter(&nce->nce_lock);
3585                         nce_fp_mp = nce->nce_fp_mp;
3586                 }
3587         }
3588 
3589         /* Matched - install mp as the fastpath mp */
3590         if (nce_fp_mp == NULL) {
3591                 fp_mp = dupb(mp->b_cont);
3592                 nce->nce_fp_mp = fp_mp;
3593         } else {
3594                 fp_mp = mp->b_cont;
3595                 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3596                 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3597                     + MBLKL(fp_mp);
3598         }
3599         mutex_exit(&nce->nce_lock);
3600         nce_refrele(nce);
3601 }
3602 
3603 /*
3604  * Return a pointer to a given option in the packet.
3605  * Assumes that option part of the packet have already been validated.
3606  */
3607 nd_opt_hdr_t *
3608 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3609 {
3610         while (optlen > 0) {
3611                 if (opt->nd_opt_type == opt_type)
3612                         return (opt);
3613                 optlen -= 8 * opt->nd_opt_len;
3614                 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3615         }
3616         return (NULL);
3617 }
3618 
3619 /*
3620  * Verify all option lengths present are > 0, also check to see
3621  * if the option lengths and packet length are consistent.
3622  */
3623 boolean_t
3624 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3625 {
3626         ASSERT(opt != NULL);
3627         while (optlen > 0) {
3628                 if (opt->nd_opt_len == 0)
3629                         return (B_FALSE);
3630                 optlen -= 8 * opt->nd_opt_len;
3631                 if (optlen < 0)
3632                         return (B_FALSE);
3633                 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3634         }
3635         return (B_TRUE);
3636 }
3637 
3638 /*
3639  * ncec_walk function.
3640  * Free a fraction of the NCE cache entries.
3641  *
3642  * A possible optimization here would be to use ncec_last where possible, and
3643  * delete the least-frequently used entry, which would require more complex
3644  * computation as we walk through the ncec's (e.g., track ncec entries by
3645  * order of ncec_last and/or maintain state)
3646  */
3647 static void
3648 ncec_cache_reclaim(ncec_t *ncec, void *arg)
3649 {
3650         ip_stack_t      *ipst = ncec->ncec_ipst;
3651         uint_t          fraction = *(uint_t *)arg;
3652         uint_t          rand;
3653 
3654         if ((ncec->ncec_flags &
3655             (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3656                 return;
3657         }
3658 
3659         rand = (uint_t)ddi_get_lbolt() +
3660             NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3661         if ((rand/fraction)*fraction == rand) {
3662                 IP_STAT(ipst, ip_nce_reclaim_deleted);
3663                 ncec_delete(ncec);
3664         }
3665 }
3666 
3667 /*
3668  * kmem_cache callback to free up memory.
3669  *
3670  * For now we just delete a fixed fraction.
3671  */
3672 static void
3673 ip_nce_reclaim_stack(ip_stack_t *ipst)
3674 {
3675         uint_t          fraction = ipst->ips_ip_nce_reclaim_fraction;
3676 
3677         IP_STAT(ipst, ip_nce_reclaim_calls);
3678 
3679         ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);
3680 
3681         /*
3682          * Walk all CONNs that can have a reference on an ire, ncec or dce.
3683          * Get them to update any stale references to drop any refholds they
3684          * have.
3685          */
3686         ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3687 }
3688 
3689 /*
3690  * Called by the memory allocator subsystem directly, when the system
3691  * is running low on memory.
3692  */
3693 /* ARGSUSED */
3694 void
3695 ip_nce_reclaim(void *args)
3696 {
3697         netstack_handle_t nh;
3698         netstack_t *ns;
3699         ip_stack_t *ipst;
3700 
3701         netstack_next_init(&nh);
3702         while ((ns = netstack_next(&nh)) != NULL) {
3703                 /*
3704                  * netstack_next() can return a netstack_t with a NULL
3705                  * netstack_ip at boot time.
3706                  */
3707                 if ((ipst = ns->netstack_ip) == NULL) {
3708                         netstack_rele(ns);
3709                         continue;
3710                 }
3711                 ip_nce_reclaim_stack(ipst);
3712                 netstack_rele(ns);
3713         }
3714         netstack_next_fini(&nh);
3715 }
3716 
3717 #ifdef DEBUG
3718 void
3719 ncec_trace_ref(ncec_t *ncec)
3720 {
3721         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3722 
3723         if (ncec->ncec_trace_disable)
3724                 return;
3725 
3726         if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3727                 ncec->ncec_trace_disable = B_TRUE;
3728                 ncec_trace_cleanup(ncec);
3729         }
3730 }
3731 
3732 void
3733 ncec_untrace_ref(ncec_t *ncec)
3734 {
3735         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3736 
3737         if (!ncec->ncec_trace_disable)
3738                 th_trace_unref(ncec);
3739 }
3740 
3741 static void
3742 ncec_trace_cleanup(const ncec_t *ncec)
3743 {
3744         th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3745 }
3746 #endif
3747 
3748 /*
3749  * Called when address resolution fails due to a timeout.
3750  * Send an ICMP unreachable in response to all queued packets.
3751  */
3752 void
3753 arp_resolv_failed(ncec_t *ncec)
3754 {
3755         mblk_t  *mp, *nxt_mp;
3756         char    buf[INET6_ADDRSTRLEN];
3757         struct in_addr ipv4addr;
3758         ill_t *ill = ncec->ncec_ill;
3759         ip_stack_t *ipst = ncec->ncec_ipst;
3760         ip_recv_attr_t  iras;
3761 
3762         bzero(&iras, sizeof (iras));
3763         iras.ira_flags = IRAF_IS_IPV4;
3764         /*
3765          * we are setting the ira_rill to the ipmp_ill (instead of
3766          * the actual ill on which the packet was received), but this
3767          * is ok because we don't actually need the real ira_rill.
3768          * to send the icmp unreachable to the sender.
3769          */
3770         iras.ira_ill = iras.ira_rill = ill;
3771         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3772         iras.ira_rifindex = iras.ira_ruifindex;
3773 
3774         IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3775         ip3dbg(("arp_resolv_failed: dst %s\n",
3776             inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3777         mutex_enter(&ncec->ncec_lock);
3778         mp = ncec->ncec_qd_mp;
3779         ncec->ncec_qd_mp = NULL;
3780         ncec->ncec_nprobes = 0;
3781         mutex_exit(&ncec->ncec_lock);
3782         while (mp != NULL) {
3783                 nxt_mp = mp->b_next;
3784                 mp->b_next = NULL;
3785 
3786                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3787                 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3788                     mp, ill);
3789                 if (ipst->ips_ip_arp_icmp_error) {
3790                         ip3dbg(("arp_resolv_failed: "
3791                             "Calling icmp_unreachable\n"));
3792                         icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3793                 } else {
3794                         freemsg(mp);
3795                 }
3796                 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3797                 mp = nxt_mp;
3798         }
3799         ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3800 }
3801 
3802 /*
3803  * if ill is an under_ill, translate it to the ipmp_ill and add the
3804  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3805  * one on the underlying in_ill) will be created for the
3806  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3807  */
3808 int
3809 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3810     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3811 {
3812         int     err;
3813         in6_addr_t addr6;
3814         ip_stack_t *ipst = ill->ill_ipst;
3815         nce_t   *nce, *upper_nce = NULL;
3816         ill_t   *in_ill = ill, *under = NULL;
3817         boolean_t need_ill_refrele = B_FALSE;
3818 
3819         if (flags & NCE_F_MCAST) {
3820                 /*
3821                  * hw_addr will be figured out in nce_set_multicast_v4;
3822                  * caller needs to pass in the cast_ill for ipmp
3823                  */
3824                 ASSERT(hw_addr == NULL);
3825                 ASSERT(!IS_IPMP(ill));
3826                 err = nce_set_multicast_v4(ill, addr, flags, newnce);
3827                 return (err);
3828         }
3829 
3830         if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3831                 ill = ipmp_ill_hold_ipmp_ill(ill);
3832                 if (ill == NULL)
3833                         return (ENXIO);
3834                 need_ill_refrele = B_TRUE;
3835         }
3836         if ((flags & NCE_F_BCAST) != 0) {
3837                 /*
3838                  * IPv4 broadcast ncec: compute the hwaddr.
3839                  */
3840                 if (IS_IPMP(ill)) {
3841                         under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3842                         if (under == NULL)  {
3843                                 if (need_ill_refrele)
3844                                         ill_refrele(ill);
3845                                 return (ENETDOWN);
3846                         }
3847                         hw_addr = under->ill_bcast_mp->b_rptr +
3848                             NCE_LL_ADDR_OFFSET(under);
3849                         hw_addr_len = under->ill_phys_addr_length;
3850                 } else {
3851                         hw_addr = ill->ill_bcast_mp->b_rptr +
3852                             NCE_LL_ADDR_OFFSET(ill),
3853                             hw_addr_len = ill->ill_phys_addr_length;
3854                 }
3855         }
3856 
3857         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3858         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3859         nce = nce_lookup_addr(ill, &addr6);
3860         if (nce == NULL) {
3861                 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3862                     state, &nce);
3863         } else {
3864                 err = EEXIST;
3865         }
3866         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3867         if (err == 0)
3868                 err = nce_add_v4_postprocess(nce);
3869 
3870         if (in_ill != ill && nce != NULL) {
3871                 nce_t *under_nce = NULL;
3872 
3873                 /*
3874                  * in_ill was the under_ill. Try to create the under_nce.
3875                  * Hold the ill_g_lock to prevent changes to group membership
3876                  * until we are done.
3877                  */
3878                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3879                 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3880                         DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3881                             ill_t *, ill);
3882                         rw_exit(&ipst->ips_ill_g_lock);
3883                         err = ENXIO;
3884                         nce_refrele(nce);
3885                         nce = NULL;
3886                         goto bail;
3887                 }
3888                 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3889                 if (under_nce == NULL) {
3890                         rw_exit(&ipst->ips_ill_g_lock);
3891                         err = EINVAL;
3892                         nce_refrele(nce);
3893                         nce = NULL;
3894                         goto bail;
3895                 }
3896                 rw_exit(&ipst->ips_ill_g_lock);
3897                 upper_nce = nce;
3898                 nce = under_nce; /* will be returned to caller */
3899                 if (NCE_ISREACHABLE(nce->nce_common))
3900                         nce_fastpath_trigger(under_nce);
3901         }
3902         if (nce != NULL) {
3903                 if (newnce != NULL)
3904                         *newnce = nce;
3905                 else
3906                         nce_refrele(nce);
3907         }
3908 bail:
3909         if (under != NULL)
3910                 ill_refrele(under);
3911         if (upper_nce != NULL)
3912                 nce_refrele(upper_nce);
3913         if (need_ill_refrele)
3914                 ill_refrele(ill);
3915 
3916         return (err);
3917 }
3918 
3919 /*
3920  * NDP Cache Entry creation routine for IPv4.
3921  * This routine must always be called with ndp4->ndp_g_lock held.
3922  * Prior to return, ncec_refcnt is incremented.
3923  *
3924  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3925  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3926  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3927  * entries will be created, both pointing at the same ncec_t. The nce_t
3928  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3929  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3930  * Local addresses are always created on the ill passed to nce_add_v4.
3931  */
3932 int
3933 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3934     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3935 {
3936         int             err;
3937         boolean_t       is_multicast = (flags & NCE_F_MCAST);
3938         struct in6_addr addr6;
3939         nce_t           *nce;
3940 
3941         ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3942         ASSERT(!ill->ill_isv6);
3943         ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3944 
3945         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3946         err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3947             &nce);
3948         ASSERT(newnce != NULL);
3949         *newnce = nce;
3950         return (err);
3951 }
3952 
3953 /*
3954  * Post-processing routine to be executed after nce_add_v4(). This function
3955  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3956  * and must be called without any locks held.
3957  *
3958  * Always returns 0, but we return an int to keep this symmetric with the
3959  * IPv6 counter-part.
3960  */
3961 int
3962 nce_add_v4_postprocess(nce_t *nce)
3963 {
3964         ncec_t          *ncec = nce->nce_common;
3965         uint16_t        flags = ncec->ncec_flags;
3966         boolean_t       ndp_need_dad = B_FALSE;
3967         boolean_t       dropped;
3968         clock_t         delay;
3969         ip_stack_t      *ipst = ncec->ncec_ill->ill_ipst;
3970         uchar_t         *hw_addr = ncec->ncec_lladdr;
3971         boolean_t       trigger_fastpath = B_TRUE;
3972 
3973         /*
3974          * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3975          * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3976          * We call nce_fastpath from nce_update if the link layer address of
3977          * the peer changes from nce_update
3978          */
3979         if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3980             ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3981                 trigger_fastpath = B_FALSE;
3982 
3983         if (trigger_fastpath)
3984                 nce_fastpath_trigger(nce);
3985 
3986         if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3987                 /*
3988                  * Either the caller (by passing in ND_PROBE)
3989                  * or nce_add_common() (by the internally computed state
3990                  * based on ncec_addr and ill_net_type) has determined
3991                  * that this unicast entry needs DAD. Trigger DAD.
3992                  */
3993                 ndp_need_dad = B_TRUE;
3994         } else if (flags & NCE_F_UNSOL_ADV) {
3995                 /*
3996                  * We account for the transmit below by assigning one
3997                  * less than the ndd variable. Subsequent decrements
3998                  * are done in nce_timer.
3999                  */
4000                 mutex_enter(&ncec->ncec_lock);
4001                 ncec->ncec_unsolicit_count =
4002                     ipst->ips_ip_arp_publish_count - 1;
4003                 mutex_exit(&ncec->ncec_lock);
4004                 dropped = arp_announce(ncec);
4005                 mutex_enter(&ncec->ncec_lock);
4006                 if (dropped)
4007                         ncec->ncec_unsolicit_count++;
4008                 else
4009                         ncec->ncec_last_time_defended = ddi_get_lbolt();
4010                 if (ncec->ncec_unsolicit_count != 0) {
4011                         nce_start_timer(ncec,
4012                             ipst->ips_ip_arp_publish_interval);
4013                 }
4014                 mutex_exit(&ncec->ncec_lock);
4015         }
4016 
4017         /*
4018          * If ncec_xmit_interval is 0, user has configured us to send the first
4019          * probe right away.  Do so, and set up for the subsequent probes.
4020          */
4021         if (ndp_need_dad) {
4022                 mutex_enter(&ncec->ncec_lock);
4023                 if (ncec->ncec_pcnt == 0) {
4024                         /*
4025                          * DAD probes and announce can be
4026                          * administratively disabled by setting the
4027                          * probe_count to zero. Restart the timer in
4028                          * this case to mark the ipif as ready.
4029                          */
4030                         ncec->ncec_unsolicit_count = 0;
4031                         mutex_exit(&ncec->ncec_lock);
4032                         nce_restart_timer(ncec, 0);
4033                 } else {
4034                         mutex_exit(&ncec->ncec_lock);
4035                         delay = ((ncec->ncec_flags & NCE_F_FAST) ?
4036                             ipst->ips_arp_probe_delay :
4037                             ipst->ips_arp_fastprobe_delay);
4038                         nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
4039                 }
4040         }
4041         return (0);
4042 }
4043 
4044 /*
4045  * ncec_walk routine to update all entries that have a given destination or
4046  * gateway address and cached link layer (MAC) address.  This is used when ARP
4047  * informs us that a network-to-link-layer mapping may have changed.
4048  */
4049 void
4050 nce_update_hw_changed(ncec_t *ncec, void *arg)
4051 {
4052         nce_hw_map_t *hwm = arg;
4053         ipaddr_t ncec_addr;
4054 
4055         if (ncec->ncec_state != ND_REACHABLE)
4056                 return;
4057 
4058         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
4059         if (ncec_addr != hwm->hwm_addr)
4060                 return;
4061 
4062         mutex_enter(&ncec->ncec_lock);
4063         if (hwm->hwm_flags != 0)
4064                 ncec->ncec_flags = hwm->hwm_flags;
4065         nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
4066         mutex_exit(&ncec->ncec_lock);
4067 }
4068 
4069 void
4070 ncec_refhold(ncec_t *ncec)
4071 {
4072         mutex_enter(&(ncec)->ncec_lock);
4073         (ncec)->ncec_refcnt++;
4074         ASSERT((ncec)->ncec_refcnt != 0);
4075 #ifdef DEBUG
4076         ncec_trace_ref(ncec);
4077 #endif
4078         mutex_exit(&(ncec)->ncec_lock);
4079 }
4080 
4081 void
4082 ncec_refhold_notr(ncec_t *ncec)
4083 {
4084         mutex_enter(&(ncec)->ncec_lock);
4085         (ncec)->ncec_refcnt++;
4086         ASSERT((ncec)->ncec_refcnt != 0);
4087         mutex_exit(&(ncec)->ncec_lock);
4088 }
4089 
4090 static void
4091 ncec_refhold_locked(ncec_t *ncec)
4092 {
4093         ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
4094         (ncec)->ncec_refcnt++;
4095 #ifdef DEBUG
4096         ncec_trace_ref(ncec);
4097 #endif
4098 }
4099 
4100 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
4101 void
4102 ncec_refrele(ncec_t *ncec)
4103 {
4104         mutex_enter(&(ncec)->ncec_lock);
4105 #ifdef DEBUG
4106         ncec_untrace_ref(ncec);
4107 #endif
4108         ASSERT((ncec)->ncec_refcnt != 0);
4109         if (--(ncec)->ncec_refcnt == 0) {
4110                 ncec_inactive(ncec);
4111         } else {
4112                 mutex_exit(&(ncec)->ncec_lock);
4113         }
4114 }
4115 
4116 void
4117 ncec_refrele_notr(ncec_t *ncec)
4118 {
4119         mutex_enter(&(ncec)->ncec_lock);
4120         ASSERT((ncec)->ncec_refcnt != 0);
4121         if (--(ncec)->ncec_refcnt == 0) {
4122                 ncec_inactive(ncec);
4123         } else {
4124                 mutex_exit(&(ncec)->ncec_lock);
4125         }
4126 }
4127 
4128 /*
4129  * Common to IPv4 and IPv6.
4130  */
4131 void
4132 nce_restart_timer(ncec_t *ncec, uint_t ms)
4133 {
4134         timeout_id_t tid;
4135 
4136         ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
4137 
4138         /* First cancel any running timer */
4139         mutex_enter(&ncec->ncec_lock);
4140         tid = ncec->ncec_timeout_id;
4141         ncec->ncec_timeout_id = 0;
4142         if (tid != 0) {
4143                 mutex_exit(&ncec->ncec_lock);
4144                 (void) untimeout(tid);
4145                 mutex_enter(&ncec->ncec_lock);
4146         }
4147 
4148         /* Restart timer */
4149         nce_start_timer(ncec, ms);
4150         mutex_exit(&ncec->ncec_lock);
4151 }
4152 
4153 static void
4154 nce_start_timer(ncec_t *ncec, uint_t ms)
4155 {
4156         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4157         /*
4158          * Don't start the timer if the ncec has been deleted, or if the timer
4159          * is already running
4160          */
4161         if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
4162                 ncec->ncec_timeout_id = timeout(nce_timer, ncec,
4163                     MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
4164         }
4165 }
4166 
4167 int
4168 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
4169     uint16_t flags, nce_t **newnce)
4170 {
4171         uchar_t         *hw_addr;
4172         int             err = 0;
4173         ip_stack_t      *ipst = ill->ill_ipst;
4174         in6_addr_t      dst6;
4175         nce_t           *nce;
4176 
4177         ASSERT(!ill->ill_isv6);
4178 
4179         IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
4180         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
4181         if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
4182                 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4183                 goto done;
4184         }
4185         if (ill->ill_net_type == IRE_IF_RESOLVER) {
4186                 /*
4187                  * For IRE_IF_RESOLVER a hardware mapping can be
4188                  * generated, for IRE_IF_NORESOLVER, resolution cookie
4189                  * in the ill is copied in nce_add_v4().
4190                  */
4191                 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
4192                 if (hw_addr == NULL) {
4193                         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4194                         return (ENOMEM);
4195                 }
4196                 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
4197         } else {
4198                 /*
4199                  * IRE_IF_NORESOLVER type simply copies the resolution
4200                  * cookie passed in.  So no hw_addr is needed.
4201                  */
4202                 hw_addr = NULL;
4203         }
4204         ASSERT(flags & NCE_F_MCAST);
4205         ASSERT(flags & NCE_F_NONUD);
4206         /* nce_state will be computed by nce_add_common() */
4207         err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
4208             ND_UNCHANGED, &nce);
4209         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4210         if (err == 0)
4211                 err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
4212         if (hw_addr != NULL)
4213                 kmem_free(hw_addr, ill->ill_phys_addr_length);
4214         if (err != 0) {
4215                 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
4216                 return (err);
4217         }
4218 done:
4219         if (newnce != NULL)
4220                 *newnce = nce;
4221         else
4222                 nce_refrele(nce);
4223         return (0);
4224 }
4225 
4226 /*
4227  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
4228  * don't want to have to walk the list for every single one, so we gather up
4229  * batches at a time.
4230  */
4231 #define NCE_RESCHED_LIST_LEN    8
4232 
4233 typedef struct {
4234         ill_t   *ncert_ill;
4235         uint_t  ncert_num;
4236         ncec_t  *ncert_nces[NCE_RESCHED_LIST_LEN];
4237 } nce_resched_t;
4238 
4239 /*
4240  * Pick the longest waiting NCEs for defense.
4241  */
4242 /* ARGSUSED */
4243 static int
4244 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4245 {
4246         nce_resched_t *ncert = arg;
4247         ncec_t **ncecs;
4248         ncec_t **ncec_max;
4249         ncec_t *ncec_temp;
4250         ncec_t *ncec = nce->nce_common;
4251 
4252         ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4253         /*
4254          * Only reachable entries that are ready for announcement are eligible.
4255          */
4256         if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4257                 return (0);
4258         if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4259                 ncec_refhold(ncec);
4260                 ncert->ncert_nces[ncert->ncert_num++] = ncec;
4261         } else {
4262                 ncecs = ncert->ncert_nces;
4263                 ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4264                 ncec_refhold(ncec);
4265                 for (; ncecs < ncec_max; ncecs++) {
4266                         ASSERT(ncec != NULL);
4267                         if ((*ncecs)->ncec_last_time_defended >
4268                             ncec->ncec_last_time_defended) {
4269                                 ncec_temp = *ncecs;
4270                                 *ncecs = ncec;
4271                                 ncec = ncec_temp;
4272                         }
4273                 }
4274                 ncec_refrele(ncec);
4275         }
4276         return (0);
4277 }
4278 
4279 /*
4280  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4281  * doesn't happen very often (if at all), and thus it needn't be highly
4282  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4283  * outer loop is bounded by a constant rather than by the length of the list.)
4284  */
4285 static void
4286 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4287 {
4288         ncec_t          *ncec;
4289         ip_stack_t      *ipst = ill->ill_ipst;
4290         uint_t          i, defend_rate;
4291 
4292         i = ill->ill_defend_count;
4293         ill->ill_defend_count = 0;
4294         if (ill->ill_isv6)
4295                 defend_rate = ipst->ips_ndp_defend_rate;
4296         else
4297                 defend_rate = ipst->ips_arp_defend_rate;
4298         /* If none could be sitting around, then don't reschedule */
4299         if (i < defend_rate) {
4300                 DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4301                 return;
4302         }
4303         ncert->ncert_ill = ill;
4304         while (ill->ill_defend_count < defend_rate) {
4305                 nce_walk_common(ill, ncec_reschedule, ncert);
4306                 for (i = 0; i < ncert->ncert_num; i++) {
4307 
4308                         ncec = ncert->ncert_nces[i];
4309                         mutex_enter(&ncec->ncec_lock);
4310                         ncec->ncec_flags |= NCE_F_DELAYED;
4311                         mutex_exit(&ncec->ncec_lock);
4312                         /*
4313                          * we plan to schedule this ncec, so incr the
4314                          * defend_count in anticipation.
4315                          */
4316                         if (++ill->ill_defend_count >= defend_rate)
4317                                 break;
4318                 }
4319                 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4320                         break;
4321         }
4322 }
4323 
4324 /*
4325  * Check if the current rate-limiting parameters permit the sending
4326  * of another address defense announcement for both IPv4 and IPv6.
4327  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4328  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4329  * determines how many address defense announcements are permitted
4330  * in any `defense_perio' interval.
4331  */
4332 static boolean_t
4333 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4334 {
4335         clock_t         now = ddi_get_lbolt();
4336         ip_stack_t      *ipst = ill->ill_ipst;
4337         clock_t         start = ill->ill_defend_start;
4338         uint32_t        elapsed, defend_period, defend_rate;
4339         nce_resched_t   ncert;
4340         boolean_t       ret;
4341         int             i;
4342 
4343         if (ill->ill_isv6) {
4344                 defend_period = ipst->ips_ndp_defend_period;
4345                 defend_rate = ipst->ips_ndp_defend_rate;
4346         } else {
4347                 defend_period = ipst->ips_arp_defend_period;
4348                 defend_rate = ipst->ips_arp_defend_rate;
4349         }
4350         if (defend_rate == 0)
4351                 return (B_TRUE);
4352         bzero(&ncert, sizeof (ncert));
4353         mutex_enter(&ill->ill_lock);
4354         if (start > 0) {
4355                 elapsed = now - start;
4356                 if (elapsed > SEC_TO_TICK(defend_period)) {
4357                         ill->ill_defend_start = now;
4358                         /*
4359                          * nce_ill_reschedule will attempt to
4360                          * prevent starvation by reschduling the
4361                          * oldest entries, which are marked with
4362                          * the NCE_F_DELAYED flag.
4363                          */
4364                         nce_ill_reschedule(ill, &ncert);
4365                 }
4366         } else {
4367                 ill->ill_defend_start = now;
4368         }
4369         ASSERT(ill->ill_defend_count <= defend_rate);
4370         mutex_enter(&ncec->ncec_lock);
4371         if (ncec->ncec_flags & NCE_F_DELAYED) {
4372                 /*
4373                  * This ncec was rescheduled as one of the really old
4374                  * entries needing on-going defense. The
4375                  * ill_defend_count was already incremented in
4376                  * nce_ill_reschedule. Go ahead and send the announce.
4377                  */
4378                 ncec->ncec_flags &= ~NCE_F_DELAYED;
4379                 mutex_exit(&ncec->ncec_lock);
4380                 ret = B_FALSE;
4381                 goto done;
4382         }
4383         mutex_exit(&ncec->ncec_lock);
4384         if (ill->ill_defend_count < defend_rate)
4385                 ill->ill_defend_count++;
4386         if (ill->ill_defend_count == defend_rate) {
4387                 /*
4388                  * we are no longer allowed to send unbidden defense
4389                  * messages. Wait for rescheduling.
4390                  */
4391                 ret = B_TRUE;
4392         } else {
4393                 ret = B_FALSE;
4394         }
4395 done:
4396         mutex_exit(&ill->ill_lock);
4397         /*
4398          * After all the locks have been dropped we can restart nce timer,
4399          * and refrele the delayed ncecs
4400          */
4401         for (i = 0; i < ncert.ncert_num; i++) {
4402                 clock_t xmit_interval;
4403                 ncec_t  *tmp;
4404 
4405                 tmp = ncert.ncert_nces[i];
4406                 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4407                     B_FALSE);
4408                 nce_restart_timer(tmp, xmit_interval);
4409                 ncec_refrele(tmp);
4410         }
4411         return (ret);
4412 }
4413 
4414 boolean_t
4415 ndp_announce(ncec_t *ncec)
4416 {
4417         return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4418             ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4419             nce_advert_flags(ncec)));
4420 }
4421 
4422 ill_t *
4423 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4424 {
4425         mblk_t          *mp;
4426         in6_addr_t      src6;
4427         ipaddr_t        src4;
4428         ill_t           *ill = ncec->ncec_ill;
4429         ill_t           *src_ill = NULL;
4430         ipif_t          *ipif = NULL;
4431         boolean_t       is_myaddr = NCE_MYADDR(ncec);
4432         boolean_t       isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4433 
4434         ASSERT(src != NULL);
4435         ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4436         src6 = *src;
4437         if (is_myaddr) {
4438                 src6 = ncec->ncec_addr;
4439                 if (!isv6)
4440                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4441         } else {
4442                 /*
4443                  * try to find one from the outgoing packet.
4444                  */
4445                 mutex_enter(&ncec->ncec_lock);
4446                 mp = ncec->ncec_qd_mp;
4447                 if (mp != NULL) {
4448                         if (isv6) {
4449                                 ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
4450 
4451                                 src6 = ip6h->ip6_src;
4452                         } else {
4453                                 ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4454 
4455                                 src4 = ipha->ipha_src;
4456                                 IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4457                         }
4458                 }
4459                 mutex_exit(&ncec->ncec_lock);
4460         }
4461 
4462         /*
4463          * For outgoing packets, if the src of outgoing packet is one
4464          * of the assigned interface addresses use it, otherwise we
4465          * will pick the source address below.
4466          * For local addresses (is_myaddr) doing DAD, NDP announce
4467          * messages are mcast. So we use the (IPMP) cast_ill or the
4468          * (non-IPMP) ncec_ill for these message types. The only case
4469          * of unicast DAD messages are for IPv6 ND probes, for which
4470          * we find the ipif_bound_ill corresponding to the ncec_addr.
4471          */
4472         if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4473                 if (isv6) {
4474                         ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4475                             ill->ill_ipst);
4476                 } else {
4477                         ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4478                             ill->ill_ipst);
4479                 }
4480 
4481                 /*
4482                  * If no relevant ipif can be found, then it's not one of our
4483                  * addresses.  Reset to :: and try to find a src for the NS or
4484                  * ARP request using ipif_select_source_v[4,6]  below.
4485                  * If an ipif can be found, but it's not yet done with
4486                  * DAD verification, and we are not being invoked for
4487                  * DAD (i.e., !is_myaddr), then just postpone this
4488                  * transmission until later.
4489                  */
4490                 if (ipif == NULL) {
4491                         src6 = ipv6_all_zeros;
4492                         src4 = INADDR_ANY;
4493                 } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4494                         DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4495                             ncec_t *, ncec, ipif_t *, ipif);
4496                         ipif_refrele(ipif);
4497                         return (NULL);
4498                 }
4499         }
4500 
4501         if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4502                 /*
4503                  * Pick a source address for this solicitation, but
4504                  * restrict the selection to addresses assigned to the
4505                  * output interface.  We do this because the destination will
4506                  * create a neighbor cache entry for the source address of
4507                  * this packet, so the source address had better be a valid
4508                  * neighbor.
4509                  */
4510                 if (isv6) {
4511                         ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4512                             B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4513                             B_FALSE, NULL);
4514                 } else {
4515                         ipaddr_t nce_addr;
4516 
4517                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4518                         ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4519                             B_FALSE, NULL);
4520                 }
4521                 if (ipif == NULL && IS_IPMP(ill)) {
4522                         ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4523 
4524                         if (send_ill != NULL) {
4525                                 if (isv6) {
4526                                         ipif = ipif_select_source_v6(send_ill,
4527                                             &ncec->ncec_addr, B_TRUE,
4528                                             IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4529                                             B_FALSE, NULL);
4530                                 } else {
4531                                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4532                                             src4);
4533                                         ipif = ipif_select_source_v4(send_ill,
4534                                             src4, ALL_ZONES, B_TRUE, NULL);
4535                                 }
4536                                 ill_refrele(send_ill);
4537                         }
4538                 }
4539 
4540                 if (ipif == NULL) {
4541                         char buf[INET6_ADDRSTRLEN];
4542 
4543                         ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4544                             inet_ntop((isv6 ? AF_INET6 : AF_INET),
4545                             (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4546                         DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4547                         return (NULL);
4548                 }
4549                 src6 = ipif->ipif_v6lcl_addr;
4550         }
4551         *src = src6;
4552         if (ipif != NULL) {
4553                 src_ill = ipif->ipif_ill;
4554                 if (IS_IPMP(src_ill))
4555                         src_ill = ipmp_ipif_hold_bound_ill(ipif);
4556                 else
4557                         ill_refhold(src_ill);
4558                 ipif_refrele(ipif);
4559                 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4560                     ill_t *, src_ill);
4561         }
4562         return (src_ill);
4563 }
4564 
4565 void
4566 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4567     uchar_t *hwaddr, int hwaddr_len, int flags)
4568 {
4569         ill_t   *ill;
4570         ncec_t  *ncec;
4571         nce_t   *nce;
4572         uint16_t new_state;
4573 
4574         ill = (ipif ? ipif->ipif_ill : NULL);
4575         if (ill != NULL) {
4576                 /*
4577                  * only one ncec is possible
4578                  */
4579                 nce = nce_lookup_v4(ill, addr);
4580                 if (nce != NULL) {
4581                         ncec = nce->nce_common;
4582                         mutex_enter(&ncec->ncec_lock);
4583                         if (NCE_ISREACHABLE(ncec))
4584                                 new_state = ND_UNCHANGED;
4585                         else
4586                                 new_state = ND_STALE;
4587                         ncec->ncec_flags = flags;
4588                         nce_update(ncec, new_state, hwaddr);
4589                         mutex_exit(&ncec->ncec_lock);
4590                         nce_refrele(nce);
4591                         return;
4592                 }
4593         } else {
4594                 /*
4595                  * ill is wildcard; clean up all ncec's and ire's
4596                  * that match on addr.
4597                  */
4598                 nce_hw_map_t hwm;
4599 
4600                 hwm.hwm_addr = *addr;
4601                 hwm.hwm_hwlen = hwaddr_len;
4602                 hwm.hwm_hwaddr = hwaddr;
4603                 hwm.hwm_flags = flags;
4604 
4605                 ncec_walk_common(ipst->ips_ndp4, NULL,
4606                     nce_update_hw_changed, &hwm, B_TRUE);
4607         }
4608 }
4609 
4610 /*
4611  * Common function to add ncec entries.
4612  * we always add the ncec with ncec_ill == ill, and always create
4613  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4614  * ncec is !reachable.
4615  *
4616  * When the caller passes in an nce_state of ND_UNCHANGED,
4617  * nce_add_common() will determine the state of the created nce based
4618  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4619  * be created with state set to the passed in nce_state.
4620  */
4621 static int
4622 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4623     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4624 {
4625         static  ncec_t          nce_nil;
4626         uchar_t                 *template = NULL;
4627         int                     err;
4628         ncec_t                  *ncec;
4629         ncec_t                  **ncep;
4630         ip_stack_t              *ipst = ill->ill_ipst;
4631         uint16_t                state;
4632         boolean_t               fastprobe = B_FALSE;
4633         struct ndp_g_s          *ndp;
4634         nce_t                   *nce = NULL;
4635         list_t                  graveyard;
4636         mblk_t                  *dlur_mp = NULL;
4637 
4638         if (ill->ill_isv6)
4639                 ndp = ill->ill_ipst->ips_ndp6;
4640         else
4641                 ndp = ill->ill_ipst->ips_ndp4;
4642 
4643         *retnce = NULL;
4644 
4645         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4646 
4647         if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4648                 ip0dbg(("nce_add_common: no addr\n"));
4649                 return (EINVAL);
4650         }
4651         if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4652                 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4653                 return (EINVAL);
4654         }
4655 
4656         if (ill->ill_isv6) {
4657                 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4658         } else {
4659                 ipaddr_t v4addr;
4660 
4661                 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4662                 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4663         }
4664 
4665         /*
4666          * The caller has ensured that there is no nce on ill, but there could
4667          * still be an nce_common_t for the address, so that we find exisiting
4668          * ncec_t strucutures first, and atomically add a new nce_t if
4669          * one is found. The ndp_g_lock ensures that we don't cross threads
4670          * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4671          * compare for matches across the illgrp because this function is
4672          * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4673          * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4674          * appropriate.
4675          */
4676         ncec = *ncep;
4677         for (; ncec != NULL; ncec = ncec->ncec_next) {
4678                 if (ncec->ncec_ill == ill) {
4679                         if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4680                                 /*
4681                                  * We should never find *retnce to be
4682                                  * MYADDR, since the caller may then
4683                                  * incorrectly restart a DAD timer that's
4684                                  * already running.  However, if we are in
4685                                  * forwarding mode, and the interface is
4686                                  * moving in/out of groups, the data
4687                                  * path ire lookup (e.g., ire_revalidate_nce)
4688                                  * may  have determined that some destination
4689                                  * is offlink while the control path is adding
4690                                  * that address as a local address.
4691                                  * Recover from  this case by failing the
4692                                  * lookup
4693                                  */
4694                                 if (NCE_MYADDR(ncec))
4695                                         return (ENXIO);
4696                                 *retnce = nce_ill_lookup_then_add(ill, ncec);
4697                                 if (*retnce != NULL)
4698                                         break;
4699                         }
4700                 }
4701         }
4702         if (*retnce != NULL) /* caller must trigger fastpath on nce */
4703                 return (0);
4704 
4705         ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4706         if (ncec == NULL)
4707                 return (ENOMEM);
4708         *ncec = nce_nil;
4709         ncec->ncec_ill = ill;
4710         ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4711         ncec->ncec_flags = flags;
4712         ncec->ncec_ipst = ipst;      /* No netstack_hold */
4713 
4714         if (!ill->ill_isv6) {
4715                 ipaddr_t addr4;
4716 
4717                 /*
4718                  * DAD probe interval and probe count are set based on
4719                  * fast/slow probe settings. If the underlying link doesn't
4720                  * have reliably up/down notifications or if we're working
4721                  * with IPv4 169.254.0.0/16 Link Local Address space, then
4722                  * don't use the fast timers.  Otherwise, use them.
4723                  */
4724                 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4725                 IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4726                 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4727                         fastprobe = B_TRUE;
4728                 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4729                     !IS_IPV4_LL_SPACE(&addr4)) {
4730                         ill_t *hwaddr_ill;
4731 
4732                         hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4733                             hw_addr_len);
4734                         if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4735                                 fastprobe = B_TRUE;
4736                 }
4737                 if (fastprobe) {
4738                         ncec->ncec_xmit_interval =
4739                             ipst->ips_arp_fastprobe_interval;
4740                         ncec->ncec_pcnt =
4741                             ipst->ips_arp_fastprobe_count;
4742                         ncec->ncec_flags |= NCE_F_FAST;
4743                 } else {
4744                         ncec->ncec_xmit_interval =
4745                             ipst->ips_arp_probe_interval;
4746                         ncec->ncec_pcnt =
4747                             ipst->ips_arp_probe_count;
4748                 }
4749                 if (NCE_PUBLISH(ncec)) {
4750                         ncec->ncec_unsolicit_count =
4751                             ipst->ips_ip_arp_publish_count;
4752                 }
4753         } else {
4754                 /*
4755                  * probe interval is constant: ILL_PROBE_INTERVAL
4756                  * probe count is constant: ND_MAX_UNICAST_SOLICIT
4757                  */
4758                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4759                 if (NCE_PUBLISH(ncec)) {
4760                         ncec->ncec_unsolicit_count =
4761                             ipst->ips_ip_ndp_unsolicit_count;
4762                 }
4763         }
4764         ncec->ncec_rcnt = ill->ill_xmit_count;
4765         ncec->ncec_addr = *addr;
4766         ncec->ncec_qd_mp = NULL;
4767         ncec->ncec_refcnt = 1; /* for ncec getting created */
4768         mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4769         ncec->ncec_trace_disable = B_FALSE;
4770 
4771         /*
4772          * ncec_lladdr holds link layer address
4773          */
4774         if (hw_addr_len > 0) {
4775                 template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4776                 if (template == NULL) {
4777                         err = ENOMEM;
4778                         goto err_ret;
4779                 }
4780                 ncec->ncec_lladdr = template;
4781                 ncec->ncec_lladdr_length = hw_addr_len;
4782                 bzero(ncec->ncec_lladdr, hw_addr_len);
4783         }
4784         if ((flags & NCE_F_BCAST) != 0) {
4785                 state = ND_REACHABLE;
4786                 ASSERT(hw_addr_len > 0);
4787         } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4788                 state = ND_INITIAL;
4789         } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4790                 /*
4791                  * NORESOLVER entries are always created in the REACHABLE
4792                  * state.
4793                  */
4794                 state = ND_REACHABLE;
4795                 if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4796                     ill->ill_mactype != DL_IPV4 &&
4797                     ill->ill_mactype != DL_6TO4) {
4798                         /*
4799                          * We create a nce_res_mp with the IP nexthop address
4800                          * as the destination address if the physical length
4801                          * is exactly 4 bytes for point-to-multipoint links
4802                          * that do their own resolution from IP to link-layer
4803                          * address (e.g. IP over X.25).
4804                          */
4805                         bcopy((uchar_t *)addr,
4806                             ncec->ncec_lladdr, ill->ill_phys_addr_length);
4807                 }
4808                 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4809                     ill->ill_mactype != DL_IPV6) {
4810                         /*
4811                          * We create a nce_res_mp with the IP nexthop address
4812                          * as the destination address if the physical legnth
4813                          * is exactly 16 bytes for point-to-multipoint links
4814                          * that do their own resolution from IP to link-layer
4815                          * address.
4816                          */
4817                         bcopy((uchar_t *)addr,
4818                             ncec->ncec_lladdr, ill->ill_phys_addr_length);
4819                 }
4820                 /*
4821                  * Since NUD is not part of the base IPv4 protocol definition,
4822                  * IPv4 neighbor entries on NORESOLVER interfaces will never
4823                  * age, and are marked NCE_F_NONUD.
4824                  */
4825                 if (!ill->ill_isv6)
4826                         ncec->ncec_flags |= NCE_F_NONUD;
4827         } else if (ill->ill_net_type == IRE_LOOPBACK) {
4828                 state = ND_REACHABLE;
4829         }
4830 
4831         if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4832                 /*
4833                  * We are adding an ncec with a deterministic hw_addr,
4834                  * so the state can only be one of {REACHABLE, STALE, PROBE}.
4835                  *
4836                  * if we are adding a unicast ncec for the local address
4837                  * it would be REACHABLE; we would be adding a ND_STALE entry
4838                  * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4839                  * addresses are added in PROBE to trigger DAD.
4840                  */
4841                 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4842                     ill->ill_net_type == IRE_IF_NORESOLVER)
4843                         state = ND_REACHABLE;
4844                 else if (!NCE_PUBLISH(ncec))
4845                         state = ND_STALE;
4846                 else
4847                         state = ND_PROBE;
4848                 if (hw_addr != NULL)
4849                         nce_set_ll(ncec, hw_addr);
4850         }
4851         /* caller overrides internally computed state */
4852         if (nce_state != ND_UNCHANGED)
4853                 state = nce_state;
4854 
4855         if (state == ND_PROBE)
4856                 ncec->ncec_flags |= NCE_F_UNVERIFIED;
4857 
4858         ncec->ncec_state = state;
4859 
4860         if (state == ND_REACHABLE) {
4861                 ncec->ncec_last = ncec->ncec_init_time =
4862                     TICK_TO_MSEC(ddi_get_lbolt64());
4863         } else {
4864                 ncec->ncec_last = 0;
4865                 if (state == ND_INITIAL)
4866                         ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4867         }
4868         list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4869             offsetof(ncec_cb_t, ncec_cb_node));
4870         /*
4871          * have all the memory allocations out of the way before taking locks
4872          * and adding the nce.
4873          */
4874         nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4875         if (nce == NULL) {
4876                 err = ENOMEM;
4877                 goto err_ret;
4878         }
4879         if (ncec->ncec_lladdr != NULL ||
4880             ill->ill_net_type == IRE_IF_NORESOLVER) {
4881                 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4882                     ill->ill_phys_addr_length, ill->ill_sap,
4883                     ill->ill_sap_length);
4884                 if (dlur_mp == NULL) {
4885                         err = ENOMEM;
4886                         goto err_ret;
4887                 }
4888         }
4889 
4890         /*
4891          * Atomically ensure that the ill is not CONDEMNED, before
4892          * adding the NCE.
4893          */
4894         mutex_enter(&ill->ill_lock);
4895         if (ill->ill_state_flags & ILL_CONDEMNED) {
4896                 mutex_exit(&ill->ill_lock);
4897                 err = EINVAL;
4898                 goto err_ret;
4899         }
4900         if (!NCE_MYADDR(ncec) &&
4901             (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4902                 mutex_exit(&ill->ill_lock);
4903                 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4904                 err = EINVAL;
4905                 goto err_ret;
4906         }
4907         /*
4908          * Acquire the ncec_lock even before adding the ncec to the list
4909          * so that it cannot get deleted after the ncec is added, but
4910          * before we add the nce.
4911          */
4912         mutex_enter(&ncec->ncec_lock);
4913         if ((ncec->ncec_next = *ncep) != NULL)
4914                 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4915         *ncep = ncec;
4916         ncec->ncec_ptpn = ncep;
4917 
4918         /* Bump up the number of ncec's referencing this ill */
4919         DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4920             (char *), "ncec", (void *), ncec);
4921         ill->ill_ncec_cnt++;
4922         /*
4923          * Since we hold the ncec_lock at this time, the ncec cannot be
4924          * condemned, and we can safely add the nce.
4925          */
4926         list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
4927         *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
4928         mutex_exit(&ncec->ncec_lock);
4929         mutex_exit(&ill->ill_lock);
4930         nce_graveyard_free(&graveyard);
4931 
4932         /* caller must trigger fastpath on *retnce */
4933         return (0);
4934 
4935 err_ret:
4936         if (ncec != NULL)
4937                 kmem_cache_free(ncec_cache, ncec);
4938         if (nce != NULL)
4939                 kmem_cache_free(nce_cache, nce);
4940         freemsg(dlur_mp);
4941         if (template != NULL)
4942                 kmem_free(template, ill->ill_phys_addr_length);
4943         return (err);
4944 }
4945 
4946 /*
4947  * take a ref on the nce
4948  */
4949 void
4950 nce_refhold(nce_t *nce)
4951 {
4952         mutex_enter(&nce->nce_lock);
4953         nce->nce_refcnt++;
4954         ASSERT((nce)->nce_refcnt != 0);
4955         mutex_exit(&nce->nce_lock);
4956 }
4957 
4958 /*
4959  * release a ref on the nce; In general, this
4960  * cannot be called with locks held because nce_inactive
4961  * may result in nce_inactive which will take the ill_lock,
4962  * do ipif_ill_refrele_tail etc. Thus the one exception
4963  * where this can be called with locks held is when the caller
4964  * is certain that the nce_refcnt is sufficient to prevent
4965  * the invocation of nce_inactive.
4966  */
4967 void
4968 nce_refrele(nce_t *nce)
4969 {
4970         ASSERT((nce)->nce_refcnt != 0);
4971         mutex_enter(&nce->nce_lock);
4972         if (--nce->nce_refcnt == 0)
4973                 nce_inactive(nce); /* destroys the mutex */
4974         else
4975                 mutex_exit(&nce->nce_lock);
4976 }
4977 
4978 /*
4979  * free the nce after all refs have gone away.
4980  */
4981 static void
4982 nce_inactive(nce_t *nce)
4983 {
4984         ill_t *ill = nce->nce_ill;
4985 
4986         ASSERT(nce->nce_refcnt == 0);
4987 
4988         ncec_refrele_notr(nce->nce_common);
4989         nce->nce_common = NULL;
4990         freemsg(nce->nce_fp_mp);
4991         freemsg(nce->nce_dlur_mp);
4992 
4993         mutex_enter(&ill->ill_lock);
4994         DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4995             (char *), "nce", (void *), nce);
4996         ill->ill_nce_cnt--;
4997         nce->nce_ill = NULL;
4998         /*
4999          * If the number of ncec's associated with this ill have dropped
5000          * to zero, check whether we need to restart any operation that
5001          * is waiting for this to happen.
5002          */
5003         if (ILL_DOWN_OK(ill)) {
5004                 /* ipif_ill_refrele_tail drops the ill_lock */
5005                 ipif_ill_refrele_tail(ill);
5006         } else {
5007                 mutex_exit(&ill->ill_lock);
5008         }
5009 
5010         mutex_destroy(&nce->nce_lock);
5011         kmem_cache_free(nce_cache, nce);
5012 }
5013 
5014 /*
5015  * Add an nce to the ill_nce list.
5016  *
5017  * Adding multicast NCEs is subject to a per-ill limit. This function returns
5018  * NULL if that's the case, and it may reap a number of multicast nces.
5019  * Callers (and upstack) must be able to cope with NULL returns.
5020  */
5021 static nce_t *
5022 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
5023     list_t *graveyard)
5024 {
5025         ASSERT(MUTEX_HELD(&ill->ill_lock));
5026 
5027         if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
5028                 if (nce_too_many_mcast(ill, graveyard)) {
5029                         kmem_cache_free(nce_cache, nce);
5030                         return (NULL);
5031                 }
5032                 ill->ill_mcast_nces++;
5033         }
5034 
5035         bzero(nce, sizeof (*nce));
5036         mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
5037         nce->nce_common = ncec;
5038         nce->nce_addr = ncec->ncec_addr;
5039         nce->nce_ill = ill;
5040         DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
5041             (char *), "nce", (void *), nce);
5042         ill->ill_nce_cnt++;
5043 
5044         nce->nce_refcnt = 1; /* for the thread */
5045         ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
5046         nce->nce_dlur_mp = dlur_mp;
5047 
5048         /* add nce to the ill's fastpath list.  */
5049         nce->nce_refcnt++; /* for the list */
5050         list_insert_head(&ill->ill_nce, nce);
5051         return (nce);
5052 }
5053 
5054 static nce_t *
5055 nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
5056 {
5057         nce_t   *nce;
5058         mblk_t  *dlur_mp = NULL;
5059 
5060         ASSERT(MUTEX_HELD(&ill->ill_lock));
5061         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
5062 
5063         nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
5064         if (nce == NULL)
5065                 return (NULL);
5066         if (ncec->ncec_lladdr != NULL ||
5067             ill->ill_net_type == IRE_IF_NORESOLVER) {
5068                 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
5069                     ill->ill_phys_addr_length, ill->ill_sap,
5070                     ill->ill_sap_length);
5071                 if (dlur_mp == NULL) {
5072                         kmem_cache_free(nce_cache, nce);
5073                         return (NULL);
5074                 }
5075         }
5076         /*
5077          * If nce_add_impl() returns NULL due to on multicast limiting, caller
5078          * will (correctly) assume ENOMEM.
5079          */
5080         return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
5081 }
5082 
5083 /*
5084  * remove the nce from the ill_faspath list
5085  */
5086 void
5087 nce_delete(nce_t *nce)
5088 {
5089         ill_t   *ill = nce->nce_ill;
5090 
5091         ASSERT(MUTEX_HELD(&ill->ill_lock));
5092 
5093         mutex_enter(&nce->nce_lock);
5094         if (nce->nce_is_condemned) {
5095                 /*
5096                  * some other thread has removed this nce from the ill_nce list
5097                  */
5098                 mutex_exit(&nce->nce_lock);
5099                 return;
5100         }
5101         nce->nce_is_condemned = B_TRUE;
5102         mutex_exit(&nce->nce_lock);
5103 
5104         /* Update the count of multicast NCEs. */
5105         if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
5106                 ill->ill_mcast_nces--;
5107 
5108         list_remove(&ill->ill_nce, nce);
5109         /*
5110          * even though we are holding the ill_lock, it is ok to
5111          * call nce_refrele here because we know that we should have
5112          * at least 2 refs on the nce: one for the thread, and one
5113          * for the list. The refrele below will release the one for
5114          * the list.
5115          */
5116         nce_refrele(nce);
5117 }
5118 
5119 nce_t *
5120 nce_lookup(ill_t *ill, const in6_addr_t *addr)
5121 {
5122         nce_t *nce = NULL;
5123 
5124         ASSERT(ill != NULL);
5125         ASSERT(MUTEX_HELD(&ill->ill_lock));
5126 
5127         for (nce = list_head(&ill->ill_nce); nce != NULL;
5128             nce = list_next(&ill->ill_nce, nce)) {
5129                 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
5130                         break;
5131         }
5132 
5133         /*
5134          * if we found the nce on the ill_nce list while holding
5135          * the ill_lock, then it cannot be condemned yet.
5136          */
5137         if (nce != NULL) {
5138                 ASSERT(!nce->nce_is_condemned);
5139                 nce_refhold(nce);
5140         }
5141         return (nce);
5142 }
5143 
5144 /*
5145  * Walk the ill_nce list on ill. The callback function func() cannot perform
5146  * any destructive actions.
5147  */
5148 static void
5149 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
5150 {
5151         nce_t *nce = NULL, *nce_next;
5152 
5153         ASSERT(MUTEX_HELD(&ill->ill_lock));
5154         for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
5155                 nce_next = list_next(&ill->ill_nce, nce);
5156                 if (func(ill, nce, arg) != 0)
5157                         break;
5158                 nce = nce_next;
5159         }
5160 }
5161 
5162 void
5163 nce_walk(ill_t *ill, pfi_t func, void *arg)
5164 {
5165         mutex_enter(&ill->ill_lock);
5166         nce_walk_common(ill, func, arg);
5167         mutex_exit(&ill->ill_lock);
5168 }
5169 
5170 void
5171 nce_flush(ill_t *ill, boolean_t flushall)
5172 {
5173         nce_t *nce, *nce_next;
5174         list_t dead;
5175 
5176         list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
5177         mutex_enter(&ill->ill_lock);
5178         for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
5179                 nce_next = list_next(&ill->ill_nce, nce);
5180                 if (!flushall && NCE_PUBLISH(nce->nce_common)) {
5181                         nce = nce_next;
5182                         continue;
5183                 }
5184                 /*
5185                  * nce_delete requires that the caller should either not
5186                  * be holding locks, or should hold a ref to ensure that
5187                  * we wont hit ncec_inactive. So take a ref and clean up
5188                  * after the list is flushed.
5189                  */
5190                 nce_refhold(nce);
5191                 nce_delete(nce);
5192                 list_insert_tail(&dead, nce);
5193                 nce = nce_next;
5194         }
5195         mutex_exit(&ill->ill_lock);
5196         while ((nce = list_head(&dead)) != NULL) {
5197                 list_remove(&dead, nce);
5198                 nce_refrele(nce);
5199         }
5200         ASSERT(list_is_empty(&dead));
5201         list_destroy(&dead);
5202 }
5203 
5204 /* Return an interval that is anywhere in the [1 .. intv] range */
5205 static clock_t
5206 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
5207 {
5208         clock_t rnd, frac;
5209 
5210         (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
5211         /* Note that clock_t is signed; must chop off bits */
5212         rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
5213         if (initial_time) {
5214                 if (intv <= 0)
5215                         intv = 1;
5216                 else
5217                         intv = (rnd % intv) + 1;
5218         } else {
5219                 /* Compute 'frac' as 20% of the configured interval */
5220                 if ((frac = intv / 5) <= 1)
5221                         frac = 2;
5222                 /* Set intv randomly in the range [intv-frac .. intv+frac] */
5223                 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
5224                         intv = 1;
5225         }
5226         return (intv);
5227 }
5228 
5229 void
5230 nce_resolv_ipmp_ok(ncec_t *ncec)
5231 {
5232         mblk_t *mp;
5233         uint_t pkt_len;
5234         iaflags_t ixaflags = IXAF_NO_TRACE;
5235         nce_t *under_nce;
5236         ill_t   *ill = ncec->ncec_ill;
5237         boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
5238         ipif_t *src_ipif = NULL;
5239         ip_stack_t *ipst = ill->ill_ipst;
5240         ill_t *send_ill;
5241         uint_t nprobes;
5242 
5243         ASSERT(IS_IPMP(ill));
5244 
5245         mutex_enter(&ncec->ncec_lock);
5246         nprobes = ncec->ncec_nprobes;
5247         mp = ncec->ncec_qd_mp;
5248         ncec->ncec_qd_mp = NULL;
5249         ncec->ncec_nprobes = 0;
5250         mutex_exit(&ncec->ncec_lock);
5251 
5252         while (mp != NULL) {
5253                 mblk_t *nxt_mp;
5254 
5255                 nxt_mp = mp->b_next;
5256                 mp->b_next = NULL;
5257                 if (isv6) {
5258                         ip6_t *ip6h = (ip6_t *)mp->b_rptr;
5259 
5260                         pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
5261                         src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
5262                             ill, ALL_ZONES, ipst);
5263                 } else {
5264                         ipha_t *ipha = (ipha_t *)mp->b_rptr;
5265 
5266                         ixaflags |= IXAF_IS_IPV4;
5267                         pkt_len = ntohs(ipha->ipha_length);
5268                         src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5269                             ill, ALL_ZONES, ipst);
5270                 }
5271 
5272                 /*
5273                  * find a new nce based on an under_ill. The first IPMP probe
5274                  * packet gets queued, so we could still find a src_ipif that
5275                  * matches an IPMP test address.
5276                  */
5277                 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5278                         /*
5279                          * if src_ipif is null, this could be either a
5280                          * forwarded packet or a probe whose src got deleted.
5281                          * We identify the former case by looking for the
5282                          * ncec_nprobes: the first ncec_nprobes packets are
5283                          * probes;
5284                          */
5285                         if (src_ipif == NULL && nprobes > 0)
5286                                 goto drop_pkt;
5287 
5288                         /*
5289                          * For forwarded packets, we use the ipmp rotor
5290                          * to find send_ill.
5291                          */
5292                         send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5293                             B_TRUE);
5294                 } else {
5295                         send_ill = src_ipif->ipif_ill;
5296                         ill_refhold(send_ill);
5297                 }
5298 
5299                 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5300                     (ncec_t *), ncec, (ipif_t *),
5301                     src_ipif, (ill_t *), send_ill);
5302 
5303                 if (send_ill == NULL) {
5304                         if (src_ipif != NULL)
5305                                 ipif_refrele(src_ipif);
5306                         goto drop_pkt;
5307                 }
5308                 /* create an under_nce on send_ill */
5309                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5310                 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5311                         under_nce = nce_fastpath_create(send_ill, ncec);
5312                 else
5313                         under_nce = NULL;
5314                 rw_exit(&ipst->ips_ill_g_lock);
5315                 if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5316                         nce_fastpath_trigger(under_nce);
5317 
5318                 ill_refrele(send_ill);
5319                 if (src_ipif != NULL)
5320                         ipif_refrele(src_ipif);
5321 
5322                 if (under_nce != NULL) {
5323                         (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5324                             ALL_ZONES, 0, NULL);
5325                         nce_refrele(under_nce);
5326                         if (nprobes > 0)
5327                                 nprobes--;
5328                         mp = nxt_mp;
5329                         continue;
5330                 }
5331 drop_pkt:
5332                 if (isv6) {
5333                         BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5334                 } else {
5335                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5336                 }
5337                 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5338                 freemsg(mp);
5339                 if (nprobes > 0)
5340                         nprobes--;
5341                 mp = nxt_mp;
5342         }
5343         ncec_cb_dispatch(ncec); /* complete callbacks */
5344 }