1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * Copyright (c) 2018, Joyent, Inc.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/stropts.h>
  32 #include <sys/strsun.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/errno.h>
  35 #include <sys/dlpi.h>
  36 #include <sys/socket.h>
  37 #include <sys/ddi.h>
  38 #include <sys/sunddi.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/debug.h>
  41 #include <sys/vtrace.h>
  42 #include <sys/kmem.h>
  43 #include <sys/zone.h>
  44 #include <sys/ethernet.h>
  45 #include <sys/sdt.h>
  46 #include <sys/mac.h>
  47 
  48 #include <net/if.h>
  49 #include <net/if_types.h>
  50 #include <net/if_dl.h>
  51 #include <net/route.h>
  52 #include <netinet/in.h>
  53 #include <netinet/ip6.h>
  54 #include <netinet/icmp6.h>
  55 
  56 #include <inet/common.h>
  57 #include <inet/mi.h>
  58 #include <inet/mib2.h>
  59 #include <inet/nd.h>
  60 #include <inet/ip.h>
  61 #include <inet/ip_impl.h>
  62 #include <inet/ipclassifier.h>
  63 #include <inet/ip_if.h>
  64 #include <inet/ip_ire.h>
  65 #include <inet/ip_rts.h>
  66 #include <inet/ip6.h>
  67 #include <inet/ip_ndp.h>
  68 #include <inet/sctp_ip.h>
  69 #include <inet/ip_arp.h>
  70 #include <inet/ip2mac_impl.h>
  71 
  72 #define ANNOUNCE_INTERVAL(isv6) \
  73         (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
  74         ipst->ips_ip_arp_publish_interval)
  75 
  76 #define DEFENSE_INTERVAL(isv6) \
  77         (isv6 ? ipst->ips_ndp_defend_interval : \
  78         ipst->ips_arp_defend_interval)
  79 
  80 /* Non-tunable probe interval, based on link capabilities */
  81 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
  82 
  83 /*
  84  * The IPv4 Link Local address space is special; we do extra duplicate checking
  85  * there, as the entire assignment mechanism rests on random numbers.
  86  */
  87 #define IS_IPV4_LL_SPACE(ptr)   (((uchar_t *)ptr)[0] == 169 && \
  88                                 ((uchar_t *)ptr)[1] == 254)
  89 
  90 /*
  91  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
  92  * in to the ncec*add* functions.
  93  *
  94  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
  95  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
  96  * that we will respond to requests for the protocol address.
  97  */
  98 #define NCE_EXTERNAL_FLAGS_MASK \
  99         (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
 100         NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
 101         NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
 102 
 103 /*
 104  * Lock ordering:
 105  *
 106  *      ndp_g_lock -> ill_lock -> ncec_lock
 107  *
 108  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
 109  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
 110  * ncec_refcnt).
 111  */
 112 
 113 static  void    nce_cleanup_list(ncec_t *ncec);
 114 static  void    nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
 115 static  ncec_t  *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
 116     ncec_t *);
 117 static  nce_t   *nce_lookup_addr(ill_t *, const in6_addr_t *);
 118 static  int     nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
 119     uint16_t ncec_flags, nce_t **newnce);
 120 static  int     nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
 121     uint16_t ncec_flags, nce_t **newnce);
 122 static  boolean_t       ndp_xmit(ill_t *ill, uint32_t operation,
 123     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
 124     const in6_addr_t *target, int flag);
 125 static void     ncec_refhold_locked(ncec_t *);
 126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
 127 static  void    nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
 128 static  int     nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 129     uint16_t, uint16_t, nce_t **);
 130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
 131 static nce_t *nce_add(ill_t *, ncec_t *);
 132 static void nce_inactive(nce_t *);
 133 extern nce_t    *nce_lookup(ill_t *, const in6_addr_t *);
 134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
 135 static int      nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 136     uint16_t, uint16_t, nce_t **);
 137 static int      nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
 138     uint16_t, uint16_t, nce_t **);
 139 static int  nce_add_v6_postprocess(nce_t *);
 140 static int  nce_add_v4_postprocess(nce_t *);
 141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
 142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
 143 static void nce_resolv_ipmp_ok(ncec_t *);
 144 static void nce_walk_common(ill_t *, pfi_t, void *);
 145 static void nce_start_timer(ncec_t *, uint_t);
 146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
 147 static void nce_fastpath_trigger(nce_t *);
 148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
 149 
 150 #ifdef DEBUG
 151 static void     ncec_trace_cleanup(const ncec_t *);
 152 #endif
 153 
 154 #define NCE_HASH_PTR_V4(ipst, addr)                                     \
 155         (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
 156 
 157 #define NCE_HASH_PTR_V6(ipst, addr)                              \
 158         (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
 159                 NCE_TABLE_SIZE)]))
 160 
 161 extern kmem_cache_t *ncec_cache;
 162 extern kmem_cache_t *nce_cache;
 163 
 164 /*
 165  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
 166  * If src_ill is not null, the ncec_addr is bound to src_ill. The
 167  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
 168  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
 169  * IPMP cast_ill (in the IPMP case).
 170  *
 171  * Note that the probe interval is based on the src_ill for IPv6, and
 172  * the ncec_xmit_interval for IPv4.
 173  */
 174 static void
 175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
 176 {
 177         boolean_t dropped;
 178         uint32_t probe_interval;
 179 
 180         ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
 181         ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
 182         if (ncec->ncec_ipversion == IPV6_VERSION) {
 183                 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
 184                     ncec->ncec_lladdr, ncec->ncec_lladdr_length,
 185                     &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
 186                 probe_interval = ILL_PROBE_INTERVAL(src_ill);
 187         } else {
 188                 /* IPv4 DAD delay the initial probe. */
 189                 if (send_probe)
 190                         dropped = arp_probe(ncec);
 191                 else
 192                         dropped = B_TRUE;
 193                 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
 194                     !send_probe);
 195         }
 196         if (!dropped) {
 197                 mutex_enter(&ncec->ncec_lock);
 198                 ncec->ncec_pcnt--;
 199                 mutex_exit(&ncec->ncec_lock);
 200         }
 201         nce_restart_timer(ncec, probe_interval);
 202 }
 203 
 204 /*
 205  * Compute default flags to use for an advertisement of this ncec's address.
 206  */
 207 static int
 208 nce_advert_flags(const ncec_t *ncec)
 209 {
 210         int flag = 0;
 211 
 212         if (ncec->ncec_flags & NCE_F_ISROUTER)
 213                 flag |= NDP_ISROUTER;
 214         if (!(ncec->ncec_flags & NCE_F_ANYCAST))
 215                 flag |= NDP_ORIDE;
 216 
 217         return (flag);
 218 }
 219 
 220 /*
 221  * NDP Cache Entry creation routine.
 222  * This routine must always be called with ndp6->ndp_g_lock held.
 223  */
 224 int
 225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
 226     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 227 {
 228         int             err;
 229         nce_t           *nce;
 230 
 231         ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
 232         ASSERT(ill != NULL && ill->ill_isv6);
 233 
 234         err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
 235             &nce);
 236         if (err != 0)
 237                 return (err);
 238         ASSERT(newnce != NULL);
 239         *newnce = nce;
 240         return (err);
 241 }
 242 
 243 /*
 244  * Post-processing routine to be executed after nce_add_v6(). This function
 245  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
 246  * and must be called without any locks held.
 247  */
 248 int
 249 nce_add_v6_postprocess(nce_t *nce)
 250 {
 251         ncec_t          *ncec = nce->nce_common;
 252         boolean_t       dropped = B_FALSE;
 253         uchar_t         *hw_addr = ncec->ncec_lladdr;
 254         uint_t          hw_addr_len = ncec->ncec_lladdr_length;
 255         ill_t           *ill = ncec->ncec_ill;
 256         int             err = 0;
 257         uint16_t        flags = ncec->ncec_flags;
 258         ip_stack_t      *ipst = ill->ill_ipst;
 259         boolean_t       trigger_fastpath = B_TRUE;
 260 
 261         /*
 262          * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
 263          * we call nce_fastpath as soon as the ncec is resolved in nce_process.
 264          * We call nce_fastpath from nce_update if the link layer address of
 265          * the peer changes from nce_update
 266          */
 267         if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
 268             (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
 269                 trigger_fastpath = B_FALSE;
 270 
 271         if (trigger_fastpath)
 272                 nce_fastpath_trigger(nce);
 273         if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
 274                 ill_t *hwaddr_ill;
 275                 /*
 276                  * Unicast entry that needs DAD.
 277                  */
 278                 if (IS_IPMP(ill)) {
 279                         hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
 280                             hw_addr, hw_addr_len);
 281                 } else {
 282                         hwaddr_ill = ill;
 283                 }
 284                 nce_dad(ncec, hwaddr_ill, B_TRUE);
 285                 err = EINPROGRESS;
 286         } else if (flags & NCE_F_UNSOL_ADV) {
 287                 /*
 288                  * We account for the transmit below by assigning one
 289                  * less than the ndd variable. Subsequent decrements
 290                  * are done in nce_timer.
 291                  */
 292                 mutex_enter(&ncec->ncec_lock);
 293                 ncec->ncec_unsolicit_count =
 294                     ipst->ips_ip_ndp_unsolicit_count - 1;
 295                 mutex_exit(&ncec->ncec_lock);
 296                 dropped = ndp_xmit(ill,
 297                     ND_NEIGHBOR_ADVERT,
 298                     hw_addr,
 299                     hw_addr_len,
 300                     &ncec->ncec_addr,    /* Source and target of the adv */
 301                     &ipv6_all_hosts_mcast, /* Destination of the packet */
 302                     nce_advert_flags(ncec));
 303                 mutex_enter(&ncec->ncec_lock);
 304                 if (dropped)
 305                         ncec->ncec_unsolicit_count++;
 306                 else
 307                         ncec->ncec_last_time_defended = ddi_get_lbolt();
 308                 if (ncec->ncec_unsolicit_count != 0) {
 309                         nce_start_timer(ncec,
 310                             ipst->ips_ip_ndp_unsolicit_interval);
 311                 }
 312                 mutex_exit(&ncec->ncec_lock);
 313         }
 314         return (err);
 315 }
 316 
 317 /*
 318  * Atomically lookup and add (if needed) Neighbor Cache information for
 319  * an address.
 320  *
 321  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
 322  * are always added pointing at the ipmp_ill. Thus, when the ill passed
 323  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
 324  * entries will be created, both pointing at the same ncec_t. The nce_t
 325  * entries will have their nce_ill set to the ipmp_ill and the under_ill
 326  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
 327  * Local addresses are always created on the ill passed to nce_add_v6.
 328  */
 329 int
 330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
 331     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 332 {
 333         int             err = 0;
 334         ip_stack_t      *ipst = ill->ill_ipst;
 335         nce_t           *nce, *upper_nce = NULL;
 336         ill_t           *in_ill = ill;
 337         boolean_t       need_ill_refrele = B_FALSE;
 338 
 339         if (flags & NCE_F_MCAST) {
 340                 /*
 341                  * hw_addr will be figured out in nce_set_multicast_v6;
 342                  * caller has to select the cast_ill
 343                  */
 344                 ASSERT(hw_addr == NULL);
 345                 ASSERT(!IS_IPMP(ill));
 346                 err = nce_set_multicast_v6(ill, addr, flags, newnce);
 347                 return (err);
 348         }
 349         ASSERT(ill->ill_isv6);
 350         if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
 351                 ill = ipmp_ill_hold_ipmp_ill(ill);
 352                 if (ill == NULL)
 353                         return (ENXIO);
 354                 need_ill_refrele = B_TRUE;
 355         }
 356 
 357         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 358         nce = nce_lookup_addr(ill, addr);
 359         if (nce == NULL) {
 360                 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
 361                     &nce);
 362         } else {
 363                 err = EEXIST;
 364         }
 365         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 366         if (err == 0)
 367                 err = nce_add_v6_postprocess(nce);
 368         if (in_ill != ill && nce != NULL) {
 369                 nce_t *under_nce = NULL;
 370 
 371                 /*
 372                  * in_ill was the under_ill. Try to create the under_nce.
 373                  * Hold the ill_g_lock to prevent changes to group membership
 374                  * until we are done.
 375                  */
 376                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 377                 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
 378                         DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
 379                             ill_t *, ill);
 380                         rw_exit(&ipst->ips_ill_g_lock);
 381                         err = ENXIO;
 382                         nce_refrele(nce);
 383                         nce = NULL;
 384                         goto bail;
 385                 }
 386                 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
 387                 if (under_nce == NULL) {
 388                         rw_exit(&ipst->ips_ill_g_lock);
 389                         err = EINVAL;
 390                         nce_refrele(nce);
 391                         nce = NULL;
 392                         goto bail;
 393                 }
 394                 rw_exit(&ipst->ips_ill_g_lock);
 395                 upper_nce = nce;
 396                 nce = under_nce; /* will be returned to caller */
 397                 if (NCE_ISREACHABLE(nce->nce_common))
 398                         nce_fastpath_trigger(under_nce);
 399         }
 400         /* nce_refrele is deferred until the lock is dropped  */
 401         if (nce != NULL) {
 402                 if (newnce != NULL)
 403                         *newnce = nce;
 404                 else
 405                         nce_refrele(nce);
 406         }
 407 bail:
 408         if (upper_nce != NULL)
 409                 nce_refrele(upper_nce);
 410         if (need_ill_refrele)
 411                 ill_refrele(ill);
 412         return (err);
 413 }
 414 
 415 /*
 416  * Remove all the CONDEMNED nces from the appropriate hash table.
 417  * We create a private list of NCEs, these may have ires pointing
 418  * to them, so the list will be passed through to clean up dependent
 419  * ires and only then we can do ncec_refrele() which can make NCE inactive.
 420  */
 421 static void
 422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
 423 {
 424         ncec_t *ncec1;
 425         ncec_t **ptpn;
 426 
 427         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 428         ASSERT(ndp->ndp_g_walker == 0);
 429         for (; ncec; ncec = ncec1) {
 430                 ncec1 = ncec->ncec_next;
 431                 mutex_enter(&ncec->ncec_lock);
 432                 if (NCE_ISCONDEMNED(ncec)) {
 433                         ptpn = ncec->ncec_ptpn;
 434                         ncec1 = ncec->ncec_next;
 435                         if (ncec1 != NULL)
 436                                 ncec1->ncec_ptpn = ptpn;
 437                         *ptpn = ncec1;
 438                         ncec->ncec_ptpn = NULL;
 439                         ncec->ncec_next = NULL;
 440                         ncec->ncec_next = *free_nce_list;
 441                         *free_nce_list = ncec;
 442                 }
 443                 mutex_exit(&ncec->ncec_lock);
 444         }
 445 }
 446 
 447 /*
 448  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
 449  *    will return this NCE. Also no new timeouts will
 450  *    be started (See nce_restart_timer).
 451  * 2. Cancel any currently running timeouts.
 452  * 3. If there is an ndp walker, return. The walker will do the cleanup.
 453  *    This ensures that walkers see a consistent list of NCEs while walking.
 454  * 4. Otherwise remove the NCE from the list of NCEs
 455  */
 456 void
 457 ncec_delete(ncec_t *ncec)
 458 {
 459         ncec_t  **ptpn;
 460         ncec_t  *ncec1;
 461         int     ipversion = ncec->ncec_ipversion;
 462         ndp_g_t *ndp;
 463         ip_stack_t      *ipst = ncec->ncec_ipst;
 464 
 465         if (ipversion == IPV4_VERSION)
 466                 ndp = ipst->ips_ndp4;
 467         else
 468                 ndp = ipst->ips_ndp6;
 469 
 470         /* Serialize deletes */
 471         mutex_enter(&ncec->ncec_lock);
 472         if (NCE_ISCONDEMNED(ncec)) {
 473                 /* Some other thread is doing the delete */
 474                 mutex_exit(&ncec->ncec_lock);
 475                 return;
 476         }
 477         /*
 478          * Caller has a refhold. Also 1 ref for being in the list. Thus
 479          * refcnt has to be >= 2
 480          */
 481         ASSERT(ncec->ncec_refcnt >= 2);
 482         ncec->ncec_flags |= NCE_F_CONDEMNED;
 483         mutex_exit(&ncec->ncec_lock);
 484 
 485         /* Count how many condemned ires for kmem_cache callback */
 486         atomic_inc_32(&ipst->ips_num_nce_condemned);
 487         nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
 488 
 489         /* Complete any waiting callbacks */
 490         ncec_cb_dispatch(ncec);
 491 
 492         /*
 493          * Cancel any running timer. Timeout can't be restarted
 494          * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
 495          * Passing invalid timeout id is fine.
 496          */
 497         if (ncec->ncec_timeout_id != 0) {
 498                 (void) untimeout(ncec->ncec_timeout_id);
 499                 ncec->ncec_timeout_id = 0;
 500         }
 501 
 502         mutex_enter(&ndp->ndp_g_lock);
 503         if (ncec->ncec_ptpn == NULL) {
 504                 /*
 505                  * The last ndp walker has already removed this ncec from
 506                  * the list after we marked the ncec CONDEMNED and before
 507                  * we grabbed the global lock.
 508                  */
 509                 mutex_exit(&ndp->ndp_g_lock);
 510                 return;
 511         }
 512         if (ndp->ndp_g_walker > 0) {
 513                 /*
 514                  * Can't unlink. The walker will clean up
 515                  */
 516                 ndp->ndp_g_walker_cleanup = B_TRUE;
 517                 mutex_exit(&ndp->ndp_g_lock);
 518                 return;
 519         }
 520 
 521         /*
 522          * Now remove the ncec from the list. nce_restart_timer won't restart
 523          * the timer since it is marked CONDEMNED.
 524          */
 525         ptpn = ncec->ncec_ptpn;
 526         ncec1 = ncec->ncec_next;
 527         if (ncec1 != NULL)
 528                 ncec1->ncec_ptpn = ptpn;
 529         *ptpn = ncec1;
 530         ncec->ncec_ptpn = NULL;
 531         ncec->ncec_next = NULL;
 532         mutex_exit(&ndp->ndp_g_lock);
 533 
 534         /* Removed from ncec_ptpn/ncec_next list */
 535         ncec_refrele_notr(ncec);
 536 }
 537 
 538 void
 539 ncec_inactive(ncec_t *ncec)
 540 {
 541         mblk_t          **mpp;
 542         ill_t           *ill = ncec->ncec_ill;
 543         ip_stack_t      *ipst = ncec->ncec_ipst;
 544 
 545         ASSERT(ncec->ncec_refcnt == 0);
 546         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 547 
 548         /* Count how many condemned nces for kmem_cache callback */
 549         if (NCE_ISCONDEMNED(ncec))
 550                 atomic_add_32(&ipst->ips_num_nce_condemned, -1);
 551 
 552         /* Free all allocated messages */
 553         mpp = &ncec->ncec_qd_mp;
 554         while (*mpp != NULL) {
 555                 mblk_t  *mp;
 556 
 557                 mp = *mpp;
 558                 *mpp = mp->b_next;
 559 
 560                 inet_freemsg(mp);
 561         }
 562         /*
 563          * must have been cleaned up in ncec_delete
 564          */
 565         ASSERT(list_is_empty(&ncec->ncec_cb));
 566         list_destroy(&ncec->ncec_cb);
 567         /*
 568          * free the ncec_lladdr if one was allocated in nce_add_common()
 569          */
 570         if (ncec->ncec_lladdr_length > 0)
 571                 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
 572 
 573 #ifdef DEBUG
 574         ncec_trace_cleanup(ncec);
 575 #endif
 576 
 577         mutex_enter(&ill->ill_lock);
 578         DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
 579             (char *), "ncec", (void *), ncec);
 580         ill->ill_ncec_cnt--;
 581         ncec->ncec_ill = NULL;
 582         /*
 583          * If the number of ncec's associated with this ill have dropped
 584          * to zero, check whether we need to restart any operation that
 585          * is waiting for this to happen.
 586          */
 587         if (ILL_DOWN_OK(ill)) {
 588                 /* ipif_ill_refrele_tail drops the ill_lock */
 589                 ipif_ill_refrele_tail(ill);
 590         } else {
 591                 mutex_exit(&ill->ill_lock);
 592         }
 593 
 594         mutex_destroy(&ncec->ncec_lock);
 595         kmem_cache_free(ncec_cache, ncec);
 596 }
 597 
 598 /*
 599  * ncec_walk routine.  Delete the ncec if it is associated with the ill
 600  * that is going away.  Always called as a writer.
 601  */
 602 void
 603 ncec_delete_per_ill(ncec_t *ncec, void *arg)
 604 {
 605         if ((ncec != NULL) && ncec->ncec_ill == arg) {
 606                 ncec_delete(ncec);
 607         }
 608 }
 609 
 610 /*
 611  * Neighbor Cache cleanup logic for a list of ncec_t entries.
 612  */
 613 static void
 614 nce_cleanup_list(ncec_t *ncec)
 615 {
 616         ncec_t *ncec_next;
 617 
 618         ASSERT(ncec != NULL);
 619         while (ncec != NULL) {
 620                 ncec_next = ncec->ncec_next;
 621                 ncec->ncec_next = NULL;
 622 
 623                 /*
 624                  * It is possible for the last ndp walker (this thread)
 625                  * to come here after ncec_delete has marked the ncec CONDEMNED
 626                  * and before it has removed the ncec from the fastpath list
 627                  * or called untimeout. So we need to do it here. It is safe
 628                  * for both ncec_delete and this thread to do it twice or
 629                  * even simultaneously since each of the threads has a
 630                  * reference on the ncec.
 631                  */
 632                 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
 633                 /*
 634                  * Cancel any running timer. Timeout can't be restarted
 635                  * since CONDEMNED is set. The ncec_lock can't be
 636                  * held across untimeout though passing invalid timeout
 637                  * id is fine.
 638                  */
 639                 if (ncec->ncec_timeout_id != 0) {
 640                         (void) untimeout(ncec->ncec_timeout_id);
 641                         ncec->ncec_timeout_id = 0;
 642                 }
 643                 /* Removed from ncec_ptpn/ncec_next list */
 644                 ncec_refrele_notr(ncec);
 645                 ncec = ncec_next;
 646         }
 647 }
 648 
 649 /*
 650  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
 651  */
 652 boolean_t
 653 nce_restart_dad(ncec_t *ncec)
 654 {
 655         boolean_t started;
 656         ill_t *ill, *hwaddr_ill;
 657 
 658         if (ncec == NULL)
 659                 return (B_FALSE);
 660         ill = ncec->ncec_ill;
 661         mutex_enter(&ncec->ncec_lock);
 662         if (ncec->ncec_state == ND_PROBE) {
 663                 mutex_exit(&ncec->ncec_lock);
 664                 started = B_TRUE;
 665         } else if (ncec->ncec_state == ND_REACHABLE) {
 666                 ASSERT(ncec->ncec_lladdr != NULL);
 667                 ncec->ncec_state = ND_PROBE;
 668                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
 669                 /*
 670                  * Slight cheat here: we don't use the initial probe delay
 671                  * for IPv4 in this obscure case.
 672                  */
 673                 mutex_exit(&ncec->ncec_lock);
 674                 if (IS_IPMP(ill)) {
 675                         hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
 676                             ncec->ncec_lladdr, ncec->ncec_lladdr_length);
 677                 } else {
 678                         hwaddr_ill = ill;
 679                 }
 680                 nce_dad(ncec, hwaddr_ill, B_TRUE);
 681                 started = B_TRUE;
 682         } else {
 683                 mutex_exit(&ncec->ncec_lock);
 684                 started = B_FALSE;
 685         }
 686         return (started);
 687 }
 688 
 689 /*
 690  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
 691  * If one is found, the refcnt on the ncec will be incremented.
 692  */
 693 ncec_t *
 694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
 695 {
 696         ncec_t          *ncec;
 697         ip_stack_t      *ipst = ill->ill_ipst;
 698 
 699         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 700         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 701 
 702         /* Get head of v6 hash table */
 703         ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
 704         ncec = ncec_lookup_illgrp(ill, addr, ncec);
 705         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 706         rw_exit(&ipst->ips_ill_g_lock);
 707         return (ncec);
 708 }
 709 /*
 710  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
 711  * If one is found, the refcnt on the ncec will be incremented.
 712  */
 713 ncec_t *
 714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
 715 {
 716         ncec_t  *ncec = NULL;
 717         in6_addr_t addr6;
 718         ip_stack_t *ipst = ill->ill_ipst;
 719 
 720         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 721         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
 722 
 723         /* Get head of v4 hash table */
 724         ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
 725         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
 726         ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
 727         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 728         rw_exit(&ipst->ips_ill_g_lock);
 729         return (ncec);
 730 }
 731 
 732 /*
 733  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
 734  * If an ncec is found, increment the hold count on that ncec.
 735  * The caller passes in the start of the appropriate hash table, and must
 736  * be holding the appropriate global lock (ndp_g_lock). In addition, since
 737  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
 738  * must be held as reader.
 739  *
 740  * This function always matches across the ipmp group.
 741  */
 742 ncec_t *
 743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
 744 {
 745         ndp_g_t         *ndp;
 746         ip_stack_t      *ipst = ill->ill_ipst;
 747 
 748         if (ill->ill_isv6)
 749                 ndp = ipst->ips_ndp6;
 750         else
 751                 ndp = ipst->ips_ndp4;
 752 
 753         ASSERT(ill != NULL);
 754         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 755         if (IN6_IS_ADDR_UNSPECIFIED(addr))
 756                 return (NULL);
 757         for (; ncec != NULL; ncec = ncec->ncec_next) {
 758                 if (ncec->ncec_ill == ill ||
 759                     IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
 760                         if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
 761                                 mutex_enter(&ncec->ncec_lock);
 762                                 if (!NCE_ISCONDEMNED(ncec)) {
 763                                         ncec_refhold_locked(ncec);
 764                                         mutex_exit(&ncec->ncec_lock);
 765                                         break;
 766                                 }
 767                                 mutex_exit(&ncec->ncec_lock);
 768                         }
 769                 }
 770         }
 771         return (ncec);
 772 }
 773 
 774 /*
 775  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
 776  * entries for ill only, i.e., when ill is part of an ipmp group,
 777  * nce_lookup_v4 will never try to match across the group.
 778  */
 779 nce_t *
 780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
 781 {
 782         nce_t *nce;
 783         in6_addr_t addr6;
 784         ip_stack_t *ipst = ill->ill_ipst;
 785 
 786         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
 787         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
 788         nce = nce_lookup_addr(ill, &addr6);
 789         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 790         return (nce);
 791 }
 792 
 793 /*
 794  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
 795  * entries for ill only, i.e., when ill is part of an ipmp group,
 796  * nce_lookup_v6 will never try to match across the group.
 797  */
 798 nce_t *
 799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
 800 {
 801         nce_t *nce;
 802         ip_stack_t *ipst = ill->ill_ipst;
 803 
 804         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 805         nce = nce_lookup_addr(ill, addr6);
 806         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 807         return (nce);
 808 }
 809 
 810 static nce_t *
 811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
 812 {
 813         nce_t *nce;
 814 
 815         ASSERT(ill != NULL);
 816 #ifdef DEBUG
 817         if (ill->ill_isv6)
 818                 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
 819         else
 820                 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
 821 #endif
 822         mutex_enter(&ill->ill_lock);
 823         nce = nce_lookup(ill, addr);
 824         mutex_exit(&ill->ill_lock);
 825         return (nce);
 826 }
 827 
 828 
 829 /*
 830  * Router turned to host.  We need to make sure that cached copies of the ncec
 831  * are not used for forwarding packets if they were derived from the default
 832  * route, and that the default route itself is removed, as  required by
 833  * section 7.2.5 of RFC 2461.
 834  *
 835  * Note that the ncec itself probably has valid link-layer information for the
 836  * nexthop, so that there is no reason to delete the ncec, as long as the
 837  * ISROUTER flag is turned off.
 838  */
 839 static void
 840 ncec_router_to_host(ncec_t *ncec)
 841 {
 842         ire_t           *ire;
 843         ip_stack_t      *ipst = ncec->ncec_ipst;
 844 
 845         mutex_enter(&ncec->ncec_lock);
 846         ncec->ncec_flags &= ~NCE_F_ISROUTER;
 847         mutex_exit(&ncec->ncec_lock);
 848 
 849         ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
 850             &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
 851             MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
 852         if (ire != NULL) {
 853                 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
 854                 ire_delete(ire);
 855                 ire_refrele(ire);
 856         }
 857 }
 858 
 859 /*
 860  * Process passed in parameters either from an incoming packet or via
 861  * user ioctl.
 862  */
 863 void
 864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
 865 {
 866         ill_t   *ill = ncec->ncec_ill;
 867         uint32_t hw_addr_len = ill->ill_phys_addr_length;
 868         boolean_t ll_updated = B_FALSE;
 869         boolean_t ll_changed;
 870         nce_t   *nce;
 871 
 872         ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
 873         /*
 874          * No updates of link layer address or the neighbor state is
 875          * allowed, when the cache is in NONUD state.  This still
 876          * allows for responding to reachability solicitation.
 877          */
 878         mutex_enter(&ncec->ncec_lock);
 879         if (ncec->ncec_state == ND_INCOMPLETE) {
 880                 if (hw_addr == NULL) {
 881                         mutex_exit(&ncec->ncec_lock);
 882                         return;
 883                 }
 884                 nce_set_ll(ncec, hw_addr);
 885                 /*
 886                  * Update ncec state and send the queued packets
 887                  * back to ip this time ire will be added.
 888                  */
 889                 if (flag & ND_NA_FLAG_SOLICITED) {
 890                         nce_update(ncec, ND_REACHABLE, NULL);
 891                 } else {
 892                         nce_update(ncec, ND_STALE, NULL);
 893                 }
 894                 mutex_exit(&ncec->ncec_lock);
 895                 nce = nce_fastpath(ncec, B_TRUE, NULL);
 896                 nce_resolv_ok(ncec);
 897                 if (nce != NULL)
 898                         nce_refrele(nce);
 899                 return;
 900         }
 901         ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
 902         if (!is_adv) {
 903                 /* If this is a SOLICITATION request only */
 904                 if (ll_changed)
 905                         nce_update(ncec, ND_STALE, hw_addr);
 906                 mutex_exit(&ncec->ncec_lock);
 907                 ncec_cb_dispatch(ncec);
 908                 return;
 909         }
 910         if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
 911                 /* If in any other state than REACHABLE, ignore */
 912                 if (ncec->ncec_state == ND_REACHABLE) {
 913                         nce_update(ncec, ND_STALE, NULL);
 914                 }
 915                 mutex_exit(&ncec->ncec_lock);
 916                 ncec_cb_dispatch(ncec);
 917                 return;
 918         } else {
 919                 if (ll_changed) {
 920                         nce_update(ncec, ND_UNCHANGED, hw_addr);
 921                         ll_updated = B_TRUE;
 922                 }
 923                 if (flag & ND_NA_FLAG_SOLICITED) {
 924                         nce_update(ncec, ND_REACHABLE, NULL);
 925                 } else {
 926                         if (ll_updated) {
 927                                 nce_update(ncec, ND_STALE, NULL);
 928                         }
 929                 }
 930                 mutex_exit(&ncec->ncec_lock);
 931                 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
 932                     NCE_F_ISROUTER)) {
 933                         ncec_router_to_host(ncec);
 934                 } else {
 935                         ncec_cb_dispatch(ncec);
 936                 }
 937         }
 938 }
 939 
 940 /*
 941  * Pass arg1 to the cbf supplied, along with each ncec in existence.
 942  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
 943  * walking the hash list.
 944  */
 945 void
 946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
 947     void *arg1, boolean_t trace)
 948 {
 949         ncec_t  *ncec;
 950         ncec_t  *ncec1;
 951         ncec_t  **ncep;
 952         ncec_t  *free_nce_list = NULL;
 953 
 954         mutex_enter(&ndp->ndp_g_lock);
 955         /* Prevent ncec_delete from unlink and free of NCE */
 956         ndp->ndp_g_walker++;
 957         mutex_exit(&ndp->ndp_g_lock);
 958         for (ncep = ndp->nce_hash_tbl;
 959             ncep < A_END(ndp->nce_hash_tbl); ncep++) {
 960                 for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
 961                         ncec1 = ncec->ncec_next;
 962                         if (ill == NULL || ncec->ncec_ill == ill) {
 963                                 if (trace) {
 964                                         ncec_refhold(ncec);
 965                                         (*cbf)(ncec, arg1);
 966                                         ncec_refrele(ncec);
 967                                 } else {
 968                                         ncec_refhold_notr(ncec);
 969                                         (*cbf)(ncec, arg1);
 970                                         ncec_refrele_notr(ncec);
 971                                 }
 972                         }
 973                 }
 974         }
 975         mutex_enter(&ndp->ndp_g_lock);
 976         ndp->ndp_g_walker--;
 977         if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
 978                 /* Time to delete condemned entries */
 979                 for (ncep = ndp->nce_hash_tbl;
 980                     ncep < A_END(ndp->nce_hash_tbl); ncep++) {
 981                         ncec = *ncep;
 982                         if (ncec != NULL) {
 983                                 nce_remove(ndp, ncec, &free_nce_list);
 984                         }
 985                 }
 986                 ndp->ndp_g_walker_cleanup = B_FALSE;
 987         }
 988 
 989         mutex_exit(&ndp->ndp_g_lock);
 990 
 991         if (free_nce_list != NULL) {
 992                 nce_cleanup_list(free_nce_list);
 993         }
 994 }
 995 
 996 /*
 997  * Walk everything.
 998  * Note that ill can be NULL hence can't derive the ipst from it.
 999  */
1000 void
1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003         ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004         ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006 
1007 /*
1008  * For each interface an entry is added for the unspecified multicast group.
1009  * Here that mapping is used to form the multicast cache entry for a particular
1010  * multicast destination.
1011  */
1012 static int
1013 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1014     uint16_t flags, nce_t **newnce)
1015 {
1016         uchar_t         *hw_addr;
1017         int             err = 0;
1018         ip_stack_t      *ipst = ill->ill_ipst;
1019         nce_t           *nce;
1020 
1021         ASSERT(ill != NULL);
1022         ASSERT(ill->ill_isv6);
1023         ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1024 
1025         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1026         nce = nce_lookup_addr(ill, dst);
1027         if (nce != NULL) {
1028                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1029                 goto done;
1030         }
1031         if (ill->ill_net_type == IRE_IF_RESOLVER) {
1032                 /*
1033                  * For IRE_IF_RESOLVER a hardware mapping can be
1034                  * generated.
1035                  */
1036                 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1037                 if (hw_addr == NULL) {
1038                         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1039                         return (ENOMEM);
1040                 }
1041                 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1042         } else {
1043                 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1044                 hw_addr = NULL;
1045         }
1046         ASSERT((flags & NCE_F_MCAST) != 0);
1047         ASSERT((flags & NCE_F_NONUD) != 0);
1048         /* nce_state will be computed by nce_add_common() */
1049         err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1050             ND_UNCHANGED, &nce);
1051         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1052         if (err == 0)
1053                 err = nce_add_v6_postprocess(nce);
1054         if (hw_addr != NULL)
1055                 kmem_free(hw_addr, ill->ill_nd_lla_len);
1056         if (err != 0) {
1057                 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1058                 return (err);
1059         }
1060 done:
1061         ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1062         if (newnce != NULL)
1063                 *newnce = nce;
1064         else
1065                 nce_refrele(nce);
1066         return (0);
1067 }
1068 
1069 /*
1070  * Return the link layer address, and any flags of a ncec.
1071  */
1072 int
1073 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1074 {
1075         ncec_t          *ncec;
1076         in6_addr_t      *addr;
1077         sin6_t          *sin6;
1078 
1079         ASSERT(ill != NULL && ill->ill_isv6);
1080         sin6 = (sin6_t *)&lnr->lnr_addr;
1081         addr =  &sin6->sin6_addr;
1082 
1083         /*
1084          * NOTE: if the ill is an IPMP interface, then match against the whole
1085          * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1086          * addresses for the data addresses on an IPMP interface even though
1087          * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1088          */
1089         ncec = ncec_lookup_illgrp_v6(ill, addr);
1090         if (ncec == NULL)
1091                 return (ESRCH);
1092         /* If no link layer address is available yet, return ESRCH */
1093         if (!NCE_ISREACHABLE(ncec)) {
1094                 ncec_refrele(ncec);
1095                 return (ESRCH);
1096         }
1097         lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1098         bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1099             lnr->lnr_hdw_len);
1100         if (ncec->ncec_flags & NCE_F_ISROUTER)
1101                 lnr->lnr_flags = NDF_ISROUTER_ON;
1102         if (ncec->ncec_flags & NCE_F_ANYCAST)
1103                 lnr->lnr_flags |= NDF_ANYCAST_ON;
1104         if (ncec->ncec_flags & NCE_F_STATIC)
1105                 lnr->lnr_flags |= NDF_STATIC;
1106         ncec_refrele(ncec);
1107         return (0);
1108 }
1109 
1110 /*
1111  * Finish setting up the Enable/Disable multicast for the driver.
1112  */
1113 mblk_t *
1114 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1115     uint32_t hw_addr_offset, mblk_t *mp)
1116 {
1117         uchar_t         *hw_addr;
1118         ipaddr_t        v4group;
1119         uchar_t         *addr;
1120 
1121         ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1122         if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1123                 IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1124 
1125                 ASSERT(CLASSD(v4group));
1126                 ASSERT(!(ill->ill_isv6));
1127 
1128                 addr = (uchar_t *)&v4group;
1129         } else {
1130                 ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1131                 ASSERT(ill->ill_isv6);
1132 
1133                 addr = (uchar_t *)v6group;
1134         }
1135         hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1136         if (hw_addr == NULL) {
1137                 ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1138                 freemsg(mp);
1139                 return (NULL);
1140         }
1141 
1142         ip_mcast_mapping(ill, addr, hw_addr);
1143         return (mp);
1144 }
1145 
1146 void
1147 ip_ndp_resolve(ncec_t *ncec)
1148 {
1149         in_addr_t       sender4 = INADDR_ANY;
1150         in6_addr_t      sender6 = ipv6_all_zeros;
1151         ill_t           *src_ill;
1152         uint32_t        ms;
1153 
1154         src_ill = nce_resolve_src(ncec, &sender6);
1155         if (src_ill == NULL) {
1156                 /* Make sure we try again later */
1157                 ms = ncec->ncec_ill->ill_reachable_retrans_time;
1158                 nce_restart_timer(ncec, (clock_t)ms);
1159                 return;
1160         }
1161         if (ncec->ncec_ipversion == IPV4_VERSION)
1162                 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1163         mutex_enter(&ncec->ncec_lock);
1164         if (ncec->ncec_ipversion == IPV6_VERSION)
1165                 ms = ndp_solicit(ncec, sender6, src_ill);
1166         else
1167                 ms = arp_request(ncec, sender4, src_ill);
1168         mutex_exit(&ncec->ncec_lock);
1169         if (ms == 0) {
1170                 if (ncec->ncec_state != ND_REACHABLE) {
1171                         if (ncec->ncec_ipversion == IPV6_VERSION)
1172                                 ndp_resolv_failed(ncec);
1173                         else
1174                                 arp_resolv_failed(ncec);
1175                         ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1176                         nce_make_unreachable(ncec);
1177                         ncec_delete(ncec);
1178                 }
1179         } else {
1180                 nce_restart_timer(ncec, (clock_t)ms);
1181         }
1182 done:
1183         ill_refrele(src_ill);
1184 }
1185 
1186 /*
1187  * Send an IPv6 neighbor solicitation.
1188  * Returns number of milliseconds after which we should either rexmit or abort.
1189  * Return of zero means we should abort.
1190  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1191  * The optional source address is used as a hint to ndp_solicit for
1192  * which source to use in the packet.
1193  *
1194  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1195  * the packet.
1196  */
1197 uint32_t
1198 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1199 {
1200         in6_addr_t      dst;
1201         boolean_t       dropped = B_FALSE;
1202 
1203         ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1204         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1205 
1206         if (ncec->ncec_rcnt == 0)
1207                 return (0);
1208 
1209         dst = ncec->ncec_addr;
1210         ncec->ncec_rcnt--;
1211         mutex_exit(&ncec->ncec_lock);
1212         dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1213             ill->ill_phys_addr_length, &src, &dst, 0);
1214         mutex_enter(&ncec->ncec_lock);
1215         if (dropped)
1216                 ncec->ncec_rcnt++;
1217         return (ncec->ncec_ill->ill_reachable_retrans_time);
1218 }
1219 
1220 /*
1221  * Attempt to recover an address on an interface that's been marked as a
1222  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1223  * no easy way to just probe the address and have the right thing happen if
1224  * it's no longer in use.  Instead, we just bring it up normally and allow the
1225  * regular interface start-up logic to probe for a remaining duplicate and take
1226  * us back down if necessary.
1227  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1228  * ip_ndp_excl.
1229  */
1230 /* ARGSUSED */
1231 void
1232 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1233 {
1234         ill_t   *ill = rq->q_ptr;
1235         ipif_t  *ipif;
1236         in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1237         in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1238         boolean_t addr_equal;
1239 
1240         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1241                 /*
1242                  * We do not support recovery of proxy ARP'd interfaces,
1243                  * because the system lacks a complete proxy ARP mechanism.
1244                  */
1245                 if (ill->ill_isv6) {
1246                         addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1247                             addr6);
1248                 } else {
1249                         addr_equal = (ipif->ipif_lcl_addr == *addr4);
1250                 }
1251 
1252                 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1253                         continue;
1254 
1255                 /*
1256                  * If we have already recovered or if the interface is going
1257                  * away, then ignore.
1258                  */
1259                 mutex_enter(&ill->ill_lock);
1260                 if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1261                     (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1262                         mutex_exit(&ill->ill_lock);
1263                         continue;
1264                 }
1265 
1266                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
1267                 ill->ill_ipif_dup_count--;
1268                 mutex_exit(&ill->ill_lock);
1269                 ipif->ipif_was_dup = B_TRUE;
1270 
1271                 if (ill->ill_isv6) {
1272                         VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1273                         (void) ipif_up_done_v6(ipif);
1274                 } else {
1275                         VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1276                             EINPROGRESS);
1277                         (void) ipif_up_done(ipif);
1278                 }
1279         }
1280         freeb(mp);
1281 }
1282 
1283 /*
1284  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1285  * As long as someone else holds the address, the interface will stay down.
1286  * When that conflict goes away, the interface is brought back up.  This is
1287  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1288  * server will recover from a failure.
1289  *
1290  * For DHCP and temporary addresses, recovery is not done in the kernel.
1291  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1292  *
1293  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1294  */
1295 void
1296 ipif_dup_recovery(void *arg)
1297 {
1298         ipif_t *ipif = arg;
1299 
1300         ipif->ipif_recovery_id = 0;
1301         if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1302                 return;
1303 
1304         /*
1305          * No lock, because this is just an optimization.
1306          */
1307         if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1308                 return;
1309 
1310         /* If the link is down, we'll retry this later */
1311         if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1312                 return;
1313 
1314         ipif_do_recovery(ipif);
1315 }
1316 
1317 /*
1318  * Perform interface recovery by forcing the duplicate interfaces up and
1319  * allowing the system to determine which ones should stay up.
1320  *
1321  * Called both by recovery timer expiry and link-up notification.
1322  */
1323 void
1324 ipif_do_recovery(ipif_t *ipif)
1325 {
1326         ill_t *ill = ipif->ipif_ill;
1327         mblk_t *mp;
1328         ip_stack_t *ipst = ill->ill_ipst;
1329         size_t mp_size;
1330 
1331         if (ipif->ipif_isv6)
1332                 mp_size = sizeof (ipif->ipif_v6lcl_addr);
1333         else
1334                 mp_size = sizeof (ipif->ipif_lcl_addr);
1335         mp = allocb(mp_size, BPRI_MED);
1336         if (mp == NULL) {
1337                 mutex_enter(&ill->ill_lock);
1338                 if (ipst->ips_ip_dup_recovery > 0 &&
1339                     ipif->ipif_recovery_id == 0 &&
1340                     !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1341                         ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1342                             ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1343                 }
1344                 mutex_exit(&ill->ill_lock);
1345         } else {
1346                 /*
1347                  * A recovery timer may still be running if we got here from
1348                  * ill_restart_dad(); cancel that timer.
1349                  */
1350                 if (ipif->ipif_recovery_id != 0)
1351                         (void) untimeout(ipif->ipif_recovery_id);
1352                 ipif->ipif_recovery_id = 0;
1353 
1354                 if (ipif->ipif_isv6) {
1355                         bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1356                             sizeof (ipif->ipif_v6lcl_addr));
1357                 } else  {
1358                         bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1359                             sizeof (ipif->ipif_lcl_addr));
1360                 }
1361                 ill_refhold(ill);
1362                 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1363                     B_FALSE);
1364         }
1365 }
1366 
1367 /*
1368  * Find the MAC and IP addresses in an NA/NS message.
1369  */
1370 static void
1371 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1372     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1373 {
1374         icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1375         nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1376         uchar_t *addr;
1377         int alen;
1378 
1379         /* icmp_inbound_v6 ensures this */
1380         ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1381 
1382         addr = ira->ira_l2src;
1383         alen = ill->ill_phys_addr_length;
1384         if (alen > 0) {
1385                 *haddr = addr;
1386                 *haddrlenp = alen;
1387         } else {
1388                 *haddr = NULL;
1389                 *haddrlenp = 0;
1390         }
1391 
1392         /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1393         *targp = ns->nd_ns_target;
1394 }
1395 
1396 /*
1397  * This is for exclusive changes due to NDP duplicate address detection
1398  * failure.
1399  */
1400 /* ARGSUSED */
1401 static void
1402 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1403 {
1404         ill_t   *ill = rq->q_ptr;
1405         ipif_t  *ipif;
1406         uchar_t *haddr;
1407         uint_t  haddrlen;
1408         ip_stack_t *ipst = ill->ill_ipst;
1409         in6_addr_t targ;
1410         ip_recv_attr_t iras;
1411         mblk_t  *attrmp;
1412 
1413         attrmp = mp;
1414         mp = mp->b_cont;
1415         attrmp->b_cont = NULL;
1416         if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1417                 /* The ill or ip_stack_t disappeared on us */
1418                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1419                 ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1420                 freemsg(mp);
1421                 ira_cleanup(&iras, B_TRUE);
1422                 return;
1423         }
1424 
1425         ASSERT(ill == iras.ira_rill);
1426 
1427         ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1428         if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1429                 /*
1430                  * Ignore conflicts generated by misbehaving switches that
1431                  * just reflect our own messages back to us.  For IPMP, we may
1432                  * see reflections across any ill in the illgrp.
1433                  *
1434                  * RFC2462 and revisions tried to detect both the case
1435                  * when a statically configured IPv6 address is a duplicate,
1436                  * and the case when the L2 address itself is a duplicate. The
1437                  * later is important because, with stateles address autoconf,
1438                  * if the L2 address is a duplicate, the resulting IPv6
1439                  * address(es) would also be duplicates. We rely on DAD of the
1440                  * IPv6 address itself to detect the latter case.
1441                  */
1442                 /* For an under ill_grp can change under lock */
1443                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1444                 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1445                     IS_UNDER_IPMP(ill) &&
1446                     ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1447                     haddrlen) != NULL) {
1448                         rw_exit(&ipst->ips_ill_g_lock);
1449                         goto ignore_conflict;
1450                 }
1451                 rw_exit(&ipst->ips_ill_g_lock);
1452         }
1453 
1454         /*
1455          * Look up the appropriate ipif.
1456          */
1457         ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1458         if (ipif == NULL)
1459                 goto ignore_conflict;
1460 
1461         /* Reload the ill to match the ipif */
1462         ill = ipif->ipif_ill;
1463 
1464         /* If it's already duplicate or ineligible, then don't do anything. */
1465         if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1466                 ipif_refrele(ipif);
1467                 goto ignore_conflict;
1468         }
1469 
1470         /*
1471          * If this is a failure during duplicate recovery, then don't
1472          * complain.  It may take a long time to recover.
1473          */
1474         if (!ipif->ipif_was_dup) {
1475                 char ibuf[LIFNAMSIZ];
1476                 char hbuf[MAC_STR_LEN];
1477                 char sbuf[INET6_ADDRSTRLEN];
1478 
1479                 ipif_get_name(ipif, ibuf, sizeof (ibuf));
1480                 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1481                     " disabled", ibuf,
1482                     inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1483                     mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1484         }
1485         mutex_enter(&ill->ill_lock);
1486         ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1487         ipif->ipif_flags |= IPIF_DUPLICATE;
1488         ill->ill_ipif_dup_count++;
1489         mutex_exit(&ill->ill_lock);
1490         (void) ipif_down(ipif, NULL, NULL);
1491         (void) ipif_down_tail(ipif);
1492         mutex_enter(&ill->ill_lock);
1493         if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1494             ill->ill_net_type == IRE_IF_RESOLVER &&
1495             !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1496             ipst->ips_ip_dup_recovery > 0) {
1497                 ASSERT(ipif->ipif_recovery_id == 0);
1498                 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1499                     ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1500         }
1501         mutex_exit(&ill->ill_lock);
1502         ipif_refrele(ipif);
1503 
1504 ignore_conflict:
1505         freemsg(mp);
1506         ira_cleanup(&iras, B_TRUE);
1507 }
1508 
1509 /*
1510  * Handle failure by tearing down the ipifs with the specified address.  Note
1511  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1512  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1513  * we start a timer on the ipif.
1514  * Caller has to free mp;
1515  */
1516 static void
1517 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1518 {
1519         const uchar_t   *haddr;
1520         ill_t           *ill = ira->ira_rill;
1521 
1522         /*
1523          * Ignore conflicts generated by misbehaving switches that just
1524          * reflect our own messages back to us.
1525          */
1526 
1527         /* icmp_inbound_v6 ensures this */
1528         ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1529         haddr = ira->ira_l2src;
1530         if (haddr != NULL &&
1531             bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1532                 return;
1533         }
1534 
1535         if ((mp = copymsg(mp)) != NULL) {
1536                 mblk_t  *attrmp;
1537 
1538                 attrmp = ip_recv_attr_to_mblk(ira);
1539                 if (attrmp == NULL) {
1540                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1541                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1542                         freemsg(mp);
1543                 } else {
1544                         ASSERT(attrmp->b_cont == NULL);
1545                         attrmp->b_cont = mp;
1546                         mp = attrmp;
1547                         ill_refhold(ill);
1548                         qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1549                             B_FALSE);
1550                 }
1551         }
1552 }
1553 
1554 /*
1555  * Handle a discovered conflict: some other system is advertising that it owns
1556  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1557  * interface.
1558  *
1559  * Handles both IPv4 and IPv6
1560  */
1561 boolean_t
1562 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1563 {
1564         ipif_t          *ipif;
1565         clock_t         now;
1566         uint_t          maxdefense;
1567         uint_t          defs;
1568         ill_t           *ill = ira->ira_ill;
1569         ip_stack_t      *ipst = ill->ill_ipst;
1570         uint32_t        elapsed;
1571         boolean_t       isv6 = ill->ill_isv6;
1572         ipaddr_t        ncec_addr;
1573 
1574         if (isv6) {
1575                 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1576                     ipst);
1577         } else {
1578                 if (arp_no_defense) {
1579                         /*
1580                          * Yes, there is a conflict, but no, we do not
1581                          * defend ourself.
1582                          */
1583                         return (B_TRUE);
1584                 }
1585                 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1586                 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1587                     ipst);
1588         }
1589         if (ipif == NULL)
1590                 return (B_FALSE);
1591 
1592         /*
1593          * First, figure out if this address is disposable.
1594          */
1595         if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1596                 maxdefense = ipst->ips_ip_max_temp_defend;
1597         else
1598                 maxdefense = ipst->ips_ip_max_defend;
1599 
1600         /*
1601          * Now figure out how many times we've defended ourselves.  Ignore
1602          * defenses that happened long in the past.
1603          */
1604         now = ddi_get_lbolt();
1605         elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1606         mutex_enter(&ncec->ncec_lock);
1607         if ((defs = ncec->ncec_defense_count) > 0 &&
1608             elapsed > ipst->ips_ip_defend_interval) {
1609                 /*
1610                  * ip_defend_interval has elapsed.
1611                  * reset the defense count.
1612                  */
1613                 ncec->ncec_defense_count = defs = 0;
1614         }
1615         ncec->ncec_defense_count++;
1616         ncec->ncec_last_time_defended = now;
1617         mutex_exit(&ncec->ncec_lock);
1618         ipif_refrele(ipif);
1619 
1620         /*
1621          * If we've defended ourselves too many times already, then give up and
1622          * tear down the interface(s) using this address.
1623          * Otherwise, caller has to defend by sending out an announce.
1624          */
1625         if (defs >= maxdefense) {
1626                 if (isv6)
1627                         ndp_failure(mp, ira);
1628                 else
1629                         arp_failure(mp, ira);
1630         } else {
1631                 return (B_TRUE); /* caller must defend this address */
1632         }
1633         return (B_FALSE);
1634 }
1635 
1636 /*
1637  * Handle reception of Neighbor Solicitation messages.
1638  */
1639 static void
1640 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1641 {
1642         ill_t           *ill = ira->ira_ill, *under_ill;
1643         nd_neighbor_solicit_t *ns;
1644         uint32_t        hlen = ill->ill_phys_addr_length;
1645         uchar_t         *haddr = NULL;
1646         icmp6_t         *icmp_nd;
1647         ip6_t           *ip6h;
1648         ncec_t          *our_ncec = NULL;
1649         in6_addr_t      target;
1650         in6_addr_t      src;
1651         int             len;
1652         int             flag = 0;
1653         nd_opt_hdr_t    *opt = NULL;
1654         boolean_t       bad_solicit = B_FALSE;
1655         mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
1656         boolean_t       need_ill_refrele = B_FALSE;
1657 
1658         ip6h = (ip6_t *)mp->b_rptr;
1659         icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1660         len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1661         src = ip6h->ip6_src;
1662         ns = (nd_neighbor_solicit_t *)icmp_nd;
1663         target = ns->nd_ns_target;
1664         if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1665             IN6_IS_ADDR_LOOPBACK(&target)) {
1666                 if (ip_debug > 2) {
1667                         /* ip1dbg */
1668                         pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1669                             AF_INET6, &target);
1670                 }
1671                 bad_solicit = B_TRUE;
1672                 goto done;
1673         }
1674         if (len > sizeof (nd_neighbor_solicit_t)) {
1675                 /* Options present */
1676                 opt = (nd_opt_hdr_t *)&ns[1];
1677                 len -= sizeof (nd_neighbor_solicit_t);
1678                 if (!ndp_verify_optlen(opt, len)) {
1679                         ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1680                         bad_solicit = B_TRUE;
1681                         goto done;
1682                 }
1683         }
1684         if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1685                 /* Check to see if this is a valid DAD solicitation */
1686                 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1687                         if (ip_debug > 2) {
1688                                 /* ip1dbg */
1689                                 pr_addr_dbg("ndp_input_solicit: IPv6 "
1690                                     "Destination is not solicited node "
1691                                     "multicast %s\n", AF_INET6,
1692                                     &ip6h->ip6_dst);
1693                         }
1694                         bad_solicit = B_TRUE;
1695                         goto done;
1696                 }
1697         }
1698 
1699         /*
1700          * NOTE: with IPMP, it's possible the nominated multicast ill (which
1701          * received this packet if it's multicast) is not the ill tied to
1702          * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1703          * to ensure we find the associated NCE.
1704          */
1705         our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1706         /*
1707          * If this is a valid Solicitation for an address we are publishing,
1708          * then a PUBLISH entry should exist in the cache
1709          */
1710         if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1711                 ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1712                     "ifname=%s ", ill->ill_name));
1713                 if (ip_debug > 2) {
1714                         /* ip1dbg */
1715                         pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1716                 }
1717                 if (our_ncec == NULL)
1718                         bad_solicit = B_TRUE;
1719                 goto done;
1720         }
1721 
1722         /* At this point we should have a verified NS per spec */
1723         if (opt != NULL) {
1724                 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1725                 if (opt != NULL) {
1726                         haddr = (uchar_t *)&opt[1];
1727                         if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1728                             hlen == 0) {
1729                                 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1730                                 bad_solicit = B_TRUE;
1731                                 goto done;
1732                         }
1733                 }
1734         }
1735 
1736         /* If sending directly to peer, set the unicast flag */
1737         if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1738                 flag |= NDP_UNICAST;
1739 
1740         /*
1741          * Create/update the entry for the soliciting node on the ipmp_ill.
1742          * or respond to outstanding queries, don't if
1743          * the source is unspecified address.
1744          */
1745         if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1746                 int     err;
1747                 nce_t   *nnce;
1748 
1749                 ASSERT(ill->ill_isv6);
1750                 /*
1751                  * Regular solicitations *must* include the Source Link-Layer
1752                  * Address option.  Ignore messages that do not.
1753                  */
1754                 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1755                         ip1dbg(("ndp_input_solicit: source link-layer address "
1756                             "option missing with a specified source.\n"));
1757                         bad_solicit = B_TRUE;
1758                         goto done;
1759                 }
1760 
1761                 /*
1762                  * This is a regular solicitation.  If we're still in the
1763                  * process of verifying the address, then don't respond at all
1764                  * and don't keep track of the sender.
1765                  */
1766                 if (our_ncec->ncec_state == ND_PROBE)
1767                         goto done;
1768 
1769                 /*
1770                  * If the solicitation doesn't have sender hardware address
1771                  * (legal for unicast solicitation), then process without
1772                  * installing the return NCE.  Either we already know it, or
1773                  * we'll be forced to look it up when (and if) we reply to the
1774                  * packet.
1775                  */
1776                 if (haddr == NULL)
1777                         goto no_source;
1778 
1779                 under_ill = ill;
1780                 if (IS_UNDER_IPMP(under_ill)) {
1781                         ill = ipmp_ill_hold_ipmp_ill(under_ill);
1782                         if (ill == NULL)
1783                                 ill = under_ill;
1784                         else
1785                                 need_ill_refrele = B_TRUE;
1786                 }
1787                 err = nce_lookup_then_add_v6(ill,
1788                     haddr, hlen,
1789                     &src,   /* Soliciting nodes address */
1790                     0,
1791                     ND_STALE,
1792                     &nnce);
1793 
1794                 if (need_ill_refrele) {
1795                         ill_refrele(ill);
1796                         ill = under_ill;
1797                         need_ill_refrele =  B_FALSE;
1798                 }
1799                 switch (err) {
1800                 case 0:
1801                         /* done with this entry */
1802                         nce_refrele(nnce);
1803                         break;
1804                 case EEXIST:
1805                         /*
1806                          * B_FALSE indicates this is not an an advertisement.
1807                          */
1808                         nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1809                         nce_refrele(nnce);
1810                         break;
1811                 default:
1812                         ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1813                             err));
1814                         goto done;
1815                 }
1816 no_source:
1817                 flag |= NDP_SOLICITED;
1818         } else {
1819                 /*
1820                  * No source link layer address option should be present in a
1821                  * valid DAD request.
1822                  */
1823                 if (haddr != NULL) {
1824                         ip1dbg(("ndp_input_solicit: source link-layer address "
1825                             "option present with an unspecified source.\n"));
1826                         bad_solicit = B_TRUE;
1827                         goto done;
1828                 }
1829                 if (our_ncec->ncec_state == ND_PROBE) {
1830                         /*
1831                          * Internally looped-back probes will have
1832                          * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1833                          * transmissions.
1834                          */
1835                         if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1836                                 /*
1837                                  * If someone else is probing our address, then
1838                                  * we've crossed wires.  Declare failure.
1839                                  */
1840                                 ndp_failure(mp, ira);
1841                         }
1842                         goto done;
1843                 }
1844                 /*
1845                  * This is a DAD probe.  Multicast the advertisement to the
1846                  * all-nodes address.
1847                  */
1848                 src = ipv6_all_hosts_mcast;
1849         }
1850         flag |= nce_advert_flags(our_ncec);
1851         (void) ndp_xmit(ill,
1852             ND_NEIGHBOR_ADVERT,
1853             our_ncec->ncec_lladdr,
1854             our_ncec->ncec_lladdr_length,
1855             &target,        /* Source and target of the advertisement pkt */
1856             &src,   /* IP Destination (source of original pkt) */
1857             flag);
1858 done:
1859         if (bad_solicit)
1860                 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1861         if (our_ncec != NULL)
1862                 ncec_refrele(our_ncec);
1863 }
1864 
1865 /*
1866  * Handle reception of Neighbor Solicitation messages
1867  */
1868 void
1869 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1870 {
1871         ill_t           *ill = ira->ira_ill;
1872         nd_neighbor_advert_t *na;
1873         uint32_t        hlen = ill->ill_phys_addr_length;
1874         uchar_t         *haddr = NULL;
1875         icmp6_t         *icmp_nd;
1876         ip6_t           *ip6h;
1877         ncec_t          *dst_ncec = NULL;
1878         in6_addr_t      target;
1879         nd_opt_hdr_t    *opt = NULL;
1880         int             len;
1881         ip_stack_t      *ipst = ill->ill_ipst;
1882         mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
1883 
1884         ip6h = (ip6_t *)mp->b_rptr;
1885         icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1886         len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1887         na = (nd_neighbor_advert_t *)icmp_nd;
1888 
1889         if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1890             (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1891                 ip1dbg(("ndp_input_advert: Target is multicast but the "
1892                     "solicited flag is not zero\n"));
1893                 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1894                 return;
1895         }
1896         target = na->nd_na_target;
1897         if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1898             IN6_IS_ADDR_LOOPBACK(&target)) {
1899                 if (ip_debug > 2) {
1900                         /* ip1dbg */
1901                         pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1902                             AF_INET6, &target);
1903                 }
1904                 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1905                 return;
1906         }
1907         if (len > sizeof (nd_neighbor_advert_t)) {
1908                 opt = (nd_opt_hdr_t *)&na[1];
1909                 if (!ndp_verify_optlen(opt,
1910                     len - sizeof (nd_neighbor_advert_t))) {
1911                         ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1912                         BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1913                         return;
1914                 }
1915                 /* At this point we have a verified NA per spec */
1916                 len -= sizeof (nd_neighbor_advert_t);
1917                 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1918                 if (opt != NULL) {
1919                         haddr = (uchar_t *)&opt[1];
1920                         if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1921                             hlen == 0) {
1922                                 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1923                                 BUMP_MIB(mib,
1924                                     ipv6IfIcmpInBadNeighborAdvertisements);
1925                                 return;
1926                         }
1927                 }
1928         }
1929 
1930         /*
1931          * NOTE: we match across the illgrp since we need to do DAD for all of
1932          * our local addresses, and those are spread across all the active
1933          * ills in the group.
1934          */
1935         if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1936                 return;
1937 
1938         if (NCE_PUBLISH(dst_ncec)) {
1939                 /*
1940                  * Someone just advertised an addresses that we publish. First,
1941                  * check it it was us -- if so, we can safely ignore it.
1942                  * We don't get the haddr from the ira_l2src because, in the
1943                  * case that the packet originated from us, on an IPMP group,
1944                  * the ira_l2src may would be the link-layer address of the
1945                  * cast_ill used to send the packet, which may not be the same
1946                  * as the dst_ncec->ncec_lladdr of the address.
1947                  */
1948                 if (haddr != NULL) {
1949                         if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1950                                 goto out;
1951 
1952                         if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1953                                 goto out;   /* from us -- no conflict */
1954 
1955                         /*
1956                          * If we're in an IPMP group, check if this is an echo
1957                          * from another ill in the group.  Use the double-
1958                          * checked locking pattern to avoid grabbing
1959                          * ill_g_lock in the non-IPMP case.
1960                          */
1961                         if (IS_UNDER_IPMP(ill)) {
1962                                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1963                                 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1964                                     ill->ill_grp, haddr, hlen) != NULL) {
1965                                         rw_exit(&ipst->ips_ill_g_lock);
1966                                         goto out;
1967                                 }
1968                                 rw_exit(&ipst->ips_ill_g_lock);
1969                         }
1970                 }
1971 
1972                 /*
1973                  * This appears to be a real conflict.  If we're trying to
1974                  * configure this NCE (ND_PROBE), then shut it down.
1975                  * Otherwise, handle the discovered conflict.
1976                  */
1977                 if (dst_ncec->ncec_state == ND_PROBE) {
1978                         ndp_failure(mp, ira);
1979                 } else {
1980                         if (ip_nce_conflict(mp, ira, dst_ncec)) {
1981                                 char hbuf[MAC_STR_LEN];
1982                                 char sbuf[INET6_ADDRSTRLEN];
1983 
1984                                 cmn_err(CE_WARN,
1985                                     "node '%s' is using %s on %s",
1986                                     inet_ntop(AF_INET6, &target, sbuf,
1987                                     sizeof (sbuf)),
1988                                     haddr == NULL ? "<none>" :
1989                                     mac_colon_addr(haddr, hlen, hbuf,
1990                                     sizeof (hbuf)), ill->ill_name);
1991                                 /*
1992                                  * RFC 4862, Section 5.4.4 does not mandate
1993                                  * any specific behavior when an NA matches
1994                                  * a non-tentative address assigned to the
1995                                  * receiver. We make the choice of defending
1996                                  * our address, based on the assumption that
1997                                  * the sender has not detected the Duplicate.
1998                                  *
1999                                  * ncec_last_time_defended has been adjusted
2000                                  * in ip_nce_conflict()
2001                                  */
2002                                 (void) ndp_announce(dst_ncec);
2003                         }
2004                 }
2005         } else {
2006                 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2007                         dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2008 
2009                 /* B_TRUE indicates this an advertisement */
2010                 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2011         }
2012 out:
2013         ncec_refrele(dst_ncec);
2014 }
2015 
2016 /*
2017  * Process NDP neighbor solicitation/advertisement messages.
2018  * The checksum has already checked o.k before reaching here.
2019  * Information about the datalink header is contained in ira_l2src, but
2020  * that should be ignored for loopback packets.
2021  */
2022 void
2023 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2024 {
2025         ill_t           *ill = ira->ira_rill;
2026         icmp6_t         *icmp_nd;
2027         ip6_t           *ip6h;
2028         int             len;
2029         mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
2030         ill_t           *orig_ill = NULL;
2031 
2032         /*
2033          * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2034          * and make it be the IPMP upper so avoid being confused by a packet
2035          * addressed to a unicast address on a different ill.
2036          */
2037         if (IS_UNDER_IPMP(ill)) {
2038                 orig_ill = ill;
2039                 ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2040                 if (ill == NULL) {
2041                         ill = orig_ill;
2042                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2043                         ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2044                             mp, ill);
2045                         freemsg(mp);
2046                         return;
2047                 }
2048                 ASSERT(ill != orig_ill);
2049                 orig_ill = ira->ira_ill;
2050                 ira->ira_ill = ill;
2051                 mib = ill->ill_icmp6_mib;
2052         }
2053         if (!pullupmsg(mp, -1)) {
2054                 ip1dbg(("ndp_input: pullupmsg failed\n"));
2055                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2056                 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2057                 goto done;
2058         }
2059         ip6h = (ip6_t *)mp->b_rptr;
2060         if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2061                 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2062                 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2063                 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2064                 goto done;
2065         }
2066         /*
2067          * NDP does not accept any extension headers between the
2068          * IP header and the ICMP header since e.g. a routing
2069          * header could be dangerous.
2070          * This assumes that any AH or ESP headers are removed
2071          * by ip prior to passing the packet to ndp_input.
2072          */
2073         if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2074                 ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2075                     ip6h->ip6_nxt));
2076                 ip_drop_input("Wrong next header", mp, ill);
2077                 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2078                 goto done;
2079         }
2080         icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2081         ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2082             icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2083         if (icmp_nd->icmp6_code != 0) {
2084                 ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2085                 ip_drop_input("code non-zero", mp, ill);
2086                 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2087                 goto done;
2088         }
2089         len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2090         /*
2091          * Make sure packet length is large enough for either
2092          * a NS or a NA icmp packet.
2093          */
2094         if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2095                 ip1dbg(("ndp_input: packet too short\n"));
2096                 ip_drop_input("packet too short", mp, ill);
2097                 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2098                 goto done;
2099         }
2100         if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2101                 ndp_input_solicit(mp, ira);
2102         } else {
2103                 ndp_input_advert(mp, ira);
2104         }
2105 done:
2106         freemsg(mp);
2107         if (orig_ill != NULL) {
2108                 ill_refrele(ill);
2109                 ira->ira_ill = orig_ill;
2110         }
2111 }
2112 
2113 /*
2114  * ndp_xmit is called to form and transmit a ND solicitation or
2115  * advertisement ICMP packet.
2116  *
2117  * If the source address is unspecified and this isn't a probe (used for
2118  * duplicate address detection), an appropriate source address and link layer
2119  * address will be chosen here.  The link layer address option is included if
2120  * the source is specified (i.e., all non-probe packets), and omitted (per the
2121  * specification) otherwise.
2122  *
2123  * It returns B_FALSE only if it does a successful put() to the
2124  * corresponding ill's ill_wq otherwise returns B_TRUE.
2125  */
2126 static boolean_t
2127 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2128     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2129 {
2130         uint32_t        len;
2131         icmp6_t         *icmp6;
2132         mblk_t          *mp;
2133         ip6_t           *ip6h;
2134         nd_opt_hdr_t    *opt;
2135         uint_t          plen;
2136         zoneid_t        zoneid = GLOBAL_ZONEID;
2137         ill_t           *hwaddr_ill = ill;
2138         ip_xmit_attr_t  ixas;
2139         ip_stack_t      *ipst = ill->ill_ipst;
2140         boolean_t       need_refrele = B_FALSE;
2141         boolean_t       probe = B_FALSE;
2142 
2143         if (IS_UNDER_IPMP(ill)) {
2144                 probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2145                 /*
2146                  * We send non-probe packets on the upper IPMP interface.
2147                  * ip_output_simple() will use cast_ill for sending any
2148                  * multicast packets. Note that we can't follow the same
2149                  * logic for probe packets because all interfaces in the ipmp
2150                  * group may have failed, so that we really want to only try
2151                  * to send the ND packet on the ill corresponding to the src
2152                  * address.
2153                  */
2154                 if (!probe) {
2155                         ill = ipmp_ill_hold_ipmp_ill(ill);
2156                         if (ill != NULL)
2157                                 need_refrele = B_TRUE;
2158                         else
2159                                 ill = hwaddr_ill;
2160                 }
2161         }
2162 
2163         /*
2164          * If we have a unspecified source(sender) address, select a
2165          * proper source address for the solicitation here itself so
2166          * that we can initialize the h/w address correctly.
2167          *
2168          * If the sender is specified then we use this address in order
2169          * to lookup the zoneid before calling ip_output_v6(). This is to
2170          * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2171          * by IP (we cannot guarantee that the global zone has an interface
2172          * route to the destination).
2173          *
2174          * Note that the NA never comes here with the unspecified source
2175          * address.
2176          */
2177 
2178         /*
2179          * Probes will have unspec src at this point.
2180          */
2181         if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2182                 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2183                 /*
2184                  * It's possible for ipif_lookup_addr_zoneid_v6() to return
2185                  * ALL_ZONES if it cannot find a matching ipif for the address
2186                  * we are trying to use. In this case we err on the side of
2187                  * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2188                  */
2189                 if (zoneid == ALL_ZONES)
2190                         zoneid = GLOBAL_ZONEID;
2191         }
2192 
2193         plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2194         len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2195         mp = allocb(len,  BPRI_LO);
2196         if (mp == NULL) {
2197                 if (need_refrele)
2198                         ill_refrele(ill);
2199                 return (B_TRUE);
2200         }
2201 
2202         bzero((char *)mp->b_rptr, len);
2203         mp->b_wptr = mp->b_rptr + len;
2204 
2205         bzero(&ixas, sizeof (ixas));
2206         ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2207 
2208         ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2209         ixas.ixa_ipst = ipst;
2210         ixas.ixa_cred = kcred;
2211         ixas.ixa_cpid = NOPID;
2212         ixas.ixa_tsl = NULL;
2213         ixas.ixa_zoneid = zoneid;
2214 
2215         ip6h = (ip6_t *)mp->b_rptr;
2216         ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2217         ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2218         ip6h->ip6_nxt = IPPROTO_ICMPV6;
2219         ip6h->ip6_hops = IPV6_MAX_HOPS;
2220         ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2221         ip6h->ip6_dst = *target;
2222         icmp6 = (icmp6_t *)&ip6h[1];
2223 
2224         if (hw_addr_len != 0) {
2225                 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2226                     sizeof (nd_neighbor_advert_t));
2227         } else {
2228                 opt = NULL;
2229         }
2230         if (operation == ND_NEIGHBOR_SOLICIT) {
2231                 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2232 
2233                 if (opt != NULL && !(flag & NDP_PROBE)) {
2234                         /*
2235                          * Note that we don't send out SLLA for ND probes
2236                          * per RFC 4862, even though we do send out the src
2237                          * haddr for IPv4 DAD probes, even though both IPv4
2238                          * and IPv6 go out with the unspecified/INADDR_ANY
2239                          * src IP addr.
2240                          */
2241                         opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2242                 }
2243                 ip6h->ip6_src = *sender;
2244                 ns->nd_ns_target = *target;
2245                 if (!(flag & NDP_UNICAST)) {
2246                         /* Form multicast address of the target */
2247                         ip6h->ip6_dst = ipv6_solicited_node_mcast;
2248                         ip6h->ip6_dst.s6_addr32[3] |=
2249                             ns->nd_ns_target.s6_addr32[3];
2250                 }
2251         } else {
2252                 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2253 
2254                 ASSERT(!(flag & NDP_PROBE));
2255                 if (opt != NULL)
2256                         opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2257                 ip6h->ip6_src = *sender;
2258                 na->nd_na_target = *sender;
2259                 if (flag & NDP_ISROUTER)
2260                         na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2261                 if (flag & NDP_SOLICITED)
2262                         na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2263                 if (flag & NDP_ORIDE)
2264                         na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2265         }
2266 
2267         if (!(flag & NDP_PROBE)) {
2268                 if (hw_addr != NULL && opt != NULL) {
2269                         /* Fill in link layer address and option len */
2270                         opt->nd_opt_len = (uint8_t)plen;
2271                         bcopy(hw_addr, &opt[1], hw_addr_len);
2272                 }
2273         }
2274         if (opt != NULL && opt->nd_opt_type == 0) {
2275                 /* If there's no link layer address option, then strip it. */
2276                 len -= plen * 8;
2277                 mp->b_wptr = mp->b_rptr + len;
2278                 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2279         }
2280 
2281         icmp6->icmp6_type = (uint8_t)operation;
2282         icmp6->icmp6_code = 0;
2283         /*
2284          * Prepare for checksum by putting icmp length in the icmp
2285          * checksum field. The checksum is calculated in ip_output.c.
2286          */
2287         icmp6->icmp6_cksum = ip6h->ip6_plen;
2288 
2289         (void) ip_output_simple(mp, &ixas);
2290         ixa_cleanup(&ixas);
2291         if (need_refrele)
2292                 ill_refrele(ill);
2293         return (B_FALSE);
2294 }
2295 
2296 /*
2297  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2298  * The datapath uses this as an indication that there
2299  * is a problem (as opposed to a NCE that was just
2300  * reclaimed due to lack of memory.
2301  * Note that static ARP entries never become unreachable.
2302  */
2303 void
2304 nce_make_unreachable(ncec_t *ncec)
2305 {
2306         mutex_enter(&ncec->ncec_lock);
2307         ncec->ncec_state = ND_UNREACHABLE;
2308         mutex_exit(&ncec->ncec_lock);
2309 }
2310 
2311 /*
2312  * NCE retransmit timer. Common to IPv4 and IPv6.
2313  * This timer goes off when:
2314  * a. It is time to retransmit a resolution for resolver.
2315  * b. It is time to send reachability probes.
2316  */
2317 void
2318 nce_timer(void *arg)
2319 {
2320         ncec_t          *ncec = arg;
2321         ill_t           *ill = ncec->ncec_ill, *src_ill;
2322         char            addrbuf[INET6_ADDRSTRLEN];
2323         boolean_t       dropped = B_FALSE;
2324         ip_stack_t      *ipst = ncec->ncec_ipst;
2325         boolean_t       isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2326         in_addr_t       sender4 = INADDR_ANY;
2327         in6_addr_t      sender6 = ipv6_all_zeros;
2328 
2329         /*
2330          * The timer has to be cancelled by ncec_delete before doing the final
2331          * refrele. So the NCE is guaranteed to exist when the timer runs
2332          * until it clears the timeout_id. Before clearing the timeout_id
2333          * bump up the refcnt so that we can continue to use the ncec
2334          */
2335         ASSERT(ncec != NULL);
2336         mutex_enter(&ncec->ncec_lock);
2337         ncec_refhold_locked(ncec);
2338         ncec->ncec_timeout_id = 0;
2339         mutex_exit(&ncec->ncec_lock);
2340 
2341         src_ill = nce_resolve_src(ncec, &sender6);
2342         /* if we could not find a sender address, return */
2343         if (src_ill == NULL) {
2344                 if (!isv6) {
2345                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2346                         ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2347                             &sender4, addrbuf, sizeof (addrbuf))));
2348                 } else {
2349                         ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2350                             &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2351                 }
2352                 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2353                 ncec_refrele(ncec);
2354                 return;
2355         }
2356         if (!isv6)
2357                 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2358 
2359         mutex_enter(&ncec->ncec_lock);
2360         /*
2361          * Check the reachability state.
2362          */
2363         switch (ncec->ncec_state) {
2364         case ND_DELAY:
2365                 ASSERT(ncec->ncec_lladdr != NULL);
2366                 ncec->ncec_state = ND_PROBE;
2367                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2368                 if (isv6) {
2369                         mutex_exit(&ncec->ncec_lock);
2370                         dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2371                             src_ill->ill_phys_addr,
2372                             src_ill->ill_phys_addr_length,
2373                             &sender6, &ncec->ncec_addr,
2374                             NDP_UNICAST);
2375                 } else {
2376                         dropped = (arp_request(ncec, sender4, src_ill) == 0);
2377                         mutex_exit(&ncec->ncec_lock);
2378                 }
2379                 if (!dropped) {
2380                         mutex_enter(&ncec->ncec_lock);
2381                         ncec->ncec_pcnt--;
2382                         mutex_exit(&ncec->ncec_lock);
2383                 }
2384                 if (ip_debug > 3) {
2385                         /* ip2dbg */
2386                         pr_addr_dbg("nce_timer: state for %s changed "
2387                             "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2388                 }
2389                 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2390                 break;
2391         case ND_PROBE:
2392                 /* must be retransmit timer */
2393                 ASSERT(ncec->ncec_pcnt >= -1);
2394                 if (ncec->ncec_pcnt > 0) {
2395                         /*
2396                          * As per RFC2461, the ncec gets deleted after
2397                          * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2398                          * Note that the first unicast solicitation is sent
2399                          * during the DELAY state.
2400                          */
2401                         ip2dbg(("nce_timer: pcount=%x dst %s\n",
2402                             ncec->ncec_pcnt,
2403                             inet_ntop((isv6? AF_INET6 : AF_INET),
2404                             &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2405                         if (NCE_PUBLISH(ncec)) {
2406                                 mutex_exit(&ncec->ncec_lock);
2407                                 /*
2408                                  * send out a probe; note that src_ill
2409                                  * is ignored by nce_dad() for all
2410                                  * DAD message types other than IPv6
2411                                  * unicast probes
2412                                  */
2413                                 nce_dad(ncec, src_ill, B_TRUE);
2414                         } else {
2415                                 ASSERT(src_ill != NULL);
2416                                 if (isv6) {
2417                                         mutex_exit(&ncec->ncec_lock);
2418                                         dropped = ndp_xmit(src_ill,
2419                                             ND_NEIGHBOR_SOLICIT,
2420                                             src_ill->ill_phys_addr,
2421                                             src_ill->ill_phys_addr_length,
2422                                             &sender6, &ncec->ncec_addr,
2423                                             NDP_UNICAST);
2424                                 } else {
2425                                         /*
2426                                          * since the nce is REACHABLE,
2427                                          * the ARP request will be sent out
2428                                          * as a link-layer unicast.
2429                                          */
2430                                         dropped = (arp_request(ncec, sender4,
2431                                             src_ill) == 0);
2432                                         mutex_exit(&ncec->ncec_lock);
2433                                 }
2434                                 if (!dropped) {
2435                                         mutex_enter(&ncec->ncec_lock);
2436                                         ncec->ncec_pcnt--;
2437                                         mutex_exit(&ncec->ncec_lock);
2438                                 }
2439                                 nce_restart_timer(ncec,
2440                                     ill->ill_reachable_retrans_time);
2441                         }
2442                 } else if (ncec->ncec_pcnt < 0) {
2443                         /* No hope, delete the ncec */
2444                         /* Tell datapath it went bad */
2445                         ncec->ncec_state = ND_UNREACHABLE;
2446                         mutex_exit(&ncec->ncec_lock);
2447                         if (ip_debug > 2) {
2448                                 /* ip1dbg */
2449                                 pr_addr_dbg("nce_timer: Delete NCE for"
2450                                     " dst %s\n", (isv6? AF_INET6: AF_INET),
2451                                     &ncec->ncec_addr);
2452                         }
2453                         /* if static ARP can't delete. */
2454                         if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2455                                 ncec_delete(ncec);
2456 
2457                 } else if (!NCE_PUBLISH(ncec)) {
2458                         /*
2459                          * Probe count is 0 for a dynamic entry (one that we
2460                          * ourselves are not publishing). We should never get
2461                          * here if NONUD was requested, hence the ASSERT below.
2462                          */
2463                         ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2464                         ip2dbg(("nce_timer: pcount=%x dst %s\n",
2465                             ncec->ncec_pcnt, inet_ntop(AF_INET6,
2466                             &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2467                         ncec->ncec_pcnt--;
2468                         mutex_exit(&ncec->ncec_lock);
2469                         /* Wait one interval before killing */
2470                         nce_restart_timer(ncec,
2471                             ill->ill_reachable_retrans_time);
2472                 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2473                         ipif_t *ipif;
2474                         ipaddr_t ncec_addr;
2475 
2476                         /*
2477                          * We're done probing, and we can now declare this
2478                          * address to be usable.  Let IP know that it's ok to
2479                          * use.
2480                          */
2481                         ncec->ncec_state = ND_REACHABLE;
2482                         ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2483                         mutex_exit(&ncec->ncec_lock);
2484                         if (isv6) {
2485                                 ipif = ipif_lookup_addr_exact_v6(
2486                                     &ncec->ncec_addr, ill, ipst);
2487                         } else {
2488                                 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2489                                     ncec_addr);
2490                                 ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2491                                     ipst);
2492                         }
2493                         if (ipif != NULL) {
2494                                 if (ipif->ipif_was_dup) {
2495                                         char ibuf[LIFNAMSIZ];
2496                                         char sbuf[INET6_ADDRSTRLEN];
2497 
2498                                         ipif->ipif_was_dup = B_FALSE;
2499                                         (void) inet_ntop(AF_INET6,
2500                                             &ipif->ipif_v6lcl_addr,
2501                                             sbuf, sizeof (sbuf));
2502                                         ipif_get_name(ipif, ibuf,
2503                                             sizeof (ibuf));
2504                                         cmn_err(CE_NOTE, "recovered address "
2505                                             "%s on %s", sbuf, ibuf);
2506                                 }
2507                                 if ((ipif->ipif_flags & IPIF_UP) &&
2508                                     !ipif->ipif_addr_ready)
2509                                         ipif_up_notify(ipif);
2510                                 ipif->ipif_addr_ready = 1;
2511                                 ipif_refrele(ipif);
2512                         }
2513                         if (!isv6 && arp_no_defense)
2514                                 break;
2515                         /* Begin defending our new address */
2516                         if (ncec->ncec_unsolicit_count > 0) {
2517                                 ncec->ncec_unsolicit_count--;
2518                                 if (isv6) {
2519                                         dropped = ndp_announce(ncec);
2520                                 } else {
2521                                         dropped = arp_announce(ncec);
2522                                 }
2523 
2524                                 if (dropped)
2525                                         ncec->ncec_unsolicit_count++;
2526                                 else
2527                                         ncec->ncec_last_time_defended =
2528                                             ddi_get_lbolt();
2529                         }
2530                         if (ncec->ncec_unsolicit_count > 0) {
2531                                 nce_restart_timer(ncec,
2532                                     ANNOUNCE_INTERVAL(isv6));
2533                         } else if (DEFENSE_INTERVAL(isv6) != 0) {
2534                                 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2535                         }
2536                 } else {
2537                         /*
2538                          * This is an address we're probing to be our own, but
2539                          * the ill is down.  Wait until it comes back before
2540                          * doing anything, but switch to reachable state so
2541                          * that the restart will work.
2542                          */
2543                         ncec->ncec_state = ND_REACHABLE;
2544                         mutex_exit(&ncec->ncec_lock);
2545                 }
2546                 break;
2547         case ND_INCOMPLETE: {
2548                 mblk_t  *mp, *nextmp;
2549                 mblk_t  **prevmpp;
2550 
2551                 /*
2552                  * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2553                  * for any IPMP probe packets, and toss them.  IPMP probe
2554                  * packets will always be at the head of ncec_qd_mp, so that
2555                  * we can stop at the first queued ND packet that is
2556                  * not a probe packet.
2557                  */
2558                 prevmpp = &ncec->ncec_qd_mp;
2559                 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2560                         nextmp = mp->b_next;
2561 
2562                         if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2563                                 inet_freemsg(mp);
2564                                 ncec->ncec_nprobes--;
2565                                 *prevmpp = nextmp;
2566                         } else {
2567                                 prevmpp = &mp->b_next;
2568                         }
2569                 }
2570 
2571                 /*
2572                  * Must be resolver's retransmit timer.
2573                  */
2574                 mutex_exit(&ncec->ncec_lock);
2575                 ip_ndp_resolve(ncec);
2576                 break;
2577         }
2578         case ND_REACHABLE:
2579                 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2580                     ncec->ncec_unsolicit_count != 0) ||
2581                     (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2582                         if (ncec->ncec_unsolicit_count > 0) {
2583                                 ncec->ncec_unsolicit_count--;
2584                                 mutex_exit(&ncec->ncec_lock);
2585                                 /*
2586                                  * When we get to zero announcements left,
2587                                  * switch to address defense
2588                                  */
2589                         } else {
2590                                 boolean_t rate_limit;
2591 
2592                                 mutex_exit(&ncec->ncec_lock);
2593                                 rate_limit = ill_defend_rate_limit(ill, ncec);
2594                                 if (rate_limit) {
2595                                         nce_restart_timer(ncec,
2596                                             DEFENSE_INTERVAL(isv6));
2597                                         break;
2598                                 }
2599                         }
2600                         if (isv6) {
2601                                 dropped = ndp_announce(ncec);
2602                         } else {
2603                                 dropped = arp_announce(ncec);
2604                         }
2605                         mutex_enter(&ncec->ncec_lock);
2606                         if (dropped) {
2607                                 ncec->ncec_unsolicit_count++;
2608                         } else {
2609                                 ncec->ncec_last_time_defended =
2610                                     ddi_get_lbolt();
2611                         }
2612                         mutex_exit(&ncec->ncec_lock);
2613                         if (ncec->ncec_unsolicit_count != 0) {
2614                                 nce_restart_timer(ncec,
2615                                     ANNOUNCE_INTERVAL(isv6));
2616                         } else {
2617                                 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2618                         }
2619                 } else {
2620                         mutex_exit(&ncec->ncec_lock);
2621                 }
2622                 break;
2623         default:
2624                 mutex_exit(&ncec->ncec_lock);
2625                 break;
2626         }
2627 done:
2628         ncec_refrele(ncec);
2629         ill_refrele(src_ill);
2630 }
2631 
2632 /*
2633  * Set a link layer address from the ll_addr passed in.
2634  * Copy SAP from ill.
2635  */
2636 static void
2637 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2638 {
2639         ill_t   *ill = ncec->ncec_ill;
2640 
2641         ASSERT(ll_addr != NULL);
2642         if (ill->ill_phys_addr_length > 0) {
2643                 /*
2644                  * The bcopy() below used to be called for the physical address
2645                  * length rather than the link layer address length. For
2646                  * ethernet and many other media, the phys_addr and lla are
2647                  * identical.
2648                  *
2649                  * The phys_addr and lla may not be the same for devices that
2650                  * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2651                  * no known instances of these.
2652                  *
2653                  * For PPP or other interfaces with a zero length
2654                  * physical address, don't do anything here.
2655                  * The bcopy() with a zero phys_addr length was previously
2656                  * a no-op for interfaces with a zero-length physical address.
2657                  * Using the lla for them would change the way they operate.
2658                  * Doing nothing in such cases preserves expected behavior.
2659                  */
2660                 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2661         }
2662 }
2663 
2664 boolean_t
2665 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2666     uint32_t ll_addr_len)
2667 {
2668         ASSERT(ncec->ncec_lladdr != NULL);
2669         if (ll_addr == NULL)
2670                 return (B_FALSE);
2671         if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2672                 return (B_TRUE);
2673         return (B_FALSE);
2674 }
2675 
2676 /*
2677  * Updates the link layer address or the reachability state of
2678  * a cache entry.  Reset probe counter if needed.
2679  */
2680 void
2681 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2682 {
2683         ill_t   *ill = ncec->ncec_ill;
2684         boolean_t need_stop_timer = B_FALSE;
2685         boolean_t need_fastpath_update = B_FALSE;
2686         nce_t   *nce = NULL;
2687         timeout_id_t tid;
2688 
2689         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2690         /*
2691          * If this interface does not do NUD, there is no point
2692          * in allowing an update to the cache entry.  Although
2693          * we will respond to NS.
2694          * The only time we accept an update for a resolver when
2695          * NUD is turned off is when it has just been created.
2696          * Non-Resolvers will always be created as REACHABLE.
2697          */
2698         if (new_state != ND_UNCHANGED) {
2699                 if ((ncec->ncec_flags & NCE_F_NONUD) &&
2700                     (ncec->ncec_state != ND_INCOMPLETE))
2701                         return;
2702                 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2703                 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2704                 need_stop_timer = B_TRUE;
2705                 if (new_state == ND_REACHABLE)
2706                         ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2707                 else {
2708                         /* We force NUD in this case */
2709                         ncec->ncec_last = 0;
2710                 }
2711                 ncec->ncec_state = new_state;
2712                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2713                 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2714                     new_state == ND_INCOMPLETE);
2715         }
2716         if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2717                 tid = ncec->ncec_timeout_id;
2718                 ncec->ncec_timeout_id = 0;
2719         }
2720         /*
2721          * Re-trigger fastpath probe and
2722          * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2723          * whatever packets that happens to be transmitting at the time.
2724          */
2725         if (new_ll_addr != NULL) {
2726                 bcopy(new_ll_addr, ncec->ncec_lladdr,
2727                     ill->ill_phys_addr_length);
2728                 need_fastpath_update = B_TRUE;
2729         }
2730         mutex_exit(&ncec->ncec_lock);
2731         if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2732                 if (tid != 0)
2733                         (void) untimeout(tid);
2734         }
2735         if (need_fastpath_update) {
2736                 /*
2737                  * Delete any existing existing dlur_mp and fp_mp information.
2738                  * For IPMP interfaces, all underlying ill's must be checked
2739                  * and purged.
2740                  */
2741                 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2742                 /*
2743                  * add the new dlur_mp and fp_mp
2744                  */
2745                 nce = nce_fastpath(ncec, B_TRUE, NULL);
2746                 if (nce != NULL)
2747                         nce_refrele(nce);
2748         }
2749         mutex_enter(&ncec->ncec_lock);
2750 }
2751 
2752 static void
2753 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2754 {
2755         uint_t  count = 0;
2756         mblk_t  **mpp, *tmp;
2757 
2758         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2759 
2760         for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2761                 if (++count > ncec->ncec_ill->ill_max_buf) {
2762                         tmp = ncec->ncec_qd_mp->b_next;
2763                         ncec->ncec_qd_mp->b_next = NULL;
2764                         /*
2765                          * if we never create data addrs on the under_ill
2766                          * does this matter?
2767                          */
2768                         BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2769                             ipIfStatsOutDiscards);
2770                         ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2771                             ncec->ncec_ill);
2772                         freemsg(ncec->ncec_qd_mp);
2773                         ncec->ncec_qd_mp = tmp;
2774                 }
2775         }
2776 
2777         if (head_insert) {
2778                 ncec->ncec_nprobes++;
2779                 mp->b_next = ncec->ncec_qd_mp;
2780                 ncec->ncec_qd_mp = mp;
2781         } else {
2782                 *mpp = mp;
2783         }
2784 }
2785 
2786 /*
2787  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2788  * queued at the head or tail of the queue based on the input argument
2789  * 'head_insert'. The caller should specify this argument as B_TRUE if this
2790  * packet is an IPMP probe packet, in which case the following happens:
2791  *
2792  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
2793  *      (non-ipmp_probe) load-speading case where the source address of the ND
2794  *      packet is not tied to ncec_ill. If the ill bound to the source address
2795  *      cannot receive, the response to the ND packet will not be received.
2796  *      However, if ND packets for ncec_ill's probes are queued behind that ND
2797  *      packet, those probes will also fail to be sent, and thus in.mpathd will
2798  *       erroneously conclude that ncec_ill has also failed.
2799  *
2800  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
2801  *      the first attempt.  This ensures that ND problems do not manifest as
2802  *      probe RTT spikes.
2803  *
2804  * We achieve this by inserting ipmp_probe() packets at the head of the
2805  * nce_queue.
2806  *
2807  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2808  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2809  */
2810 void
2811 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2812 {
2813         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2814         nce_queue_mp_common(ncec, mp, head_insert);
2815 }
2816 
2817 /*
2818  * Called when address resolution failed due to a timeout.
2819  * Send an ICMP unreachable in response to all queued packets.
2820  */
2821 void
2822 ndp_resolv_failed(ncec_t *ncec)
2823 {
2824         mblk_t  *mp, *nxt_mp;
2825         char    buf[INET6_ADDRSTRLEN];
2826         ill_t *ill = ncec->ncec_ill;
2827         ip_recv_attr_t  iras;
2828 
2829         bzero(&iras, sizeof (iras));
2830         iras.ira_flags = 0;
2831         /*
2832          * we are setting the ira_rill to the ipmp_ill (instead of
2833          * the actual ill on which the packet was received), but this
2834          * is ok because we don't actually need the real ira_rill.
2835          * to send the icmp unreachable to the sender.
2836          */
2837         iras.ira_ill = iras.ira_rill = ill;
2838         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2839         iras.ira_rifindex = iras.ira_ruifindex;
2840 
2841         ip1dbg(("ndp_resolv_failed: dst %s\n",
2842             inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2843         mutex_enter(&ncec->ncec_lock);
2844         mp = ncec->ncec_qd_mp;
2845         ncec->ncec_qd_mp = NULL;
2846         ncec->ncec_nprobes = 0;
2847         mutex_exit(&ncec->ncec_lock);
2848         while (mp != NULL) {
2849                 nxt_mp = mp->b_next;
2850                 mp->b_next = NULL;
2851 
2852                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2853                 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2854                     mp, ill);
2855                 icmp_unreachable_v6(mp,
2856                     ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2857                 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2858                 mp = nxt_mp;
2859         }
2860         ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2861 }
2862 
2863 /*
2864  * Handle the completion of NDP and ARP resolution.
2865  */
2866 void
2867 nce_resolv_ok(ncec_t *ncec)
2868 {
2869         mblk_t *mp;
2870         uint_t pkt_len;
2871         iaflags_t ixaflags = IXAF_NO_TRACE;
2872         nce_t *nce;
2873         ill_t   *ill = ncec->ncec_ill;
2874         boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2875         ip_stack_t *ipst = ill->ill_ipst;
2876 
2877         if (IS_IPMP(ncec->ncec_ill)) {
2878                 nce_resolv_ipmp_ok(ncec);
2879                 return;
2880         }
2881         /* non IPMP case */
2882 
2883         mutex_enter(&ncec->ncec_lock);
2884         ASSERT(ncec->ncec_nprobes == 0);
2885         mp = ncec->ncec_qd_mp;
2886         ncec->ncec_qd_mp = NULL;
2887         mutex_exit(&ncec->ncec_lock);
2888 
2889         while (mp != NULL) {
2890                 mblk_t *nxt_mp;
2891 
2892                 if (ill->ill_isv6) {
2893                         ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2894 
2895                         pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2896                 } else {
2897                         ipha_t *ipha = (ipha_t *)mp->b_rptr;
2898 
2899                         ixaflags |= IXAF_IS_IPV4;
2900                         pkt_len = ntohs(ipha->ipha_length);
2901                 }
2902                 nxt_mp = mp->b_next;
2903                 mp->b_next = NULL;
2904                 /*
2905                  * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2906                  * longer available, but it's ok to drop this flag because TCP
2907                  * has its own flow-control in effect, so TCP packets
2908                  * are not likely to get here when flow-control is in effect.
2909                  */
2910                 mutex_enter(&ill->ill_lock);
2911                 nce = nce_lookup(ill, &ncec->ncec_addr);
2912                 mutex_exit(&ill->ill_lock);
2913 
2914                 if (nce == NULL) {
2915                         if (isv6) {
2916                                 BUMP_MIB(&ipst->ips_ip6_mib,
2917                                     ipIfStatsOutDiscards);
2918                         } else {
2919                                 BUMP_MIB(&ipst->ips_ip_mib,
2920                                     ipIfStatsOutDiscards);
2921                         }
2922                         ip_drop_output("ipIfStatsOutDiscards - no nce",
2923                             mp, NULL);
2924                         freemsg(mp);
2925                 } else {
2926                         /*
2927                          * We don't know the zoneid, but
2928                          * ip_xmit does not care since IXAF_NO_TRACE
2929                          * is set. (We traced the packet the first
2930                          * time through ip_xmit.)
2931                          */
2932                         (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2933                             ALL_ZONES, 0, NULL);
2934                         nce_refrele(nce);
2935                 }
2936                 mp = nxt_mp;
2937         }
2938 
2939         ncec_cb_dispatch(ncec); /* complete callbacks */
2940 }
2941 
2942 /*
2943  * Called by SIOCSNDP* ioctl to add/change an ncec entry
2944  * and the corresponding attributes.
2945  * Disallow states other than ND_REACHABLE or ND_STALE.
2946  */
2947 int
2948 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2949 {
2950         sin6_t          *sin6;
2951         in6_addr_t      *addr;
2952         ncec_t          *ncec;
2953         nce_t           *nce;
2954         int             err = 0;
2955         uint16_t        new_flags = 0;
2956         uint16_t        old_flags = 0;
2957         int             inflags = lnr->lnr_flags;
2958         ip_stack_t      *ipst = ill->ill_ipst;
2959         boolean_t       do_postprocess = B_FALSE;
2960 
2961         ASSERT(ill->ill_isv6);
2962         if ((lnr->lnr_state_create != ND_REACHABLE) &&
2963             (lnr->lnr_state_create != ND_STALE))
2964                 return (EINVAL);
2965 
2966         sin6 = (sin6_t *)&lnr->lnr_addr;
2967         addr = &sin6->sin6_addr;
2968 
2969         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2970         ASSERT(!IS_UNDER_IPMP(ill));
2971         nce = nce_lookup_addr(ill, addr);
2972         if (nce != NULL)
2973                 new_flags = nce->nce_common->ncec_flags;
2974 
2975         switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2976         case NDF_ISROUTER_ON:
2977                 new_flags |= NCE_F_ISROUTER;
2978                 break;
2979         case NDF_ISROUTER_OFF:
2980                 new_flags &= ~NCE_F_ISROUTER;
2981                 break;
2982         case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2983                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2984                 if (nce != NULL)
2985                         nce_refrele(nce);
2986                 return (EINVAL);
2987         }
2988         if (inflags & NDF_STATIC)
2989                 new_flags |= NCE_F_STATIC;
2990 
2991         switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2992         case NDF_ANYCAST_ON:
2993                 new_flags |= NCE_F_ANYCAST;
2994                 break;
2995         case NDF_ANYCAST_OFF:
2996                 new_flags &= ~NCE_F_ANYCAST;
2997                 break;
2998         case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2999                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3000                 if (nce != NULL)
3001                         nce_refrele(nce);
3002                 return (EINVAL);
3003         }
3004 
3005         if (nce == NULL) {
3006                 err = nce_add_v6(ill,
3007                     (uchar_t *)lnr->lnr_hdw_addr,
3008                     ill->ill_phys_addr_length,
3009                     addr,
3010                     new_flags,
3011                     lnr->lnr_state_create,
3012                     &nce);
3013                 if (err != 0) {
3014                         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3015                         ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3016                         return (err);
3017                 } else {
3018                         do_postprocess = B_TRUE;
3019                 }
3020         }
3021         ncec = nce->nce_common;
3022         old_flags = ncec->ncec_flags;
3023         if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3024                 ncec_router_to_host(ncec);
3025                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3026                 if (do_postprocess)
3027                         err = nce_add_v6_postprocess(nce);
3028                 nce_refrele(nce);
3029                 return (0);
3030         }
3031         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3032 
3033         if (do_postprocess)
3034                 err = nce_add_v6_postprocess(nce);
3035         /*
3036          * err cannot be anything other than 0 because we don't support
3037          * proxy arp of static addresses.
3038          */
3039         ASSERT(err == 0);
3040 
3041         mutex_enter(&ncec->ncec_lock);
3042         ncec->ncec_flags = new_flags;
3043         mutex_exit(&ncec->ncec_lock);
3044         /*
3045          * Note that we ignore the state at this point, which
3046          * should be either STALE or REACHABLE.  Instead we let
3047          * the link layer address passed in to determine the state
3048          * much like incoming packets.
3049          */
3050         nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3051         nce_refrele(nce);
3052         return (0);
3053 }
3054 
3055 /*
3056  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3057  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3058  * be held to ensure that they are in the same group.
3059  */
3060 static nce_t *
3061 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3062 {
3063 
3064         nce_t *nce;
3065 
3066         nce = nce_ill_lookup_then_add(ill, ncec);
3067 
3068         if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3069                 return (nce);
3070 
3071         /*
3072          * hold the ncec_lock to synchronize with nce_update() so that,
3073          * at the end of this function, the contents of nce_dlur_mp are
3074          * consistent with ncec->ncec_lladdr, even though some intermediate
3075          * packet may have been sent out with a mangled address, which would
3076          * only be a transient condition.
3077          */
3078         mutex_enter(&ncec->ncec_lock);
3079         if (ncec->ncec_lladdr != NULL) {
3080                 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3081                     NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3082         } else {
3083                 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3084                     ill->ill_sap_length);
3085         }
3086         mutex_exit(&ncec->ncec_lock);
3087         return (nce);
3088 }
3089 
3090 /*
3091  * we make nce_fp_mp to have an M_DATA prepend.
3092  * The caller ensures there is hold on ncec for this function.
3093  * Note that since ill_fastpath_probe() copies the mblk there is
3094  * no need to hold the nce or ncec beyond this function.
3095  *
3096  * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3097  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3098  * and will be returned back by this function, so that no extra nce_refrele
3099  * is required for the caller. The calls from nce_add_common() use this
3100  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3101  * nce_refrele of the returned nce (when it is non-null).
3102  */
3103 nce_t *
3104 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3105 {
3106         nce_t *nce;
3107         ill_t *ill = ncec->ncec_ill;
3108 
3109         ASSERT(ill != NULL);
3110 
3111         if (IS_IPMP(ill) && trigger_fp_req) {
3112                 trigger_fp_req = B_FALSE;
3113                 ipmp_ncec_refresh_nce(ncec);
3114         }
3115 
3116         /*
3117          * If the caller already has the nce corresponding to the ill, use
3118          * that one. Otherwise we have to lookup/add the nce. Calls from
3119          * nce_add_common() fall in the former category, and have just done
3120          * the nce lookup/add that can be reused.
3121          */
3122         if (ncec_nce == NULL)
3123                 nce = nce_fastpath_create(ill, ncec);
3124         else
3125                 nce = ncec_nce;
3126 
3127         if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3128                 return (nce);
3129 
3130         if (trigger_fp_req)
3131                 nce_fastpath_trigger(nce);
3132         return (nce);
3133 }
3134 
3135 /*
3136  * Trigger fastpath on nce. No locks may be held.
3137  */
3138 static void
3139 nce_fastpath_trigger(nce_t *nce)
3140 {
3141         int res;
3142         ill_t *ill = nce->nce_ill;
3143         ncec_t *ncec = nce->nce_common;
3144 
3145         res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3146         /*
3147          * EAGAIN is an indication of a transient error
3148          * i.e. allocation failure etc. leave the ncec in the list it
3149          * will be updated when another probe happens for another ire
3150          * if not it will be taken out of the list when the ire is
3151          * deleted.
3152          */
3153         if (res != 0 && res != EAGAIN && res != ENOTSUP)
3154                 nce_fastpath_list_delete(ill, ncec, NULL);
3155 }
3156 
3157 /*
3158  * Add ncec to the nce fastpath list on ill.
3159  */
3160 static nce_t *
3161 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3162 {
3163         nce_t *nce = NULL;
3164 
3165         ASSERT(MUTEX_HELD(&ill->ill_lock));
3166         /*
3167          * Atomically ensure that the ill is not CONDEMNED and is not going
3168          * down, before adding the NCE.
3169          */
3170         if (ill->ill_state_flags & ILL_CONDEMNED)
3171                 return (NULL);
3172         mutex_enter(&ncec->ncec_lock);
3173         /*
3174          * if ncec has not been deleted and
3175          * is not already in the list add it.
3176          */
3177         if (!NCE_ISCONDEMNED(ncec)) {
3178                 nce = nce_lookup(ill, &ncec->ncec_addr);
3179                 if (nce != NULL)
3180                         goto done;
3181                 nce = nce_add(ill, ncec);
3182         }
3183 done:
3184         mutex_exit(&ncec->ncec_lock);
3185         return (nce);
3186 }
3187 
3188 nce_t *
3189 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3190 {
3191         nce_t *nce;
3192 
3193         mutex_enter(&ill->ill_lock);
3194         nce = nce_ill_lookup_then_add_locked(ill, ncec);
3195         mutex_exit(&ill->ill_lock);
3196         return (nce);
3197 }
3198 
3199 
3200 /*
3201  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3202  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3203  * entry after all locks have been dropped.
3204  */
3205 void
3206 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3207 {
3208         nce_t *nce;
3209 
3210         ASSERT(ill != NULL);
3211 
3212         /* delete any nces referencing the ncec from underlying ills */
3213         if (IS_IPMP(ill))
3214                 ipmp_ncec_delete_nce(ncec);
3215 
3216         /* now the ill itself */
3217         mutex_enter(&ill->ill_lock);
3218         for (nce = list_head(&ill->ill_nce); nce != NULL;
3219             nce = list_next(&ill->ill_nce, nce)) {
3220                 if (nce->nce_common == ncec) {
3221                         nce_refhold(nce);
3222                         nce_delete(nce);
3223                         break;
3224                 }
3225         }
3226         mutex_exit(&ill->ill_lock);
3227         if (nce != NULL) {
3228                 if (dead == NULL)
3229                         nce_refrele(nce);
3230                 else
3231                         list_insert_tail(dead, nce);
3232         }
3233 }
3234 
3235 /*
3236  * when the fastpath response does not fit in the datab
3237  * associated with the existing nce_fp_mp, we delete and
3238  * add the nce to retrigger fastpath based on the information
3239  * in the ncec_t.
3240  */
3241 static nce_t *
3242 nce_delete_then_add(nce_t *nce)
3243 {
3244         ill_t           *ill = nce->nce_ill;
3245         nce_t           *newnce = NULL;
3246 
3247         ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3248             (void *)nce, ill->ill_name));
3249         mutex_enter(&ill->ill_lock);
3250         mutex_enter(&nce->nce_common->ncec_lock);
3251         nce_delete(nce);
3252         /*
3253          * Make sure that ncec is not condemned before adding. We hold the
3254          * ill_lock and ncec_lock to synchronize with ncec_delete() and
3255          * ipmp_ncec_delete_nce()
3256          */
3257         if (!NCE_ISCONDEMNED(nce->nce_common))
3258                 newnce = nce_add(ill, nce->nce_common);
3259         mutex_exit(&nce->nce_common->ncec_lock);
3260         mutex_exit(&ill->ill_lock);
3261         nce_refrele(nce);
3262         return (newnce); /* could be null if nomem */
3263 }
3264 
3265 typedef struct nce_fp_match_s {
3266         nce_t   *nce_fp_match_res;
3267         mblk_t  *nce_fp_match_ack_mp;
3268 } nce_fp_match_t;
3269 
3270 /* ARGSUSED */
3271 static int
3272 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3273 {
3274         nce_fp_match_t  *nce_fp_marg = arg;
3275         ncec_t          *ncec = nce->nce_common;
3276         mblk_t          *mp = nce_fp_marg->nce_fp_match_ack_mp;
3277         uchar_t *mp_rptr, *ud_mp_rptr;
3278         mblk_t          *ud_mp = nce->nce_dlur_mp;
3279         ptrdiff_t       cmplen;
3280 
3281         /*
3282          * mp is the mp associated with the fastpath ack.
3283          * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3284          * under consideration. If the contents match, then the
3285          * fastpath ack is used to update the nce.
3286          */
3287         if (ud_mp == NULL)
3288                 return (0);
3289         mp_rptr = mp->b_rptr;
3290         cmplen = mp->b_wptr - mp_rptr;
3291         ASSERT(cmplen >= 0);
3292 
3293         ud_mp_rptr = ud_mp->b_rptr;
3294         /*
3295          * The ncec is locked here to prevent any other threads from accessing
3296          * and changing nce_dlur_mp when the address becomes resolved to an
3297          * lla while we're in the middle of looking at and comparing the
3298          * hardware address (lla). It is also locked to prevent multiple
3299          * threads in nce_fastpath() from examining nce_dlur_mp at the same
3300          * time.
3301          */
3302         mutex_enter(&ncec->ncec_lock);
3303         if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3304             bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3305                 nce_fp_marg->nce_fp_match_res = nce;
3306                 mutex_exit(&ncec->ncec_lock);
3307                 nce_refhold(nce);
3308                 return (1);
3309         }
3310         mutex_exit(&ncec->ncec_lock);
3311         return (0);
3312 }
3313 
3314 /*
3315  * Update all NCE's that are not in fastpath mode and
3316  * have an nce_fp_mp that matches mp. mp->b_cont contains
3317  * the fastpath header.
3318  *
3319  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3320  */
3321 void
3322 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3323 {
3324         nce_fp_match_t nce_fp_marg;
3325         nce_t *nce;
3326         mblk_t *nce_fp_mp, *fp_mp;
3327 
3328         nce_fp_marg.nce_fp_match_res = NULL;
3329         nce_fp_marg.nce_fp_match_ack_mp = mp;
3330 
3331         nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3332 
3333         if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3334                 return;
3335 
3336         mutex_enter(&nce->nce_lock);
3337         nce_fp_mp = nce->nce_fp_mp;
3338 
3339         if (nce_fp_mp != NULL) {
3340                 fp_mp = mp->b_cont;
3341                 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3342                     nce_fp_mp->b_datap->db_lim) {
3343                         mutex_exit(&nce->nce_lock);
3344                         nce = nce_delete_then_add(nce);
3345                         if (nce == NULL) {
3346                                 return;
3347                         }
3348                         mutex_enter(&nce->nce_lock);
3349                         nce_fp_mp = nce->nce_fp_mp;
3350                 }
3351         }
3352 
3353         /* Matched - install mp as the fastpath mp */
3354         if (nce_fp_mp == NULL) {
3355                 fp_mp = dupb(mp->b_cont);
3356                 nce->nce_fp_mp = fp_mp;
3357         } else {
3358                 fp_mp = mp->b_cont;
3359                 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3360                 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3361                     + MBLKL(fp_mp);
3362         }
3363         mutex_exit(&nce->nce_lock);
3364         nce_refrele(nce);
3365 }
3366 
3367 /*
3368  * Return a pointer to a given option in the packet.
3369  * Assumes that option part of the packet have already been validated.
3370  */
3371 nd_opt_hdr_t *
3372 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3373 {
3374         while (optlen > 0) {
3375                 if (opt->nd_opt_type == opt_type)
3376                         return (opt);
3377                 optlen -= 8 * opt->nd_opt_len;
3378                 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3379         }
3380         return (NULL);
3381 }
3382 
3383 /*
3384  * Verify all option lengths present are > 0, also check to see
3385  * if the option lengths and packet length are consistent.
3386  */
3387 boolean_t
3388 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3389 {
3390         ASSERT(opt != NULL);
3391         while (optlen > 0) {
3392                 if (opt->nd_opt_len == 0)
3393                         return (B_FALSE);
3394                 optlen -= 8 * opt->nd_opt_len;
3395                 if (optlen < 0)
3396                         return (B_FALSE);
3397                 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3398         }
3399         return (B_TRUE);
3400 }
3401 
3402 /*
3403  * ncec_walk function.
3404  * Free a fraction of the NCE cache entries.
3405  *
3406  * A possible optimization here would be to use ncec_last where possible, and
3407  * delete the least-frequently used entry, which would require more complex
3408  * computation as we walk through the ncec's (e.g., track ncec entries by
3409  * order of ncec_last and/or maintain state)
3410  */
3411 static void
3412 ncec_cache_reclaim(ncec_t *ncec, void *arg)
3413 {
3414         ip_stack_t      *ipst = ncec->ncec_ipst;
3415         uint_t          fraction = *(uint_t *)arg;
3416         uint_t          rand;
3417 
3418         if ((ncec->ncec_flags &
3419             (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3420                 return;
3421         }
3422 
3423         rand = (uint_t)ddi_get_lbolt() +
3424             NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3425         if ((rand/fraction)*fraction == rand) {
3426                 IP_STAT(ipst, ip_nce_reclaim_deleted);
3427                 ncec_delete(ncec);
3428         }
3429 }
3430 
3431 /*
3432  * kmem_cache callback to free up memory.
3433  *
3434  * For now we just delete a fixed fraction.
3435  */
3436 static void
3437 ip_nce_reclaim_stack(ip_stack_t *ipst)
3438 {
3439         uint_t          fraction = ipst->ips_ip_nce_reclaim_fraction;
3440 
3441         IP_STAT(ipst, ip_nce_reclaim_calls);
3442 
3443         ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);
3444 
3445         /*
3446          * Walk all CONNs that can have a reference on an ire, ncec or dce.
3447          * Get them to update any stale references to drop any refholds they
3448          * have.
3449          */
3450         ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3451 }
3452 
3453 /*
3454  * Called by the memory allocator subsystem directly, when the system
3455  * is running low on memory.
3456  */
3457 /* ARGSUSED */
3458 void
3459 ip_nce_reclaim(void *args)
3460 {
3461         netstack_handle_t nh;
3462         netstack_t *ns;
3463         ip_stack_t *ipst;
3464 
3465         netstack_next_init(&nh);
3466         while ((ns = netstack_next(&nh)) != NULL) {
3467                 /*
3468                  * netstack_next() can return a netstack_t with a NULL
3469                  * netstack_ip at boot time.
3470                  */
3471                 if ((ipst = ns->netstack_ip) == NULL) {
3472                         netstack_rele(ns);
3473                         continue;
3474                 }
3475                 ip_nce_reclaim_stack(ipst);
3476                 netstack_rele(ns);
3477         }
3478         netstack_next_fini(&nh);
3479 }
3480 
3481 #ifdef DEBUG
3482 void
3483 ncec_trace_ref(ncec_t *ncec)
3484 {
3485         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3486 
3487         if (ncec->ncec_trace_disable)
3488                 return;
3489 
3490         if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3491                 ncec->ncec_trace_disable = B_TRUE;
3492                 ncec_trace_cleanup(ncec);
3493         }
3494 }
3495 
3496 void
3497 ncec_untrace_ref(ncec_t *ncec)
3498 {
3499         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3500 
3501         if (!ncec->ncec_trace_disable)
3502                 th_trace_unref(ncec);
3503 }
3504 
3505 static void
3506 ncec_trace_cleanup(const ncec_t *ncec)
3507 {
3508         th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3509 }
3510 #endif
3511 
3512 /*
3513  * Called when address resolution fails due to a timeout.
3514  * Send an ICMP unreachable in response to all queued packets.
3515  */
3516 void
3517 arp_resolv_failed(ncec_t *ncec)
3518 {
3519         mblk_t  *mp, *nxt_mp;
3520         char    buf[INET6_ADDRSTRLEN];
3521         struct in_addr ipv4addr;
3522         ill_t *ill = ncec->ncec_ill;
3523         ip_stack_t *ipst = ncec->ncec_ipst;
3524         ip_recv_attr_t  iras;
3525 
3526         bzero(&iras, sizeof (iras));
3527         iras.ira_flags = IRAF_IS_IPV4;
3528         /*
3529          * we are setting the ira_rill to the ipmp_ill (instead of
3530          * the actual ill on which the packet was received), but this
3531          * is ok because we don't actually need the real ira_rill.
3532          * to send the icmp unreachable to the sender.
3533          */
3534         iras.ira_ill = iras.ira_rill = ill;
3535         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3536         iras.ira_rifindex = iras.ira_ruifindex;
3537 
3538         IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3539         ip3dbg(("arp_resolv_failed: dst %s\n",
3540             inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3541         mutex_enter(&ncec->ncec_lock);
3542         mp = ncec->ncec_qd_mp;
3543         ncec->ncec_qd_mp = NULL;
3544         ncec->ncec_nprobes = 0;
3545         mutex_exit(&ncec->ncec_lock);
3546         while (mp != NULL) {
3547                 nxt_mp = mp->b_next;
3548                 mp->b_next = NULL;
3549 
3550                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3551                 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3552                     mp, ill);
3553                 if (ipst->ips_ip_arp_icmp_error) {
3554                         ip3dbg(("arp_resolv_failed: "
3555                             "Calling icmp_unreachable\n"));
3556                         icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3557                 } else {
3558                         freemsg(mp);
3559                 }
3560                 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3561                 mp = nxt_mp;
3562         }
3563         ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3564 }
3565 
3566 /*
3567  * if ill is an under_ill, translate it to the ipmp_ill and add the
3568  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3569  * one on the underlying in_ill) will be created for the
3570  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3571  */
3572 int
3573 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3574     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3575 {
3576         int     err;
3577         in6_addr_t addr6;
3578         ip_stack_t *ipst = ill->ill_ipst;
3579         nce_t   *nce, *upper_nce = NULL;
3580         ill_t   *in_ill = ill, *under = NULL;
3581         boolean_t need_ill_refrele = B_FALSE;
3582 
3583         if (flags & NCE_F_MCAST) {
3584                 /*
3585                  * hw_addr will be figured out in nce_set_multicast_v4;
3586                  * caller needs to pass in the cast_ill for ipmp
3587                  */
3588                 ASSERT(hw_addr == NULL);
3589                 ASSERT(!IS_IPMP(ill));
3590                 err = nce_set_multicast_v4(ill, addr, flags, newnce);
3591                 return (err);
3592         }
3593 
3594         if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3595                 ill = ipmp_ill_hold_ipmp_ill(ill);
3596                 if (ill == NULL)
3597                         return (ENXIO);
3598                 need_ill_refrele = B_TRUE;
3599         }
3600         if ((flags & NCE_F_BCAST) != 0) {
3601                 /*
3602                  * IPv4 broadcast ncec: compute the hwaddr.
3603                  */
3604                 if (IS_IPMP(ill)) {
3605                         under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3606                         if (under == NULL)  {
3607                                 if (need_ill_refrele)
3608                                         ill_refrele(ill);
3609                                 return (ENETDOWN);
3610                         }
3611                         hw_addr = under->ill_bcast_mp->b_rptr +
3612                             NCE_LL_ADDR_OFFSET(under);
3613                         hw_addr_len = under->ill_phys_addr_length;
3614                 } else {
3615                         hw_addr = ill->ill_bcast_mp->b_rptr +
3616                             NCE_LL_ADDR_OFFSET(ill),
3617                             hw_addr_len = ill->ill_phys_addr_length;
3618                 }
3619         }
3620 
3621         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3622         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3623         nce = nce_lookup_addr(ill, &addr6);
3624         if (nce == NULL) {
3625                 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3626                     state, &nce);
3627         } else {
3628                 err = EEXIST;
3629         }
3630         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3631         if (err == 0)
3632                 err = nce_add_v4_postprocess(nce);
3633 
3634         if (in_ill != ill && nce != NULL) {
3635                 nce_t *under_nce = NULL;
3636 
3637                 /*
3638                  * in_ill was the under_ill. Try to create the under_nce.
3639                  * Hold the ill_g_lock to prevent changes to group membership
3640                  * until we are done.
3641                  */
3642                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3643                 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3644                         DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3645                             ill_t *, ill);
3646                         rw_exit(&ipst->ips_ill_g_lock);
3647                         err = ENXIO;
3648                         nce_refrele(nce);
3649                         nce = NULL;
3650                         goto bail;
3651                 }
3652                 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3653                 if (under_nce == NULL) {
3654                         rw_exit(&ipst->ips_ill_g_lock);
3655                         err = EINVAL;
3656                         nce_refrele(nce);
3657                         nce = NULL;
3658                         goto bail;
3659                 }
3660                 rw_exit(&ipst->ips_ill_g_lock);
3661                 upper_nce = nce;
3662                 nce = under_nce; /* will be returned to caller */
3663                 if (NCE_ISREACHABLE(nce->nce_common))
3664                         nce_fastpath_trigger(under_nce);
3665         }
3666         if (nce != NULL) {
3667                 if (newnce != NULL)
3668                         *newnce = nce;
3669                 else
3670                         nce_refrele(nce);
3671         }
3672 bail:
3673         if (under != NULL)
3674                 ill_refrele(under);
3675         if (upper_nce != NULL)
3676                 nce_refrele(upper_nce);
3677         if (need_ill_refrele)
3678                 ill_refrele(ill);
3679 
3680         return (err);
3681 }
3682 
3683 /*
3684  * NDP Cache Entry creation routine for IPv4.
3685  * This routine must always be called with ndp4->ndp_g_lock held.
3686  * Prior to return, ncec_refcnt is incremented.
3687  *
3688  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3689  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3690  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3691  * entries will be created, both pointing at the same ncec_t. The nce_t
3692  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3693  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3694  * Local addresses are always created on the ill passed to nce_add_v4.
3695  */
3696 int
3697 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3698     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3699 {
3700         int             err;
3701         boolean_t       is_multicast = (flags & NCE_F_MCAST);
3702         struct in6_addr addr6;
3703         nce_t           *nce;
3704 
3705         ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3706         ASSERT(!ill->ill_isv6);
3707         ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3708 
3709         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3710         err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3711             &nce);
3712         ASSERT(newnce != NULL);
3713         *newnce = nce;
3714         return (err);
3715 }
3716 
3717 /*
3718  * Post-processing routine to be executed after nce_add_v4(). This function
3719  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3720  * and must be called without any locks held.
3721  *
3722  * Always returns 0, but we return an int to keep this symmetric with the
3723  * IPv6 counter-part.
3724  */
3725 int
3726 nce_add_v4_postprocess(nce_t *nce)
3727 {
3728         ncec_t          *ncec = nce->nce_common;
3729         uint16_t        flags = ncec->ncec_flags;
3730         boolean_t       ndp_need_dad = B_FALSE;
3731         boolean_t       dropped;
3732         clock_t         delay;
3733         ip_stack_t      *ipst = ncec->ncec_ill->ill_ipst;
3734         uchar_t         *hw_addr = ncec->ncec_lladdr;
3735         boolean_t       trigger_fastpath = B_TRUE;
3736 
3737         /*
3738          * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3739          * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3740          * We call nce_fastpath from nce_update if the link layer address of
3741          * the peer changes from nce_update
3742          */
3743         if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3744             ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3745                 trigger_fastpath = B_FALSE;
3746 
3747         if (trigger_fastpath)
3748                 nce_fastpath_trigger(nce);
3749 
3750         if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3751                 /*
3752                  * Either the caller (by passing in ND_PROBE)
3753                  * or nce_add_common() (by the internally computed state
3754                  * based on ncec_addr and ill_net_type) has determined
3755                  * that this unicast entry needs DAD. Trigger DAD.
3756                  */
3757                 ndp_need_dad = B_TRUE;
3758         } else if (flags & NCE_F_UNSOL_ADV) {
3759                 /*
3760                  * We account for the transmit below by assigning one
3761                  * less than the ndd variable. Subsequent decrements
3762                  * are done in nce_timer.
3763                  */
3764                 mutex_enter(&ncec->ncec_lock);
3765                 ncec->ncec_unsolicit_count =
3766                     ipst->ips_ip_arp_publish_count - 1;
3767                 mutex_exit(&ncec->ncec_lock);
3768                 dropped = arp_announce(ncec);
3769                 mutex_enter(&ncec->ncec_lock);
3770                 if (dropped)
3771                         ncec->ncec_unsolicit_count++;
3772                 else
3773                         ncec->ncec_last_time_defended = ddi_get_lbolt();
3774                 if (ncec->ncec_unsolicit_count != 0) {
3775                         nce_start_timer(ncec,
3776                             ipst->ips_ip_arp_publish_interval);
3777                 }
3778                 mutex_exit(&ncec->ncec_lock);
3779         }
3780 
3781         /*
3782          * If ncec_xmit_interval is 0, user has configured us to send the first
3783          * probe right away.  Do so, and set up for the subsequent probes.
3784          */
3785         if (ndp_need_dad) {
3786                 mutex_enter(&ncec->ncec_lock);
3787                 if (ncec->ncec_pcnt == 0) {
3788                         /*
3789                          * DAD probes and announce can be
3790                          * administratively disabled by setting the
3791                          * probe_count to zero. Restart the timer in
3792                          * this case to mark the ipif as ready.
3793                          */
3794                         ncec->ncec_unsolicit_count = 0;
3795                         mutex_exit(&ncec->ncec_lock);
3796                         nce_restart_timer(ncec, 0);
3797                 } else {
3798                         mutex_exit(&ncec->ncec_lock);
3799                         delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3800                             ipst->ips_arp_probe_delay :
3801                             ipst->ips_arp_fastprobe_delay);
3802                         nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3803                 }
3804         }
3805         return (0);
3806 }
3807 
3808 /*
3809  * ncec_walk routine to update all entries that have a given destination or
3810  * gateway address and cached link layer (MAC) address.  This is used when ARP
3811  * informs us that a network-to-link-layer mapping may have changed.
3812  */
3813 void
3814 nce_update_hw_changed(ncec_t *ncec, void *arg)
3815 {
3816         nce_hw_map_t *hwm = arg;
3817         ipaddr_t ncec_addr;
3818 
3819         if (ncec->ncec_state != ND_REACHABLE)
3820                 return;
3821 
3822         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3823         if (ncec_addr != hwm->hwm_addr)
3824                 return;
3825 
3826         mutex_enter(&ncec->ncec_lock);
3827         if (hwm->hwm_flags != 0)
3828                 ncec->ncec_flags = hwm->hwm_flags;
3829         nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3830         mutex_exit(&ncec->ncec_lock);
3831 }
3832 
3833 void
3834 ncec_refhold(ncec_t *ncec)
3835 {
3836         mutex_enter(&(ncec)->ncec_lock);
3837         (ncec)->ncec_refcnt++;
3838         ASSERT((ncec)->ncec_refcnt != 0);
3839 #ifdef DEBUG
3840         ncec_trace_ref(ncec);
3841 #endif
3842         mutex_exit(&(ncec)->ncec_lock);
3843 }
3844 
3845 void
3846 ncec_refhold_notr(ncec_t *ncec)
3847 {
3848         mutex_enter(&(ncec)->ncec_lock);
3849         (ncec)->ncec_refcnt++;
3850         ASSERT((ncec)->ncec_refcnt != 0);
3851         mutex_exit(&(ncec)->ncec_lock);
3852 }
3853 
3854 static void
3855 ncec_refhold_locked(ncec_t *ncec)
3856 {
3857         ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3858         (ncec)->ncec_refcnt++;
3859 #ifdef DEBUG
3860         ncec_trace_ref(ncec);
3861 #endif
3862 }
3863 
3864 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3865 void
3866 ncec_refrele(ncec_t *ncec)
3867 {
3868         mutex_enter(&(ncec)->ncec_lock);
3869 #ifdef DEBUG
3870         ncec_untrace_ref(ncec);
3871 #endif
3872         ASSERT((ncec)->ncec_refcnt != 0);
3873         if (--(ncec)->ncec_refcnt == 0) {
3874                 ncec_inactive(ncec);
3875         } else {
3876                 mutex_exit(&(ncec)->ncec_lock);
3877         }
3878 }
3879 
3880 void
3881 ncec_refrele_notr(ncec_t *ncec)
3882 {
3883         mutex_enter(&(ncec)->ncec_lock);
3884         ASSERT((ncec)->ncec_refcnt != 0);
3885         if (--(ncec)->ncec_refcnt == 0) {
3886                 ncec_inactive(ncec);
3887         } else {
3888                 mutex_exit(&(ncec)->ncec_lock);
3889         }
3890 }
3891 
3892 /*
3893  * Common to IPv4 and IPv6.
3894  */
3895 void
3896 nce_restart_timer(ncec_t *ncec, uint_t ms)
3897 {
3898         timeout_id_t tid;
3899 
3900         ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3901 
3902         /* First cancel any running timer */
3903         mutex_enter(&ncec->ncec_lock);
3904         tid = ncec->ncec_timeout_id;
3905         ncec->ncec_timeout_id = 0;
3906         if (tid != 0) {
3907                 mutex_exit(&ncec->ncec_lock);
3908                 (void) untimeout(tid);
3909                 mutex_enter(&ncec->ncec_lock);
3910         }
3911 
3912         /* Restart timer */
3913         nce_start_timer(ncec, ms);
3914         mutex_exit(&ncec->ncec_lock);
3915 }
3916 
3917 static void
3918 nce_start_timer(ncec_t *ncec, uint_t ms)
3919 {
3920         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3921         /*
3922          * Don't start the timer if the ncec has been deleted, or if the timer
3923          * is already running
3924          */
3925         if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3926                 ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3927                     MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3928         }
3929 }
3930 
3931 int
3932 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3933     uint16_t flags, nce_t **newnce)
3934 {
3935         uchar_t         *hw_addr;
3936         int             err = 0;
3937         ip_stack_t      *ipst = ill->ill_ipst;
3938         in6_addr_t      dst6;
3939         nce_t           *nce;
3940 
3941         ASSERT(!ill->ill_isv6);
3942 
3943         IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3944         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3945         if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3946                 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3947                 goto done;
3948         }
3949         if (ill->ill_net_type == IRE_IF_RESOLVER) {
3950                 /*
3951                  * For IRE_IF_RESOLVER a hardware mapping can be
3952                  * generated, for IRE_IF_NORESOLVER, resolution cookie
3953                  * in the ill is copied in nce_add_v4().
3954                  */
3955                 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3956                 if (hw_addr == NULL) {
3957                         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3958                         return (ENOMEM);
3959                 }
3960                 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3961         } else {
3962                 /*
3963                  * IRE_IF_NORESOLVER type simply copies the resolution
3964                  * cookie passed in.  So no hw_addr is needed.
3965                  */
3966                 hw_addr = NULL;
3967         }
3968         ASSERT(flags & NCE_F_MCAST);
3969         ASSERT(flags & NCE_F_NONUD);
3970         /* nce_state will be computed by nce_add_common() */
3971         err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3972             ND_UNCHANGED, &nce);
3973         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3974         if (err == 0)
3975                 err = nce_add_v4_postprocess(nce);
3976         if (hw_addr != NULL)
3977                 kmem_free(hw_addr, ill->ill_phys_addr_length);
3978         if (err != 0) {
3979                 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3980                 return (err);
3981         }
3982 done:
3983         if (newnce != NULL)
3984                 *newnce = nce;
3985         else
3986                 nce_refrele(nce);
3987         return (0);
3988 }
3989 
3990 /*
3991  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3992  * don't want to have to walk the list for every single one, so we gather up
3993  * batches at a time.
3994  */
3995 #define NCE_RESCHED_LIST_LEN    8
3996 
3997 typedef struct {
3998         ill_t   *ncert_ill;
3999         uint_t  ncert_num;
4000         ncec_t  *ncert_nces[NCE_RESCHED_LIST_LEN];
4001 } nce_resched_t;
4002 
4003 /*
4004  * Pick the longest waiting NCEs for defense.
4005  */
4006 /* ARGSUSED */
4007 static int
4008 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4009 {
4010         nce_resched_t *ncert = arg;
4011         ncec_t **ncecs;
4012         ncec_t **ncec_max;
4013         ncec_t *ncec_temp;
4014         ncec_t *ncec = nce->nce_common;
4015 
4016         ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4017         /*
4018          * Only reachable entries that are ready for announcement are eligible.
4019          */
4020         if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4021                 return (0);
4022         if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4023                 ncec_refhold(ncec);
4024                 ncert->ncert_nces[ncert->ncert_num++] = ncec;
4025         } else {
4026                 ncecs = ncert->ncert_nces;
4027                 ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4028                 ncec_refhold(ncec);
4029                 for (; ncecs < ncec_max; ncecs++) {
4030                         ASSERT(ncec != NULL);
4031                         if ((*ncecs)->ncec_last_time_defended >
4032                             ncec->ncec_last_time_defended) {
4033                                 ncec_temp = *ncecs;
4034                                 *ncecs = ncec;
4035                                 ncec = ncec_temp;
4036                         }
4037                 }
4038                 ncec_refrele(ncec);
4039         }
4040         return (0);
4041 }
4042 
4043 /*
4044  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4045  * doesn't happen very often (if at all), and thus it needn't be highly
4046  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4047  * outer loop is bounded by a constant rather than by the length of the list.)
4048  */
4049 static void
4050 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4051 {
4052         ncec_t          *ncec;
4053         ip_stack_t      *ipst = ill->ill_ipst;
4054         uint_t          i, defend_rate;
4055 
4056         i = ill->ill_defend_count;
4057         ill->ill_defend_count = 0;
4058         if (ill->ill_isv6)
4059                 defend_rate = ipst->ips_ndp_defend_rate;
4060         else
4061                 defend_rate = ipst->ips_arp_defend_rate;
4062         /* If none could be sitting around, then don't reschedule */
4063         if (i < defend_rate) {
4064                 DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4065                 return;
4066         }
4067         ncert->ncert_ill = ill;
4068         while (ill->ill_defend_count < defend_rate) {
4069                 nce_walk_common(ill, ncec_reschedule, ncert);
4070                 for (i = 0; i < ncert->ncert_num; i++) {
4071 
4072                         ncec = ncert->ncert_nces[i];
4073                         mutex_enter(&ncec->ncec_lock);
4074                         ncec->ncec_flags |= NCE_F_DELAYED;
4075                         mutex_exit(&ncec->ncec_lock);
4076                         /*
4077                          * we plan to schedule this ncec, so incr the
4078                          * defend_count in anticipation.
4079                          */
4080                         if (++ill->ill_defend_count >= defend_rate)
4081                                 break;
4082                 }
4083                 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4084                         break;
4085         }
4086 }
4087 
4088 /*
4089  * Check if the current rate-limiting parameters permit the sending
4090  * of another address defense announcement for both IPv4 and IPv6.
4091  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4092  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4093  * determines how many address defense announcements are permitted
4094  * in any `defense_perio' interval.
4095  */
4096 static boolean_t
4097 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4098 {
4099         clock_t         now = ddi_get_lbolt();
4100         ip_stack_t      *ipst = ill->ill_ipst;
4101         clock_t         start = ill->ill_defend_start;
4102         uint32_t        elapsed, defend_period, defend_rate;
4103         nce_resched_t   ncert;
4104         boolean_t       ret;
4105         int             i;
4106 
4107         if (ill->ill_isv6) {
4108                 defend_period = ipst->ips_ndp_defend_period;
4109                 defend_rate = ipst->ips_ndp_defend_rate;
4110         } else {
4111                 defend_period = ipst->ips_arp_defend_period;
4112                 defend_rate = ipst->ips_arp_defend_rate;
4113         }
4114         if (defend_rate == 0)
4115                 return (B_TRUE);
4116         bzero(&ncert, sizeof (ncert));
4117         mutex_enter(&ill->ill_lock);
4118         if (start > 0) {
4119                 elapsed = now - start;
4120                 if (elapsed > SEC_TO_TICK(defend_period)) {
4121                         ill->ill_defend_start = now;
4122                         /*
4123                          * nce_ill_reschedule will attempt to
4124                          * prevent starvation by reschduling the
4125                          * oldest entries, which are marked with
4126                          * the NCE_F_DELAYED flag.
4127                          */
4128                         nce_ill_reschedule(ill, &ncert);
4129                 }
4130         } else {
4131                 ill->ill_defend_start = now;
4132         }
4133         ASSERT(ill->ill_defend_count <= defend_rate);
4134         mutex_enter(&ncec->ncec_lock);
4135         if (ncec->ncec_flags & NCE_F_DELAYED) {
4136                 /*
4137                  * This ncec was rescheduled as one of the really old
4138                  * entries needing on-going defense. The
4139                  * ill_defend_count was already incremented in
4140                  * nce_ill_reschedule. Go ahead and send the announce.
4141                  */
4142                 ncec->ncec_flags &= ~NCE_F_DELAYED;
4143                 mutex_exit(&ncec->ncec_lock);
4144                 ret = B_FALSE;
4145                 goto done;
4146         }
4147         mutex_exit(&ncec->ncec_lock);
4148         if (ill->ill_defend_count < defend_rate)
4149                 ill->ill_defend_count++;
4150         if (ill->ill_defend_count == defend_rate) {
4151                 /*
4152                  * we are no longer allowed to send unbidden defense
4153                  * messages. Wait for rescheduling.
4154                  */
4155                 ret = B_TRUE;
4156         } else {
4157                 ret = B_FALSE;
4158         }
4159 done:
4160         mutex_exit(&ill->ill_lock);
4161         /*
4162          * After all the locks have been dropped we can restart nce timer,
4163          * and refrele the delayed ncecs
4164          */
4165         for (i = 0; i < ncert.ncert_num; i++) {
4166                 clock_t xmit_interval;
4167                 ncec_t  *tmp;
4168 
4169                 tmp = ncert.ncert_nces[i];
4170                 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4171                     B_FALSE);
4172                 nce_restart_timer(tmp, xmit_interval);
4173                 ncec_refrele(tmp);
4174         }
4175         return (ret);
4176 }
4177 
4178 boolean_t
4179 ndp_announce(ncec_t *ncec)
4180 {
4181         return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4182             ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4183             nce_advert_flags(ncec)));
4184 }
4185 
4186 ill_t *
4187 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4188 {
4189         mblk_t          *mp;
4190         in6_addr_t      src6;
4191         ipaddr_t        src4;
4192         ill_t           *ill = ncec->ncec_ill;
4193         ill_t           *src_ill = NULL;
4194         ipif_t          *ipif = NULL;
4195         boolean_t       is_myaddr = NCE_MYADDR(ncec);
4196         boolean_t       isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4197 
4198         ASSERT(src != NULL);
4199         ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4200         src6 = *src;
4201         if (is_myaddr) {
4202                 src6 = ncec->ncec_addr;
4203                 if (!isv6)
4204                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4205         } else {
4206                 /*
4207                  * try to find one from the outgoing packet.
4208                  */
4209                 mutex_enter(&ncec->ncec_lock);
4210                 mp = ncec->ncec_qd_mp;
4211                 if (mp != NULL) {
4212                         if (isv6) {
4213                                 ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
4214 
4215                                 src6 = ip6h->ip6_src;
4216                         } else {
4217                                 ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4218 
4219                                 src4 = ipha->ipha_src;
4220                                 IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4221                         }
4222                 }
4223                 mutex_exit(&ncec->ncec_lock);
4224         }
4225 
4226         /*
4227          * For outgoing packets, if the src of outgoing packet is one
4228          * of the assigned interface addresses use it, otherwise we
4229          * will pick the source address below.
4230          * For local addresses (is_myaddr) doing DAD, NDP announce
4231          * messages are mcast. So we use the (IPMP) cast_ill or the
4232          * (non-IPMP) ncec_ill for these message types. The only case
4233          * of unicast DAD messages are for IPv6 ND probes, for which
4234          * we find the ipif_bound_ill corresponding to the ncec_addr.
4235          */
4236         if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4237                 if (isv6) {
4238                         ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4239                             ill->ill_ipst);
4240                 } else {
4241                         ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4242                             ill->ill_ipst);
4243                 }
4244 
4245                 /*
4246                  * If no relevant ipif can be found, then it's not one of our
4247                  * addresses.  Reset to :: and try to find a src for the NS or
4248                  * ARP request using ipif_select_source_v[4,6]  below.
4249                  * If an ipif can be found, but it's not yet done with
4250                  * DAD verification, and we are not being invoked for
4251                  * DAD (i.e., !is_myaddr), then just postpone this
4252                  * transmission until later.
4253                  */
4254                 if (ipif == NULL) {
4255                         src6 = ipv6_all_zeros;
4256                         src4 = INADDR_ANY;
4257                 } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4258                         DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4259                             ncec_t *, ncec, ipif_t *, ipif);
4260                         ipif_refrele(ipif);
4261                         return (NULL);
4262                 }
4263         }
4264 
4265         if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4266                 /*
4267                  * Pick a source address for this solicitation, but
4268                  * restrict the selection to addresses assigned to the
4269                  * output interface.  We do this because the destination will
4270                  * create a neighbor cache entry for the source address of
4271                  * this packet, so the source address had better be a valid
4272                  * neighbor.
4273                  */
4274                 if (isv6) {
4275                         ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4276                             B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4277                             B_FALSE, NULL);
4278                 } else {
4279                         ipaddr_t nce_addr;
4280 
4281                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4282                         ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4283                             B_FALSE, NULL);
4284                 }
4285                 if (ipif == NULL && IS_IPMP(ill)) {
4286                         ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4287 
4288                         if (send_ill != NULL) {
4289                                 if (isv6) {
4290                                         ipif = ipif_select_source_v6(send_ill,
4291                                             &ncec->ncec_addr, B_TRUE,
4292                                             IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4293                                             B_FALSE, NULL);
4294                                 } else {
4295                                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4296                                             src4);
4297                                         ipif = ipif_select_source_v4(send_ill,
4298                                             src4, ALL_ZONES, B_TRUE, NULL);
4299                                 }
4300                                 ill_refrele(send_ill);
4301                         }
4302                 }
4303 
4304                 if (ipif == NULL) {
4305                         char buf[INET6_ADDRSTRLEN];
4306 
4307                         ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4308                             inet_ntop((isv6 ? AF_INET6 : AF_INET),
4309                             (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4310                         DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4311                         return (NULL);
4312                 }
4313                 src6 = ipif->ipif_v6lcl_addr;
4314         }
4315         *src = src6;
4316         if (ipif != NULL) {
4317                 src_ill = ipif->ipif_ill;
4318                 if (IS_IPMP(src_ill))
4319                         src_ill = ipmp_ipif_hold_bound_ill(ipif);
4320                 else
4321                         ill_refhold(src_ill);
4322                 ipif_refrele(ipif);
4323                 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4324                     ill_t *, src_ill);
4325         }
4326         return (src_ill);
4327 }
4328 
4329 void
4330 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4331     uchar_t *hwaddr, int hwaddr_len, int flags)
4332 {
4333         ill_t   *ill;
4334         ncec_t  *ncec;
4335         nce_t   *nce;
4336         uint16_t new_state;
4337 
4338         ill = (ipif ? ipif->ipif_ill : NULL);
4339         if (ill != NULL) {
4340                 /*
4341                  * only one ncec is possible
4342                  */
4343                 nce = nce_lookup_v4(ill, addr);
4344                 if (nce != NULL) {
4345                         ncec = nce->nce_common;
4346                         mutex_enter(&ncec->ncec_lock);
4347                         if (NCE_ISREACHABLE(ncec))
4348                                 new_state = ND_UNCHANGED;
4349                         else
4350                                 new_state = ND_STALE;
4351                         ncec->ncec_flags = flags;
4352                         nce_update(ncec, new_state, hwaddr);
4353                         mutex_exit(&ncec->ncec_lock);
4354                         nce_refrele(nce);
4355                         return;
4356                 }
4357         } else {
4358                 /*
4359                  * ill is wildcard; clean up all ncec's and ire's
4360                  * that match on addr.
4361                  */
4362                 nce_hw_map_t hwm;
4363 
4364                 hwm.hwm_addr = *addr;
4365                 hwm.hwm_hwlen = hwaddr_len;
4366                 hwm.hwm_hwaddr = hwaddr;
4367                 hwm.hwm_flags = flags;
4368 
4369                 ncec_walk_common(ipst->ips_ndp4, NULL,
4370                     nce_update_hw_changed, &hwm, B_TRUE);
4371         }
4372 }
4373 
4374 /*
4375  * Common function to add ncec entries.
4376  * we always add the ncec with ncec_ill == ill, and always create
4377  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4378  * ncec is !reachable.
4379  *
4380  * When the caller passes in an nce_state of ND_UNCHANGED,
4381  * nce_add_common() will determine the state of the created nce based
4382  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4383  * be created with state set to the passed in nce_state.
4384  */
4385 static int
4386 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4387     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4388 {
4389         static  ncec_t          nce_nil;
4390         uchar_t                 *template = NULL;
4391         int                     err;
4392         ncec_t                  *ncec;
4393         ncec_t                  **ncep;
4394         ip_stack_t              *ipst = ill->ill_ipst;
4395         uint16_t                state;
4396         boolean_t               fastprobe = B_FALSE;
4397         struct ndp_g_s          *ndp;
4398         nce_t                   *nce = NULL;
4399         mblk_t                  *dlur_mp = NULL;
4400 
4401         if (ill->ill_isv6)
4402                 ndp = ill->ill_ipst->ips_ndp6;
4403         else
4404                 ndp = ill->ill_ipst->ips_ndp4;
4405 
4406         *retnce = NULL;
4407 
4408         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4409 
4410         if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4411                 ip0dbg(("nce_add_common: no addr\n"));
4412                 return (EINVAL);
4413         }
4414         if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4415                 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4416                 return (EINVAL);
4417         }
4418 
4419         if (ill->ill_isv6) {
4420                 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4421         } else {
4422                 ipaddr_t v4addr;
4423 
4424                 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4425                 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4426         }
4427 
4428         /*
4429          * The caller has ensured that there is no nce on ill, but there could
4430          * still be an nce_common_t for the address, so that we find exisiting
4431          * ncec_t strucutures first, and atomically add a new nce_t if
4432          * one is found. The ndp_g_lock ensures that we don't cross threads
4433          * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4434          * compare for matches across the illgrp because this function is
4435          * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4436          * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4437          * appropriate.
4438          */
4439         ncec = *ncep;
4440         for (; ncec != NULL; ncec = ncec->ncec_next) {
4441                 if (ncec->ncec_ill == ill) {
4442                         if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4443                                 /*
4444                                  * We should never find *retnce to be
4445                                  * MYADDR, since the caller may then
4446                                  * incorrectly restart a DAD timer that's
4447                                  * already running.  However, if we are in
4448                                  * forwarding mode, and the interface is
4449                                  * moving in/out of groups, the data
4450                                  * path ire lookup (e.g., ire_revalidate_nce)
4451                                  * may  have determined that some destination
4452                                  * is offlink while the control path is adding
4453                                  * that address as a local address.
4454                                  * Recover from  this case by failing the
4455                                  * lookup
4456                                  */
4457                                 if (NCE_MYADDR(ncec))
4458                                         return (ENXIO);
4459                                 *retnce = nce_ill_lookup_then_add(ill, ncec);
4460                                 if (*retnce != NULL)
4461                                         break;
4462                         }
4463                 }
4464         }
4465         if (*retnce != NULL) /* caller must trigger fastpath on nce */
4466                 return (0);
4467 
4468         ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4469         if (ncec == NULL)
4470                 return (ENOMEM);
4471         *ncec = nce_nil;
4472         ncec->ncec_ill = ill;
4473         ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4474         ncec->ncec_flags = flags;
4475         ncec->ncec_ipst = ipst;      /* No netstack_hold */
4476 
4477         if (!ill->ill_isv6) {
4478                 ipaddr_t addr4;
4479 
4480                 /*
4481                  * DAD probe interval and probe count are set based on
4482                  * fast/slow probe settings. If the underlying link doesn't
4483                  * have reliably up/down notifications or if we're working
4484                  * with IPv4 169.254.0.0/16 Link Local Address space, then
4485                  * don't use the fast timers.  Otherwise, use them.
4486                  */
4487                 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4488                 IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4489                 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4490                         fastprobe = B_TRUE;
4491                 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4492                     !IS_IPV4_LL_SPACE(&addr4)) {
4493                         ill_t *hwaddr_ill;
4494 
4495                         hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4496                             hw_addr_len);
4497                         if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4498                                 fastprobe = B_TRUE;
4499                 }
4500                 if (fastprobe) {
4501                         ncec->ncec_xmit_interval =
4502                             ipst->ips_arp_fastprobe_interval;
4503                         ncec->ncec_pcnt =
4504                             ipst->ips_arp_fastprobe_count;
4505                         ncec->ncec_flags |= NCE_F_FAST;
4506                 } else {
4507                         ncec->ncec_xmit_interval =
4508                             ipst->ips_arp_probe_interval;
4509                         ncec->ncec_pcnt =
4510                             ipst->ips_arp_probe_count;
4511                 }
4512                 if (NCE_PUBLISH(ncec)) {
4513                         ncec->ncec_unsolicit_count =
4514                             ipst->ips_ip_arp_publish_count;
4515                 }
4516         } else {
4517                 /*
4518                  * probe interval is constant: ILL_PROBE_INTERVAL
4519                  * probe count is constant: ND_MAX_UNICAST_SOLICIT
4520                  */
4521                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4522                 if (NCE_PUBLISH(ncec)) {
4523                         ncec->ncec_unsolicit_count =
4524                             ipst->ips_ip_ndp_unsolicit_count;
4525                 }
4526         }
4527         ncec->ncec_rcnt = ill->ill_xmit_count;
4528         ncec->ncec_addr = *addr;
4529         ncec->ncec_qd_mp = NULL;
4530         ncec->ncec_refcnt = 1; /* for ncec getting created */
4531         mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4532         ncec->ncec_trace_disable = B_FALSE;
4533 
4534         /*
4535          * ncec_lladdr holds link layer address
4536          */
4537         if (hw_addr_len > 0) {
4538                 template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4539                 if (template == NULL) {
4540                         err = ENOMEM;
4541                         goto err_ret;
4542                 }
4543                 ncec->ncec_lladdr = template;
4544                 ncec->ncec_lladdr_length = hw_addr_len;
4545                 bzero(ncec->ncec_lladdr, hw_addr_len);
4546         }
4547         if ((flags & NCE_F_BCAST) != 0) {
4548                 state = ND_REACHABLE;
4549                 ASSERT(hw_addr_len > 0);
4550         } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4551                 state = ND_INITIAL;
4552         } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4553                 /*
4554                  * NORESOLVER entries are always created in the REACHABLE
4555                  * state.
4556                  */
4557                 state = ND_REACHABLE;
4558                 if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4559                     ill->ill_mactype != DL_IPV4 &&
4560                     ill->ill_mactype != DL_6TO4) {
4561                         /*
4562                          * We create a nce_res_mp with the IP nexthop address
4563                          * as the destination address if the physical length
4564                          * is exactly 4 bytes for point-to-multipoint links
4565                          * that do their own resolution from IP to link-layer
4566                          * address (e.g. IP over X.25).
4567                          */
4568                         bcopy((uchar_t *)addr,
4569                             ncec->ncec_lladdr, ill->ill_phys_addr_length);
4570                 }
4571                 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4572                     ill->ill_mactype != DL_IPV6) {
4573                         /*
4574                          * We create a nce_res_mp with the IP nexthop address
4575                          * as the destination address if the physical legnth
4576                          * is exactly 16 bytes for point-to-multipoint links
4577                          * that do their own resolution from IP to link-layer
4578                          * address.
4579                          */
4580                         bcopy((uchar_t *)addr,
4581                             ncec->ncec_lladdr, ill->ill_phys_addr_length);
4582                 }
4583                 /*
4584                  * Since NUD is not part of the base IPv4 protocol definition,
4585                  * IPv4 neighbor entries on NORESOLVER interfaces will never
4586                  * age, and are marked NCE_F_NONUD.
4587                  */
4588                 if (!ill->ill_isv6)
4589                         ncec->ncec_flags |= NCE_F_NONUD;
4590         } else if (ill->ill_net_type == IRE_LOOPBACK) {
4591                 state = ND_REACHABLE;
4592         }
4593 
4594         if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4595                 /*
4596                  * We are adding an ncec with a deterministic hw_addr,
4597                  * so the state can only be one of {REACHABLE, STALE, PROBE}.
4598                  *
4599                  * if we are adding a unicast ncec for the local address
4600                  * it would be REACHABLE; we would be adding a ND_STALE entry
4601                  * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4602                  * addresses are added in PROBE to trigger DAD.
4603                  */
4604                 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4605                     ill->ill_net_type == IRE_IF_NORESOLVER)
4606                         state = ND_REACHABLE;
4607                 else if (!NCE_PUBLISH(ncec))
4608                         state = ND_STALE;
4609                 else
4610                         state = ND_PROBE;
4611                 if (hw_addr != NULL)
4612                         nce_set_ll(ncec, hw_addr);
4613         }
4614         /* caller overrides internally computed state */
4615         if (nce_state != ND_UNCHANGED)
4616                 state = nce_state;
4617 
4618         if (state == ND_PROBE)
4619                 ncec->ncec_flags |= NCE_F_UNVERIFIED;
4620 
4621         ncec->ncec_state = state;
4622 
4623         if (state == ND_REACHABLE) {
4624                 ncec->ncec_last = ncec->ncec_init_time =
4625                     TICK_TO_MSEC(ddi_get_lbolt64());
4626         } else {
4627                 ncec->ncec_last = 0;
4628                 if (state == ND_INITIAL)
4629                         ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4630         }
4631         list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4632             offsetof(ncec_cb_t, ncec_cb_node));
4633         /*
4634          * have all the memory allocations out of the way before taking locks
4635          * and adding the nce.
4636          */
4637         nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4638         if (nce == NULL) {
4639                 err = ENOMEM;
4640                 goto err_ret;
4641         }
4642         if (ncec->ncec_lladdr != NULL ||
4643             ill->ill_net_type == IRE_IF_NORESOLVER) {
4644                 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4645                     ill->ill_phys_addr_length, ill->ill_sap,
4646                     ill->ill_sap_length);
4647                 if (dlur_mp == NULL) {
4648                         err = ENOMEM;
4649                         goto err_ret;
4650                 }
4651         }
4652 
4653         /*
4654          * Atomically ensure that the ill is not CONDEMNED, before
4655          * adding the NCE.
4656          */
4657         mutex_enter(&ill->ill_lock);
4658         if (ill->ill_state_flags & ILL_CONDEMNED) {
4659                 mutex_exit(&ill->ill_lock);
4660                 err = EINVAL;
4661                 goto err_ret;
4662         }
4663         if (!NCE_MYADDR(ncec) &&
4664             (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4665                 mutex_exit(&ill->ill_lock);
4666                 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4667                 err = EINVAL;
4668                 goto err_ret;
4669         }
4670         /*
4671          * Acquire the ncec_lock even before adding the ncec to the list
4672          * so that it cannot get deleted after the ncec is added, but
4673          * before we add the nce.
4674          */
4675         mutex_enter(&ncec->ncec_lock);
4676         if ((ncec->ncec_next = *ncep) != NULL)
4677                 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4678         *ncep = ncec;
4679         ncec->ncec_ptpn = ncep;
4680 
4681         /* Bump up the number of ncec's referencing this ill */
4682         DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4683             (char *), "ncec", (void *), ncec);
4684         ill->ill_ncec_cnt++;
4685         /*
4686          * Since we hold the ncec_lock at this time, the ncec cannot be
4687          * condemned, and we can safely add the nce.
4688          */
4689         *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4690         mutex_exit(&ncec->ncec_lock);
4691         mutex_exit(&ill->ill_lock);
4692 
4693         /* caller must trigger fastpath on *retnce */
4694         return (0);
4695 
4696 err_ret:
4697         if (ncec != NULL)
4698                 kmem_cache_free(ncec_cache, ncec);
4699         if (nce != NULL)
4700                 kmem_cache_free(nce_cache, nce);
4701         freemsg(dlur_mp);
4702         if (template != NULL)
4703                 kmem_free(template, ill->ill_phys_addr_length);
4704         return (err);
4705 }
4706 
4707 /*
4708  * take a ref on the nce
4709  */
4710 void
4711 nce_refhold(nce_t *nce)
4712 {
4713         mutex_enter(&nce->nce_lock);
4714         nce->nce_refcnt++;
4715         ASSERT((nce)->nce_refcnt != 0);
4716         mutex_exit(&nce->nce_lock);
4717 }
4718 
4719 /*
4720  * release a ref on the nce; In general, this
4721  * cannot be called with locks held because nce_inactive
4722  * may result in nce_inactive which will take the ill_lock,
4723  * do ipif_ill_refrele_tail etc. Thus the one exception
4724  * where this can be called with locks held is when the caller
4725  * is certain that the nce_refcnt is sufficient to prevent
4726  * the invocation of nce_inactive.
4727  */
4728 void
4729 nce_refrele(nce_t *nce)
4730 {
4731         ASSERT((nce)->nce_refcnt != 0);
4732         mutex_enter(&nce->nce_lock);
4733         if (--nce->nce_refcnt == 0)
4734                 nce_inactive(nce); /* destroys the mutex */
4735         else
4736                 mutex_exit(&nce->nce_lock);
4737 }
4738 
4739 /*
4740  * free the nce after all refs have gone away.
4741  */
4742 static void
4743 nce_inactive(nce_t *nce)
4744 {
4745         ill_t *ill = nce->nce_ill;
4746 
4747         ASSERT(nce->nce_refcnt == 0);
4748 
4749         ncec_refrele_notr(nce->nce_common);
4750         nce->nce_common = NULL;
4751         freemsg(nce->nce_fp_mp);
4752         freemsg(nce->nce_dlur_mp);
4753 
4754         mutex_enter(&ill->ill_lock);
4755         DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4756             (char *), "nce", (void *), nce);
4757         ill->ill_nce_cnt--;
4758         nce->nce_ill = NULL;
4759         /*
4760          * If the number of ncec's associated with this ill have dropped
4761          * to zero, check whether we need to restart any operation that
4762          * is waiting for this to happen.
4763          */
4764         if (ILL_DOWN_OK(ill)) {
4765                 /* ipif_ill_refrele_tail drops the ill_lock */
4766                 ipif_ill_refrele_tail(ill);
4767         } else {
4768                 mutex_exit(&ill->ill_lock);
4769         }
4770 
4771         mutex_destroy(&nce->nce_lock);
4772         kmem_cache_free(nce_cache, nce);
4773 }
4774 
4775 /*
4776  * Add an nce to the ill_nce list.
4777  */
4778 static nce_t *
4779 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4780 {
4781         bzero(nce, sizeof (*nce));
4782         mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4783         nce->nce_common = ncec;
4784         nce->nce_addr = ncec->ncec_addr;
4785         nce->nce_ill = ill;
4786         DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4787             (char *), "nce", (void *), nce);
4788         ill->ill_nce_cnt++;
4789 
4790         nce->nce_refcnt = 1; /* for the thread */
4791         ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4792         nce->nce_dlur_mp = dlur_mp;
4793 
4794         /* add nce to the ill's fastpath list.  */
4795         nce->nce_refcnt++; /* for the list */
4796         list_insert_head(&ill->ill_nce, nce);
4797         return (nce);
4798 }
4799 
4800 static nce_t *
4801 nce_add(ill_t *ill, ncec_t *ncec)
4802 {
4803         nce_t   *nce;
4804         mblk_t  *dlur_mp = NULL;
4805 
4806         ASSERT(MUTEX_HELD(&ill->ill_lock));
4807         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4808 
4809         nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4810         if (nce == NULL)
4811                 return (NULL);
4812         if (ncec->ncec_lladdr != NULL ||
4813             ill->ill_net_type == IRE_IF_NORESOLVER) {
4814                 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4815                     ill->ill_phys_addr_length, ill->ill_sap,
4816                     ill->ill_sap_length);
4817                 if (dlur_mp == NULL) {
4818                         kmem_cache_free(nce_cache, nce);
4819                         return (NULL);
4820                 }
4821         }
4822         return (nce_add_impl(ill, ncec, nce, dlur_mp));
4823 }
4824 
4825 /*
4826  * remove the nce from the ill_faspath list
4827  */
4828 void
4829 nce_delete(nce_t *nce)
4830 {
4831         ill_t   *ill = nce->nce_ill;
4832 
4833         ASSERT(MUTEX_HELD(&ill->ill_lock));
4834 
4835         mutex_enter(&nce->nce_lock);
4836         if (nce->nce_is_condemned) {
4837                 /*
4838                  * some other thread has removed this nce from the ill_nce list
4839                  */
4840                 mutex_exit(&nce->nce_lock);
4841                 return;
4842         }
4843         nce->nce_is_condemned = B_TRUE;
4844         mutex_exit(&nce->nce_lock);
4845 
4846         list_remove(&ill->ill_nce, nce);
4847         /*
4848          * even though we are holding the ill_lock, it is ok to
4849          * call nce_refrele here because we know that we should have
4850          * at least 2 refs on the nce: one for the thread, and one
4851          * for the list. The refrele below will release the one for
4852          * the list.
4853          */
4854         nce_refrele(nce);
4855 }
4856 
4857 nce_t *
4858 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4859 {
4860         nce_t *nce = NULL;
4861 
4862         ASSERT(ill != NULL);
4863         ASSERT(MUTEX_HELD(&ill->ill_lock));
4864 
4865         for (nce = list_head(&ill->ill_nce); nce != NULL;
4866             nce = list_next(&ill->ill_nce, nce)) {
4867                 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4868                         break;
4869         }
4870 
4871         /*
4872          * if we found the nce on the ill_nce list while holding
4873          * the ill_lock, then it cannot be condemned yet.
4874          */
4875         if (nce != NULL) {
4876                 ASSERT(!nce->nce_is_condemned);
4877                 nce_refhold(nce);
4878         }
4879         return (nce);
4880 }
4881 
4882 /*
4883  * Walk the ill_nce list on ill. The callback function func() cannot perform
4884  * any destructive actions.
4885  */
4886 static void
4887 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4888 {
4889         nce_t *nce = NULL, *nce_next;
4890 
4891         ASSERT(MUTEX_HELD(&ill->ill_lock));
4892         for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4893                 nce_next = list_next(&ill->ill_nce, nce);
4894                 if (func(ill, nce, arg) != 0)
4895                         break;
4896                 nce = nce_next;
4897         }
4898 }
4899 
4900 void
4901 nce_walk(ill_t *ill, pfi_t func, void *arg)
4902 {
4903         mutex_enter(&ill->ill_lock);
4904         nce_walk_common(ill, func, arg);
4905         mutex_exit(&ill->ill_lock);
4906 }
4907 
4908 void
4909 nce_flush(ill_t *ill, boolean_t flushall)
4910 {
4911         nce_t *nce, *nce_next;
4912         list_t dead;
4913 
4914         list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4915         mutex_enter(&ill->ill_lock);
4916         for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4917                 nce_next = list_next(&ill->ill_nce, nce);
4918                 if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4919                         nce = nce_next;
4920                         continue;
4921                 }
4922                 /*
4923                  * nce_delete requires that the caller should either not
4924                  * be holding locks, or should hold a ref to ensure that
4925                  * we wont hit ncec_inactive. So take a ref and clean up
4926                  * after the list is flushed.
4927                  */
4928                 nce_refhold(nce);
4929                 nce_delete(nce);
4930                 list_insert_tail(&dead, nce);
4931                 nce = nce_next;
4932         }
4933         mutex_exit(&ill->ill_lock);
4934         while ((nce = list_head(&dead)) != NULL) {
4935                 list_remove(&dead, nce);
4936                 nce_refrele(nce);
4937         }
4938         ASSERT(list_is_empty(&dead));
4939         list_destroy(&dead);
4940 }
4941 
4942 /* Return an interval that is anywhere in the [1 .. intv] range */
4943 static clock_t
4944 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4945 {
4946         clock_t rnd, frac;
4947 
4948         (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4949         /* Note that clock_t is signed; must chop off bits */
4950         rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4951         if (initial_time) {
4952                 if (intv <= 0)
4953                         intv = 1;
4954                 else
4955                         intv = (rnd % intv) + 1;
4956         } else {
4957                 /* Compute 'frac' as 20% of the configured interval */
4958                 if ((frac = intv / 5) <= 1)
4959                         frac = 2;
4960                 /* Set intv randomly in the range [intv-frac .. intv+frac] */
4961                 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4962                         intv = 1;
4963         }
4964         return (intv);
4965 }
4966 
4967 void
4968 nce_resolv_ipmp_ok(ncec_t *ncec)
4969 {
4970         mblk_t *mp;
4971         uint_t pkt_len;
4972         iaflags_t ixaflags = IXAF_NO_TRACE;
4973         nce_t *under_nce;
4974         ill_t   *ill = ncec->ncec_ill;
4975         boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4976         ipif_t *src_ipif = NULL;
4977         ip_stack_t *ipst = ill->ill_ipst;
4978         ill_t *send_ill;
4979         uint_t nprobes;
4980 
4981         ASSERT(IS_IPMP(ill));
4982 
4983         mutex_enter(&ncec->ncec_lock);
4984         nprobes = ncec->ncec_nprobes;
4985         mp = ncec->ncec_qd_mp;
4986         ncec->ncec_qd_mp = NULL;
4987         ncec->ncec_nprobes = 0;
4988         mutex_exit(&ncec->ncec_lock);
4989 
4990         while (mp != NULL) {
4991                 mblk_t *nxt_mp;
4992 
4993                 nxt_mp = mp->b_next;
4994                 mp->b_next = NULL;
4995                 if (isv6) {
4996                         ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4997 
4998                         pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4999                         src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
5000                             ill, ALL_ZONES, ipst);
5001                 } else {
5002                         ipha_t *ipha = (ipha_t *)mp->b_rptr;
5003 
5004                         ixaflags |= IXAF_IS_IPV4;
5005                         pkt_len = ntohs(ipha->ipha_length);
5006                         src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5007                             ill, ALL_ZONES, ipst);
5008                 }
5009 
5010                 /*
5011                  * find a new nce based on an under_ill. The first IPMP probe
5012                  * packet gets queued, so we could still find a src_ipif that
5013                  * matches an IPMP test address.
5014                  */
5015                 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5016                         /*
5017                          * if src_ipif is null, this could be either a
5018                          * forwarded packet or a probe whose src got deleted.
5019                          * We identify the former case by looking for the
5020                          * ncec_nprobes: the first ncec_nprobes packets are
5021                          * probes;
5022                          */
5023                         if (src_ipif == NULL && nprobes > 0)
5024                                 goto drop_pkt;
5025 
5026                         /*
5027                          * For forwarded packets, we use the ipmp rotor
5028                          * to find send_ill.
5029                          */
5030                         send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5031                             B_TRUE);
5032                 } else {
5033                         send_ill = src_ipif->ipif_ill;
5034                         ill_refhold(send_ill);
5035                 }
5036 
5037                 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5038                     (ncec_t *), ncec, (ipif_t *),
5039                     src_ipif, (ill_t *), send_ill);
5040 
5041                 if (send_ill == NULL) {
5042                         if (src_ipif != NULL)
5043                                 ipif_refrele(src_ipif);
5044                         goto drop_pkt;
5045                 }
5046                 /* create an under_nce on send_ill */
5047                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5048                 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5049                         under_nce = nce_fastpath_create(send_ill, ncec);
5050                 else
5051                         under_nce = NULL;
5052                 rw_exit(&ipst->ips_ill_g_lock);
5053                 if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5054                         nce_fastpath_trigger(under_nce);
5055 
5056                 ill_refrele(send_ill);
5057                 if (src_ipif != NULL)
5058                         ipif_refrele(src_ipif);
5059 
5060                 if (under_nce != NULL) {
5061                         (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5062                             ALL_ZONES, 0, NULL);
5063                         nce_refrele(under_nce);
5064                         if (nprobes > 0)
5065                                 nprobes--;
5066                         mp = nxt_mp;
5067                         continue;
5068                 }
5069 drop_pkt:
5070                 if (isv6) {
5071                         BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5072                 } else {
5073                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5074                 }
5075                 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5076                 freemsg(mp);
5077                 if (nprobes > 0)
5078                         nprobes--;
5079                 mp = nxt_mp;
5080         }
5081         ncec_cb_dispatch(ncec); /* complete callbacks */
5082 }