Print this page
    
10472 Limit number of multicast NCEs
Reviewed by: Cody Peter Mello <melloc@writev.io>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/ip/ip_ndp.c
          +++ new/usr/src/uts/common/inet/ip/ip_ndp.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  
    | 
      ↓ open down ↓ | 
    15 lines elided | 
    
      ↑ open up ↑ | 
  
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  /*
  26      - * Copyright (c) 2018, Joyent, Inc.
       26 + * Copyright (c) 2019, Joyent, Inc.
  27   27   */
  28   28  
  29   29  #include <sys/types.h>
  30   30  #include <sys/stream.h>
  31   31  #include <sys/stropts.h>
  32   32  #include <sys/strsun.h>
  33   33  #include <sys/sysmacros.h>
  34   34  #include <sys/errno.h>
  35   35  #include <sys/dlpi.h>
  36   36  #include <sys/socket.h>
  37   37  #include <sys/ddi.h>
  38   38  #include <sys/sunddi.h>
  39   39  #include <sys/cmn_err.h>
  40   40  #include <sys/debug.h>
  41   41  #include <sys/vtrace.h>
  42   42  #include <sys/kmem.h>
  43   43  #include <sys/zone.h>
  44   44  #include <sys/ethernet.h>
  45   45  #include <sys/sdt.h>
  46   46  #include <sys/mac.h>
  47   47  
  48   48  #include <net/if.h>
  49   49  #include <net/if_types.h>
  50   50  #include <net/if_dl.h>
  51   51  #include <net/route.h>
  52   52  #include <netinet/in.h>
  53   53  #include <netinet/ip6.h>
  54   54  #include <netinet/icmp6.h>
  55   55  
  56   56  #include <inet/common.h>
  57   57  #include <inet/mi.h>
  58   58  #include <inet/mib2.h>
  59   59  #include <inet/nd.h>
  60   60  #include <inet/ip.h>
  61   61  #include <inet/ip_impl.h>
  62   62  #include <inet/ipclassifier.h>
  63   63  #include <inet/ip_if.h>
  64   64  #include <inet/ip_ire.h>
  65   65  #include <inet/ip_rts.h>
  66   66  #include <inet/ip6.h>
  67   67  #include <inet/ip_ndp.h>
  68   68  #include <inet/sctp_ip.h>
  69   69  #include <inet/ip_arp.h>
  70   70  #include <inet/ip2mac_impl.h>
  71   71  
  72   72  #define ANNOUNCE_INTERVAL(isv6) \
  73   73          (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
  74   74          ipst->ips_ip_arp_publish_interval)
  75   75  
  76   76  #define DEFENSE_INTERVAL(isv6) \
  77   77          (isv6 ? ipst->ips_ndp_defend_interval : \
  78   78          ipst->ips_arp_defend_interval)
  79   79  
  80   80  /* Non-tunable probe interval, based on link capabilities */
  81   81  #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
  82   82  
  83   83  /*
  84   84   * The IPv4 Link Local address space is special; we do extra duplicate checking
  85   85   * there, as the entire assignment mechanism rests on random numbers.
  86   86   */
  87   87  #define IS_IPV4_LL_SPACE(ptr)   (((uchar_t *)ptr)[0] == 169 && \
  88   88                                  ((uchar_t *)ptr)[1] == 254)
  89   89  
  90   90  /*
  91   91   * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
  92   92   * in to the ncec*add* functions.
  93   93   *
  94   94   * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
  95   95   * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
  96   96   * that we will respond to requests for the protocol address.
  97   97   */
  98   98  #define NCE_EXTERNAL_FLAGS_MASK \
  99   99          (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
 100  100          NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
 101  101          NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
 102  102  
 103  103  /*
 104  104   * Lock ordering:
 105  105   *
 106  106   *      ndp_g_lock -> ill_lock -> ncec_lock
 107  107   *
 108  108   * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
 109  109   * ncec_next.  ncec_lock protects the contents of the NCE (particularly
 110  110   * ncec_refcnt).
 111  111   */
 112  112  
 113  113  static  void    nce_cleanup_list(ncec_t *ncec);
 114  114  static  void    nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
 115  115  static  ncec_t  *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
 116  116      ncec_t *);
 117  117  static  nce_t   *nce_lookup_addr(ill_t *, const in6_addr_t *);
 118  118  static  int     nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
 119  119      uint16_t ncec_flags, nce_t **newnce);
  
    | 
      ↓ open down ↓ | 
    83 lines elided | 
    
      ↑ open up ↑ | 
  
 120  120  static  int     nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
 121  121      uint16_t ncec_flags, nce_t **newnce);
 122  122  static  boolean_t       ndp_xmit(ill_t *ill, uint32_t operation,
 123  123      uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
 124  124      const in6_addr_t *target, int flag);
 125  125  static void     ncec_refhold_locked(ncec_t *);
 126  126  static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
 127  127  static  void    nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
 128  128  static  int     nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 129  129      uint16_t, uint16_t, nce_t **);
 130      -static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
 131      -static nce_t *nce_add(ill_t *, ncec_t *);
      130 +static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
      131 +static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
 132  132  static void nce_inactive(nce_t *);
 133  133  extern nce_t    *nce_lookup(ill_t *, const in6_addr_t *);
 134  134  static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
 135  135  static int      nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 136  136      uint16_t, uint16_t, nce_t **);
 137  137  static int      nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
 138  138      uint16_t, uint16_t, nce_t **);
 139  139  static int  nce_add_v6_postprocess(nce_t *);
 140  140  static int  nce_add_v4_postprocess(nce_t *);
 141  141  static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
 142  142  static clock_t nce_fuzz_interval(clock_t, boolean_t);
 143  143  static void nce_resolv_ipmp_ok(ncec_t *);
 144  144  static void nce_walk_common(ill_t *, pfi_t, void *);
 145  145  static void nce_start_timer(ncec_t *, uint_t);
 146  146  static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
 147  147  static void nce_fastpath_trigger(nce_t *);
 148  148  static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
 149  149  
 150  150  #ifdef DEBUG
 151  151  static void     ncec_trace_cleanup(const ncec_t *);
 152  152  #endif
 153  153  
 154  154  #define NCE_HASH_PTR_V4(ipst, addr)                                     \
 155  155          (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
 156  156  
 157  157  #define NCE_HASH_PTR_V6(ipst, addr)                              \
 158  158          (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
 159  159                  NCE_TABLE_SIZE)]))
 160  160  
 161  161  extern kmem_cache_t *ncec_cache;
 162  162  extern kmem_cache_t *nce_cache;
 163  163  
 164  164  /*
 165  165   * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
 166  166   * If src_ill is not null, the ncec_addr is bound to src_ill. The
 167  167   * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
 168  168   * the probe is sent on the ncec_ill (in the non-IPMP case) or the
 169  169   * IPMP cast_ill (in the IPMP case).
 170  170   *
 171  171   * Note that the probe interval is based on the src_ill for IPv6, and
 172  172   * the ncec_xmit_interval for IPv4.
 173  173   */
 174  174  static void
 175  175  nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
 176  176  {
 177  177          boolean_t dropped;
 178  178          uint32_t probe_interval;
 179  179  
 180  180          ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
 181  181          ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
 182  182          if (ncec->ncec_ipversion == IPV6_VERSION) {
 183  183                  dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
 184  184                      ncec->ncec_lladdr, ncec->ncec_lladdr_length,
 185  185                      &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
 186  186                  probe_interval = ILL_PROBE_INTERVAL(src_ill);
 187  187          } else {
 188  188                  /* IPv4 DAD delay the initial probe. */
 189  189                  if (send_probe)
 190  190                          dropped = arp_probe(ncec);
 191  191                  else
 192  192                          dropped = B_TRUE;
 193  193                  probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
 194  194                      !send_probe);
 195  195          }
 196  196          if (!dropped) {
 197  197                  mutex_enter(&ncec->ncec_lock);
 198  198                  ncec->ncec_pcnt--;
 199  199                  mutex_exit(&ncec->ncec_lock);
 200  200          }
 201  201          nce_restart_timer(ncec, probe_interval);
 202  202  }
 203  203  
 204  204  /*
 205  205   * Compute default flags to use for an advertisement of this ncec's address.
 206  206   */
 207  207  static int
 208  208  nce_advert_flags(const ncec_t *ncec)
 209  209  {
 210  210          int flag = 0;
 211  211  
 212  212          if (ncec->ncec_flags & NCE_F_ISROUTER)
 213  213                  flag |= NDP_ISROUTER;
 214  214          if (!(ncec->ncec_flags & NCE_F_ANYCAST))
 215  215                  flag |= NDP_ORIDE;
 216  216  
 217  217          return (flag);
 218  218  }
 219  219  
 220  220  /*
 221  221   * NDP Cache Entry creation routine.
 222  222   * This routine must always be called with ndp6->ndp_g_lock held.
 223  223   */
 224  224  int
 225  225  nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
 226  226      const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 227  227  {
 228  228          int             err;
 229  229          nce_t           *nce;
 230  230  
 231  231          ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
 232  232          ASSERT(ill != NULL && ill->ill_isv6);
 233  233  
 234  234          err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
 235  235              &nce);
 236  236          if (err != 0)
 237  237                  return (err);
 238  238          ASSERT(newnce != NULL);
 239  239          *newnce = nce;
 240  240          return (err);
 241  241  }
 242  242  
 243  243  /*
 244  244   * Post-processing routine to be executed after nce_add_v6(). This function
 245  245   * triggers fastpath (if appropriate) and DAD on the newly added nce entry
 246  246   * and must be called without any locks held.
 247  247   */
 248  248  int
 249  249  nce_add_v6_postprocess(nce_t *nce)
 250  250  {
 251  251          ncec_t          *ncec = nce->nce_common;
 252  252          boolean_t       dropped = B_FALSE;
 253  253          uchar_t         *hw_addr = ncec->ncec_lladdr;
 254  254          uint_t          hw_addr_len = ncec->ncec_lladdr_length;
 255  255          ill_t           *ill = ncec->ncec_ill;
 256  256          int             err = 0;
 257  257          uint16_t        flags = ncec->ncec_flags;
 258  258          ip_stack_t      *ipst = ill->ill_ipst;
 259  259          boolean_t       trigger_fastpath = B_TRUE;
 260  260  
 261  261          /*
 262  262           * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
 263  263           * we call nce_fastpath as soon as the ncec is resolved in nce_process.
 264  264           * We call nce_fastpath from nce_update if the link layer address of
 265  265           * the peer changes from nce_update
 266  266           */
 267  267          if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
 268  268              (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
 269  269                  trigger_fastpath = B_FALSE;
 270  270  
 271  271          if (trigger_fastpath)
 272  272                  nce_fastpath_trigger(nce);
 273  273          if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
 274  274                  ill_t *hwaddr_ill;
 275  275                  /*
 276  276                   * Unicast entry that needs DAD.
 277  277                   */
 278  278                  if (IS_IPMP(ill)) {
 279  279                          hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
 280  280                              hw_addr, hw_addr_len);
 281  281                  } else {
 282  282                          hwaddr_ill = ill;
 283  283                  }
 284  284                  nce_dad(ncec, hwaddr_ill, B_TRUE);
 285  285                  err = EINPROGRESS;
 286  286          } else if (flags & NCE_F_UNSOL_ADV) {
 287  287                  /*
 288  288                   * We account for the transmit below by assigning one
 289  289                   * less than the ndd variable. Subsequent decrements
 290  290                   * are done in nce_timer.
 291  291                   */
 292  292                  mutex_enter(&ncec->ncec_lock);
 293  293                  ncec->ncec_unsolicit_count =
 294  294                      ipst->ips_ip_ndp_unsolicit_count - 1;
 295  295                  mutex_exit(&ncec->ncec_lock);
 296  296                  dropped = ndp_xmit(ill,
 297  297                      ND_NEIGHBOR_ADVERT,
 298  298                      hw_addr,
 299  299                      hw_addr_len,
 300  300                      &ncec->ncec_addr,   /* Source and target of the adv */
 301  301                      &ipv6_all_hosts_mcast, /* Destination of the packet */
 302  302                      nce_advert_flags(ncec));
 303  303                  mutex_enter(&ncec->ncec_lock);
 304  304                  if (dropped)
 305  305                          ncec->ncec_unsolicit_count++;
 306  306                  else
 307  307                          ncec->ncec_last_time_defended = ddi_get_lbolt();
 308  308                  if (ncec->ncec_unsolicit_count != 0) {
 309  309                          nce_start_timer(ncec,
 310  310                              ipst->ips_ip_ndp_unsolicit_interval);
 311  311                  }
 312  312                  mutex_exit(&ncec->ncec_lock);
 313  313          }
 314  314          return (err);
 315  315  }
 316  316  
 317  317  /*
 318  318   * Atomically lookup and add (if needed) Neighbor Cache information for
 319  319   * an address.
 320  320   *
 321  321   * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
 322  322   * are always added pointing at the ipmp_ill. Thus, when the ill passed
 323  323   * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
 324  324   * entries will be created, both pointing at the same ncec_t. The nce_t
 325  325   * entries will have their nce_ill set to the ipmp_ill and the under_ill
 326  326   * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
 327  327   * Local addresses are always created on the ill passed to nce_add_v6.
 328  328   */
 329  329  int
 330  330  nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
 331  331      const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 332  332  {
 333  333          int             err = 0;
 334  334          ip_stack_t      *ipst = ill->ill_ipst;
 335  335          nce_t           *nce, *upper_nce = NULL;
 336  336          ill_t           *in_ill = ill;
 337  337          boolean_t       need_ill_refrele = B_FALSE;
 338  338  
 339  339          if (flags & NCE_F_MCAST) {
 340  340                  /*
 341  341                   * hw_addr will be figured out in nce_set_multicast_v6;
 342  342                   * caller has to select the cast_ill
 343  343                   */
 344  344                  ASSERT(hw_addr == NULL);
 345  345                  ASSERT(!IS_IPMP(ill));
 346  346                  err = nce_set_multicast_v6(ill, addr, flags, newnce);
 347  347                  return (err);
 348  348          }
 349  349          ASSERT(ill->ill_isv6);
 350  350          if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
 351  351                  ill = ipmp_ill_hold_ipmp_ill(ill);
 352  352                  if (ill == NULL)
 353  353                          return (ENXIO);
 354  354                  need_ill_refrele = B_TRUE;
 355  355          }
 356  356  
 357  357          mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 358  358          nce = nce_lookup_addr(ill, addr);
 359  359          if (nce == NULL) {
 360  360                  err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
 361  361                      &nce);
 362  362          } else {
 363  363                  err = EEXIST;
 364  364          }
 365  365          mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 366  366          if (err == 0)
 367  367                  err = nce_add_v6_postprocess(nce);
 368  368          if (in_ill != ill && nce != NULL) {
 369  369                  nce_t *under_nce = NULL;
 370  370  
 371  371                  /*
 372  372                   * in_ill was the under_ill. Try to create the under_nce.
 373  373                   * Hold the ill_g_lock to prevent changes to group membership
 374  374                   * until we are done.
 375  375                   */
 376  376                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 377  377                  if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
 378  378                          DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
 379  379                              ill_t *, ill);
 380  380                          rw_exit(&ipst->ips_ill_g_lock);
 381  381                          err = ENXIO;
 382  382                          nce_refrele(nce);
 383  383                          nce = NULL;
 384  384                          goto bail;
 385  385                  }
 386  386                  under_nce = nce_fastpath_create(in_ill, nce->nce_common);
 387  387                  if (under_nce == NULL) {
 388  388                          rw_exit(&ipst->ips_ill_g_lock);
 389  389                          err = EINVAL;
 390  390                          nce_refrele(nce);
 391  391                          nce = NULL;
 392  392                          goto bail;
 393  393                  }
 394  394                  rw_exit(&ipst->ips_ill_g_lock);
 395  395                  upper_nce = nce;
 396  396                  nce = under_nce; /* will be returned to caller */
 397  397                  if (NCE_ISREACHABLE(nce->nce_common))
 398  398                          nce_fastpath_trigger(under_nce);
 399  399          }
 400  400          /* nce_refrele is deferred until the lock is dropped  */
 401  401          if (nce != NULL) {
 402  402                  if (newnce != NULL)
 403  403                          *newnce = nce;
 404  404                  else
 405  405                          nce_refrele(nce);
 406  406          }
 407  407  bail:
 408  408          if (upper_nce != NULL)
 409  409                  nce_refrele(upper_nce);
 410  410          if (need_ill_refrele)
 411  411                  ill_refrele(ill);
 412  412          return (err);
 413  413  }
 414  414  
 415  415  /*
 416  416   * Remove all the CONDEMNED nces from the appropriate hash table.
 417  417   * We create a private list of NCEs, these may have ires pointing
 418  418   * to them, so the list will be passed through to clean up dependent
 419  419   * ires and only then we can do ncec_refrele() which can make NCE inactive.
 420  420   */
 421  421  static void
 422  422  nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
 423  423  {
 424  424          ncec_t *ncec1;
 425  425          ncec_t **ptpn;
 426  426  
 427  427          ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 428  428          ASSERT(ndp->ndp_g_walker == 0);
 429  429          for (; ncec; ncec = ncec1) {
 430  430                  ncec1 = ncec->ncec_next;
 431  431                  mutex_enter(&ncec->ncec_lock);
 432  432                  if (NCE_ISCONDEMNED(ncec)) {
 433  433                          ptpn = ncec->ncec_ptpn;
 434  434                          ncec1 = ncec->ncec_next;
 435  435                          if (ncec1 != NULL)
 436  436                                  ncec1->ncec_ptpn = ptpn;
 437  437                          *ptpn = ncec1;
 438  438                          ncec->ncec_ptpn = NULL;
 439  439                          ncec->ncec_next = NULL;
 440  440                          ncec->ncec_next = *free_nce_list;
 441  441                          *free_nce_list = ncec;
 442  442                  }
 443  443                  mutex_exit(&ncec->ncec_lock);
 444  444          }
 445  445  }
 446  446  
 447  447  /*
 448  448   * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
 449  449   *    will return this NCE. Also no new timeouts will
 450  450   *    be started (See nce_restart_timer).
 451  451   * 2. Cancel any currently running timeouts.
 452  452   * 3. If there is an ndp walker, return. The walker will do the cleanup.
 453  453   *    This ensures that walkers see a consistent list of NCEs while walking.
 454  454   * 4. Otherwise remove the NCE from the list of NCEs
 455  455   */
 456  456  void
 457  457  ncec_delete(ncec_t *ncec)
 458  458  {
 459  459          ncec_t  **ptpn;
 460  460          ncec_t  *ncec1;
 461  461          int     ipversion = ncec->ncec_ipversion;
 462  462          ndp_g_t *ndp;
 463  463          ip_stack_t      *ipst = ncec->ncec_ipst;
 464  464  
 465  465          if (ipversion == IPV4_VERSION)
 466  466                  ndp = ipst->ips_ndp4;
 467  467          else
 468  468                  ndp = ipst->ips_ndp6;
 469  469  
 470  470          /* Serialize deletes */
 471  471          mutex_enter(&ncec->ncec_lock);
 472  472          if (NCE_ISCONDEMNED(ncec)) {
 473  473                  /* Some other thread is doing the delete */
 474  474                  mutex_exit(&ncec->ncec_lock);
 475  475                  return;
 476  476          }
 477  477          /*
 478  478           * Caller has a refhold. Also 1 ref for being in the list. Thus
 479  479           * refcnt has to be >= 2
 480  480           */
 481  481          ASSERT(ncec->ncec_refcnt >= 2);
 482  482          ncec->ncec_flags |= NCE_F_CONDEMNED;
 483  483          mutex_exit(&ncec->ncec_lock);
 484  484  
 485  485          /* Count how many condemned ires for kmem_cache callback */
 486  486          atomic_inc_32(&ipst->ips_num_nce_condemned);
 487  487          nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
 488  488  
 489  489          /* Complete any waiting callbacks */
 490  490          ncec_cb_dispatch(ncec);
 491  491  
 492  492          /*
 493  493           * Cancel any running timer. Timeout can't be restarted
 494  494           * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
 495  495           * Passing invalid timeout id is fine.
 496  496           */
 497  497          if (ncec->ncec_timeout_id != 0) {
 498  498                  (void) untimeout(ncec->ncec_timeout_id);
 499  499                  ncec->ncec_timeout_id = 0;
 500  500          }
 501  501  
 502  502          mutex_enter(&ndp->ndp_g_lock);
 503  503          if (ncec->ncec_ptpn == NULL) {
 504  504                  /*
 505  505                   * The last ndp walker has already removed this ncec from
 506  506                   * the list after we marked the ncec CONDEMNED and before
 507  507                   * we grabbed the global lock.
 508  508                   */
 509  509                  mutex_exit(&ndp->ndp_g_lock);
 510  510                  return;
 511  511          }
 512  512          if (ndp->ndp_g_walker > 0) {
 513  513                  /*
 514  514                   * Can't unlink. The walker will clean up
 515  515                   */
 516  516                  ndp->ndp_g_walker_cleanup = B_TRUE;
 517  517                  mutex_exit(&ndp->ndp_g_lock);
 518  518                  return;
 519  519          }
 520  520  
 521  521          /*
 522  522           * Now remove the ncec from the list. nce_restart_timer won't restart
 523  523           * the timer since it is marked CONDEMNED.
 524  524           */
 525  525          ptpn = ncec->ncec_ptpn;
 526  526          ncec1 = ncec->ncec_next;
 527  527          if (ncec1 != NULL)
 528  528                  ncec1->ncec_ptpn = ptpn;
 529  529          *ptpn = ncec1;
 530  530          ncec->ncec_ptpn = NULL;
 531  531          ncec->ncec_next = NULL;
 532  532          mutex_exit(&ndp->ndp_g_lock);
 533  533  
 534  534          /* Removed from ncec_ptpn/ncec_next list */
 535  535          ncec_refrele_notr(ncec);
 536  536  }
 537  537  
 538  538  void
 539  539  ncec_inactive(ncec_t *ncec)
 540  540  {
 541  541          mblk_t          **mpp;
 542  542          ill_t           *ill = ncec->ncec_ill;
 543  543          ip_stack_t      *ipst = ncec->ncec_ipst;
 544  544  
 545  545          ASSERT(ncec->ncec_refcnt == 0);
 546  546          ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 547  547  
 548  548          /* Count how many condemned nces for kmem_cache callback */
 549  549          if (NCE_ISCONDEMNED(ncec))
 550  550                  atomic_add_32(&ipst->ips_num_nce_condemned, -1);
 551  551  
 552  552          /* Free all allocated messages */
 553  553          mpp = &ncec->ncec_qd_mp;
 554  554          while (*mpp != NULL) {
 555  555                  mblk_t  *mp;
 556  556  
 557  557                  mp = *mpp;
 558  558                  *mpp = mp->b_next;
 559  559  
 560  560                  inet_freemsg(mp);
 561  561          }
 562  562          /*
 563  563           * must have been cleaned up in ncec_delete
 564  564           */
 565  565          ASSERT(list_is_empty(&ncec->ncec_cb));
 566  566          list_destroy(&ncec->ncec_cb);
 567  567          /*
 568  568           * free the ncec_lladdr if one was allocated in nce_add_common()
 569  569           */
 570  570          if (ncec->ncec_lladdr_length > 0)
 571  571                  kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
 572  572  
 573  573  #ifdef DEBUG
 574  574          ncec_trace_cleanup(ncec);
 575  575  #endif
 576  576  
 577  577          mutex_enter(&ill->ill_lock);
 578  578          DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
 579  579              (char *), "ncec", (void *), ncec);
 580  580          ill->ill_ncec_cnt--;
 581  581          ncec->ncec_ill = NULL;
 582  582          /*
 583  583           * If the number of ncec's associated with this ill have dropped
 584  584           * to zero, check whether we need to restart any operation that
 585  585           * is waiting for this to happen.
 586  586           */
 587  587          if (ILL_DOWN_OK(ill)) {
 588  588                  /* ipif_ill_refrele_tail drops the ill_lock */
 589  589                  ipif_ill_refrele_tail(ill);
 590  590          } else {
 591  591                  mutex_exit(&ill->ill_lock);
 592  592          }
 593  593  
 594  594          mutex_destroy(&ncec->ncec_lock);
 595  595          kmem_cache_free(ncec_cache, ncec);
 596  596  }
 597  597  
 598  598  /*
 599  599   * ncec_walk routine.  Delete the ncec if it is associated with the ill
 600  600   * that is going away.  Always called as a writer.
 601  601   */
 602  602  void
 603  603  ncec_delete_per_ill(ncec_t *ncec, void *arg)
 604  604  {
 605  605          if ((ncec != NULL) && ncec->ncec_ill == arg) {
 606  606                  ncec_delete(ncec);
 607  607          }
 608  608  }
 609  609  
 610  610  /*
 611  611   * Neighbor Cache cleanup logic for a list of ncec_t entries.
 612  612   */
 613  613  static void
 614  614  nce_cleanup_list(ncec_t *ncec)
 615  615  {
 616  616          ncec_t *ncec_next;
 617  617  
 618  618          ASSERT(ncec != NULL);
 619  619          while (ncec != NULL) {
 620  620                  ncec_next = ncec->ncec_next;
 621  621                  ncec->ncec_next = NULL;
 622  622  
 623  623                  /*
 624  624                   * It is possible for the last ndp walker (this thread)
 625  625                   * to come here after ncec_delete has marked the ncec CONDEMNED
 626  626                   * and before it has removed the ncec from the fastpath list
 627  627                   * or called untimeout. So we need to do it here. It is safe
 628  628                   * for both ncec_delete and this thread to do it twice or
 629  629                   * even simultaneously since each of the threads has a
 630  630                   * reference on the ncec.
 631  631                   */
 632  632                  nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
 633  633                  /*
 634  634                   * Cancel any running timer. Timeout can't be restarted
 635  635                   * since CONDEMNED is set. The ncec_lock can't be
 636  636                   * held across untimeout though passing invalid timeout
 637  637                   * id is fine.
 638  638                   */
 639  639                  if (ncec->ncec_timeout_id != 0) {
 640  640                          (void) untimeout(ncec->ncec_timeout_id);
 641  641                          ncec->ncec_timeout_id = 0;
 642  642                  }
 643  643                  /* Removed from ncec_ptpn/ncec_next list */
 644  644                  ncec_refrele_notr(ncec);
 645  645                  ncec = ncec_next;
 646  646          }
 647  647  }
 648  648  
 649  649  /*
 650  650   * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
 651  651   */
 652  652  boolean_t
 653  653  nce_restart_dad(ncec_t *ncec)
 654  654  {
 655  655          boolean_t started;
 656  656          ill_t *ill, *hwaddr_ill;
 657  657  
 658  658          if (ncec == NULL)
 659  659                  return (B_FALSE);
 660  660          ill = ncec->ncec_ill;
 661  661          mutex_enter(&ncec->ncec_lock);
 662  662          if (ncec->ncec_state == ND_PROBE) {
 663  663                  mutex_exit(&ncec->ncec_lock);
 664  664                  started = B_TRUE;
 665  665          } else if (ncec->ncec_state == ND_REACHABLE) {
 666  666                  ASSERT(ncec->ncec_lladdr != NULL);
 667  667                  ncec->ncec_state = ND_PROBE;
 668  668                  ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
 669  669                  /*
 670  670                   * Slight cheat here: we don't use the initial probe delay
 671  671                   * for IPv4 in this obscure case.
 672  672                   */
 673  673                  mutex_exit(&ncec->ncec_lock);
 674  674                  if (IS_IPMP(ill)) {
 675  675                          hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
 676  676                              ncec->ncec_lladdr, ncec->ncec_lladdr_length);
 677  677                  } else {
 678  678                          hwaddr_ill = ill;
 679  679                  }
 680  680                  nce_dad(ncec, hwaddr_ill, B_TRUE);
 681  681                  started = B_TRUE;
 682  682          } else {
 683  683                  mutex_exit(&ncec->ncec_lock);
 684  684                  started = B_FALSE;
 685  685          }
 686  686          return (started);
 687  687  }
 688  688  
 689  689  /*
 690  690   * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
 691  691   * If one is found, the refcnt on the ncec will be incremented.
 692  692   */
 693  693  ncec_t *
 694  694  ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
 695  695  {
 696  696          ncec_t          *ncec;
 697  697          ip_stack_t      *ipst = ill->ill_ipst;
 698  698  
 699  699          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 700  700          mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 701  701  
 702  702          /* Get head of v6 hash table */
 703  703          ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
 704  704          ncec = ncec_lookup_illgrp(ill, addr, ncec);
 705  705          mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 706  706          rw_exit(&ipst->ips_ill_g_lock);
 707  707          return (ncec);
 708  708  }
 709  709  /*
 710  710   * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
 711  711   * If one is found, the refcnt on the ncec will be incremented.
 712  712   */
 713  713  ncec_t *
 714  714  ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
 715  715  {
 716  716          ncec_t  *ncec = NULL;
 717  717          in6_addr_t addr6;
 718  718          ip_stack_t *ipst = ill->ill_ipst;
 719  719  
 720  720          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 721  721          mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
 722  722  
 723  723          /* Get head of v4 hash table */
 724  724          ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
 725  725          IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
 726  726          ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
 727  727          mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 728  728          rw_exit(&ipst->ips_ill_g_lock);
 729  729          return (ncec);
 730  730  }
 731  731  
 732  732  /*
 733  733   * Cache entry lookup.  Try to find an ncec matching the parameters passed.
 734  734   * If an ncec is found, increment the hold count on that ncec.
 735  735   * The caller passes in the start of the appropriate hash table, and must
 736  736   * be holding the appropriate global lock (ndp_g_lock). In addition, since
 737  737   * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
 738  738   * must be held as reader.
 739  739   *
 740  740   * This function always matches across the ipmp group.
 741  741   */
 742  742  ncec_t *
 743  743  ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
 744  744  {
 745  745          ndp_g_t         *ndp;
 746  746          ip_stack_t      *ipst = ill->ill_ipst;
 747  747  
 748  748          if (ill->ill_isv6)
 749  749                  ndp = ipst->ips_ndp6;
 750  750          else
 751  751                  ndp = ipst->ips_ndp4;
 752  752  
 753  753          ASSERT(ill != NULL);
 754  754          ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 755  755          if (IN6_IS_ADDR_UNSPECIFIED(addr))
 756  756                  return (NULL);
 757  757          for (; ncec != NULL; ncec = ncec->ncec_next) {
 758  758                  if (ncec->ncec_ill == ill ||
 759  759                      IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
 760  760                          if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
 761  761                                  mutex_enter(&ncec->ncec_lock);
 762  762                                  if (!NCE_ISCONDEMNED(ncec)) {
 763  763                                          ncec_refhold_locked(ncec);
 764  764                                          mutex_exit(&ncec->ncec_lock);
 765  765                                          break;
 766  766                                  }
 767  767                                  mutex_exit(&ncec->ncec_lock);
 768  768                          }
 769  769                  }
 770  770          }
 771  771          return (ncec);
 772  772  }
 773  773  
 774  774  /*
 775  775   * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
 776  776   * entries for ill only, i.e., when ill is part of an ipmp group,
 777  777   * nce_lookup_v4 will never try to match across the group.
 778  778   */
 779  779  nce_t *
 780  780  nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
 781  781  {
 782  782          nce_t *nce;
 783  783          in6_addr_t addr6;
 784  784          ip_stack_t *ipst = ill->ill_ipst;
 785  785  
 786  786          mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
 787  787          IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
 788  788          nce = nce_lookup_addr(ill, &addr6);
 789  789          mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 790  790          return (nce);
 791  791  }
 792  792  
 793  793  /*
 794  794   * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
 795  795   * entries for ill only, i.e., when ill is part of an ipmp group,
 796  796   * nce_lookup_v6 will never try to match across the group.
 797  797   */
 798  798  nce_t *
 799  799  nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
 800  800  {
 801  801          nce_t *nce;
 802  802          ip_stack_t *ipst = ill->ill_ipst;
 803  803  
 804  804          mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 805  805          nce = nce_lookup_addr(ill, addr6);
 806  806          mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 807  807          return (nce);
 808  808  }
 809  809  
 810  810  static nce_t *
 811  811  nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
 812  812  {
 813  813          nce_t *nce;
 814  814  
 815  815          ASSERT(ill != NULL);
 816  816  #ifdef DEBUG
 817  817          if (ill->ill_isv6)
 818  818                  ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
 819  819          else
 820  820                  ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
 821  821  #endif
 822  822          mutex_enter(&ill->ill_lock);
 823  823          nce = nce_lookup(ill, addr);
 824  824          mutex_exit(&ill->ill_lock);
 825  825          return (nce);
 826  826  }
 827  827  
 828  828  
 829  829  /*
 830  830   * Router turned to host.  We need to make sure that cached copies of the ncec
 831  831   * are not used for forwarding packets if they were derived from the default
 832  832   * route, and that the default route itself is removed, as  required by
 833  833   * section 7.2.5 of RFC 2461.
 834  834   *
 835  835   * Note that the ncec itself probably has valid link-layer information for the
 836  836   * nexthop, so that there is no reason to delete the ncec, as long as the
 837  837   * ISROUTER flag is turned off.
 838  838   */
 839  839  static void
 840  840  ncec_router_to_host(ncec_t *ncec)
 841  841  {
 842  842          ire_t           *ire;
 843  843          ip_stack_t      *ipst = ncec->ncec_ipst;
 844  844  
 845  845          mutex_enter(&ncec->ncec_lock);
 846  846          ncec->ncec_flags &= ~NCE_F_ISROUTER;
 847  847          mutex_exit(&ncec->ncec_lock);
 848  848  
 849  849          ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
 850  850              &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
 851  851              MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
 852  852          if (ire != NULL) {
 853  853                  ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
 854  854                  ire_delete(ire);
 855  855                  ire_refrele(ire);
 856  856          }
 857  857  }
 858  858  
 859  859  /*
 860  860   * Process passed in parameters either from an incoming packet or via
 861  861   * user ioctl.
 862  862   */
 863  863  void
 864  864  nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
 865  865  {
 866  866          ill_t   *ill = ncec->ncec_ill;
 867  867          uint32_t hw_addr_len = ill->ill_phys_addr_length;
 868  868          boolean_t ll_updated = B_FALSE;
 869  869          boolean_t ll_changed;
 870  870          nce_t   *nce;
 871  871  
 872  872          ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
 873  873          /*
 874  874           * No updates of link layer address or the neighbor state is
 875  875           * allowed, when the cache is in NONUD state.  This still
 876  876           * allows for responding to reachability solicitation.
 877  877           */
 878  878          mutex_enter(&ncec->ncec_lock);
 879  879          if (ncec->ncec_state == ND_INCOMPLETE) {
 880  880                  if (hw_addr == NULL) {
 881  881                          mutex_exit(&ncec->ncec_lock);
 882  882                          return;
 883  883                  }
 884  884                  nce_set_ll(ncec, hw_addr);
 885  885                  /*
 886  886                   * Update ncec state and send the queued packets
 887  887                   * back to ip this time ire will be added.
 888  888                   */
 889  889                  if (flag & ND_NA_FLAG_SOLICITED) {
 890  890                          nce_update(ncec, ND_REACHABLE, NULL);
 891  891                  } else {
 892  892                          nce_update(ncec, ND_STALE, NULL);
 893  893                  }
 894  894                  mutex_exit(&ncec->ncec_lock);
 895  895                  nce = nce_fastpath(ncec, B_TRUE, NULL);
 896  896                  nce_resolv_ok(ncec);
 897  897                  if (nce != NULL)
 898  898                          nce_refrele(nce);
 899  899                  return;
 900  900          }
 901  901          ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
 902  902          if (!is_adv) {
 903  903                  /* If this is a SOLICITATION request only */
 904  904                  if (ll_changed)
 905  905                          nce_update(ncec, ND_STALE, hw_addr);
 906  906                  mutex_exit(&ncec->ncec_lock);
 907  907                  ncec_cb_dispatch(ncec);
 908  908                  return;
 909  909          }
 910  910          if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
 911  911                  /* If in any other state than REACHABLE, ignore */
 912  912                  if (ncec->ncec_state == ND_REACHABLE) {
 913  913                          nce_update(ncec, ND_STALE, NULL);
 914  914                  }
 915  915                  mutex_exit(&ncec->ncec_lock);
 916  916                  ncec_cb_dispatch(ncec);
 917  917                  return;
 918  918          } else {
 919  919                  if (ll_changed) {
 920  920                          nce_update(ncec, ND_UNCHANGED, hw_addr);
 921  921                          ll_updated = B_TRUE;
 922  922                  }
 923  923                  if (flag & ND_NA_FLAG_SOLICITED) {
 924  924                          nce_update(ncec, ND_REACHABLE, NULL);
 925  925                  } else {
 926  926                          if (ll_updated) {
 927  927                                  nce_update(ncec, ND_STALE, NULL);
 928  928                          }
 929  929                  }
 930  930                  mutex_exit(&ncec->ncec_lock);
 931  931                  if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
 932  932                      NCE_F_ISROUTER)) {
 933  933                          ncec_router_to_host(ncec);
 934  934                  } else {
 935  935                          ncec_cb_dispatch(ncec);
 936  936                  }
 937  937          }
 938  938  }
 939  939  
 940  940  /*
 941  941   * Pass arg1 to the cbf supplied, along with each ncec in existence.
 942  942   * ncec_walk() places a REFHOLD on the ncec and drops the lock when
 943  943   * walking the hash list.
 944  944   */
 945  945  void
 946  946  ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
 947  947      void *arg1, boolean_t trace)
 948  948  {
 949  949          ncec_t  *ncec;
 950  950          ncec_t  *ncec1;
 951  951          ncec_t  **ncep;
 952  952          ncec_t  *free_nce_list = NULL;
 953  953  
 954  954          mutex_enter(&ndp->ndp_g_lock);
 955  955          /* Prevent ncec_delete from unlink and free of NCE */
 956  956          ndp->ndp_g_walker++;
 957  957          mutex_exit(&ndp->ndp_g_lock);
 958  958          for (ncep = ndp->nce_hash_tbl;
 959  959              ncep < A_END(ndp->nce_hash_tbl); ncep++) {
 960  960                  for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
 961  961                          ncec1 = ncec->ncec_next;
 962  962                          if (ill == NULL || ncec->ncec_ill == ill) {
 963  963                                  if (trace) {
 964  964                                          ncec_refhold(ncec);
 965  965                                          (*cbf)(ncec, arg1);
 966  966                                          ncec_refrele(ncec);
 967  967                                  } else {
 968  968                                          ncec_refhold_notr(ncec);
 969  969                                          (*cbf)(ncec, arg1);
 970  970                                          ncec_refrele_notr(ncec);
 971  971                                  }
 972  972                          }
 973  973                  }
 974  974          }
 975  975          mutex_enter(&ndp->ndp_g_lock);
 976  976          ndp->ndp_g_walker--;
 977  977          if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
 978  978                  /* Time to delete condemned entries */
 979  979                  for (ncep = ndp->nce_hash_tbl;
 980  980                      ncep < A_END(ndp->nce_hash_tbl); ncep++) {
 981  981                          ncec = *ncep;
 982  982                          if (ncec != NULL) {
 983  983                                  nce_remove(ndp, ncec, &free_nce_list);
 984  984                          }
 985  985                  }
 986  986                  ndp->ndp_g_walker_cleanup = B_FALSE;
 987  987          }
 988  988  
 989  989          mutex_exit(&ndp->ndp_g_lock);
 990  990  
 991  991          if (free_nce_list != NULL) {
 992  992                  nce_cleanup_list(free_nce_list);
 993  993          }
 994  994  }
 995  995  
 996  996  /*
 997  997   * Walk everything.
  
    | 
      ↓ open down ↓ | 
    856 lines elided | 
    
      ↑ open up ↑ | 
  
 998  998   * Note that ill can be NULL hence can't derive the ipst from it.
 999  999   */
1000 1000  void
1001 1001  ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 1002  {
1003 1003          ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004 1004          ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 1005  }
1006 1006  
1007 1007  /*
     1008 + * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
     1009 + * NCEs, and the number to reclaim if we hit the limit.  Used by
     1010 + * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
     1011 + * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
     1012 + */
     1013 +
     1014 +/* Maximum number of multicast NCEs on an ill. */
     1015 +uint_t ip_max_ill_mcast_nces = 16384;
     1016 +/*
     1017 + * Number of NCEs to delete if we hit the maximum above.  0 means *don't* and
     1018 + * return an error.  Non-zero means delete so many, and if the number is >=
     1019 + * the max above, that means delete them all.
     1020 + */
     1021 +uint_t ip_ill_mcast_reclaim = 256;
     1022 +
     1023 +/*
     1024 + * Encapsulate multicast ill capping in a function, for easier DTrace
     1025 + * detections.  Return a list of refheld NCEs to destroy-via-refrele.  That
     1026 + * list can be NULL, but can only be non-NULL if we successfully reclaimed.
     1027 + *
     1028 + * NOTE:  This function must be called while holding the ill_lock AND
     1029 + * JUST PRIOR to making the insertion into the ill_nce list.
     1030 + *
     1031 + * We can't release the ones we delete ourselves because the ill_lock is held
     1032 + * by the caller. They are, instead, passed back in a list_t for deletion
     1033 + * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
     1034 + *
     1035 + * While this covers nce_t, ncec_t gets done even further down the road.  See
     1036 + * nce_graveyard_free() for why.
     1037 + */
     1038 +static boolean_t
     1039 +nce_too_many_mcast(ill_t *ill, list_t *graveyard)
     1040 +{
     1041 +        uint_t reclaim_count, max_count, reclaimed = 0;
     1042 +        boolean_t too_many;
     1043 +        nce_t *nce, *deadman;
     1044 +
     1045 +        ASSERT(graveyard != NULL);
     1046 +        ASSERT(list_is_empty(graveyard));
     1047 +        ASSERT(MUTEX_HELD(&ill->ill_lock));
     1048 +
     1049 +        /*
     1050 +         * NOTE: Some grinning weirdo may have lowered the global max beyond
     1051 +         * what this ill currently has.  The behavior in this case will be
     1052 +         * trim-back just by the reclaim amount for any new ones.
     1053 +         */
     1054 +        max_count = ip_max_ill_mcast_nces;
     1055 +        reclaim_count = min(ip_ill_mcast_reclaim, max_count);
     1056 +
     1057 +        /* All good? */
     1058 +        if (ill->ill_mcast_nces < max_count)
     1059 +                return (B_FALSE);       /* Yes, all good. */
     1060 +
     1061 +        if (reclaim_count == 0)
     1062 +                return (B_TRUE);        /* Don't bother - we're stuck. */
     1063 +
     1064 +        /* We need to reclaim now.  Exploit our held ill_lock. */
     1065 +
     1066 +        /*
     1067 +         * Start at the tail and work backwards, new nces are head-inserted,
     1068 +         * so we'll be reaping the oldest entries.
     1069 +         */
     1070 +        nce = list_tail(&ill->ill_nce);
     1071 +        while (reclaimed < reclaim_count) {
     1072 +                /* Skip ahead to a multicast NCE. */
     1073 +                while (nce != NULL &&
     1074 +                    (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
     1075 +                        nce = list_prev(&ill->ill_nce, nce);
     1076 +                }
     1077 +                if (nce == NULL)
     1078 +                        break;
     1079 +
     1080 +                /*
     1081 +                 * NOTE: For now, we just delete the first one(s) we find.
     1082 +                 * This is not optimal, and may require some inspection of nce
     1083 +                 * & its ncec to be better.
     1084 +                 */
     1085 +                deadman = nce;
     1086 +                nce = list_prev(&ill->ill_nce, nce);
     1087 +
     1088 +                /* nce_delete() requires caller holds... */
     1089 +                nce_refhold(deadman);
     1090 +                nce_delete(deadman);    /* Bumps down ill_mcast_nces. */
     1091 +
     1092 +                /* Link the dead ones singly, still refheld... */
     1093 +                list_insert_tail(graveyard, deadman);
     1094 +                reclaimed++;
     1095 +        }
     1096 +
     1097 +        if (reclaimed != reclaim_count) {
     1098 +                /* We didn't have enough to reach reclaim_count. Why?!? */
     1099 +                DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
     1100 +                    uint_t, reclaimed, uint_t, reclaim_count);
     1101 +
     1102 +                /* In case for some REALLY weird reason we found none! */
     1103 +                too_many = (reclaimed == 0);
     1104 +        } else {
     1105 +                too_many = B_FALSE;
     1106 +        }
     1107 +
     1108 +        return (too_many);
     1109 +}
     1110 +
     1111 +static void
     1112 +ncec_mcast_reap_one(ncec_t *ncec, void *arg)
     1113 +{
     1114 +        boolean_t reapit;
     1115 +        ill_t *ill = (ill_t *)arg;
     1116 +
     1117 +        /* Obvious no-lock-needed checks... */
     1118 +        if (ncec == NULL || ncec->ncec_ill != ill ||
     1119 +            (ncec->ncec_flags & NCE_F_MCAST) == 0)
     1120 +                return;
     1121 +
     1122 +        mutex_enter(&ncec->ncec_lock);
     1123 +        /*
     1124 +         * It's refheld by the walk infrastructure. It has one reference for
     1125 +         * being in the ndp_g_hash, and if an nce_t exists, that's one more.
     1126 +         * We want ones without an nce_t, so 2 is the magic number.  If it's
     1127 +         * LESS than 2, we have much bigger problems anyway.
     1128 +         */
     1129 +        ASSERT(ncec->ncec_refcnt >= 2);
     1130 +        reapit = (ncec->ncec_refcnt == 2);
     1131 +        mutex_exit(&ncec->ncec_lock);
     1132 +
     1133 +        if (reapit) {
     1134 +                IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
     1135 +                ncec_delete(ncec);
     1136 +        }
     1137 +}
     1138 +
     1139 +/*
     1140 + * Attempt to reap stray multicast ncec_t structures left in the wake of
     1141 + * nce_graveyard_free(). This is a taskq servicing routine, as it's well
     1142 + * outside any netstack-global locks being held - ndp_g_lock in this case.  We
     1143 + * have a reference hold on the ill, which will prevent any unplumbing races.
     1144 + */
     1145 +static void
     1146 +ncec_mcast_reap(void *arg)
     1147 +{
     1148 +        ill_t *ill = (ill_t *)arg;
     1149 +
     1150 +        IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
     1151 +        ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
     1152 +        mutex_enter(&ill->ill_lock);
     1153 +        ill->ill_mcast_ncec_cleanup = B_FALSE;
     1154 +        /*
     1155 +         * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
     1156 +         * below for why.
     1157 +         */
     1158 +        ill->ill_refcnt--;
     1159 +        if (ill->ill_refcnt == 0)
     1160 +                ipif_ill_refrele_tail(ill);     /* Drops ill_lock. */
     1161 +        else
     1162 +                mutex_exit(&ill->ill_lock);
     1163 +}
     1164 +
     1165 +/*
     1166 + * Free a list (including handling an empty list or NULL list) of
     1167 + * reference-held NCEs that were reaped from a nce_too_many_mcast()
     1168 + * call. Separate because the caller must have dropped ndp_g_lock first.
     1169 + *
     1170 + * This also schedules a taskq task to unlink underlying NCECs from the
     1171 + * ndp_g_hash, which are protected by ndp_g_lock.
     1172 + */
     1173 +static void
     1174 +nce_graveyard_free(list_t *graveyard)
     1175 +{
     1176 +        nce_t *deadman, *current;
     1177 +        ill_t *ill;
     1178 +        boolean_t doit;
     1179 +
     1180 +        if (graveyard == NULL)
     1181 +                return;
     1182 +
     1183 +        current = list_head(graveyard);
     1184 +        if (current == NULL) {
     1185 +                list_destroy(graveyard);
     1186 +                return;
     1187 +        }
     1188 +
     1189 +        ill = current->nce_ill;
     1190 +        /*
     1191 +         * Normally one should ill_refhold(ill) here.  There's no _notr()
     1192 +         * variant like there is for ire_t, dce_t, or even ncec_t, but this is
     1193 +         * the ONLY case that'll break the mh_trace that IP debugging uses for
     1194 +         * reference counts (i.e. they assume same thread releases as
     1195 +         * holds). Instead, we inline ill_refhold() here.  We must do the same
     1196 +         * in the release done by the ncec_mcast_reap() above.
     1197 +         */
     1198 +        mutex_enter(&ill->ill_lock);
     1199 +        ill->ill_refcnt++;
     1200 +        mutex_exit(&ill->ill_lock);
     1201 +
     1202 +        while (current != NULL) {
     1203 +                ASSERT3P(ill, ==, current->nce_ill);
     1204 +                deadman = current;
     1205 +                current = list_next(graveyard, deadman);
     1206 +                list_remove(graveyard, deadman);
     1207 +                ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
     1208 +                    0);
     1209 +                nce_refrele(deadman);
     1210 +        }
     1211 +        list_destroy(graveyard);
     1212 +
     1213 +        mutex_enter(&ill->ill_lock);
     1214 +        if (ill->ill_mcast_ncec_cleanup)
     1215 +                doit = B_FALSE;
     1216 +        else {
     1217 +                ill->ill_mcast_ncec_cleanup = B_TRUE;
     1218 +                doit = B_TRUE;
     1219 +        }
     1220 +        mutex_exit(&ill->ill_lock);
     1221 +        if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
     1222 +            ill, TQ_NOSLEEP) == NULL) {
     1223 +                mutex_enter(&ill->ill_lock);
     1224 +                if (doit) {
     1225 +                        IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
     1226 +                        ill->ill_mcast_ncec_cleanup = B_FALSE;
     1227 +                }
     1228 +                /* There's no _notr() for ill_refrele(), so inline it here. */
     1229 +                ill->ill_refcnt--;
     1230 +                if (ill->ill_refcnt == 0)
     1231 +                        ipif_ill_refrele_tail(ill);     /* Drops ill_lock */
     1232 +                else
     1233 +                        mutex_exit(&ill->ill_lock);
     1234 +        }
     1235 +}
     1236 +
     1237 +/*
1008 1238   * For each interface an entry is added for the unspecified multicast group.
1009 1239   * Here that mapping is used to form the multicast cache entry for a particular
1010 1240   * multicast destination.
1011 1241   */
1012 1242  static int
1013 1243  nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1014 1244      uint16_t flags, nce_t **newnce)
1015 1245  {
1016 1246          uchar_t         *hw_addr;
1017 1247          int             err = 0;
1018 1248          ip_stack_t      *ipst = ill->ill_ipst;
1019 1249          nce_t           *nce;
1020 1250  
1021 1251          ASSERT(ill != NULL);
1022 1252          ASSERT(ill->ill_isv6);
1023 1253          ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1024 1254  
1025 1255          mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1026 1256          nce = nce_lookup_addr(ill, dst);
1027 1257          if (nce != NULL) {
1028 1258                  mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1029 1259                  goto done;
1030 1260          }
1031 1261          if (ill->ill_net_type == IRE_IF_RESOLVER) {
1032 1262                  /*
1033 1263                   * For IRE_IF_RESOLVER a hardware mapping can be
1034 1264                   * generated.
1035 1265                   */
1036 1266                  hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1037 1267                  if (hw_addr == NULL) {
1038 1268                          mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1039 1269                          return (ENOMEM);
1040 1270                  }
1041 1271                  ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1042 1272          } else {
  
    | 
      ↓ open down ↓ | 
    25 lines elided | 
    
      ↑ open up ↑ | 
  
1043 1273                  /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1044 1274                  hw_addr = NULL;
1045 1275          }
1046 1276          ASSERT((flags & NCE_F_MCAST) != 0);
1047 1277          ASSERT((flags & NCE_F_NONUD) != 0);
1048 1278          /* nce_state will be computed by nce_add_common() */
1049 1279          err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1050 1280              ND_UNCHANGED, &nce);
1051 1281          mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1052 1282          if (err == 0)
1053      -                err = nce_add_v6_postprocess(nce);
     1283 +                err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
1054 1284          if (hw_addr != NULL)
1055 1285                  kmem_free(hw_addr, ill->ill_nd_lla_len);
1056 1286          if (err != 0) {
1057 1287                  ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1058 1288                  return (err);
1059 1289          }
1060 1290  done:
1061 1291          ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1062 1292          if (newnce != NULL)
1063 1293                  *newnce = nce;
1064 1294          else
1065 1295                  nce_refrele(nce);
1066 1296          return (0);
1067 1297  }
1068 1298  
1069 1299  /*
1070 1300   * Return the link layer address, and any flags of a ncec.
1071 1301   */
1072 1302  int
1073 1303  ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1074 1304  {
1075 1305          ncec_t          *ncec;
1076 1306          in6_addr_t      *addr;
1077 1307          sin6_t          *sin6;
1078 1308  
1079 1309          ASSERT(ill != NULL && ill->ill_isv6);
1080 1310          sin6 = (sin6_t *)&lnr->lnr_addr;
1081 1311          addr =  &sin6->sin6_addr;
1082 1312  
1083 1313          /*
1084 1314           * NOTE: if the ill is an IPMP interface, then match against the whole
1085 1315           * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1086 1316           * addresses for the data addresses on an IPMP interface even though
1087 1317           * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1088 1318           */
1089 1319          ncec = ncec_lookup_illgrp_v6(ill, addr);
1090 1320          if (ncec == NULL)
1091 1321                  return (ESRCH);
1092 1322          /* If no link layer address is available yet, return ESRCH */
1093 1323          if (!NCE_ISREACHABLE(ncec)) {
1094 1324                  ncec_refrele(ncec);
1095 1325                  return (ESRCH);
1096 1326          }
1097 1327          lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1098 1328          bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1099 1329              lnr->lnr_hdw_len);
1100 1330          if (ncec->ncec_flags & NCE_F_ISROUTER)
1101 1331                  lnr->lnr_flags = NDF_ISROUTER_ON;
1102 1332          if (ncec->ncec_flags & NCE_F_ANYCAST)
1103 1333                  lnr->lnr_flags |= NDF_ANYCAST_ON;
1104 1334          if (ncec->ncec_flags & NCE_F_STATIC)
1105 1335                  lnr->lnr_flags |= NDF_STATIC;
1106 1336          ncec_refrele(ncec);
1107 1337          return (0);
1108 1338  }
1109 1339  
1110 1340  /*
1111 1341   * Finish setting up the Enable/Disable multicast for the driver.
1112 1342   */
1113 1343  mblk_t *
1114 1344  ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1115 1345      uint32_t hw_addr_offset, mblk_t *mp)
1116 1346  {
1117 1347          uchar_t         *hw_addr;
1118 1348          ipaddr_t        v4group;
1119 1349          uchar_t         *addr;
1120 1350  
1121 1351          ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1122 1352          if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1123 1353                  IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1124 1354  
1125 1355                  ASSERT(CLASSD(v4group));
1126 1356                  ASSERT(!(ill->ill_isv6));
1127 1357  
1128 1358                  addr = (uchar_t *)&v4group;
1129 1359          } else {
1130 1360                  ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1131 1361                  ASSERT(ill->ill_isv6);
1132 1362  
1133 1363                  addr = (uchar_t *)v6group;
1134 1364          }
1135 1365          hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1136 1366          if (hw_addr == NULL) {
1137 1367                  ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1138 1368                  freemsg(mp);
1139 1369                  return (NULL);
1140 1370          }
1141 1371  
1142 1372          ip_mcast_mapping(ill, addr, hw_addr);
1143 1373          return (mp);
1144 1374  }
1145 1375  
1146 1376  void
1147 1377  ip_ndp_resolve(ncec_t *ncec)
1148 1378  {
1149 1379          in_addr_t       sender4 = INADDR_ANY;
1150 1380          in6_addr_t      sender6 = ipv6_all_zeros;
1151 1381          ill_t           *src_ill;
1152 1382          uint32_t        ms;
1153 1383  
1154 1384          src_ill = nce_resolve_src(ncec, &sender6);
1155 1385          if (src_ill == NULL) {
1156 1386                  /* Make sure we try again later */
1157 1387                  ms = ncec->ncec_ill->ill_reachable_retrans_time;
1158 1388                  nce_restart_timer(ncec, (clock_t)ms);
1159 1389                  return;
1160 1390          }
1161 1391          if (ncec->ncec_ipversion == IPV4_VERSION)
1162 1392                  IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1163 1393          mutex_enter(&ncec->ncec_lock);
1164 1394          if (ncec->ncec_ipversion == IPV6_VERSION)
1165 1395                  ms = ndp_solicit(ncec, sender6, src_ill);
1166 1396          else
1167 1397                  ms = arp_request(ncec, sender4, src_ill);
1168 1398          mutex_exit(&ncec->ncec_lock);
1169 1399          if (ms == 0) {
1170 1400                  if (ncec->ncec_state != ND_REACHABLE) {
1171 1401                          if (ncec->ncec_ipversion == IPV6_VERSION)
1172 1402                                  ndp_resolv_failed(ncec);
1173 1403                          else
1174 1404                                  arp_resolv_failed(ncec);
1175 1405                          ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1176 1406                          nce_make_unreachable(ncec);
1177 1407                          ncec_delete(ncec);
1178 1408                  }
1179 1409          } else {
1180 1410                  nce_restart_timer(ncec, (clock_t)ms);
1181 1411          }
1182 1412  done:
1183 1413          ill_refrele(src_ill);
1184 1414  }
1185 1415  
1186 1416  /*
1187 1417   * Send an IPv6 neighbor solicitation.
1188 1418   * Returns number of milliseconds after which we should either rexmit or abort.
1189 1419   * Return of zero means we should abort.
1190 1420   * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1191 1421   * The optional source address is used as a hint to ndp_solicit for
1192 1422   * which source to use in the packet.
1193 1423   *
1194 1424   * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1195 1425   * the packet.
1196 1426   */
1197 1427  uint32_t
1198 1428  ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1199 1429  {
1200 1430          in6_addr_t      dst;
1201 1431          boolean_t       dropped = B_FALSE;
1202 1432  
1203 1433          ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1204 1434          ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1205 1435  
1206 1436          if (ncec->ncec_rcnt == 0)
1207 1437                  return (0);
1208 1438  
1209 1439          dst = ncec->ncec_addr;
1210 1440          ncec->ncec_rcnt--;
1211 1441          mutex_exit(&ncec->ncec_lock);
1212 1442          dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1213 1443              ill->ill_phys_addr_length, &src, &dst, 0);
1214 1444          mutex_enter(&ncec->ncec_lock);
1215 1445          if (dropped)
1216 1446                  ncec->ncec_rcnt++;
1217 1447          return (ncec->ncec_ill->ill_reachable_retrans_time);
1218 1448  }
1219 1449  
1220 1450  /*
1221 1451   * Attempt to recover an address on an interface that's been marked as a
1222 1452   * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1223 1453   * no easy way to just probe the address and have the right thing happen if
1224 1454   * it's no longer in use.  Instead, we just bring it up normally and allow the
1225 1455   * regular interface start-up logic to probe for a remaining duplicate and take
1226 1456   * us back down if necessary.
1227 1457   * Neither DHCP nor temporary addresses arrive here; they're excluded by
1228 1458   * ip_ndp_excl.
1229 1459   */
1230 1460  /* ARGSUSED */
1231 1461  void
1232 1462  ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1233 1463  {
1234 1464          ill_t   *ill = rq->q_ptr;
1235 1465          ipif_t  *ipif;
1236 1466          in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1237 1467          in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1238 1468          boolean_t addr_equal;
1239 1469  
1240 1470          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1241 1471                  /*
1242 1472                   * We do not support recovery of proxy ARP'd interfaces,
1243 1473                   * because the system lacks a complete proxy ARP mechanism.
1244 1474                   */
1245 1475                  if (ill->ill_isv6) {
1246 1476                          addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1247 1477                              addr6);
1248 1478                  } else {
1249 1479                          addr_equal = (ipif->ipif_lcl_addr == *addr4);
1250 1480                  }
1251 1481  
1252 1482                  if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1253 1483                          continue;
1254 1484  
1255 1485                  /*
1256 1486                   * If we have already recovered or if the interface is going
1257 1487                   * away, then ignore.
1258 1488                   */
1259 1489                  mutex_enter(&ill->ill_lock);
1260 1490                  if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1261 1491                      (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1262 1492                          mutex_exit(&ill->ill_lock);
1263 1493                          continue;
1264 1494                  }
1265 1495  
1266 1496                  ipif->ipif_flags &= ~IPIF_DUPLICATE;
1267 1497                  ill->ill_ipif_dup_count--;
1268 1498                  mutex_exit(&ill->ill_lock);
1269 1499                  ipif->ipif_was_dup = B_TRUE;
1270 1500  
1271 1501                  if (ill->ill_isv6) {
1272 1502                          VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1273 1503                          (void) ipif_up_done_v6(ipif);
1274 1504                  } else {
1275 1505                          VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1276 1506                              EINPROGRESS);
1277 1507                          (void) ipif_up_done(ipif);
1278 1508                  }
1279 1509          }
1280 1510          freeb(mp);
1281 1511  }
1282 1512  
1283 1513  /*
1284 1514   * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1285 1515   * As long as someone else holds the address, the interface will stay down.
1286 1516   * When that conflict goes away, the interface is brought back up.  This is
1287 1517   * done so that accidental shutdowns of addresses aren't made permanent.  Your
1288 1518   * server will recover from a failure.
1289 1519   *
1290 1520   * For DHCP and temporary addresses, recovery is not done in the kernel.
1291 1521   * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1292 1522   *
1293 1523   * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1294 1524   */
1295 1525  void
1296 1526  ipif_dup_recovery(void *arg)
1297 1527  {
1298 1528          ipif_t *ipif = arg;
1299 1529  
1300 1530          ipif->ipif_recovery_id = 0;
1301 1531          if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1302 1532                  return;
1303 1533  
1304 1534          /*
1305 1535           * No lock, because this is just an optimization.
1306 1536           */
1307 1537          if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1308 1538                  return;
1309 1539  
1310 1540          /* If the link is down, we'll retry this later */
1311 1541          if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1312 1542                  return;
1313 1543  
1314 1544          ipif_do_recovery(ipif);
1315 1545  }
1316 1546  
1317 1547  /*
1318 1548   * Perform interface recovery by forcing the duplicate interfaces up and
1319 1549   * allowing the system to determine which ones should stay up.
1320 1550   *
1321 1551   * Called both by recovery timer expiry and link-up notification.
1322 1552   */
1323 1553  void
1324 1554  ipif_do_recovery(ipif_t *ipif)
1325 1555  {
1326 1556          ill_t *ill = ipif->ipif_ill;
1327 1557          mblk_t *mp;
1328 1558          ip_stack_t *ipst = ill->ill_ipst;
1329 1559          size_t mp_size;
1330 1560  
1331 1561          if (ipif->ipif_isv6)
1332 1562                  mp_size = sizeof (ipif->ipif_v6lcl_addr);
1333 1563          else
1334 1564                  mp_size = sizeof (ipif->ipif_lcl_addr);
1335 1565          mp = allocb(mp_size, BPRI_MED);
1336 1566          if (mp == NULL) {
1337 1567                  mutex_enter(&ill->ill_lock);
1338 1568                  if (ipst->ips_ip_dup_recovery > 0 &&
1339 1569                      ipif->ipif_recovery_id == 0 &&
1340 1570                      !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1341 1571                          ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1342 1572                              ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1343 1573                  }
1344 1574                  mutex_exit(&ill->ill_lock);
1345 1575          } else {
1346 1576                  /*
1347 1577                   * A recovery timer may still be running if we got here from
1348 1578                   * ill_restart_dad(); cancel that timer.
1349 1579                   */
1350 1580                  if (ipif->ipif_recovery_id != 0)
1351 1581                          (void) untimeout(ipif->ipif_recovery_id);
1352 1582                  ipif->ipif_recovery_id = 0;
1353 1583  
1354 1584                  if (ipif->ipif_isv6) {
1355 1585                          bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1356 1586                              sizeof (ipif->ipif_v6lcl_addr));
1357 1587                  } else  {
1358 1588                          bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1359 1589                              sizeof (ipif->ipif_lcl_addr));
1360 1590                  }
1361 1591                  ill_refhold(ill);
1362 1592                  qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1363 1593                      B_FALSE);
1364 1594          }
1365 1595  }
1366 1596  
1367 1597  /*
1368 1598   * Find the MAC and IP addresses in an NA/NS message.
1369 1599   */
1370 1600  static void
1371 1601  ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1372 1602      in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1373 1603  {
1374 1604          icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1375 1605          nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1376 1606          uchar_t *addr;
1377 1607          int alen;
1378 1608  
1379 1609          /* icmp_inbound_v6 ensures this */
1380 1610          ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1381 1611  
1382 1612          addr = ira->ira_l2src;
1383 1613          alen = ill->ill_phys_addr_length;
1384 1614          if (alen > 0) {
1385 1615                  *haddr = addr;
1386 1616                  *haddrlenp = alen;
1387 1617          } else {
1388 1618                  *haddr = NULL;
1389 1619                  *haddrlenp = 0;
1390 1620          }
1391 1621  
1392 1622          /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1393 1623          *targp = ns->nd_ns_target;
1394 1624  }
1395 1625  
1396 1626  /*
1397 1627   * This is for exclusive changes due to NDP duplicate address detection
1398 1628   * failure.
1399 1629   */
1400 1630  /* ARGSUSED */
1401 1631  static void
1402 1632  ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1403 1633  {
1404 1634          ill_t   *ill = rq->q_ptr;
1405 1635          ipif_t  *ipif;
1406 1636          uchar_t *haddr;
1407 1637          uint_t  haddrlen;
1408 1638          ip_stack_t *ipst = ill->ill_ipst;
1409 1639          in6_addr_t targ;
1410 1640          ip_recv_attr_t iras;
1411 1641          mblk_t  *attrmp;
1412 1642  
1413 1643          attrmp = mp;
1414 1644          mp = mp->b_cont;
1415 1645          attrmp->b_cont = NULL;
1416 1646          if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1417 1647                  /* The ill or ip_stack_t disappeared on us */
1418 1648                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1419 1649                  ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1420 1650                  freemsg(mp);
1421 1651                  ira_cleanup(&iras, B_TRUE);
1422 1652                  return;
1423 1653          }
1424 1654  
1425 1655          ASSERT(ill == iras.ira_rill);
1426 1656  
1427 1657          ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1428 1658          if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1429 1659                  /*
1430 1660                   * Ignore conflicts generated by misbehaving switches that
1431 1661                   * just reflect our own messages back to us.  For IPMP, we may
1432 1662                   * see reflections across any ill in the illgrp.
1433 1663                   *
1434 1664                   * RFC2462 and revisions tried to detect both the case
1435 1665                   * when a statically configured IPv6 address is a duplicate,
1436 1666                   * and the case when the L2 address itself is a duplicate. The
1437 1667                   * later is important because, with stateles address autoconf,
1438 1668                   * if the L2 address is a duplicate, the resulting IPv6
1439 1669                   * address(es) would also be duplicates. We rely on DAD of the
1440 1670                   * IPv6 address itself to detect the latter case.
1441 1671                   */
1442 1672                  /* For an under ill_grp can change under lock */
1443 1673                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1444 1674                  if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1445 1675                      IS_UNDER_IPMP(ill) &&
1446 1676                      ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1447 1677                      haddrlen) != NULL) {
1448 1678                          rw_exit(&ipst->ips_ill_g_lock);
1449 1679                          goto ignore_conflict;
1450 1680                  }
1451 1681                  rw_exit(&ipst->ips_ill_g_lock);
1452 1682          }
1453 1683  
1454 1684          /*
1455 1685           * Look up the appropriate ipif.
1456 1686           */
1457 1687          ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1458 1688          if (ipif == NULL)
1459 1689                  goto ignore_conflict;
1460 1690  
1461 1691          /* Reload the ill to match the ipif */
1462 1692          ill = ipif->ipif_ill;
1463 1693  
1464 1694          /* If it's already duplicate or ineligible, then don't do anything. */
1465 1695          if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1466 1696                  ipif_refrele(ipif);
1467 1697                  goto ignore_conflict;
1468 1698          }
1469 1699  
1470 1700          /*
1471 1701           * If this is a failure during duplicate recovery, then don't
1472 1702           * complain.  It may take a long time to recover.
1473 1703           */
1474 1704          if (!ipif->ipif_was_dup) {
1475 1705                  char ibuf[LIFNAMSIZ];
1476 1706                  char hbuf[MAC_STR_LEN];
1477 1707                  char sbuf[INET6_ADDRSTRLEN];
1478 1708  
1479 1709                  ipif_get_name(ipif, ibuf, sizeof (ibuf));
1480 1710                  cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1481 1711                      " disabled", ibuf,
1482 1712                      inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1483 1713                      mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1484 1714          }
1485 1715          mutex_enter(&ill->ill_lock);
1486 1716          ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1487 1717          ipif->ipif_flags |= IPIF_DUPLICATE;
1488 1718          ill->ill_ipif_dup_count++;
1489 1719          mutex_exit(&ill->ill_lock);
1490 1720          (void) ipif_down(ipif, NULL, NULL);
1491 1721          (void) ipif_down_tail(ipif);
1492 1722          mutex_enter(&ill->ill_lock);
1493 1723          if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1494 1724              ill->ill_net_type == IRE_IF_RESOLVER &&
1495 1725              !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1496 1726              ipst->ips_ip_dup_recovery > 0) {
1497 1727                  ASSERT(ipif->ipif_recovery_id == 0);
1498 1728                  ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1499 1729                      ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1500 1730          }
1501 1731          mutex_exit(&ill->ill_lock);
1502 1732          ipif_refrele(ipif);
1503 1733  
1504 1734  ignore_conflict:
1505 1735          freemsg(mp);
1506 1736          ira_cleanup(&iras, B_TRUE);
1507 1737  }
1508 1738  
1509 1739  /*
1510 1740   * Handle failure by tearing down the ipifs with the specified address.  Note
1511 1741   * that tearing down the ipif also means deleting the ncec through ipif_down, so
1512 1742   * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1513 1743   * we start a timer on the ipif.
1514 1744   * Caller has to free mp;
1515 1745   */
1516 1746  static void
1517 1747  ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1518 1748  {
1519 1749          const uchar_t   *haddr;
1520 1750          ill_t           *ill = ira->ira_rill;
1521 1751  
1522 1752          /*
1523 1753           * Ignore conflicts generated by misbehaving switches that just
1524 1754           * reflect our own messages back to us.
1525 1755           */
1526 1756  
1527 1757          /* icmp_inbound_v6 ensures this */
1528 1758          ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1529 1759          haddr = ira->ira_l2src;
1530 1760          if (haddr != NULL &&
1531 1761              bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1532 1762                  return;
1533 1763          }
1534 1764  
1535 1765          if ((mp = copymsg(mp)) != NULL) {
1536 1766                  mblk_t  *attrmp;
1537 1767  
1538 1768                  attrmp = ip_recv_attr_to_mblk(ira);
1539 1769                  if (attrmp == NULL) {
1540 1770                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1541 1771                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
1542 1772                          freemsg(mp);
1543 1773                  } else {
1544 1774                          ASSERT(attrmp->b_cont == NULL);
1545 1775                          attrmp->b_cont = mp;
1546 1776                          mp = attrmp;
1547 1777                          ill_refhold(ill);
1548 1778                          qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1549 1779                              B_FALSE);
1550 1780                  }
1551 1781          }
1552 1782  }
1553 1783  
1554 1784  /*
1555 1785   * Handle a discovered conflict: some other system is advertising that it owns
1556 1786   * one of our IP addresses.  We need to defend ourselves, or just shut down the
1557 1787   * interface.
1558 1788   *
1559 1789   * Handles both IPv4 and IPv6
1560 1790   */
1561 1791  boolean_t
1562 1792  ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1563 1793  {
1564 1794          ipif_t          *ipif;
1565 1795          clock_t         now;
1566 1796          uint_t          maxdefense;
1567 1797          uint_t          defs;
1568 1798          ill_t           *ill = ira->ira_ill;
1569 1799          ip_stack_t      *ipst = ill->ill_ipst;
1570 1800          uint32_t        elapsed;
1571 1801          boolean_t       isv6 = ill->ill_isv6;
1572 1802          ipaddr_t        ncec_addr;
1573 1803  
1574 1804          if (isv6) {
1575 1805                  ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1576 1806                      ipst);
1577 1807          } else {
1578 1808                  if (arp_no_defense) {
1579 1809                          /*
1580 1810                           * Yes, there is a conflict, but no, we do not
1581 1811                           * defend ourself.
1582 1812                           */
1583 1813                          return (B_TRUE);
1584 1814                  }
1585 1815                  IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1586 1816                  ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1587 1817                      ipst);
1588 1818          }
1589 1819          if (ipif == NULL)
1590 1820                  return (B_FALSE);
1591 1821  
1592 1822          /*
1593 1823           * First, figure out if this address is disposable.
1594 1824           */
1595 1825          if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1596 1826                  maxdefense = ipst->ips_ip_max_temp_defend;
1597 1827          else
1598 1828                  maxdefense = ipst->ips_ip_max_defend;
1599 1829  
1600 1830          /*
1601 1831           * Now figure out how many times we've defended ourselves.  Ignore
1602 1832           * defenses that happened long in the past.
1603 1833           */
1604 1834          now = ddi_get_lbolt();
1605 1835          elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1606 1836          mutex_enter(&ncec->ncec_lock);
1607 1837          if ((defs = ncec->ncec_defense_count) > 0 &&
1608 1838              elapsed > ipst->ips_ip_defend_interval) {
1609 1839                  /*
1610 1840                   * ip_defend_interval has elapsed.
1611 1841                   * reset the defense count.
1612 1842                   */
1613 1843                  ncec->ncec_defense_count = defs = 0;
1614 1844          }
1615 1845          ncec->ncec_defense_count++;
1616 1846          ncec->ncec_last_time_defended = now;
1617 1847          mutex_exit(&ncec->ncec_lock);
1618 1848          ipif_refrele(ipif);
1619 1849  
1620 1850          /*
1621 1851           * If we've defended ourselves too many times already, then give up and
1622 1852           * tear down the interface(s) using this address.
1623 1853           * Otherwise, caller has to defend by sending out an announce.
1624 1854           */
1625 1855          if (defs >= maxdefense) {
1626 1856                  if (isv6)
1627 1857                          ndp_failure(mp, ira);
1628 1858                  else
1629 1859                          arp_failure(mp, ira);
1630 1860          } else {
1631 1861                  return (B_TRUE); /* caller must defend this address */
1632 1862          }
1633 1863          return (B_FALSE);
1634 1864  }
1635 1865  
1636 1866  /*
1637 1867   * Handle reception of Neighbor Solicitation messages.
1638 1868   */
1639 1869  static void
1640 1870  ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1641 1871  {
1642 1872          ill_t           *ill = ira->ira_ill, *under_ill;
1643 1873          nd_neighbor_solicit_t *ns;
1644 1874          uint32_t        hlen = ill->ill_phys_addr_length;
1645 1875          uchar_t         *haddr = NULL;
1646 1876          icmp6_t         *icmp_nd;
1647 1877          ip6_t           *ip6h;
1648 1878          ncec_t          *our_ncec = NULL;
1649 1879          in6_addr_t      target;
1650 1880          in6_addr_t      src;
1651 1881          int             len;
1652 1882          int             flag = 0;
1653 1883          nd_opt_hdr_t    *opt = NULL;
1654 1884          boolean_t       bad_solicit = B_FALSE;
1655 1885          mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
1656 1886          boolean_t       need_ill_refrele = B_FALSE;
1657 1887  
1658 1888          ip6h = (ip6_t *)mp->b_rptr;
1659 1889          icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1660 1890          len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1661 1891          src = ip6h->ip6_src;
1662 1892          ns = (nd_neighbor_solicit_t *)icmp_nd;
1663 1893          target = ns->nd_ns_target;
1664 1894          if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1665 1895              IN6_IS_ADDR_LOOPBACK(&target)) {
1666 1896                  if (ip_debug > 2) {
1667 1897                          /* ip1dbg */
1668 1898                          pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1669 1899                              AF_INET6, &target);
1670 1900                  }
1671 1901                  bad_solicit = B_TRUE;
1672 1902                  goto done;
1673 1903          }
1674 1904          if (len > sizeof (nd_neighbor_solicit_t)) {
1675 1905                  /* Options present */
1676 1906                  opt = (nd_opt_hdr_t *)&ns[1];
1677 1907                  len -= sizeof (nd_neighbor_solicit_t);
1678 1908                  if (!ndp_verify_optlen(opt, len)) {
1679 1909                          ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1680 1910                          bad_solicit = B_TRUE;
1681 1911                          goto done;
1682 1912                  }
1683 1913          }
1684 1914          if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1685 1915                  /* Check to see if this is a valid DAD solicitation */
1686 1916                  if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1687 1917                          if (ip_debug > 2) {
1688 1918                                  /* ip1dbg */
1689 1919                                  pr_addr_dbg("ndp_input_solicit: IPv6 "
1690 1920                                      "Destination is not solicited node "
1691 1921                                      "multicast %s\n", AF_INET6,
1692 1922                                      &ip6h->ip6_dst);
1693 1923                          }
1694 1924                          bad_solicit = B_TRUE;
1695 1925                          goto done;
1696 1926                  }
1697 1927          }
1698 1928  
1699 1929          /*
1700 1930           * NOTE: with IPMP, it's possible the nominated multicast ill (which
1701 1931           * received this packet if it's multicast) is not the ill tied to
1702 1932           * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1703 1933           * to ensure we find the associated NCE.
1704 1934           */
1705 1935          our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1706 1936          /*
1707 1937           * If this is a valid Solicitation for an address we are publishing,
1708 1938           * then a PUBLISH entry should exist in the cache
1709 1939           */
1710 1940          if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1711 1941                  ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1712 1942                      "ifname=%s ", ill->ill_name));
1713 1943                  if (ip_debug > 2) {
1714 1944                          /* ip1dbg */
1715 1945                          pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1716 1946                  }
1717 1947                  if (our_ncec == NULL)
1718 1948                          bad_solicit = B_TRUE;
1719 1949                  goto done;
1720 1950          }
1721 1951  
1722 1952          /* At this point we should have a verified NS per spec */
1723 1953          if (opt != NULL) {
1724 1954                  opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1725 1955                  if (opt != NULL) {
1726 1956                          haddr = (uchar_t *)&opt[1];
1727 1957                          if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1728 1958                              hlen == 0) {
1729 1959                                  ip1dbg(("ndp_input_advert: bad SLLA\n"));
1730 1960                                  bad_solicit = B_TRUE;
1731 1961                                  goto done;
1732 1962                          }
1733 1963                  }
1734 1964          }
1735 1965  
1736 1966          /* If sending directly to peer, set the unicast flag */
1737 1967          if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1738 1968                  flag |= NDP_UNICAST;
1739 1969  
1740 1970          /*
1741 1971           * Create/update the entry for the soliciting node on the ipmp_ill.
1742 1972           * or respond to outstanding queries, don't if
1743 1973           * the source is unspecified address.
1744 1974           */
1745 1975          if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1746 1976                  int     err;
1747 1977                  nce_t   *nnce;
1748 1978  
1749 1979                  ASSERT(ill->ill_isv6);
1750 1980                  /*
1751 1981                   * Regular solicitations *must* include the Source Link-Layer
1752 1982                   * Address option.  Ignore messages that do not.
1753 1983                   */
1754 1984                  if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1755 1985                          ip1dbg(("ndp_input_solicit: source link-layer address "
1756 1986                              "option missing with a specified source.\n"));
1757 1987                          bad_solicit = B_TRUE;
1758 1988                          goto done;
1759 1989                  }
1760 1990  
1761 1991                  /*
1762 1992                   * This is a regular solicitation.  If we're still in the
1763 1993                   * process of verifying the address, then don't respond at all
1764 1994                   * and don't keep track of the sender.
1765 1995                   */
1766 1996                  if (our_ncec->ncec_state == ND_PROBE)
1767 1997                          goto done;
1768 1998  
1769 1999                  /*
1770 2000                   * If the solicitation doesn't have sender hardware address
1771 2001                   * (legal for unicast solicitation), then process without
1772 2002                   * installing the return NCE.  Either we already know it, or
1773 2003                   * we'll be forced to look it up when (and if) we reply to the
1774 2004                   * packet.
1775 2005                   */
1776 2006                  if (haddr == NULL)
1777 2007                          goto no_source;
1778 2008  
1779 2009                  under_ill = ill;
1780 2010                  if (IS_UNDER_IPMP(under_ill)) {
1781 2011                          ill = ipmp_ill_hold_ipmp_ill(under_ill);
1782 2012                          if (ill == NULL)
1783 2013                                  ill = under_ill;
1784 2014                          else
1785 2015                                  need_ill_refrele = B_TRUE;
1786 2016                  }
1787 2017                  err = nce_lookup_then_add_v6(ill,
1788 2018                      haddr, hlen,
1789 2019                      &src,       /* Soliciting nodes address */
1790 2020                      0,
1791 2021                      ND_STALE,
1792 2022                      &nnce);
1793 2023  
1794 2024                  if (need_ill_refrele) {
1795 2025                          ill_refrele(ill);
1796 2026                          ill = under_ill;
1797 2027                          need_ill_refrele =  B_FALSE;
1798 2028                  }
1799 2029                  switch (err) {
1800 2030                  case 0:
1801 2031                          /* done with this entry */
1802 2032                          nce_refrele(nnce);
1803 2033                          break;
1804 2034                  case EEXIST:
1805 2035                          /*
1806 2036                           * B_FALSE indicates this is not an an advertisement.
1807 2037                           */
1808 2038                          nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1809 2039                          nce_refrele(nnce);
1810 2040                          break;
1811 2041                  default:
1812 2042                          ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1813 2043                              err));
1814 2044                          goto done;
1815 2045                  }
1816 2046  no_source:
1817 2047                  flag |= NDP_SOLICITED;
1818 2048          } else {
1819 2049                  /*
1820 2050                   * No source link layer address option should be present in a
1821 2051                   * valid DAD request.
1822 2052                   */
1823 2053                  if (haddr != NULL) {
1824 2054                          ip1dbg(("ndp_input_solicit: source link-layer address "
1825 2055                              "option present with an unspecified source.\n"));
1826 2056                          bad_solicit = B_TRUE;
1827 2057                          goto done;
1828 2058                  }
1829 2059                  if (our_ncec->ncec_state == ND_PROBE) {
1830 2060                          /*
1831 2061                           * Internally looped-back probes will have
1832 2062                           * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1833 2063                           * transmissions.
1834 2064                           */
1835 2065                          if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1836 2066                                  /*
1837 2067                                   * If someone else is probing our address, then
1838 2068                                   * we've crossed wires.  Declare failure.
1839 2069                                   */
1840 2070                                  ndp_failure(mp, ira);
1841 2071                          }
1842 2072                          goto done;
1843 2073                  }
1844 2074                  /*
1845 2075                   * This is a DAD probe.  Multicast the advertisement to the
1846 2076                   * all-nodes address.
1847 2077                   */
1848 2078                  src = ipv6_all_hosts_mcast;
1849 2079          }
1850 2080          flag |= nce_advert_flags(our_ncec);
1851 2081          (void) ndp_xmit(ill,
1852 2082              ND_NEIGHBOR_ADVERT,
1853 2083              our_ncec->ncec_lladdr,
1854 2084              our_ncec->ncec_lladdr_length,
1855 2085              &target,    /* Source and target of the advertisement pkt */
1856 2086              &src,       /* IP Destination (source of original pkt) */
1857 2087              flag);
1858 2088  done:
1859 2089          if (bad_solicit)
1860 2090                  BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1861 2091          if (our_ncec != NULL)
1862 2092                  ncec_refrele(our_ncec);
1863 2093  }
1864 2094  
1865 2095  /*
1866 2096   * Handle reception of Neighbor Solicitation messages
1867 2097   */
1868 2098  void
1869 2099  ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1870 2100  {
1871 2101          ill_t           *ill = ira->ira_ill;
1872 2102          nd_neighbor_advert_t *na;
1873 2103          uint32_t        hlen = ill->ill_phys_addr_length;
1874 2104          uchar_t         *haddr = NULL;
1875 2105          icmp6_t         *icmp_nd;
1876 2106          ip6_t           *ip6h;
1877 2107          ncec_t          *dst_ncec = NULL;
1878 2108          in6_addr_t      target;
1879 2109          nd_opt_hdr_t    *opt = NULL;
1880 2110          int             len;
1881 2111          ip_stack_t      *ipst = ill->ill_ipst;
1882 2112          mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
1883 2113  
1884 2114          ip6h = (ip6_t *)mp->b_rptr;
1885 2115          icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1886 2116          len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1887 2117          na = (nd_neighbor_advert_t *)icmp_nd;
1888 2118  
1889 2119          if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1890 2120              (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1891 2121                  ip1dbg(("ndp_input_advert: Target is multicast but the "
1892 2122                      "solicited flag is not zero\n"));
1893 2123                  BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1894 2124                  return;
1895 2125          }
1896 2126          target = na->nd_na_target;
1897 2127          if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1898 2128              IN6_IS_ADDR_LOOPBACK(&target)) {
1899 2129                  if (ip_debug > 2) {
1900 2130                          /* ip1dbg */
1901 2131                          pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1902 2132                              AF_INET6, &target);
1903 2133                  }
1904 2134                  BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1905 2135                  return;
1906 2136          }
1907 2137          if (len > sizeof (nd_neighbor_advert_t)) {
1908 2138                  opt = (nd_opt_hdr_t *)&na[1];
1909 2139                  if (!ndp_verify_optlen(opt,
1910 2140                      len - sizeof (nd_neighbor_advert_t))) {
1911 2141                          ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1912 2142                          BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1913 2143                          return;
1914 2144                  }
1915 2145                  /* At this point we have a verified NA per spec */
1916 2146                  len -= sizeof (nd_neighbor_advert_t);
1917 2147                  opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1918 2148                  if (opt != NULL) {
1919 2149                          haddr = (uchar_t *)&opt[1];
1920 2150                          if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1921 2151                              hlen == 0) {
1922 2152                                  ip1dbg(("ndp_input_advert: bad SLLA\n"));
1923 2153                                  BUMP_MIB(mib,
1924 2154                                      ipv6IfIcmpInBadNeighborAdvertisements);
1925 2155                                  return;
1926 2156                          }
1927 2157                  }
1928 2158          }
1929 2159  
1930 2160          /*
1931 2161           * NOTE: we match across the illgrp since we need to do DAD for all of
1932 2162           * our local addresses, and those are spread across all the active
1933 2163           * ills in the group.
1934 2164           */
1935 2165          if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1936 2166                  return;
1937 2167  
1938 2168          if (NCE_PUBLISH(dst_ncec)) {
1939 2169                  /*
1940 2170                   * Someone just advertised an addresses that we publish. First,
1941 2171                   * check it it was us -- if so, we can safely ignore it.
1942 2172                   * We don't get the haddr from the ira_l2src because, in the
1943 2173                   * case that the packet originated from us, on an IPMP group,
1944 2174                   * the ira_l2src may would be the link-layer address of the
1945 2175                   * cast_ill used to send the packet, which may not be the same
1946 2176                   * as the dst_ncec->ncec_lladdr of the address.
1947 2177                   */
1948 2178                  if (haddr != NULL) {
1949 2179                          if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1950 2180                                  goto out;
1951 2181  
1952 2182                          if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1953 2183                                  goto out;   /* from us -- no conflict */
1954 2184  
1955 2185                          /*
1956 2186                           * If we're in an IPMP group, check if this is an echo
1957 2187                           * from another ill in the group.  Use the double-
1958 2188                           * checked locking pattern to avoid grabbing
1959 2189                           * ill_g_lock in the non-IPMP case.
1960 2190                           */
1961 2191                          if (IS_UNDER_IPMP(ill)) {
1962 2192                                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1963 2193                                  if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1964 2194                                      ill->ill_grp, haddr, hlen) != NULL) {
1965 2195                                          rw_exit(&ipst->ips_ill_g_lock);
1966 2196                                          goto out;
1967 2197                                  }
1968 2198                                  rw_exit(&ipst->ips_ill_g_lock);
1969 2199                          }
1970 2200                  }
1971 2201  
1972 2202                  /*
1973 2203                   * This appears to be a real conflict.  If we're trying to
1974 2204                   * configure this NCE (ND_PROBE), then shut it down.
1975 2205                   * Otherwise, handle the discovered conflict.
1976 2206                   */
1977 2207                  if (dst_ncec->ncec_state == ND_PROBE) {
1978 2208                          ndp_failure(mp, ira);
1979 2209                  } else {
1980 2210                          if (ip_nce_conflict(mp, ira, dst_ncec)) {
1981 2211                                  char hbuf[MAC_STR_LEN];
1982 2212                                  char sbuf[INET6_ADDRSTRLEN];
1983 2213  
1984 2214                                  cmn_err(CE_WARN,
1985 2215                                      "node '%s' is using %s on %s",
1986 2216                                      inet_ntop(AF_INET6, &target, sbuf,
1987 2217                                      sizeof (sbuf)),
1988 2218                                      haddr == NULL ? "<none>" :
1989 2219                                      mac_colon_addr(haddr, hlen, hbuf,
1990 2220                                      sizeof (hbuf)), ill->ill_name);
1991 2221                                  /*
1992 2222                                   * RFC 4862, Section 5.4.4 does not mandate
1993 2223                                   * any specific behavior when an NA matches
1994 2224                                   * a non-tentative address assigned to the
1995 2225                                   * receiver. We make the choice of defending
1996 2226                                   * our address, based on the assumption that
1997 2227                                   * the sender has not detected the Duplicate.
1998 2228                                   *
1999 2229                                   * ncec_last_time_defended has been adjusted
2000 2230                                   * in ip_nce_conflict()
2001 2231                                   */
2002 2232                                  (void) ndp_announce(dst_ncec);
2003 2233                          }
2004 2234                  }
2005 2235          } else {
2006 2236                  if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2007 2237                          dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2008 2238  
2009 2239                  /* B_TRUE indicates this an advertisement */
2010 2240                  nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2011 2241          }
2012 2242  out:
2013 2243          ncec_refrele(dst_ncec);
2014 2244  }
2015 2245  
2016 2246  /*
2017 2247   * Process NDP neighbor solicitation/advertisement messages.
2018 2248   * The checksum has already checked o.k before reaching here.
2019 2249   * Information about the datalink header is contained in ira_l2src, but
2020 2250   * that should be ignored for loopback packets.
2021 2251   */
2022 2252  void
2023 2253  ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2024 2254  {
2025 2255          ill_t           *ill = ira->ira_rill;
2026 2256          icmp6_t         *icmp_nd;
2027 2257          ip6_t           *ip6h;
2028 2258          int             len;
2029 2259          mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
2030 2260          ill_t           *orig_ill = NULL;
2031 2261  
2032 2262          /*
2033 2263           * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2034 2264           * and make it be the IPMP upper so avoid being confused by a packet
2035 2265           * addressed to a unicast address on a different ill.
2036 2266           */
2037 2267          if (IS_UNDER_IPMP(ill)) {
2038 2268                  orig_ill = ill;
2039 2269                  ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2040 2270                  if (ill == NULL) {
2041 2271                          ill = orig_ill;
2042 2272                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2043 2273                          ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2044 2274                              mp, ill);
2045 2275                          freemsg(mp);
2046 2276                          return;
2047 2277                  }
2048 2278                  ASSERT(ill != orig_ill);
2049 2279                  orig_ill = ira->ira_ill;
2050 2280                  ira->ira_ill = ill;
2051 2281                  mib = ill->ill_icmp6_mib;
2052 2282          }
2053 2283          if (!pullupmsg(mp, -1)) {
2054 2284                  ip1dbg(("ndp_input: pullupmsg failed\n"));
2055 2285                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2056 2286                  ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2057 2287                  goto done;
2058 2288          }
2059 2289          ip6h = (ip6_t *)mp->b_rptr;
2060 2290          if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2061 2291                  ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2062 2292                  ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2063 2293                  BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2064 2294                  goto done;
2065 2295          }
2066 2296          /*
2067 2297           * NDP does not accept any extension headers between the
2068 2298           * IP header and the ICMP header since e.g. a routing
2069 2299           * header could be dangerous.
2070 2300           * This assumes that any AH or ESP headers are removed
2071 2301           * by ip prior to passing the packet to ndp_input.
2072 2302           */
2073 2303          if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2074 2304                  ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2075 2305                      ip6h->ip6_nxt));
2076 2306                  ip_drop_input("Wrong next header", mp, ill);
2077 2307                  BUMP_MIB(mib, ipv6IfIcmpInErrors);
2078 2308                  goto done;
2079 2309          }
2080 2310          icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2081 2311          ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2082 2312              icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2083 2313          if (icmp_nd->icmp6_code != 0) {
2084 2314                  ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2085 2315                  ip_drop_input("code non-zero", mp, ill);
2086 2316                  BUMP_MIB(mib, ipv6IfIcmpInErrors);
2087 2317                  goto done;
2088 2318          }
2089 2319          len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2090 2320          /*
2091 2321           * Make sure packet length is large enough for either
2092 2322           * a NS or a NA icmp packet.
2093 2323           */
2094 2324          if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2095 2325                  ip1dbg(("ndp_input: packet too short\n"));
2096 2326                  ip_drop_input("packet too short", mp, ill);
2097 2327                  BUMP_MIB(mib, ipv6IfIcmpInErrors);
2098 2328                  goto done;
2099 2329          }
2100 2330          if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2101 2331                  ndp_input_solicit(mp, ira);
2102 2332          } else {
2103 2333                  ndp_input_advert(mp, ira);
2104 2334          }
2105 2335  done:
2106 2336          freemsg(mp);
2107 2337          if (orig_ill != NULL) {
2108 2338                  ill_refrele(ill);
2109 2339                  ira->ira_ill = orig_ill;
2110 2340          }
2111 2341  }
2112 2342  
2113 2343  /*
2114 2344   * ndp_xmit is called to form and transmit a ND solicitation or
2115 2345   * advertisement ICMP packet.
2116 2346   *
2117 2347   * If the source address is unspecified and this isn't a probe (used for
2118 2348   * duplicate address detection), an appropriate source address and link layer
2119 2349   * address will be chosen here.  The link layer address option is included if
2120 2350   * the source is specified (i.e., all non-probe packets), and omitted (per the
2121 2351   * specification) otherwise.
2122 2352   *
2123 2353   * It returns B_FALSE only if it does a successful put() to the
2124 2354   * corresponding ill's ill_wq otherwise returns B_TRUE.
2125 2355   */
2126 2356  static boolean_t
2127 2357  ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2128 2358      const in6_addr_t *sender, const in6_addr_t *target, int flag)
2129 2359  {
2130 2360          uint32_t        len;
2131 2361          icmp6_t         *icmp6;
2132 2362          mblk_t          *mp;
2133 2363          ip6_t           *ip6h;
2134 2364          nd_opt_hdr_t    *opt;
2135 2365          uint_t          plen;
2136 2366          zoneid_t        zoneid = GLOBAL_ZONEID;
2137 2367          ill_t           *hwaddr_ill = ill;
2138 2368          ip_xmit_attr_t  ixas;
2139 2369          ip_stack_t      *ipst = ill->ill_ipst;
2140 2370          boolean_t       need_refrele = B_FALSE;
2141 2371          boolean_t       probe = B_FALSE;
2142 2372  
2143 2373          if (IS_UNDER_IPMP(ill)) {
2144 2374                  probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2145 2375                  /*
2146 2376                   * We send non-probe packets on the upper IPMP interface.
2147 2377                   * ip_output_simple() will use cast_ill for sending any
2148 2378                   * multicast packets. Note that we can't follow the same
2149 2379                   * logic for probe packets because all interfaces in the ipmp
2150 2380                   * group may have failed, so that we really want to only try
2151 2381                   * to send the ND packet on the ill corresponding to the src
2152 2382                   * address.
2153 2383                   */
2154 2384                  if (!probe) {
2155 2385                          ill = ipmp_ill_hold_ipmp_ill(ill);
2156 2386                          if (ill != NULL)
2157 2387                                  need_refrele = B_TRUE;
2158 2388                          else
2159 2389                                  ill = hwaddr_ill;
2160 2390                  }
2161 2391          }
2162 2392  
2163 2393          /*
2164 2394           * If we have a unspecified source(sender) address, select a
2165 2395           * proper source address for the solicitation here itself so
2166 2396           * that we can initialize the h/w address correctly.
2167 2397           *
2168 2398           * If the sender is specified then we use this address in order
2169 2399           * to lookup the zoneid before calling ip_output_v6(). This is to
2170 2400           * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2171 2401           * by IP (we cannot guarantee that the global zone has an interface
2172 2402           * route to the destination).
2173 2403           *
2174 2404           * Note that the NA never comes here with the unspecified source
2175 2405           * address.
2176 2406           */
2177 2407  
2178 2408          /*
2179 2409           * Probes will have unspec src at this point.
2180 2410           */
2181 2411          if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2182 2412                  zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2183 2413                  /*
2184 2414                   * It's possible for ipif_lookup_addr_zoneid_v6() to return
2185 2415                   * ALL_ZONES if it cannot find a matching ipif for the address
2186 2416                   * we are trying to use. In this case we err on the side of
2187 2417                   * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2188 2418                   */
2189 2419                  if (zoneid == ALL_ZONES)
2190 2420                          zoneid = GLOBAL_ZONEID;
2191 2421          }
2192 2422  
2193 2423          plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2194 2424          len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2195 2425          mp = allocb(len,  BPRI_LO);
2196 2426          if (mp == NULL) {
2197 2427                  if (need_refrele)
2198 2428                          ill_refrele(ill);
2199 2429                  return (B_TRUE);
2200 2430          }
2201 2431  
2202 2432          bzero((char *)mp->b_rptr, len);
2203 2433          mp->b_wptr = mp->b_rptr + len;
2204 2434  
2205 2435          bzero(&ixas, sizeof (ixas));
2206 2436          ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2207 2437  
2208 2438          ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2209 2439          ixas.ixa_ipst = ipst;
2210 2440          ixas.ixa_cred = kcred;
2211 2441          ixas.ixa_cpid = NOPID;
2212 2442          ixas.ixa_tsl = NULL;
2213 2443          ixas.ixa_zoneid = zoneid;
2214 2444  
2215 2445          ip6h = (ip6_t *)mp->b_rptr;
2216 2446          ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2217 2447          ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2218 2448          ip6h->ip6_nxt = IPPROTO_ICMPV6;
2219 2449          ip6h->ip6_hops = IPV6_MAX_HOPS;
2220 2450          ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2221 2451          ip6h->ip6_dst = *target;
2222 2452          icmp6 = (icmp6_t *)&ip6h[1];
2223 2453  
2224 2454          if (hw_addr_len != 0) {
2225 2455                  opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2226 2456                      sizeof (nd_neighbor_advert_t));
2227 2457          } else {
2228 2458                  opt = NULL;
2229 2459          }
2230 2460          if (operation == ND_NEIGHBOR_SOLICIT) {
2231 2461                  nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2232 2462  
2233 2463                  if (opt != NULL && !(flag & NDP_PROBE)) {
2234 2464                          /*
2235 2465                           * Note that we don't send out SLLA for ND probes
2236 2466                           * per RFC 4862, even though we do send out the src
2237 2467                           * haddr for IPv4 DAD probes, even though both IPv4
2238 2468                           * and IPv6 go out with the unspecified/INADDR_ANY
2239 2469                           * src IP addr.
2240 2470                           */
2241 2471                          opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2242 2472                  }
2243 2473                  ip6h->ip6_src = *sender;
2244 2474                  ns->nd_ns_target = *target;
2245 2475                  if (!(flag & NDP_UNICAST)) {
2246 2476                          /* Form multicast address of the target */
2247 2477                          ip6h->ip6_dst = ipv6_solicited_node_mcast;
2248 2478                          ip6h->ip6_dst.s6_addr32[3] |=
2249 2479                              ns->nd_ns_target.s6_addr32[3];
2250 2480                  }
2251 2481          } else {
2252 2482                  nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2253 2483  
2254 2484                  ASSERT(!(flag & NDP_PROBE));
2255 2485                  if (opt != NULL)
2256 2486                          opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2257 2487                  ip6h->ip6_src = *sender;
2258 2488                  na->nd_na_target = *sender;
2259 2489                  if (flag & NDP_ISROUTER)
2260 2490                          na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2261 2491                  if (flag & NDP_SOLICITED)
2262 2492                          na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2263 2493                  if (flag & NDP_ORIDE)
2264 2494                          na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2265 2495          }
2266 2496  
2267 2497          if (!(flag & NDP_PROBE)) {
2268 2498                  if (hw_addr != NULL && opt != NULL) {
2269 2499                          /* Fill in link layer address and option len */
2270 2500                          opt->nd_opt_len = (uint8_t)plen;
2271 2501                          bcopy(hw_addr, &opt[1], hw_addr_len);
2272 2502                  }
2273 2503          }
2274 2504          if (opt != NULL && opt->nd_opt_type == 0) {
2275 2505                  /* If there's no link layer address option, then strip it. */
2276 2506                  len -= plen * 8;
2277 2507                  mp->b_wptr = mp->b_rptr + len;
2278 2508                  ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2279 2509          }
2280 2510  
2281 2511          icmp6->icmp6_type = (uint8_t)operation;
2282 2512          icmp6->icmp6_code = 0;
2283 2513          /*
2284 2514           * Prepare for checksum by putting icmp length in the icmp
2285 2515           * checksum field. The checksum is calculated in ip_output.c.
2286 2516           */
2287 2517          icmp6->icmp6_cksum = ip6h->ip6_plen;
2288 2518  
2289 2519          (void) ip_output_simple(mp, &ixas);
2290 2520          ixa_cleanup(&ixas);
2291 2521          if (need_refrele)
2292 2522                  ill_refrele(ill);
2293 2523          return (B_FALSE);
2294 2524  }
2295 2525  
2296 2526  /*
2297 2527   * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2298 2528   * The datapath uses this as an indication that there
2299 2529   * is a problem (as opposed to a NCE that was just
2300 2530   * reclaimed due to lack of memory.
2301 2531   * Note that static ARP entries never become unreachable.
2302 2532   */
2303 2533  void
2304 2534  nce_make_unreachable(ncec_t *ncec)
2305 2535  {
2306 2536          mutex_enter(&ncec->ncec_lock);
2307 2537          ncec->ncec_state = ND_UNREACHABLE;
2308 2538          mutex_exit(&ncec->ncec_lock);
2309 2539  }
2310 2540  
2311 2541  /*
2312 2542   * NCE retransmit timer. Common to IPv4 and IPv6.
2313 2543   * This timer goes off when:
2314 2544   * a. It is time to retransmit a resolution for resolver.
2315 2545   * b. It is time to send reachability probes.
2316 2546   */
2317 2547  void
2318 2548  nce_timer(void *arg)
2319 2549  {
2320 2550          ncec_t          *ncec = arg;
2321 2551          ill_t           *ill = ncec->ncec_ill, *src_ill;
2322 2552          char            addrbuf[INET6_ADDRSTRLEN];
2323 2553          boolean_t       dropped = B_FALSE;
2324 2554          ip_stack_t      *ipst = ncec->ncec_ipst;
2325 2555          boolean_t       isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2326 2556          in_addr_t       sender4 = INADDR_ANY;
2327 2557          in6_addr_t      sender6 = ipv6_all_zeros;
2328 2558  
2329 2559          /*
2330 2560           * The timer has to be cancelled by ncec_delete before doing the final
2331 2561           * refrele. So the NCE is guaranteed to exist when the timer runs
2332 2562           * until it clears the timeout_id. Before clearing the timeout_id
2333 2563           * bump up the refcnt so that we can continue to use the ncec
2334 2564           */
2335 2565          ASSERT(ncec != NULL);
2336 2566          mutex_enter(&ncec->ncec_lock);
2337 2567          ncec_refhold_locked(ncec);
2338 2568          ncec->ncec_timeout_id = 0;
2339 2569          mutex_exit(&ncec->ncec_lock);
2340 2570  
2341 2571          src_ill = nce_resolve_src(ncec, &sender6);
2342 2572          /* if we could not find a sender address, return */
2343 2573          if (src_ill == NULL) {
2344 2574                  if (!isv6) {
2345 2575                          IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2346 2576                          ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2347 2577                              &sender4, addrbuf, sizeof (addrbuf))));
2348 2578                  } else {
2349 2579                          ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2350 2580                              &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2351 2581                  }
2352 2582                  nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2353 2583                  ncec_refrele(ncec);
2354 2584                  return;
2355 2585          }
2356 2586          if (!isv6)
2357 2587                  IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2358 2588  
2359 2589          mutex_enter(&ncec->ncec_lock);
2360 2590          /*
2361 2591           * Check the reachability state.
2362 2592           */
2363 2593          switch (ncec->ncec_state) {
2364 2594          case ND_DELAY:
2365 2595                  ASSERT(ncec->ncec_lladdr != NULL);
2366 2596                  ncec->ncec_state = ND_PROBE;
2367 2597                  ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2368 2598                  if (isv6) {
2369 2599                          mutex_exit(&ncec->ncec_lock);
2370 2600                          dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2371 2601                              src_ill->ill_phys_addr,
2372 2602                              src_ill->ill_phys_addr_length,
2373 2603                              &sender6, &ncec->ncec_addr,
2374 2604                              NDP_UNICAST);
2375 2605                  } else {
2376 2606                          dropped = (arp_request(ncec, sender4, src_ill) == 0);
2377 2607                          mutex_exit(&ncec->ncec_lock);
2378 2608                  }
2379 2609                  if (!dropped) {
2380 2610                          mutex_enter(&ncec->ncec_lock);
2381 2611                          ncec->ncec_pcnt--;
2382 2612                          mutex_exit(&ncec->ncec_lock);
2383 2613                  }
2384 2614                  if (ip_debug > 3) {
2385 2615                          /* ip2dbg */
2386 2616                          pr_addr_dbg("nce_timer: state for %s changed "
2387 2617                              "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2388 2618                  }
2389 2619                  nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2390 2620                  break;
2391 2621          case ND_PROBE:
2392 2622                  /* must be retransmit timer */
2393 2623                  ASSERT(ncec->ncec_pcnt >= -1);
2394 2624                  if (ncec->ncec_pcnt > 0) {
2395 2625                          /*
2396 2626                           * As per RFC2461, the ncec gets deleted after
2397 2627                           * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2398 2628                           * Note that the first unicast solicitation is sent
2399 2629                           * during the DELAY state.
2400 2630                           */
2401 2631                          ip2dbg(("nce_timer: pcount=%x dst %s\n",
2402 2632                              ncec->ncec_pcnt,
2403 2633                              inet_ntop((isv6? AF_INET6 : AF_INET),
2404 2634                              &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2405 2635                          if (NCE_PUBLISH(ncec)) {
2406 2636                                  mutex_exit(&ncec->ncec_lock);
2407 2637                                  /*
2408 2638                                   * send out a probe; note that src_ill
2409 2639                                   * is ignored by nce_dad() for all
2410 2640                                   * DAD message types other than IPv6
2411 2641                                   * unicast probes
2412 2642                                   */
2413 2643                                  nce_dad(ncec, src_ill, B_TRUE);
2414 2644                          } else {
2415 2645                                  ASSERT(src_ill != NULL);
2416 2646                                  if (isv6) {
2417 2647                                          mutex_exit(&ncec->ncec_lock);
2418 2648                                          dropped = ndp_xmit(src_ill,
2419 2649                                              ND_NEIGHBOR_SOLICIT,
2420 2650                                              src_ill->ill_phys_addr,
2421 2651                                              src_ill->ill_phys_addr_length,
2422 2652                                              &sender6, &ncec->ncec_addr,
2423 2653                                              NDP_UNICAST);
2424 2654                                  } else {
2425 2655                                          /*
2426 2656                                           * since the nce is REACHABLE,
2427 2657                                           * the ARP request will be sent out
2428 2658                                           * as a link-layer unicast.
2429 2659                                           */
2430 2660                                          dropped = (arp_request(ncec, sender4,
2431 2661                                              src_ill) == 0);
2432 2662                                          mutex_exit(&ncec->ncec_lock);
2433 2663                                  }
2434 2664                                  if (!dropped) {
2435 2665                                          mutex_enter(&ncec->ncec_lock);
2436 2666                                          ncec->ncec_pcnt--;
2437 2667                                          mutex_exit(&ncec->ncec_lock);
2438 2668                                  }
2439 2669                                  nce_restart_timer(ncec,
2440 2670                                      ill->ill_reachable_retrans_time);
2441 2671                          }
2442 2672                  } else if (ncec->ncec_pcnt < 0) {
2443 2673                          /* No hope, delete the ncec */
2444 2674                          /* Tell datapath it went bad */
2445 2675                          ncec->ncec_state = ND_UNREACHABLE;
2446 2676                          mutex_exit(&ncec->ncec_lock);
2447 2677                          if (ip_debug > 2) {
2448 2678                                  /* ip1dbg */
2449 2679                                  pr_addr_dbg("nce_timer: Delete NCE for"
2450 2680                                      " dst %s\n", (isv6? AF_INET6: AF_INET),
2451 2681                                      &ncec->ncec_addr);
2452 2682                          }
2453 2683                          /* if static ARP can't delete. */
2454 2684                          if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2455 2685                                  ncec_delete(ncec);
2456 2686  
2457 2687                  } else if (!NCE_PUBLISH(ncec)) {
2458 2688                          /*
2459 2689                           * Probe count is 0 for a dynamic entry (one that we
2460 2690                           * ourselves are not publishing). We should never get
2461 2691                           * here if NONUD was requested, hence the ASSERT below.
2462 2692                           */
2463 2693                          ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2464 2694                          ip2dbg(("nce_timer: pcount=%x dst %s\n",
2465 2695                              ncec->ncec_pcnt, inet_ntop(AF_INET6,
2466 2696                              &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2467 2697                          ncec->ncec_pcnt--;
2468 2698                          mutex_exit(&ncec->ncec_lock);
2469 2699                          /* Wait one interval before killing */
2470 2700                          nce_restart_timer(ncec,
2471 2701                              ill->ill_reachable_retrans_time);
2472 2702                  } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2473 2703                          ipif_t *ipif;
2474 2704                          ipaddr_t ncec_addr;
2475 2705  
2476 2706                          /*
2477 2707                           * We're done probing, and we can now declare this
2478 2708                           * address to be usable.  Let IP know that it's ok to
2479 2709                           * use.
2480 2710                           */
2481 2711                          ncec->ncec_state = ND_REACHABLE;
2482 2712                          ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2483 2713                          mutex_exit(&ncec->ncec_lock);
2484 2714                          if (isv6) {
2485 2715                                  ipif = ipif_lookup_addr_exact_v6(
2486 2716                                      &ncec->ncec_addr, ill, ipst);
2487 2717                          } else {
2488 2718                                  IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2489 2719                                      ncec_addr);
2490 2720                                  ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2491 2721                                      ipst);
2492 2722                          }
2493 2723                          if (ipif != NULL) {
2494 2724                                  if (ipif->ipif_was_dup) {
2495 2725                                          char ibuf[LIFNAMSIZ];
2496 2726                                          char sbuf[INET6_ADDRSTRLEN];
2497 2727  
2498 2728                                          ipif->ipif_was_dup = B_FALSE;
2499 2729                                          (void) inet_ntop(AF_INET6,
2500 2730                                              &ipif->ipif_v6lcl_addr,
2501 2731                                              sbuf, sizeof (sbuf));
2502 2732                                          ipif_get_name(ipif, ibuf,
2503 2733                                              sizeof (ibuf));
2504 2734                                          cmn_err(CE_NOTE, "recovered address "
2505 2735                                              "%s on %s", sbuf, ibuf);
2506 2736                                  }
2507 2737                                  if ((ipif->ipif_flags & IPIF_UP) &&
2508 2738                                      !ipif->ipif_addr_ready)
2509 2739                                          ipif_up_notify(ipif);
2510 2740                                  ipif->ipif_addr_ready = 1;
2511 2741                                  ipif_refrele(ipif);
2512 2742                          }
2513 2743                          if (!isv6 && arp_no_defense)
2514 2744                                  break;
2515 2745                          /* Begin defending our new address */
2516 2746                          if (ncec->ncec_unsolicit_count > 0) {
2517 2747                                  ncec->ncec_unsolicit_count--;
2518 2748                                  if (isv6) {
2519 2749                                          dropped = ndp_announce(ncec);
2520 2750                                  } else {
2521 2751                                          dropped = arp_announce(ncec);
2522 2752                                  }
2523 2753  
2524 2754                                  if (dropped)
2525 2755                                          ncec->ncec_unsolicit_count++;
2526 2756                                  else
2527 2757                                          ncec->ncec_last_time_defended =
2528 2758                                              ddi_get_lbolt();
2529 2759                          }
2530 2760                          if (ncec->ncec_unsolicit_count > 0) {
2531 2761                                  nce_restart_timer(ncec,
2532 2762                                      ANNOUNCE_INTERVAL(isv6));
2533 2763                          } else if (DEFENSE_INTERVAL(isv6) != 0) {
2534 2764                                  nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2535 2765                          }
2536 2766                  } else {
2537 2767                          /*
2538 2768                           * This is an address we're probing to be our own, but
2539 2769                           * the ill is down.  Wait until it comes back before
2540 2770                           * doing anything, but switch to reachable state so
2541 2771                           * that the restart will work.
2542 2772                           */
2543 2773                          ncec->ncec_state = ND_REACHABLE;
2544 2774                          mutex_exit(&ncec->ncec_lock);
2545 2775                  }
2546 2776                  break;
2547 2777          case ND_INCOMPLETE: {
2548 2778                  mblk_t  *mp, *nextmp;
2549 2779                  mblk_t  **prevmpp;
2550 2780  
2551 2781                  /*
2552 2782                   * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2553 2783                   * for any IPMP probe packets, and toss them.  IPMP probe
2554 2784                   * packets will always be at the head of ncec_qd_mp, so that
2555 2785                   * we can stop at the first queued ND packet that is
2556 2786                   * not a probe packet.
2557 2787                   */
2558 2788                  prevmpp = &ncec->ncec_qd_mp;
2559 2789                  for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2560 2790                          nextmp = mp->b_next;
2561 2791  
2562 2792                          if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2563 2793                                  inet_freemsg(mp);
2564 2794                                  ncec->ncec_nprobes--;
2565 2795                                  *prevmpp = nextmp;
2566 2796                          } else {
2567 2797                                  prevmpp = &mp->b_next;
2568 2798                          }
2569 2799                  }
2570 2800  
2571 2801                  /*
2572 2802                   * Must be resolver's retransmit timer.
2573 2803                   */
2574 2804                  mutex_exit(&ncec->ncec_lock);
2575 2805                  ip_ndp_resolve(ncec);
2576 2806                  break;
2577 2807          }
2578 2808          case ND_REACHABLE:
2579 2809                  if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2580 2810                      ncec->ncec_unsolicit_count != 0) ||
2581 2811                      (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2582 2812                          if (ncec->ncec_unsolicit_count > 0) {
2583 2813                                  ncec->ncec_unsolicit_count--;
2584 2814                                  mutex_exit(&ncec->ncec_lock);
2585 2815                                  /*
2586 2816                                   * When we get to zero announcements left,
2587 2817                                   * switch to address defense
2588 2818                                   */
2589 2819                          } else {
2590 2820                                  boolean_t rate_limit;
2591 2821  
2592 2822                                  mutex_exit(&ncec->ncec_lock);
2593 2823                                  rate_limit = ill_defend_rate_limit(ill, ncec);
2594 2824                                  if (rate_limit) {
2595 2825                                          nce_restart_timer(ncec,
2596 2826                                              DEFENSE_INTERVAL(isv6));
2597 2827                                          break;
2598 2828                                  }
2599 2829                          }
2600 2830                          if (isv6) {
2601 2831                                  dropped = ndp_announce(ncec);
2602 2832                          } else {
2603 2833                                  dropped = arp_announce(ncec);
2604 2834                          }
2605 2835                          mutex_enter(&ncec->ncec_lock);
2606 2836                          if (dropped) {
2607 2837                                  ncec->ncec_unsolicit_count++;
2608 2838                          } else {
2609 2839                                  ncec->ncec_last_time_defended =
2610 2840                                      ddi_get_lbolt();
2611 2841                          }
2612 2842                          mutex_exit(&ncec->ncec_lock);
2613 2843                          if (ncec->ncec_unsolicit_count != 0) {
2614 2844                                  nce_restart_timer(ncec,
2615 2845                                      ANNOUNCE_INTERVAL(isv6));
2616 2846                          } else {
2617 2847                                  nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2618 2848                          }
2619 2849                  } else {
2620 2850                          mutex_exit(&ncec->ncec_lock);
2621 2851                  }
2622 2852                  break;
2623 2853          default:
2624 2854                  mutex_exit(&ncec->ncec_lock);
2625 2855                  break;
2626 2856          }
2627 2857  done:
2628 2858          ncec_refrele(ncec);
2629 2859          ill_refrele(src_ill);
2630 2860  }
2631 2861  
2632 2862  /*
2633 2863   * Set a link layer address from the ll_addr passed in.
2634 2864   * Copy SAP from ill.
2635 2865   */
2636 2866  static void
2637 2867  nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2638 2868  {
2639 2869          ill_t   *ill = ncec->ncec_ill;
2640 2870  
2641 2871          ASSERT(ll_addr != NULL);
2642 2872          if (ill->ill_phys_addr_length > 0) {
2643 2873                  /*
2644 2874                   * The bcopy() below used to be called for the physical address
2645 2875                   * length rather than the link layer address length. For
2646 2876                   * ethernet and many other media, the phys_addr and lla are
2647 2877                   * identical.
2648 2878                   *
2649 2879                   * The phys_addr and lla may not be the same for devices that
2650 2880                   * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2651 2881                   * no known instances of these.
2652 2882                   *
2653 2883                   * For PPP or other interfaces with a zero length
2654 2884                   * physical address, don't do anything here.
2655 2885                   * The bcopy() with a zero phys_addr length was previously
2656 2886                   * a no-op for interfaces with a zero-length physical address.
2657 2887                   * Using the lla for them would change the way they operate.
2658 2888                   * Doing nothing in such cases preserves expected behavior.
2659 2889                   */
2660 2890                  bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2661 2891          }
2662 2892  }
2663 2893  
2664 2894  boolean_t
2665 2895  nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2666 2896      uint32_t ll_addr_len)
2667 2897  {
2668 2898          ASSERT(ncec->ncec_lladdr != NULL);
2669 2899          if (ll_addr == NULL)
2670 2900                  return (B_FALSE);
2671 2901          if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2672 2902                  return (B_TRUE);
2673 2903          return (B_FALSE);
2674 2904  }
2675 2905  
2676 2906  /*
2677 2907   * Updates the link layer address or the reachability state of
2678 2908   * a cache entry.  Reset probe counter if needed.
2679 2909   */
2680 2910  void
2681 2911  nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2682 2912  {
2683 2913          ill_t   *ill = ncec->ncec_ill;
2684 2914          boolean_t need_stop_timer = B_FALSE;
2685 2915          boolean_t need_fastpath_update = B_FALSE;
2686 2916          nce_t   *nce = NULL;
2687 2917          timeout_id_t tid;
2688 2918  
2689 2919          ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2690 2920          /*
2691 2921           * If this interface does not do NUD, there is no point
2692 2922           * in allowing an update to the cache entry.  Although
2693 2923           * we will respond to NS.
2694 2924           * The only time we accept an update for a resolver when
2695 2925           * NUD is turned off is when it has just been created.
2696 2926           * Non-Resolvers will always be created as REACHABLE.
2697 2927           */
2698 2928          if (new_state != ND_UNCHANGED) {
2699 2929                  if ((ncec->ncec_flags & NCE_F_NONUD) &&
2700 2930                      (ncec->ncec_state != ND_INCOMPLETE))
2701 2931                          return;
2702 2932                  ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2703 2933                  ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2704 2934                  need_stop_timer = B_TRUE;
2705 2935                  if (new_state == ND_REACHABLE)
2706 2936                          ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2707 2937                  else {
2708 2938                          /* We force NUD in this case */
2709 2939                          ncec->ncec_last = 0;
2710 2940                  }
2711 2941                  ncec->ncec_state = new_state;
2712 2942                  ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2713 2943                  ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2714 2944                      new_state == ND_INCOMPLETE);
2715 2945          }
2716 2946          if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2717 2947                  tid = ncec->ncec_timeout_id;
2718 2948                  ncec->ncec_timeout_id = 0;
2719 2949          }
2720 2950          /*
2721 2951           * Re-trigger fastpath probe and
2722 2952           * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2723 2953           * whatever packets that happens to be transmitting at the time.
2724 2954           */
2725 2955          if (new_ll_addr != NULL) {
2726 2956                  bcopy(new_ll_addr, ncec->ncec_lladdr,
2727 2957                      ill->ill_phys_addr_length);
2728 2958                  need_fastpath_update = B_TRUE;
2729 2959          }
2730 2960          mutex_exit(&ncec->ncec_lock);
2731 2961          if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2732 2962                  if (tid != 0)
2733 2963                          (void) untimeout(tid);
2734 2964          }
2735 2965          if (need_fastpath_update) {
2736 2966                  /*
2737 2967                   * Delete any existing existing dlur_mp and fp_mp information.
2738 2968                   * For IPMP interfaces, all underlying ill's must be checked
2739 2969                   * and purged.
2740 2970                   */
2741 2971                  nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2742 2972                  /*
2743 2973                   * add the new dlur_mp and fp_mp
2744 2974                   */
2745 2975                  nce = nce_fastpath(ncec, B_TRUE, NULL);
2746 2976                  if (nce != NULL)
2747 2977                          nce_refrele(nce);
2748 2978          }
2749 2979          mutex_enter(&ncec->ncec_lock);
2750 2980  }
2751 2981  
2752 2982  static void
2753 2983  nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2754 2984  {
2755 2985          uint_t  count = 0;
2756 2986          mblk_t  **mpp, *tmp;
2757 2987  
2758 2988          ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2759 2989  
2760 2990          for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2761 2991                  if (++count > ncec->ncec_ill->ill_max_buf) {
2762 2992                          tmp = ncec->ncec_qd_mp->b_next;
2763 2993                          ncec->ncec_qd_mp->b_next = NULL;
2764 2994                          /*
2765 2995                           * if we never create data addrs on the under_ill
2766 2996                           * does this matter?
2767 2997                           */
2768 2998                          BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2769 2999                              ipIfStatsOutDiscards);
2770 3000                          ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2771 3001                              ncec->ncec_ill);
2772 3002                          freemsg(ncec->ncec_qd_mp);
2773 3003                          ncec->ncec_qd_mp = tmp;
2774 3004                  }
2775 3005          }
2776 3006  
2777 3007          if (head_insert) {
2778 3008                  ncec->ncec_nprobes++;
2779 3009                  mp->b_next = ncec->ncec_qd_mp;
2780 3010                  ncec->ncec_qd_mp = mp;
2781 3011          } else {
2782 3012                  *mpp = mp;
2783 3013          }
2784 3014  }
2785 3015  
2786 3016  /*
2787 3017   * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2788 3018   * queued at the head or tail of the queue based on the input argument
2789 3019   * 'head_insert'. The caller should specify this argument as B_TRUE if this
2790 3020   * packet is an IPMP probe packet, in which case the following happens:
2791 3021   *
2792 3022   *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
2793 3023   *      (non-ipmp_probe) load-speading case where the source address of the ND
2794 3024   *      packet is not tied to ncec_ill. If the ill bound to the source address
2795 3025   *      cannot receive, the response to the ND packet will not be received.
2796 3026   *      However, if ND packets for ncec_ill's probes are queued behind that ND
2797 3027   *      packet, those probes will also fail to be sent, and thus in.mpathd will
2798 3028   *       erroneously conclude that ncec_ill has also failed.
2799 3029   *
2800 3030   *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
2801 3031   *      the first attempt.  This ensures that ND problems do not manifest as
2802 3032   *      probe RTT spikes.
2803 3033   *
2804 3034   * We achieve this by inserting ipmp_probe() packets at the head of the
2805 3035   * nce_queue.
2806 3036   *
2807 3037   * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2808 3038   * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2809 3039   */
2810 3040  void
2811 3041  nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2812 3042  {
2813 3043          ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2814 3044          nce_queue_mp_common(ncec, mp, head_insert);
2815 3045  }
2816 3046  
2817 3047  /*
2818 3048   * Called when address resolution failed due to a timeout.
2819 3049   * Send an ICMP unreachable in response to all queued packets.
2820 3050   */
2821 3051  void
2822 3052  ndp_resolv_failed(ncec_t *ncec)
2823 3053  {
2824 3054          mblk_t  *mp, *nxt_mp;
2825 3055          char    buf[INET6_ADDRSTRLEN];
2826 3056          ill_t *ill = ncec->ncec_ill;
2827 3057          ip_recv_attr_t  iras;
2828 3058  
2829 3059          bzero(&iras, sizeof (iras));
2830 3060          iras.ira_flags = 0;
2831 3061          /*
2832 3062           * we are setting the ira_rill to the ipmp_ill (instead of
2833 3063           * the actual ill on which the packet was received), but this
2834 3064           * is ok because we don't actually need the real ira_rill.
2835 3065           * to send the icmp unreachable to the sender.
2836 3066           */
2837 3067          iras.ira_ill = iras.ira_rill = ill;
2838 3068          iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2839 3069          iras.ira_rifindex = iras.ira_ruifindex;
2840 3070  
2841 3071          ip1dbg(("ndp_resolv_failed: dst %s\n",
2842 3072              inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2843 3073          mutex_enter(&ncec->ncec_lock);
2844 3074          mp = ncec->ncec_qd_mp;
2845 3075          ncec->ncec_qd_mp = NULL;
2846 3076          ncec->ncec_nprobes = 0;
2847 3077          mutex_exit(&ncec->ncec_lock);
2848 3078          while (mp != NULL) {
2849 3079                  nxt_mp = mp->b_next;
2850 3080                  mp->b_next = NULL;
2851 3081  
2852 3082                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2853 3083                  ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2854 3084                      mp, ill);
2855 3085                  icmp_unreachable_v6(mp,
2856 3086                      ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2857 3087                  ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2858 3088                  mp = nxt_mp;
2859 3089          }
2860 3090          ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2861 3091  }
2862 3092  
2863 3093  /*
2864 3094   * Handle the completion of NDP and ARP resolution.
2865 3095   */
2866 3096  void
2867 3097  nce_resolv_ok(ncec_t *ncec)
2868 3098  {
2869 3099          mblk_t *mp;
2870 3100          uint_t pkt_len;
2871 3101          iaflags_t ixaflags = IXAF_NO_TRACE;
2872 3102          nce_t *nce;
2873 3103          ill_t   *ill = ncec->ncec_ill;
2874 3104          boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2875 3105          ip_stack_t *ipst = ill->ill_ipst;
2876 3106  
2877 3107          if (IS_IPMP(ncec->ncec_ill)) {
2878 3108                  nce_resolv_ipmp_ok(ncec);
2879 3109                  return;
2880 3110          }
2881 3111          /* non IPMP case */
2882 3112  
2883 3113          mutex_enter(&ncec->ncec_lock);
2884 3114          ASSERT(ncec->ncec_nprobes == 0);
2885 3115          mp = ncec->ncec_qd_mp;
2886 3116          ncec->ncec_qd_mp = NULL;
2887 3117          mutex_exit(&ncec->ncec_lock);
2888 3118  
2889 3119          while (mp != NULL) {
2890 3120                  mblk_t *nxt_mp;
2891 3121  
2892 3122                  if (ill->ill_isv6) {
2893 3123                          ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2894 3124  
2895 3125                          pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2896 3126                  } else {
2897 3127                          ipha_t *ipha = (ipha_t *)mp->b_rptr;
2898 3128  
2899 3129                          ixaflags |= IXAF_IS_IPV4;
2900 3130                          pkt_len = ntohs(ipha->ipha_length);
2901 3131                  }
2902 3132                  nxt_mp = mp->b_next;
2903 3133                  mp->b_next = NULL;
2904 3134                  /*
2905 3135                   * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2906 3136                   * longer available, but it's ok to drop this flag because TCP
2907 3137                   * has its own flow-control in effect, so TCP packets
2908 3138                   * are not likely to get here when flow-control is in effect.
2909 3139                   */
2910 3140                  mutex_enter(&ill->ill_lock);
2911 3141                  nce = nce_lookup(ill, &ncec->ncec_addr);
2912 3142                  mutex_exit(&ill->ill_lock);
2913 3143  
2914 3144                  if (nce == NULL) {
2915 3145                          if (isv6) {
2916 3146                                  BUMP_MIB(&ipst->ips_ip6_mib,
2917 3147                                      ipIfStatsOutDiscards);
2918 3148                          } else {
2919 3149                                  BUMP_MIB(&ipst->ips_ip_mib,
2920 3150                                      ipIfStatsOutDiscards);
2921 3151                          }
2922 3152                          ip_drop_output("ipIfStatsOutDiscards - no nce",
2923 3153                              mp, NULL);
2924 3154                          freemsg(mp);
2925 3155                  } else {
2926 3156                          /*
2927 3157                           * We don't know the zoneid, but
2928 3158                           * ip_xmit does not care since IXAF_NO_TRACE
2929 3159                           * is set. (We traced the packet the first
2930 3160                           * time through ip_xmit.)
2931 3161                           */
2932 3162                          (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2933 3163                              ALL_ZONES, 0, NULL);
2934 3164                          nce_refrele(nce);
2935 3165                  }
2936 3166                  mp = nxt_mp;
2937 3167          }
2938 3168  
2939 3169          ncec_cb_dispatch(ncec); /* complete callbacks */
2940 3170  }
2941 3171  
2942 3172  /*
2943 3173   * Called by SIOCSNDP* ioctl to add/change an ncec entry
2944 3174   * and the corresponding attributes.
2945 3175   * Disallow states other than ND_REACHABLE or ND_STALE.
2946 3176   */
2947 3177  int
2948 3178  ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2949 3179  {
2950 3180          sin6_t          *sin6;
2951 3181          in6_addr_t      *addr;
2952 3182          ncec_t          *ncec;
2953 3183          nce_t           *nce;
2954 3184          int             err = 0;
2955 3185          uint16_t        new_flags = 0;
2956 3186          uint16_t        old_flags = 0;
2957 3187          int             inflags = lnr->lnr_flags;
2958 3188          ip_stack_t      *ipst = ill->ill_ipst;
2959 3189          boolean_t       do_postprocess = B_FALSE;
2960 3190  
2961 3191          ASSERT(ill->ill_isv6);
2962 3192          if ((lnr->lnr_state_create != ND_REACHABLE) &&
2963 3193              (lnr->lnr_state_create != ND_STALE))
2964 3194                  return (EINVAL);
2965 3195  
2966 3196          sin6 = (sin6_t *)&lnr->lnr_addr;
2967 3197          addr = &sin6->sin6_addr;
2968 3198  
2969 3199          mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2970 3200          ASSERT(!IS_UNDER_IPMP(ill));
2971 3201          nce = nce_lookup_addr(ill, addr);
2972 3202          if (nce != NULL)
2973 3203                  new_flags = nce->nce_common->ncec_flags;
2974 3204  
2975 3205          switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2976 3206          case NDF_ISROUTER_ON:
2977 3207                  new_flags |= NCE_F_ISROUTER;
2978 3208                  break;
2979 3209          case NDF_ISROUTER_OFF:
2980 3210                  new_flags &= ~NCE_F_ISROUTER;
2981 3211                  break;
2982 3212          case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2983 3213                  mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2984 3214                  if (nce != NULL)
2985 3215                          nce_refrele(nce);
2986 3216                  return (EINVAL);
2987 3217          }
2988 3218          if (inflags & NDF_STATIC)
2989 3219                  new_flags |= NCE_F_STATIC;
2990 3220  
2991 3221          switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2992 3222          case NDF_ANYCAST_ON:
2993 3223                  new_flags |= NCE_F_ANYCAST;
2994 3224                  break;
2995 3225          case NDF_ANYCAST_OFF:
2996 3226                  new_flags &= ~NCE_F_ANYCAST;
2997 3227                  break;
2998 3228          case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2999 3229                  mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3000 3230                  if (nce != NULL)
3001 3231                          nce_refrele(nce);
3002 3232                  return (EINVAL);
3003 3233          }
3004 3234  
3005 3235          if (nce == NULL) {
3006 3236                  err = nce_add_v6(ill,
3007 3237                      (uchar_t *)lnr->lnr_hdw_addr,
3008 3238                      ill->ill_phys_addr_length,
3009 3239                      addr,
3010 3240                      new_flags,
3011 3241                      lnr->lnr_state_create,
3012 3242                      &nce);
3013 3243                  if (err != 0) {
3014 3244                          mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3015 3245                          ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3016 3246                          return (err);
3017 3247                  } else {
3018 3248                          do_postprocess = B_TRUE;
3019 3249                  }
3020 3250          }
3021 3251          ncec = nce->nce_common;
3022 3252          old_flags = ncec->ncec_flags;
3023 3253          if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3024 3254                  ncec_router_to_host(ncec);
3025 3255                  mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3026 3256                  if (do_postprocess)
3027 3257                          err = nce_add_v6_postprocess(nce);
3028 3258                  nce_refrele(nce);
3029 3259                  return (0);
3030 3260          }
3031 3261          mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3032 3262  
3033 3263          if (do_postprocess)
3034 3264                  err = nce_add_v6_postprocess(nce);
3035 3265          /*
3036 3266           * err cannot be anything other than 0 because we don't support
3037 3267           * proxy arp of static addresses.
3038 3268           */
3039 3269          ASSERT(err == 0);
3040 3270  
3041 3271          mutex_enter(&ncec->ncec_lock);
3042 3272          ncec->ncec_flags = new_flags;
3043 3273          mutex_exit(&ncec->ncec_lock);
3044 3274          /*
3045 3275           * Note that we ignore the state at this point, which
3046 3276           * should be either STALE or REACHABLE.  Instead we let
3047 3277           * the link layer address passed in to determine the state
3048 3278           * much like incoming packets.
3049 3279           */
3050 3280          nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3051 3281          nce_refrele(nce);
3052 3282          return (0);
3053 3283  }
3054 3284  
3055 3285  /*
3056 3286   * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3057 3287   * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3058 3288   * be held to ensure that they are in the same group.
3059 3289   */
3060 3290  static nce_t *
3061 3291  nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3062 3292  {
3063 3293  
3064 3294          nce_t *nce;
3065 3295  
3066 3296          nce = nce_ill_lookup_then_add(ill, ncec);
3067 3297  
3068 3298          if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3069 3299                  return (nce);
3070 3300  
3071 3301          /*
3072 3302           * hold the ncec_lock to synchronize with nce_update() so that,
3073 3303           * at the end of this function, the contents of nce_dlur_mp are
3074 3304           * consistent with ncec->ncec_lladdr, even though some intermediate
3075 3305           * packet may have been sent out with a mangled address, which would
3076 3306           * only be a transient condition.
3077 3307           */
3078 3308          mutex_enter(&ncec->ncec_lock);
3079 3309          if (ncec->ncec_lladdr != NULL) {
3080 3310                  bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3081 3311                      NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3082 3312          } else {
3083 3313                  nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3084 3314                      ill->ill_sap_length);
3085 3315          }
3086 3316          mutex_exit(&ncec->ncec_lock);
3087 3317          return (nce);
3088 3318  }
3089 3319  
3090 3320  /*
3091 3321   * we make nce_fp_mp to have an M_DATA prepend.
3092 3322   * The caller ensures there is hold on ncec for this function.
  
    | 
      ↓ open down ↓ | 
    2029 lines elided | 
    
      ↑ open up ↑ | 
  
3093 3323   * Note that since ill_fastpath_probe() copies the mblk there is
3094 3324   * no need to hold the nce or ncec beyond this function.
3095 3325   *
3096 3326   * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3097 3327   * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3098 3328   * and will be returned back by this function, so that no extra nce_refrele
3099 3329   * is required for the caller. The calls from nce_add_common() use this
3100 3330   * method. All other callers (that pass in NULL ncec_nce) will have to do a
3101 3331   * nce_refrele of the returned nce (when it is non-null).
3102 3332   */
3103      -nce_t *
     3333 +static nce_t *
3104 3334  nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3105 3335  {
3106 3336          nce_t *nce;
3107 3337          ill_t *ill = ncec->ncec_ill;
3108 3338  
3109 3339          ASSERT(ill != NULL);
3110 3340  
3111 3341          if (IS_IPMP(ill) && trigger_fp_req) {
3112 3342                  trigger_fp_req = B_FALSE;
3113 3343                  ipmp_ncec_refresh_nce(ncec);
3114 3344          }
3115 3345  
3116 3346          /*
3117 3347           * If the caller already has the nce corresponding to the ill, use
3118 3348           * that one. Otherwise we have to lookup/add the nce. Calls from
3119 3349           * nce_add_common() fall in the former category, and have just done
3120 3350           * the nce lookup/add that can be reused.
3121 3351           */
3122 3352          if (ncec_nce == NULL)
3123 3353                  nce = nce_fastpath_create(ill, ncec);
3124 3354          else
3125 3355                  nce = ncec_nce;
3126 3356  
3127 3357          if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3128 3358                  return (nce);
3129 3359  
3130 3360          if (trigger_fp_req)
3131 3361                  nce_fastpath_trigger(nce);
3132 3362          return (nce);
3133 3363  }
3134 3364  
3135 3365  /*
3136 3366   * Trigger fastpath on nce. No locks may be held.
3137 3367   */
3138 3368  static void
3139 3369  nce_fastpath_trigger(nce_t *nce)
3140 3370  {
3141 3371          int res;
3142 3372          ill_t *ill = nce->nce_ill;
3143 3373          ncec_t *ncec = nce->nce_common;
3144 3374  
3145 3375          res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3146 3376          /*
3147 3377           * EAGAIN is an indication of a transient error
3148 3378           * i.e. allocation failure etc. leave the ncec in the list it
3149 3379           * will be updated when another probe happens for another ire
3150 3380           * if not it will be taken out of the list when the ire is
  
    | 
      ↓ open down ↓ | 
    37 lines elided | 
    
      ↑ open up ↑ | 
  
3151 3381           * deleted.
3152 3382           */
3153 3383          if (res != 0 && res != EAGAIN && res != ENOTSUP)
3154 3384                  nce_fastpath_list_delete(ill, ncec, NULL);
3155 3385  }
3156 3386  
3157 3387  /*
3158 3388   * Add ncec to the nce fastpath list on ill.
3159 3389   */
3160 3390  static nce_t *
3161      -nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
     3391 +nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
3162 3392  {
3163 3393          nce_t *nce = NULL;
3164 3394  
3165 3395          ASSERT(MUTEX_HELD(&ill->ill_lock));
3166 3396          /*
3167 3397           * Atomically ensure that the ill is not CONDEMNED and is not going
3168 3398           * down, before adding the NCE.
3169 3399           */
3170 3400          if (ill->ill_state_flags & ILL_CONDEMNED)
3171 3401                  return (NULL);
3172 3402          mutex_enter(&ncec->ncec_lock);
3173 3403          /*
3174 3404           * if ncec has not been deleted and
3175 3405           * is not already in the list add it.
3176 3406           */
3177 3407          if (!NCE_ISCONDEMNED(ncec)) {
3178 3408                  nce = nce_lookup(ill, &ncec->ncec_addr);
3179 3409                  if (nce != NULL)
3180 3410                          goto done;
3181      -                nce = nce_add(ill, ncec);
     3411 +                nce = nce_add(ill, ncec, graveyard);
3182 3412          }
3183 3413  done:
3184 3414          mutex_exit(&ncec->ncec_lock);
3185 3415          return (nce);
3186 3416  }
3187 3417  
3188      -nce_t *
     3418 +static nce_t *
3189 3419  nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3190 3420  {
3191 3421          nce_t *nce;
     3422 +        list_t graveyard;
3192 3423  
     3424 +        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3193 3425          mutex_enter(&ill->ill_lock);
3194      -        nce = nce_ill_lookup_then_add_locked(ill, ncec);
     3426 +        nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
3195 3427          mutex_exit(&ill->ill_lock);
     3428 +        nce_graveyard_free(&graveyard);
3196 3429          return (nce);
3197 3430  }
3198 3431  
3199 3432  
3200 3433  /*
3201 3434   * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3202 3435   * nce is added to the 'dead' list, and the caller must nce_refrele() the
3203 3436   * entry after all locks have been dropped.
3204 3437   */
3205 3438  void
3206 3439  nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3207 3440  {
3208 3441          nce_t *nce;
3209 3442  
3210 3443          ASSERT(ill != NULL);
3211 3444  
3212 3445          /* delete any nces referencing the ncec from underlying ills */
3213 3446          if (IS_IPMP(ill))
3214 3447                  ipmp_ncec_delete_nce(ncec);
3215 3448  
3216 3449          /* now the ill itself */
3217 3450          mutex_enter(&ill->ill_lock);
3218 3451          for (nce = list_head(&ill->ill_nce); nce != NULL;
3219 3452              nce = list_next(&ill->ill_nce, nce)) {
3220 3453                  if (nce->nce_common == ncec) {
3221 3454                          nce_refhold(nce);
3222 3455                          nce_delete(nce);
3223 3456                          break;
3224 3457                  }
3225 3458          }
3226 3459          mutex_exit(&ill->ill_lock);
3227 3460          if (nce != NULL) {
3228 3461                  if (dead == NULL)
3229 3462                          nce_refrele(nce);
3230 3463                  else
3231 3464                          list_insert_tail(dead, nce);
3232 3465          }
3233 3466  }
3234 3467  
3235 3468  /*
  
    | 
      ↓ open down ↓ | 
    30 lines elided | 
    
      ↑ open up ↑ | 
  
3236 3469   * when the fastpath response does not fit in the datab
3237 3470   * associated with the existing nce_fp_mp, we delete and
3238 3471   * add the nce to retrigger fastpath based on the information
3239 3472   * in the ncec_t.
3240 3473   */
3241 3474  static nce_t *
3242 3475  nce_delete_then_add(nce_t *nce)
3243 3476  {
3244 3477          ill_t           *ill = nce->nce_ill;
3245 3478          nce_t           *newnce = NULL;
     3479 +        list_t          graveyard;
3246 3480  
     3481 +        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3247 3482          ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3248 3483              (void *)nce, ill->ill_name));
3249 3484          mutex_enter(&ill->ill_lock);
3250 3485          mutex_enter(&nce->nce_common->ncec_lock);
3251 3486          nce_delete(nce);
3252 3487          /*
3253 3488           * Make sure that ncec is not condemned before adding. We hold the
3254 3489           * ill_lock and ncec_lock to synchronize with ncec_delete() and
3255 3490           * ipmp_ncec_delete_nce()
3256 3491           */
3257 3492          if (!NCE_ISCONDEMNED(nce->nce_common))
3258      -                newnce = nce_add(ill, nce->nce_common);
     3493 +                newnce = nce_add(ill, nce->nce_common, &graveyard);
3259 3494          mutex_exit(&nce->nce_common->ncec_lock);
3260 3495          mutex_exit(&ill->ill_lock);
     3496 +        nce_graveyard_free(&graveyard);
3261 3497          nce_refrele(nce);
3262 3498          return (newnce); /* could be null if nomem */
3263 3499  }
3264 3500  
3265 3501  typedef struct nce_fp_match_s {
3266 3502          nce_t   *nce_fp_match_res;
3267 3503          mblk_t  *nce_fp_match_ack_mp;
3268 3504  } nce_fp_match_t;
3269 3505  
3270 3506  /* ARGSUSED */
3271 3507  static int
3272 3508  nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3273 3509  {
3274 3510          nce_fp_match_t  *nce_fp_marg = arg;
3275 3511          ncec_t          *ncec = nce->nce_common;
3276 3512          mblk_t          *mp = nce_fp_marg->nce_fp_match_ack_mp;
3277 3513          uchar_t *mp_rptr, *ud_mp_rptr;
3278 3514          mblk_t          *ud_mp = nce->nce_dlur_mp;
3279 3515          ptrdiff_t       cmplen;
3280 3516  
3281 3517          /*
3282 3518           * mp is the mp associated with the fastpath ack.
3283 3519           * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3284 3520           * under consideration. If the contents match, then the
3285 3521           * fastpath ack is used to update the nce.
3286 3522           */
3287 3523          if (ud_mp == NULL)
3288 3524                  return (0);
3289 3525          mp_rptr = mp->b_rptr;
3290 3526          cmplen = mp->b_wptr - mp_rptr;
3291 3527          ASSERT(cmplen >= 0);
3292 3528  
3293 3529          ud_mp_rptr = ud_mp->b_rptr;
3294 3530          /*
3295 3531           * The ncec is locked here to prevent any other threads from accessing
3296 3532           * and changing nce_dlur_mp when the address becomes resolved to an
3297 3533           * lla while we're in the middle of looking at and comparing the
3298 3534           * hardware address (lla). It is also locked to prevent multiple
3299 3535           * threads in nce_fastpath() from examining nce_dlur_mp at the same
3300 3536           * time.
3301 3537           */
3302 3538          mutex_enter(&ncec->ncec_lock);
3303 3539          if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3304 3540              bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3305 3541                  nce_fp_marg->nce_fp_match_res = nce;
3306 3542                  mutex_exit(&ncec->ncec_lock);
3307 3543                  nce_refhold(nce);
3308 3544                  return (1);
3309 3545          }
3310 3546          mutex_exit(&ncec->ncec_lock);
3311 3547          return (0);
3312 3548  }
3313 3549  
3314 3550  /*
3315 3551   * Update all NCE's that are not in fastpath mode and
3316 3552   * have an nce_fp_mp that matches mp. mp->b_cont contains
3317 3553   * the fastpath header.
3318 3554   *
3319 3555   * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3320 3556   */
3321 3557  void
3322 3558  nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3323 3559  {
3324 3560          nce_fp_match_t nce_fp_marg;
3325 3561          nce_t *nce;
3326 3562          mblk_t *nce_fp_mp, *fp_mp;
3327 3563  
3328 3564          nce_fp_marg.nce_fp_match_res = NULL;
3329 3565          nce_fp_marg.nce_fp_match_ack_mp = mp;
3330 3566  
3331 3567          nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3332 3568  
3333 3569          if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3334 3570                  return;
3335 3571  
3336 3572          mutex_enter(&nce->nce_lock);
3337 3573          nce_fp_mp = nce->nce_fp_mp;
3338 3574  
3339 3575          if (nce_fp_mp != NULL) {
3340 3576                  fp_mp = mp->b_cont;
3341 3577                  if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3342 3578                      nce_fp_mp->b_datap->db_lim) {
3343 3579                          mutex_exit(&nce->nce_lock);
3344 3580                          nce = nce_delete_then_add(nce);
3345 3581                          if (nce == NULL) {
3346 3582                                  return;
3347 3583                          }
3348 3584                          mutex_enter(&nce->nce_lock);
3349 3585                          nce_fp_mp = nce->nce_fp_mp;
3350 3586                  }
3351 3587          }
3352 3588  
3353 3589          /* Matched - install mp as the fastpath mp */
3354 3590          if (nce_fp_mp == NULL) {
3355 3591                  fp_mp = dupb(mp->b_cont);
3356 3592                  nce->nce_fp_mp = fp_mp;
3357 3593          } else {
3358 3594                  fp_mp = mp->b_cont;
3359 3595                  bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3360 3596                  nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3361 3597                      + MBLKL(fp_mp);
3362 3598          }
3363 3599          mutex_exit(&nce->nce_lock);
3364 3600          nce_refrele(nce);
3365 3601  }
3366 3602  
3367 3603  /*
3368 3604   * Return a pointer to a given option in the packet.
3369 3605   * Assumes that option part of the packet have already been validated.
3370 3606   */
3371 3607  nd_opt_hdr_t *
3372 3608  ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3373 3609  {
3374 3610          while (optlen > 0) {
3375 3611                  if (opt->nd_opt_type == opt_type)
3376 3612                          return (opt);
3377 3613                  optlen -= 8 * opt->nd_opt_len;
3378 3614                  opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3379 3615          }
3380 3616          return (NULL);
3381 3617  }
3382 3618  
3383 3619  /*
3384 3620   * Verify all option lengths present are > 0, also check to see
3385 3621   * if the option lengths and packet length are consistent.
3386 3622   */
3387 3623  boolean_t
3388 3624  ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3389 3625  {
3390 3626          ASSERT(opt != NULL);
3391 3627          while (optlen > 0) {
3392 3628                  if (opt->nd_opt_len == 0)
3393 3629                          return (B_FALSE);
3394 3630                  optlen -= 8 * opt->nd_opt_len;
3395 3631                  if (optlen < 0)
3396 3632                          return (B_FALSE);
3397 3633                  opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3398 3634          }
3399 3635          return (B_TRUE);
3400 3636  }
3401 3637  
3402 3638  /*
3403 3639   * ncec_walk function.
3404 3640   * Free a fraction of the NCE cache entries.
3405 3641   *
3406 3642   * A possible optimization here would be to use ncec_last where possible, and
3407 3643   * delete the least-frequently used entry, which would require more complex
3408 3644   * computation as we walk through the ncec's (e.g., track ncec entries by
3409 3645   * order of ncec_last and/or maintain state)
3410 3646   */
3411 3647  static void
3412 3648  ncec_cache_reclaim(ncec_t *ncec, void *arg)
3413 3649  {
3414 3650          ip_stack_t      *ipst = ncec->ncec_ipst;
3415 3651          uint_t          fraction = *(uint_t *)arg;
3416 3652          uint_t          rand;
3417 3653  
3418 3654          if ((ncec->ncec_flags &
3419 3655              (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3420 3656                  return;
3421 3657          }
3422 3658  
3423 3659          rand = (uint_t)ddi_get_lbolt() +
3424 3660              NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3425 3661          if ((rand/fraction)*fraction == rand) {
3426 3662                  IP_STAT(ipst, ip_nce_reclaim_deleted);
3427 3663                  ncec_delete(ncec);
3428 3664          }
3429 3665  }
3430 3666  
3431 3667  /*
3432 3668   * kmem_cache callback to free up memory.
3433 3669   *
3434 3670   * For now we just delete a fixed fraction.
3435 3671   */
3436 3672  static void
3437 3673  ip_nce_reclaim_stack(ip_stack_t *ipst)
3438 3674  {
3439 3675          uint_t          fraction = ipst->ips_ip_nce_reclaim_fraction;
3440 3676  
3441 3677          IP_STAT(ipst, ip_nce_reclaim_calls);
3442 3678  
3443 3679          ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);
3444 3680  
3445 3681          /*
3446 3682           * Walk all CONNs that can have a reference on an ire, ncec or dce.
3447 3683           * Get them to update any stale references to drop any refholds they
3448 3684           * have.
3449 3685           */
3450 3686          ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3451 3687  }
3452 3688  
3453 3689  /*
3454 3690   * Called by the memory allocator subsystem directly, when the system
3455 3691   * is running low on memory.
3456 3692   */
3457 3693  /* ARGSUSED */
3458 3694  void
3459 3695  ip_nce_reclaim(void *args)
3460 3696  {
3461 3697          netstack_handle_t nh;
3462 3698          netstack_t *ns;
3463 3699          ip_stack_t *ipst;
3464 3700  
3465 3701          netstack_next_init(&nh);
3466 3702          while ((ns = netstack_next(&nh)) != NULL) {
3467 3703                  /*
3468 3704                   * netstack_next() can return a netstack_t with a NULL
3469 3705                   * netstack_ip at boot time.
3470 3706                   */
3471 3707                  if ((ipst = ns->netstack_ip) == NULL) {
3472 3708                          netstack_rele(ns);
3473 3709                          continue;
3474 3710                  }
3475 3711                  ip_nce_reclaim_stack(ipst);
3476 3712                  netstack_rele(ns);
3477 3713          }
3478 3714          netstack_next_fini(&nh);
3479 3715  }
3480 3716  
3481 3717  #ifdef DEBUG
3482 3718  void
3483 3719  ncec_trace_ref(ncec_t *ncec)
3484 3720  {
3485 3721          ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3486 3722  
3487 3723          if (ncec->ncec_trace_disable)
3488 3724                  return;
3489 3725  
3490 3726          if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3491 3727                  ncec->ncec_trace_disable = B_TRUE;
3492 3728                  ncec_trace_cleanup(ncec);
3493 3729          }
3494 3730  }
3495 3731  
3496 3732  void
3497 3733  ncec_untrace_ref(ncec_t *ncec)
3498 3734  {
3499 3735          ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3500 3736  
3501 3737          if (!ncec->ncec_trace_disable)
3502 3738                  th_trace_unref(ncec);
3503 3739  }
3504 3740  
3505 3741  static void
3506 3742  ncec_trace_cleanup(const ncec_t *ncec)
3507 3743  {
3508 3744          th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3509 3745  }
3510 3746  #endif
3511 3747  
3512 3748  /*
3513 3749   * Called when address resolution fails due to a timeout.
3514 3750   * Send an ICMP unreachable in response to all queued packets.
3515 3751   */
3516 3752  void
3517 3753  arp_resolv_failed(ncec_t *ncec)
3518 3754  {
3519 3755          mblk_t  *mp, *nxt_mp;
3520 3756          char    buf[INET6_ADDRSTRLEN];
3521 3757          struct in_addr ipv4addr;
3522 3758          ill_t *ill = ncec->ncec_ill;
3523 3759          ip_stack_t *ipst = ncec->ncec_ipst;
3524 3760          ip_recv_attr_t  iras;
3525 3761  
3526 3762          bzero(&iras, sizeof (iras));
3527 3763          iras.ira_flags = IRAF_IS_IPV4;
3528 3764          /*
3529 3765           * we are setting the ira_rill to the ipmp_ill (instead of
3530 3766           * the actual ill on which the packet was received), but this
3531 3767           * is ok because we don't actually need the real ira_rill.
3532 3768           * to send the icmp unreachable to the sender.
3533 3769           */
3534 3770          iras.ira_ill = iras.ira_rill = ill;
3535 3771          iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3536 3772          iras.ira_rifindex = iras.ira_ruifindex;
3537 3773  
3538 3774          IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3539 3775          ip3dbg(("arp_resolv_failed: dst %s\n",
3540 3776              inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3541 3777          mutex_enter(&ncec->ncec_lock);
3542 3778          mp = ncec->ncec_qd_mp;
3543 3779          ncec->ncec_qd_mp = NULL;
3544 3780          ncec->ncec_nprobes = 0;
3545 3781          mutex_exit(&ncec->ncec_lock);
3546 3782          while (mp != NULL) {
3547 3783                  nxt_mp = mp->b_next;
3548 3784                  mp->b_next = NULL;
3549 3785  
3550 3786                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3551 3787                  ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3552 3788                      mp, ill);
3553 3789                  if (ipst->ips_ip_arp_icmp_error) {
3554 3790                          ip3dbg(("arp_resolv_failed: "
3555 3791                              "Calling icmp_unreachable\n"));
3556 3792                          icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3557 3793                  } else {
3558 3794                          freemsg(mp);
3559 3795                  }
3560 3796                  ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3561 3797                  mp = nxt_mp;
3562 3798          }
3563 3799          ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3564 3800  }
3565 3801  
3566 3802  /*
3567 3803   * if ill is an under_ill, translate it to the ipmp_ill and add the
3568 3804   * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3569 3805   * one on the underlying in_ill) will be created for the
3570 3806   * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3571 3807   */
3572 3808  int
3573 3809  nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3574 3810      const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3575 3811  {
3576 3812          int     err;
3577 3813          in6_addr_t addr6;
3578 3814          ip_stack_t *ipst = ill->ill_ipst;
3579 3815          nce_t   *nce, *upper_nce = NULL;
3580 3816          ill_t   *in_ill = ill, *under = NULL;
3581 3817          boolean_t need_ill_refrele = B_FALSE;
3582 3818  
3583 3819          if (flags & NCE_F_MCAST) {
3584 3820                  /*
3585 3821                   * hw_addr will be figured out in nce_set_multicast_v4;
3586 3822                   * caller needs to pass in the cast_ill for ipmp
3587 3823                   */
3588 3824                  ASSERT(hw_addr == NULL);
3589 3825                  ASSERT(!IS_IPMP(ill));
3590 3826                  err = nce_set_multicast_v4(ill, addr, flags, newnce);
3591 3827                  return (err);
3592 3828          }
3593 3829  
3594 3830          if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3595 3831                  ill = ipmp_ill_hold_ipmp_ill(ill);
3596 3832                  if (ill == NULL)
3597 3833                          return (ENXIO);
3598 3834                  need_ill_refrele = B_TRUE;
3599 3835          }
3600 3836          if ((flags & NCE_F_BCAST) != 0) {
3601 3837                  /*
3602 3838                   * IPv4 broadcast ncec: compute the hwaddr.
3603 3839                   */
3604 3840                  if (IS_IPMP(ill)) {
3605 3841                          under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3606 3842                          if (under == NULL)  {
3607 3843                                  if (need_ill_refrele)
3608 3844                                          ill_refrele(ill);
3609 3845                                  return (ENETDOWN);
3610 3846                          }
3611 3847                          hw_addr = under->ill_bcast_mp->b_rptr +
3612 3848                              NCE_LL_ADDR_OFFSET(under);
3613 3849                          hw_addr_len = under->ill_phys_addr_length;
3614 3850                  } else {
3615 3851                          hw_addr = ill->ill_bcast_mp->b_rptr +
3616 3852                              NCE_LL_ADDR_OFFSET(ill),
3617 3853                              hw_addr_len = ill->ill_phys_addr_length;
3618 3854                  }
3619 3855          }
3620 3856  
3621 3857          mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3622 3858          IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3623 3859          nce = nce_lookup_addr(ill, &addr6);
3624 3860          if (nce == NULL) {
3625 3861                  err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3626 3862                      state, &nce);
3627 3863          } else {
3628 3864                  err = EEXIST;
3629 3865          }
3630 3866          mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3631 3867          if (err == 0)
3632 3868                  err = nce_add_v4_postprocess(nce);
3633 3869  
3634 3870          if (in_ill != ill && nce != NULL) {
3635 3871                  nce_t *under_nce = NULL;
3636 3872  
3637 3873                  /*
3638 3874                   * in_ill was the under_ill. Try to create the under_nce.
3639 3875                   * Hold the ill_g_lock to prevent changes to group membership
3640 3876                   * until we are done.
3641 3877                   */
3642 3878                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3643 3879                  if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3644 3880                          DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3645 3881                              ill_t *, ill);
3646 3882                          rw_exit(&ipst->ips_ill_g_lock);
3647 3883                          err = ENXIO;
3648 3884                          nce_refrele(nce);
3649 3885                          nce = NULL;
3650 3886                          goto bail;
3651 3887                  }
3652 3888                  under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3653 3889                  if (under_nce == NULL) {
3654 3890                          rw_exit(&ipst->ips_ill_g_lock);
3655 3891                          err = EINVAL;
3656 3892                          nce_refrele(nce);
3657 3893                          nce = NULL;
3658 3894                          goto bail;
3659 3895                  }
3660 3896                  rw_exit(&ipst->ips_ill_g_lock);
3661 3897                  upper_nce = nce;
3662 3898                  nce = under_nce; /* will be returned to caller */
3663 3899                  if (NCE_ISREACHABLE(nce->nce_common))
3664 3900                          nce_fastpath_trigger(under_nce);
3665 3901          }
3666 3902          if (nce != NULL) {
3667 3903                  if (newnce != NULL)
3668 3904                          *newnce = nce;
3669 3905                  else
3670 3906                          nce_refrele(nce);
3671 3907          }
3672 3908  bail:
3673 3909          if (under != NULL)
3674 3910                  ill_refrele(under);
3675 3911          if (upper_nce != NULL)
3676 3912                  nce_refrele(upper_nce);
3677 3913          if (need_ill_refrele)
3678 3914                  ill_refrele(ill);
3679 3915  
3680 3916          return (err);
3681 3917  }
3682 3918  
3683 3919  /*
3684 3920   * NDP Cache Entry creation routine for IPv4.
3685 3921   * This routine must always be called with ndp4->ndp_g_lock held.
3686 3922   * Prior to return, ncec_refcnt is incremented.
3687 3923   *
3688 3924   * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3689 3925   * are always added pointing at the ipmp_ill. Thus, when the ill passed
3690 3926   * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3691 3927   * entries will be created, both pointing at the same ncec_t. The nce_t
3692 3928   * entries will have their nce_ill set to the ipmp_ill and the under_ill
3693 3929   * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3694 3930   * Local addresses are always created on the ill passed to nce_add_v4.
3695 3931   */
3696 3932  int
3697 3933  nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3698 3934      const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3699 3935  {
3700 3936          int             err;
3701 3937          boolean_t       is_multicast = (flags & NCE_F_MCAST);
3702 3938          struct in6_addr addr6;
3703 3939          nce_t           *nce;
3704 3940  
3705 3941          ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3706 3942          ASSERT(!ill->ill_isv6);
3707 3943          ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3708 3944  
3709 3945          IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3710 3946          err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3711 3947              &nce);
3712 3948          ASSERT(newnce != NULL);
3713 3949          *newnce = nce;
3714 3950          return (err);
3715 3951  }
3716 3952  
3717 3953  /*
3718 3954   * Post-processing routine to be executed after nce_add_v4(). This function
3719 3955   * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3720 3956   * and must be called without any locks held.
3721 3957   *
3722 3958   * Always returns 0, but we return an int to keep this symmetric with the
3723 3959   * IPv6 counter-part.
3724 3960   */
3725 3961  int
3726 3962  nce_add_v4_postprocess(nce_t *nce)
3727 3963  {
3728 3964          ncec_t          *ncec = nce->nce_common;
3729 3965          uint16_t        flags = ncec->ncec_flags;
3730 3966          boolean_t       ndp_need_dad = B_FALSE;
3731 3967          boolean_t       dropped;
3732 3968          clock_t         delay;
3733 3969          ip_stack_t      *ipst = ncec->ncec_ill->ill_ipst;
3734 3970          uchar_t         *hw_addr = ncec->ncec_lladdr;
3735 3971          boolean_t       trigger_fastpath = B_TRUE;
3736 3972  
3737 3973          /*
3738 3974           * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3739 3975           * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3740 3976           * We call nce_fastpath from nce_update if the link layer address of
3741 3977           * the peer changes from nce_update
3742 3978           */
3743 3979          if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3744 3980              ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3745 3981                  trigger_fastpath = B_FALSE;
3746 3982  
3747 3983          if (trigger_fastpath)
3748 3984                  nce_fastpath_trigger(nce);
3749 3985  
3750 3986          if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3751 3987                  /*
3752 3988                   * Either the caller (by passing in ND_PROBE)
3753 3989                   * or nce_add_common() (by the internally computed state
3754 3990                   * based on ncec_addr and ill_net_type) has determined
3755 3991                   * that this unicast entry needs DAD. Trigger DAD.
3756 3992                   */
3757 3993                  ndp_need_dad = B_TRUE;
3758 3994          } else if (flags & NCE_F_UNSOL_ADV) {
3759 3995                  /*
3760 3996                   * We account for the transmit below by assigning one
3761 3997                   * less than the ndd variable. Subsequent decrements
3762 3998                   * are done in nce_timer.
3763 3999                   */
3764 4000                  mutex_enter(&ncec->ncec_lock);
3765 4001                  ncec->ncec_unsolicit_count =
3766 4002                      ipst->ips_ip_arp_publish_count - 1;
3767 4003                  mutex_exit(&ncec->ncec_lock);
3768 4004                  dropped = arp_announce(ncec);
3769 4005                  mutex_enter(&ncec->ncec_lock);
3770 4006                  if (dropped)
3771 4007                          ncec->ncec_unsolicit_count++;
3772 4008                  else
3773 4009                          ncec->ncec_last_time_defended = ddi_get_lbolt();
3774 4010                  if (ncec->ncec_unsolicit_count != 0) {
3775 4011                          nce_start_timer(ncec,
3776 4012                              ipst->ips_ip_arp_publish_interval);
3777 4013                  }
3778 4014                  mutex_exit(&ncec->ncec_lock);
3779 4015          }
3780 4016  
3781 4017          /*
3782 4018           * If ncec_xmit_interval is 0, user has configured us to send the first
3783 4019           * probe right away.  Do so, and set up for the subsequent probes.
3784 4020           */
3785 4021          if (ndp_need_dad) {
3786 4022                  mutex_enter(&ncec->ncec_lock);
3787 4023                  if (ncec->ncec_pcnt == 0) {
3788 4024                          /*
3789 4025                           * DAD probes and announce can be
3790 4026                           * administratively disabled by setting the
3791 4027                           * probe_count to zero. Restart the timer in
3792 4028                           * this case to mark the ipif as ready.
3793 4029                           */
3794 4030                          ncec->ncec_unsolicit_count = 0;
3795 4031                          mutex_exit(&ncec->ncec_lock);
3796 4032                          nce_restart_timer(ncec, 0);
3797 4033                  } else {
3798 4034                          mutex_exit(&ncec->ncec_lock);
3799 4035                          delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3800 4036                              ipst->ips_arp_probe_delay :
3801 4037                              ipst->ips_arp_fastprobe_delay);
3802 4038                          nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3803 4039                  }
3804 4040          }
3805 4041          return (0);
3806 4042  }
3807 4043  
3808 4044  /*
3809 4045   * ncec_walk routine to update all entries that have a given destination or
3810 4046   * gateway address and cached link layer (MAC) address.  This is used when ARP
3811 4047   * informs us that a network-to-link-layer mapping may have changed.
3812 4048   */
3813 4049  void
3814 4050  nce_update_hw_changed(ncec_t *ncec, void *arg)
3815 4051  {
3816 4052          nce_hw_map_t *hwm = arg;
3817 4053          ipaddr_t ncec_addr;
3818 4054  
3819 4055          if (ncec->ncec_state != ND_REACHABLE)
3820 4056                  return;
3821 4057  
3822 4058          IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3823 4059          if (ncec_addr != hwm->hwm_addr)
3824 4060                  return;
3825 4061  
3826 4062          mutex_enter(&ncec->ncec_lock);
3827 4063          if (hwm->hwm_flags != 0)
3828 4064                  ncec->ncec_flags = hwm->hwm_flags;
3829 4065          nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3830 4066          mutex_exit(&ncec->ncec_lock);
3831 4067  }
3832 4068  
3833 4069  void
3834 4070  ncec_refhold(ncec_t *ncec)
3835 4071  {
3836 4072          mutex_enter(&(ncec)->ncec_lock);
3837 4073          (ncec)->ncec_refcnt++;
3838 4074          ASSERT((ncec)->ncec_refcnt != 0);
3839 4075  #ifdef DEBUG
3840 4076          ncec_trace_ref(ncec);
3841 4077  #endif
3842 4078          mutex_exit(&(ncec)->ncec_lock);
3843 4079  }
3844 4080  
3845 4081  void
3846 4082  ncec_refhold_notr(ncec_t *ncec)
3847 4083  {
3848 4084          mutex_enter(&(ncec)->ncec_lock);
3849 4085          (ncec)->ncec_refcnt++;
3850 4086          ASSERT((ncec)->ncec_refcnt != 0);
3851 4087          mutex_exit(&(ncec)->ncec_lock);
3852 4088  }
3853 4089  
3854 4090  static void
3855 4091  ncec_refhold_locked(ncec_t *ncec)
3856 4092  {
3857 4093          ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3858 4094          (ncec)->ncec_refcnt++;
3859 4095  #ifdef DEBUG
3860 4096          ncec_trace_ref(ncec);
3861 4097  #endif
3862 4098  }
3863 4099  
3864 4100  /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3865 4101  void
3866 4102  ncec_refrele(ncec_t *ncec)
3867 4103  {
3868 4104          mutex_enter(&(ncec)->ncec_lock);
3869 4105  #ifdef DEBUG
3870 4106          ncec_untrace_ref(ncec);
3871 4107  #endif
3872 4108          ASSERT((ncec)->ncec_refcnt != 0);
3873 4109          if (--(ncec)->ncec_refcnt == 0) {
3874 4110                  ncec_inactive(ncec);
3875 4111          } else {
3876 4112                  mutex_exit(&(ncec)->ncec_lock);
3877 4113          }
3878 4114  }
3879 4115  
3880 4116  void
3881 4117  ncec_refrele_notr(ncec_t *ncec)
3882 4118  {
3883 4119          mutex_enter(&(ncec)->ncec_lock);
3884 4120          ASSERT((ncec)->ncec_refcnt != 0);
3885 4121          if (--(ncec)->ncec_refcnt == 0) {
3886 4122                  ncec_inactive(ncec);
3887 4123          } else {
3888 4124                  mutex_exit(&(ncec)->ncec_lock);
3889 4125          }
3890 4126  }
3891 4127  
3892 4128  /*
3893 4129   * Common to IPv4 and IPv6.
3894 4130   */
3895 4131  void
3896 4132  nce_restart_timer(ncec_t *ncec, uint_t ms)
3897 4133  {
3898 4134          timeout_id_t tid;
3899 4135  
3900 4136          ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3901 4137  
3902 4138          /* First cancel any running timer */
3903 4139          mutex_enter(&ncec->ncec_lock);
3904 4140          tid = ncec->ncec_timeout_id;
3905 4141          ncec->ncec_timeout_id = 0;
3906 4142          if (tid != 0) {
3907 4143                  mutex_exit(&ncec->ncec_lock);
3908 4144                  (void) untimeout(tid);
3909 4145                  mutex_enter(&ncec->ncec_lock);
3910 4146          }
3911 4147  
3912 4148          /* Restart timer */
3913 4149          nce_start_timer(ncec, ms);
3914 4150          mutex_exit(&ncec->ncec_lock);
3915 4151  }
3916 4152  
3917 4153  static void
3918 4154  nce_start_timer(ncec_t *ncec, uint_t ms)
3919 4155  {
3920 4156          ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3921 4157          /*
3922 4158           * Don't start the timer if the ncec has been deleted, or if the timer
3923 4159           * is already running
3924 4160           */
3925 4161          if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3926 4162                  ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3927 4163                      MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3928 4164          }
3929 4165  }
3930 4166  
3931 4167  int
3932 4168  nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3933 4169      uint16_t flags, nce_t **newnce)
3934 4170  {
3935 4171          uchar_t         *hw_addr;
3936 4172          int             err = 0;
3937 4173          ip_stack_t      *ipst = ill->ill_ipst;
3938 4174          in6_addr_t      dst6;
3939 4175          nce_t           *nce;
3940 4176  
3941 4177          ASSERT(!ill->ill_isv6);
3942 4178  
3943 4179          IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3944 4180          mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3945 4181          if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3946 4182                  mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3947 4183                  goto done;
3948 4184          }
3949 4185          if (ill->ill_net_type == IRE_IF_RESOLVER) {
3950 4186                  /*
3951 4187                   * For IRE_IF_RESOLVER a hardware mapping can be
3952 4188                   * generated, for IRE_IF_NORESOLVER, resolution cookie
3953 4189                   * in the ill is copied in nce_add_v4().
3954 4190                   */
3955 4191                  hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3956 4192                  if (hw_addr == NULL) {
3957 4193                          mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3958 4194                          return (ENOMEM);
3959 4195                  }
3960 4196                  ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3961 4197          } else {
3962 4198                  /*
3963 4199                   * IRE_IF_NORESOLVER type simply copies the resolution
3964 4200                   * cookie passed in.  So no hw_addr is needed.
  
    | 
      ↓ open down ↓ | 
    694 lines elided | 
    
      ↑ open up ↑ | 
  
3965 4201                   */
3966 4202                  hw_addr = NULL;
3967 4203          }
3968 4204          ASSERT(flags & NCE_F_MCAST);
3969 4205          ASSERT(flags & NCE_F_NONUD);
3970 4206          /* nce_state will be computed by nce_add_common() */
3971 4207          err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3972 4208              ND_UNCHANGED, &nce);
3973 4209          mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3974 4210          if (err == 0)
3975      -                err = nce_add_v4_postprocess(nce);
     4211 +                err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
3976 4212          if (hw_addr != NULL)
3977 4213                  kmem_free(hw_addr, ill->ill_phys_addr_length);
3978 4214          if (err != 0) {
3979 4215                  ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3980 4216                  return (err);
3981 4217          }
3982 4218  done:
3983 4219          if (newnce != NULL)
3984 4220                  *newnce = nce;
3985 4221          else
3986 4222                  nce_refrele(nce);
3987 4223          return (0);
3988 4224  }
3989 4225  
3990 4226  /*
3991 4227   * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3992 4228   * don't want to have to walk the list for every single one, so we gather up
3993 4229   * batches at a time.
3994 4230   */
3995 4231  #define NCE_RESCHED_LIST_LEN    8
3996 4232  
3997 4233  typedef struct {
3998 4234          ill_t   *ncert_ill;
3999 4235          uint_t  ncert_num;
4000 4236          ncec_t  *ncert_nces[NCE_RESCHED_LIST_LEN];
4001 4237  } nce_resched_t;
4002 4238  
4003 4239  /*
4004 4240   * Pick the longest waiting NCEs for defense.
4005 4241   */
4006 4242  /* ARGSUSED */
4007 4243  static int
4008 4244  ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4009 4245  {
4010 4246          nce_resched_t *ncert = arg;
4011 4247          ncec_t **ncecs;
4012 4248          ncec_t **ncec_max;
4013 4249          ncec_t *ncec_temp;
4014 4250          ncec_t *ncec = nce->nce_common;
4015 4251  
4016 4252          ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4017 4253          /*
4018 4254           * Only reachable entries that are ready for announcement are eligible.
4019 4255           */
4020 4256          if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4021 4257                  return (0);
4022 4258          if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4023 4259                  ncec_refhold(ncec);
4024 4260                  ncert->ncert_nces[ncert->ncert_num++] = ncec;
4025 4261          } else {
4026 4262                  ncecs = ncert->ncert_nces;
4027 4263                  ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4028 4264                  ncec_refhold(ncec);
4029 4265                  for (; ncecs < ncec_max; ncecs++) {
4030 4266                          ASSERT(ncec != NULL);
4031 4267                          if ((*ncecs)->ncec_last_time_defended >
4032 4268                              ncec->ncec_last_time_defended) {
4033 4269                                  ncec_temp = *ncecs;
4034 4270                                  *ncecs = ncec;
4035 4271                                  ncec = ncec_temp;
4036 4272                          }
4037 4273                  }
4038 4274                  ncec_refrele(ncec);
4039 4275          }
4040 4276          return (0);
4041 4277  }
4042 4278  
4043 4279  /*
4044 4280   * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4045 4281   * doesn't happen very often (if at all), and thus it needn't be highly
4046 4282   * optimized.  (Note, though, that it's actually O(N) complexity, because the
4047 4283   * outer loop is bounded by a constant rather than by the length of the list.)
4048 4284   */
4049 4285  static void
4050 4286  nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4051 4287  {
4052 4288          ncec_t          *ncec;
4053 4289          ip_stack_t      *ipst = ill->ill_ipst;
4054 4290          uint_t          i, defend_rate;
4055 4291  
4056 4292          i = ill->ill_defend_count;
4057 4293          ill->ill_defend_count = 0;
4058 4294          if (ill->ill_isv6)
4059 4295                  defend_rate = ipst->ips_ndp_defend_rate;
4060 4296          else
4061 4297                  defend_rate = ipst->ips_arp_defend_rate;
4062 4298          /* If none could be sitting around, then don't reschedule */
4063 4299          if (i < defend_rate) {
4064 4300                  DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4065 4301                  return;
4066 4302          }
4067 4303          ncert->ncert_ill = ill;
4068 4304          while (ill->ill_defend_count < defend_rate) {
4069 4305                  nce_walk_common(ill, ncec_reschedule, ncert);
4070 4306                  for (i = 0; i < ncert->ncert_num; i++) {
4071 4307  
4072 4308                          ncec = ncert->ncert_nces[i];
4073 4309                          mutex_enter(&ncec->ncec_lock);
4074 4310                          ncec->ncec_flags |= NCE_F_DELAYED;
4075 4311                          mutex_exit(&ncec->ncec_lock);
4076 4312                          /*
4077 4313                           * we plan to schedule this ncec, so incr the
4078 4314                           * defend_count in anticipation.
4079 4315                           */
4080 4316                          if (++ill->ill_defend_count >= defend_rate)
4081 4317                                  break;
4082 4318                  }
4083 4319                  if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4084 4320                          break;
4085 4321          }
4086 4322  }
4087 4323  
4088 4324  /*
4089 4325   * Check if the current rate-limiting parameters permit the sending
4090 4326   * of another address defense announcement for both IPv4 and IPv6.
4091 4327   * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4092 4328   * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4093 4329   * determines how many address defense announcements are permitted
4094 4330   * in any `defense_perio' interval.
4095 4331   */
4096 4332  static boolean_t
4097 4333  ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4098 4334  {
4099 4335          clock_t         now = ddi_get_lbolt();
4100 4336          ip_stack_t      *ipst = ill->ill_ipst;
4101 4337          clock_t         start = ill->ill_defend_start;
4102 4338          uint32_t        elapsed, defend_period, defend_rate;
4103 4339          nce_resched_t   ncert;
4104 4340          boolean_t       ret;
4105 4341          int             i;
4106 4342  
4107 4343          if (ill->ill_isv6) {
4108 4344                  defend_period = ipst->ips_ndp_defend_period;
4109 4345                  defend_rate = ipst->ips_ndp_defend_rate;
4110 4346          } else {
4111 4347                  defend_period = ipst->ips_arp_defend_period;
4112 4348                  defend_rate = ipst->ips_arp_defend_rate;
4113 4349          }
4114 4350          if (defend_rate == 0)
4115 4351                  return (B_TRUE);
4116 4352          bzero(&ncert, sizeof (ncert));
4117 4353          mutex_enter(&ill->ill_lock);
4118 4354          if (start > 0) {
4119 4355                  elapsed = now - start;
4120 4356                  if (elapsed > SEC_TO_TICK(defend_period)) {
4121 4357                          ill->ill_defend_start = now;
4122 4358                          /*
4123 4359                           * nce_ill_reschedule will attempt to
4124 4360                           * prevent starvation by reschduling the
4125 4361                           * oldest entries, which are marked with
4126 4362                           * the NCE_F_DELAYED flag.
4127 4363                           */
4128 4364                          nce_ill_reschedule(ill, &ncert);
4129 4365                  }
4130 4366          } else {
4131 4367                  ill->ill_defend_start = now;
4132 4368          }
4133 4369          ASSERT(ill->ill_defend_count <= defend_rate);
4134 4370          mutex_enter(&ncec->ncec_lock);
4135 4371          if (ncec->ncec_flags & NCE_F_DELAYED) {
4136 4372                  /*
4137 4373                   * This ncec was rescheduled as one of the really old
4138 4374                   * entries needing on-going defense. The
4139 4375                   * ill_defend_count was already incremented in
4140 4376                   * nce_ill_reschedule. Go ahead and send the announce.
4141 4377                   */
4142 4378                  ncec->ncec_flags &= ~NCE_F_DELAYED;
4143 4379                  mutex_exit(&ncec->ncec_lock);
4144 4380                  ret = B_FALSE;
4145 4381                  goto done;
4146 4382          }
4147 4383          mutex_exit(&ncec->ncec_lock);
4148 4384          if (ill->ill_defend_count < defend_rate)
4149 4385                  ill->ill_defend_count++;
4150 4386          if (ill->ill_defend_count == defend_rate) {
4151 4387                  /*
4152 4388                   * we are no longer allowed to send unbidden defense
4153 4389                   * messages. Wait for rescheduling.
4154 4390                   */
4155 4391                  ret = B_TRUE;
4156 4392          } else {
4157 4393                  ret = B_FALSE;
4158 4394          }
4159 4395  done:
4160 4396          mutex_exit(&ill->ill_lock);
4161 4397          /*
4162 4398           * After all the locks have been dropped we can restart nce timer,
4163 4399           * and refrele the delayed ncecs
4164 4400           */
4165 4401          for (i = 0; i < ncert.ncert_num; i++) {
4166 4402                  clock_t xmit_interval;
4167 4403                  ncec_t  *tmp;
4168 4404  
4169 4405                  tmp = ncert.ncert_nces[i];
4170 4406                  xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4171 4407                      B_FALSE);
4172 4408                  nce_restart_timer(tmp, xmit_interval);
4173 4409                  ncec_refrele(tmp);
4174 4410          }
4175 4411          return (ret);
4176 4412  }
4177 4413  
4178 4414  boolean_t
4179 4415  ndp_announce(ncec_t *ncec)
4180 4416  {
4181 4417          return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4182 4418              ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4183 4419              nce_advert_flags(ncec)));
4184 4420  }
4185 4421  
4186 4422  ill_t *
4187 4423  nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4188 4424  {
4189 4425          mblk_t          *mp;
4190 4426          in6_addr_t      src6;
4191 4427          ipaddr_t        src4;
4192 4428          ill_t           *ill = ncec->ncec_ill;
4193 4429          ill_t           *src_ill = NULL;
4194 4430          ipif_t          *ipif = NULL;
4195 4431          boolean_t       is_myaddr = NCE_MYADDR(ncec);
4196 4432          boolean_t       isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4197 4433  
4198 4434          ASSERT(src != NULL);
4199 4435          ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4200 4436          src6 = *src;
4201 4437          if (is_myaddr) {
4202 4438                  src6 = ncec->ncec_addr;
4203 4439                  if (!isv6)
4204 4440                          IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4205 4441          } else {
4206 4442                  /*
4207 4443                   * try to find one from the outgoing packet.
4208 4444                   */
4209 4445                  mutex_enter(&ncec->ncec_lock);
4210 4446                  mp = ncec->ncec_qd_mp;
4211 4447                  if (mp != NULL) {
4212 4448                          if (isv6) {
4213 4449                                  ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
4214 4450  
4215 4451                                  src6 = ip6h->ip6_src;
4216 4452                          } else {
4217 4453                                  ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4218 4454  
4219 4455                                  src4 = ipha->ipha_src;
4220 4456                                  IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4221 4457                          }
4222 4458                  }
4223 4459                  mutex_exit(&ncec->ncec_lock);
4224 4460          }
4225 4461  
4226 4462          /*
4227 4463           * For outgoing packets, if the src of outgoing packet is one
4228 4464           * of the assigned interface addresses use it, otherwise we
4229 4465           * will pick the source address below.
4230 4466           * For local addresses (is_myaddr) doing DAD, NDP announce
4231 4467           * messages are mcast. So we use the (IPMP) cast_ill or the
4232 4468           * (non-IPMP) ncec_ill for these message types. The only case
4233 4469           * of unicast DAD messages are for IPv6 ND probes, for which
4234 4470           * we find the ipif_bound_ill corresponding to the ncec_addr.
4235 4471           */
4236 4472          if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4237 4473                  if (isv6) {
4238 4474                          ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4239 4475                              ill->ill_ipst);
4240 4476                  } else {
4241 4477                          ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4242 4478                              ill->ill_ipst);
4243 4479                  }
4244 4480  
4245 4481                  /*
4246 4482                   * If no relevant ipif can be found, then it's not one of our
4247 4483                   * addresses.  Reset to :: and try to find a src for the NS or
4248 4484                   * ARP request using ipif_select_source_v[4,6]  below.
4249 4485                   * If an ipif can be found, but it's not yet done with
4250 4486                   * DAD verification, and we are not being invoked for
4251 4487                   * DAD (i.e., !is_myaddr), then just postpone this
4252 4488                   * transmission until later.
4253 4489                   */
4254 4490                  if (ipif == NULL) {
4255 4491                          src6 = ipv6_all_zeros;
4256 4492                          src4 = INADDR_ANY;
4257 4493                  } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4258 4494                          DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4259 4495                              ncec_t *, ncec, ipif_t *, ipif);
4260 4496                          ipif_refrele(ipif);
4261 4497                          return (NULL);
4262 4498                  }
4263 4499          }
4264 4500  
4265 4501          if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4266 4502                  /*
4267 4503                   * Pick a source address for this solicitation, but
4268 4504                   * restrict the selection to addresses assigned to the
4269 4505                   * output interface.  We do this because the destination will
4270 4506                   * create a neighbor cache entry for the source address of
4271 4507                   * this packet, so the source address had better be a valid
4272 4508                   * neighbor.
4273 4509                   */
4274 4510                  if (isv6) {
4275 4511                          ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4276 4512                              B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4277 4513                              B_FALSE, NULL);
4278 4514                  } else {
4279 4515                          ipaddr_t nce_addr;
4280 4516  
4281 4517                          IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4282 4518                          ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4283 4519                              B_FALSE, NULL);
4284 4520                  }
4285 4521                  if (ipif == NULL && IS_IPMP(ill)) {
4286 4522                          ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4287 4523  
4288 4524                          if (send_ill != NULL) {
4289 4525                                  if (isv6) {
4290 4526                                          ipif = ipif_select_source_v6(send_ill,
4291 4527                                              &ncec->ncec_addr, B_TRUE,
4292 4528                                              IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4293 4529                                              B_FALSE, NULL);
4294 4530                                  } else {
4295 4531                                          IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4296 4532                                              src4);
4297 4533                                          ipif = ipif_select_source_v4(send_ill,
4298 4534                                              src4, ALL_ZONES, B_TRUE, NULL);
4299 4535                                  }
4300 4536                                  ill_refrele(send_ill);
4301 4537                          }
4302 4538                  }
4303 4539  
4304 4540                  if (ipif == NULL) {
4305 4541                          char buf[INET6_ADDRSTRLEN];
4306 4542  
4307 4543                          ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4308 4544                              inet_ntop((isv6 ? AF_INET6 : AF_INET),
4309 4545                              (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4310 4546                          DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4311 4547                          return (NULL);
4312 4548                  }
4313 4549                  src6 = ipif->ipif_v6lcl_addr;
4314 4550          }
4315 4551          *src = src6;
4316 4552          if (ipif != NULL) {
4317 4553                  src_ill = ipif->ipif_ill;
4318 4554                  if (IS_IPMP(src_ill))
4319 4555                          src_ill = ipmp_ipif_hold_bound_ill(ipif);
4320 4556                  else
4321 4557                          ill_refhold(src_ill);
4322 4558                  ipif_refrele(ipif);
4323 4559                  DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4324 4560                      ill_t *, src_ill);
4325 4561          }
4326 4562          return (src_ill);
4327 4563  }
4328 4564  
4329 4565  void
4330 4566  ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4331 4567      uchar_t *hwaddr, int hwaddr_len, int flags)
4332 4568  {
4333 4569          ill_t   *ill;
4334 4570          ncec_t  *ncec;
4335 4571          nce_t   *nce;
4336 4572          uint16_t new_state;
4337 4573  
4338 4574          ill = (ipif ? ipif->ipif_ill : NULL);
4339 4575          if (ill != NULL) {
4340 4576                  /*
4341 4577                   * only one ncec is possible
4342 4578                   */
4343 4579                  nce = nce_lookup_v4(ill, addr);
4344 4580                  if (nce != NULL) {
4345 4581                          ncec = nce->nce_common;
4346 4582                          mutex_enter(&ncec->ncec_lock);
4347 4583                          if (NCE_ISREACHABLE(ncec))
4348 4584                                  new_state = ND_UNCHANGED;
4349 4585                          else
4350 4586                                  new_state = ND_STALE;
4351 4587                          ncec->ncec_flags = flags;
4352 4588                          nce_update(ncec, new_state, hwaddr);
4353 4589                          mutex_exit(&ncec->ncec_lock);
4354 4590                          nce_refrele(nce);
4355 4591                          return;
4356 4592                  }
4357 4593          } else {
4358 4594                  /*
4359 4595                   * ill is wildcard; clean up all ncec's and ire's
4360 4596                   * that match on addr.
4361 4597                   */
4362 4598                  nce_hw_map_t hwm;
4363 4599  
4364 4600                  hwm.hwm_addr = *addr;
4365 4601                  hwm.hwm_hwlen = hwaddr_len;
4366 4602                  hwm.hwm_hwaddr = hwaddr;
4367 4603                  hwm.hwm_flags = flags;
4368 4604  
4369 4605                  ncec_walk_common(ipst->ips_ndp4, NULL,
4370 4606                      nce_update_hw_changed, &hwm, B_TRUE);
4371 4607          }
4372 4608  }
4373 4609  
4374 4610  /*
4375 4611   * Common function to add ncec entries.
4376 4612   * we always add the ncec with ncec_ill == ill, and always create
4377 4613   * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4378 4614   * ncec is !reachable.
4379 4615   *
4380 4616   * When the caller passes in an nce_state of ND_UNCHANGED,
4381 4617   * nce_add_common() will determine the state of the created nce based
4382 4618   * on the ill_net_type and nce_flags used. Otherwise, the nce will
4383 4619   * be created with state set to the passed in nce_state.
4384 4620   */
4385 4621  static int
4386 4622  nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4387 4623      const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4388 4624  {
  
    | 
      ↓ open down ↓ | 
    403 lines elided | 
    
      ↑ open up ↑ | 
  
4389 4625          static  ncec_t          nce_nil;
4390 4626          uchar_t                 *template = NULL;
4391 4627          int                     err;
4392 4628          ncec_t                  *ncec;
4393 4629          ncec_t                  **ncep;
4394 4630          ip_stack_t              *ipst = ill->ill_ipst;
4395 4631          uint16_t                state;
4396 4632          boolean_t               fastprobe = B_FALSE;
4397 4633          struct ndp_g_s          *ndp;
4398 4634          nce_t                   *nce = NULL;
     4635 +        list_t                  graveyard;
4399 4636          mblk_t                  *dlur_mp = NULL;
4400 4637  
4401 4638          if (ill->ill_isv6)
4402 4639                  ndp = ill->ill_ipst->ips_ndp6;
4403 4640          else
4404 4641                  ndp = ill->ill_ipst->ips_ndp4;
4405 4642  
4406 4643          *retnce = NULL;
4407 4644  
4408 4645          ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4409 4646  
4410 4647          if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4411 4648                  ip0dbg(("nce_add_common: no addr\n"));
4412 4649                  return (EINVAL);
4413 4650          }
4414 4651          if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4415 4652                  ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4416 4653                  return (EINVAL);
4417 4654          }
4418 4655  
4419 4656          if (ill->ill_isv6) {
4420 4657                  ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4421 4658          } else {
4422 4659                  ipaddr_t v4addr;
4423 4660  
4424 4661                  IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4425 4662                  ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4426 4663          }
4427 4664  
4428 4665          /*
4429 4666           * The caller has ensured that there is no nce on ill, but there could
4430 4667           * still be an nce_common_t for the address, so that we find exisiting
4431 4668           * ncec_t strucutures first, and atomically add a new nce_t if
4432 4669           * one is found. The ndp_g_lock ensures that we don't cross threads
4433 4670           * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4434 4671           * compare for matches across the illgrp because this function is
4435 4672           * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4436 4673           * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4437 4674           * appropriate.
4438 4675           */
4439 4676          ncec = *ncep;
4440 4677          for (; ncec != NULL; ncec = ncec->ncec_next) {
4441 4678                  if (ncec->ncec_ill == ill) {
4442 4679                          if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4443 4680                                  /*
4444 4681                                   * We should never find *retnce to be
4445 4682                                   * MYADDR, since the caller may then
4446 4683                                   * incorrectly restart a DAD timer that's
4447 4684                                   * already running.  However, if we are in
4448 4685                                   * forwarding mode, and the interface is
4449 4686                                   * moving in/out of groups, the data
4450 4687                                   * path ire lookup (e.g., ire_revalidate_nce)
4451 4688                                   * may  have determined that some destination
4452 4689                                   * is offlink while the control path is adding
4453 4690                                   * that address as a local address.
4454 4691                                   * Recover from  this case by failing the
4455 4692                                   * lookup
4456 4693                                   */
4457 4694                                  if (NCE_MYADDR(ncec))
4458 4695                                          return (ENXIO);
4459 4696                                  *retnce = nce_ill_lookup_then_add(ill, ncec);
4460 4697                                  if (*retnce != NULL)
4461 4698                                          break;
4462 4699                          }
4463 4700                  }
4464 4701          }
4465 4702          if (*retnce != NULL) /* caller must trigger fastpath on nce */
4466 4703                  return (0);
4467 4704  
4468 4705          ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4469 4706          if (ncec == NULL)
4470 4707                  return (ENOMEM);
4471 4708          *ncec = nce_nil;
4472 4709          ncec->ncec_ill = ill;
4473 4710          ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4474 4711          ncec->ncec_flags = flags;
4475 4712          ncec->ncec_ipst = ipst; /* No netstack_hold */
4476 4713  
4477 4714          if (!ill->ill_isv6) {
4478 4715                  ipaddr_t addr4;
4479 4716  
4480 4717                  /*
4481 4718                   * DAD probe interval and probe count are set based on
4482 4719                   * fast/slow probe settings. If the underlying link doesn't
4483 4720                   * have reliably up/down notifications or if we're working
4484 4721                   * with IPv4 169.254.0.0/16 Link Local Address space, then
4485 4722                   * don't use the fast timers.  Otherwise, use them.
4486 4723                   */
4487 4724                  ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4488 4725                  IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4489 4726                  if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4490 4727                          fastprobe = B_TRUE;
4491 4728                  } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4492 4729                      !IS_IPV4_LL_SPACE(&addr4)) {
4493 4730                          ill_t *hwaddr_ill;
4494 4731  
4495 4732                          hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4496 4733                              hw_addr_len);
4497 4734                          if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4498 4735                                  fastprobe = B_TRUE;
4499 4736                  }
4500 4737                  if (fastprobe) {
4501 4738                          ncec->ncec_xmit_interval =
4502 4739                              ipst->ips_arp_fastprobe_interval;
4503 4740                          ncec->ncec_pcnt =
4504 4741                              ipst->ips_arp_fastprobe_count;
4505 4742                          ncec->ncec_flags |= NCE_F_FAST;
4506 4743                  } else {
4507 4744                          ncec->ncec_xmit_interval =
4508 4745                              ipst->ips_arp_probe_interval;
4509 4746                          ncec->ncec_pcnt =
4510 4747                              ipst->ips_arp_probe_count;
4511 4748                  }
4512 4749                  if (NCE_PUBLISH(ncec)) {
4513 4750                          ncec->ncec_unsolicit_count =
4514 4751                              ipst->ips_ip_arp_publish_count;
4515 4752                  }
4516 4753          } else {
4517 4754                  /*
4518 4755                   * probe interval is constant: ILL_PROBE_INTERVAL
4519 4756                   * probe count is constant: ND_MAX_UNICAST_SOLICIT
4520 4757                   */
4521 4758                  ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4522 4759                  if (NCE_PUBLISH(ncec)) {
4523 4760                          ncec->ncec_unsolicit_count =
4524 4761                              ipst->ips_ip_ndp_unsolicit_count;
4525 4762                  }
4526 4763          }
4527 4764          ncec->ncec_rcnt = ill->ill_xmit_count;
4528 4765          ncec->ncec_addr = *addr;
4529 4766          ncec->ncec_qd_mp = NULL;
4530 4767          ncec->ncec_refcnt = 1; /* for ncec getting created */
4531 4768          mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4532 4769          ncec->ncec_trace_disable = B_FALSE;
4533 4770  
4534 4771          /*
4535 4772           * ncec_lladdr holds link layer address
4536 4773           */
4537 4774          if (hw_addr_len > 0) {
4538 4775                  template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4539 4776                  if (template == NULL) {
4540 4777                          err = ENOMEM;
4541 4778                          goto err_ret;
4542 4779                  }
4543 4780                  ncec->ncec_lladdr = template;
4544 4781                  ncec->ncec_lladdr_length = hw_addr_len;
4545 4782                  bzero(ncec->ncec_lladdr, hw_addr_len);
4546 4783          }
4547 4784          if ((flags & NCE_F_BCAST) != 0) {
4548 4785                  state = ND_REACHABLE;
4549 4786                  ASSERT(hw_addr_len > 0);
4550 4787          } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4551 4788                  state = ND_INITIAL;
4552 4789          } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4553 4790                  /*
4554 4791                   * NORESOLVER entries are always created in the REACHABLE
4555 4792                   * state.
4556 4793                   */
4557 4794                  state = ND_REACHABLE;
4558 4795                  if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4559 4796                      ill->ill_mactype != DL_IPV4 &&
4560 4797                      ill->ill_mactype != DL_6TO4) {
4561 4798                          /*
4562 4799                           * We create a nce_res_mp with the IP nexthop address
4563 4800                           * as the destination address if the physical length
4564 4801                           * is exactly 4 bytes for point-to-multipoint links
4565 4802                           * that do their own resolution from IP to link-layer
4566 4803                           * address (e.g. IP over X.25).
4567 4804                           */
4568 4805                          bcopy((uchar_t *)addr,
4569 4806                              ncec->ncec_lladdr, ill->ill_phys_addr_length);
4570 4807                  }
4571 4808                  if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4572 4809                      ill->ill_mactype != DL_IPV6) {
4573 4810                          /*
4574 4811                           * We create a nce_res_mp with the IP nexthop address
4575 4812                           * as the destination address if the physical legnth
4576 4813                           * is exactly 16 bytes for point-to-multipoint links
4577 4814                           * that do their own resolution from IP to link-layer
4578 4815                           * address.
4579 4816                           */
4580 4817                          bcopy((uchar_t *)addr,
4581 4818                              ncec->ncec_lladdr, ill->ill_phys_addr_length);
4582 4819                  }
4583 4820                  /*
4584 4821                   * Since NUD is not part of the base IPv4 protocol definition,
4585 4822                   * IPv4 neighbor entries on NORESOLVER interfaces will never
4586 4823                   * age, and are marked NCE_F_NONUD.
4587 4824                   */
4588 4825                  if (!ill->ill_isv6)
4589 4826                          ncec->ncec_flags |= NCE_F_NONUD;
4590 4827          } else if (ill->ill_net_type == IRE_LOOPBACK) {
4591 4828                  state = ND_REACHABLE;
4592 4829          }
4593 4830  
4594 4831          if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4595 4832                  /*
4596 4833                   * We are adding an ncec with a deterministic hw_addr,
4597 4834                   * so the state can only be one of {REACHABLE, STALE, PROBE}.
4598 4835                   *
4599 4836                   * if we are adding a unicast ncec for the local address
4600 4837                   * it would be REACHABLE; we would be adding a ND_STALE entry
4601 4838                   * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4602 4839                   * addresses are added in PROBE to trigger DAD.
4603 4840                   */
4604 4841                  if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4605 4842                      ill->ill_net_type == IRE_IF_NORESOLVER)
4606 4843                          state = ND_REACHABLE;
4607 4844                  else if (!NCE_PUBLISH(ncec))
4608 4845                          state = ND_STALE;
4609 4846                  else
4610 4847                          state = ND_PROBE;
4611 4848                  if (hw_addr != NULL)
4612 4849                          nce_set_ll(ncec, hw_addr);
4613 4850          }
4614 4851          /* caller overrides internally computed state */
4615 4852          if (nce_state != ND_UNCHANGED)
4616 4853                  state = nce_state;
4617 4854  
4618 4855          if (state == ND_PROBE)
4619 4856                  ncec->ncec_flags |= NCE_F_UNVERIFIED;
4620 4857  
4621 4858          ncec->ncec_state = state;
4622 4859  
4623 4860          if (state == ND_REACHABLE) {
4624 4861                  ncec->ncec_last = ncec->ncec_init_time =
4625 4862                      TICK_TO_MSEC(ddi_get_lbolt64());
4626 4863          } else {
4627 4864                  ncec->ncec_last = 0;
4628 4865                  if (state == ND_INITIAL)
4629 4866                          ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4630 4867          }
4631 4868          list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4632 4869              offsetof(ncec_cb_t, ncec_cb_node));
4633 4870          /*
4634 4871           * have all the memory allocations out of the way before taking locks
4635 4872           * and adding the nce.
4636 4873           */
4637 4874          nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4638 4875          if (nce == NULL) {
4639 4876                  err = ENOMEM;
4640 4877                  goto err_ret;
4641 4878          }
4642 4879          if (ncec->ncec_lladdr != NULL ||
4643 4880              ill->ill_net_type == IRE_IF_NORESOLVER) {
4644 4881                  dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4645 4882                      ill->ill_phys_addr_length, ill->ill_sap,
4646 4883                      ill->ill_sap_length);
4647 4884                  if (dlur_mp == NULL) {
4648 4885                          err = ENOMEM;
4649 4886                          goto err_ret;
4650 4887                  }
4651 4888          }
4652 4889  
4653 4890          /*
4654 4891           * Atomically ensure that the ill is not CONDEMNED, before
4655 4892           * adding the NCE.
4656 4893           */
4657 4894          mutex_enter(&ill->ill_lock);
4658 4895          if (ill->ill_state_flags & ILL_CONDEMNED) {
4659 4896                  mutex_exit(&ill->ill_lock);
4660 4897                  err = EINVAL;
4661 4898                  goto err_ret;
4662 4899          }
4663 4900          if (!NCE_MYADDR(ncec) &&
4664 4901              (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4665 4902                  mutex_exit(&ill->ill_lock);
4666 4903                  DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4667 4904                  err = EINVAL;
4668 4905                  goto err_ret;
4669 4906          }
4670 4907          /*
4671 4908           * Acquire the ncec_lock even before adding the ncec to the list
4672 4909           * so that it cannot get deleted after the ncec is added, but
4673 4910           * before we add the nce.
4674 4911           */
4675 4912          mutex_enter(&ncec->ncec_lock);
4676 4913          if ((ncec->ncec_next = *ncep) != NULL)
4677 4914                  ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4678 4915          *ncep = ncec;
  
    | 
      ↓ open down ↓ | 
    270 lines elided | 
    
      ↑ open up ↑ | 
  
4679 4916          ncec->ncec_ptpn = ncep;
4680 4917  
4681 4918          /* Bump up the number of ncec's referencing this ill */
4682 4919          DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4683 4920              (char *), "ncec", (void *), ncec);
4684 4921          ill->ill_ncec_cnt++;
4685 4922          /*
4686 4923           * Since we hold the ncec_lock at this time, the ncec cannot be
4687 4924           * condemned, and we can safely add the nce.
4688 4925           */
4689      -        *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
     4926 +        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
     4927 +        *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
4690 4928          mutex_exit(&ncec->ncec_lock);
4691 4929          mutex_exit(&ill->ill_lock);
     4930 +        nce_graveyard_free(&graveyard);
4692 4931  
4693 4932          /* caller must trigger fastpath on *retnce */
4694 4933          return (0);
4695 4934  
4696 4935  err_ret:
4697 4936          if (ncec != NULL)
4698 4937                  kmem_cache_free(ncec_cache, ncec);
4699 4938          if (nce != NULL)
4700 4939                  kmem_cache_free(nce_cache, nce);
4701 4940          freemsg(dlur_mp);
4702 4941          if (template != NULL)
4703 4942                  kmem_free(template, ill->ill_phys_addr_length);
4704 4943          return (err);
4705 4944  }
4706 4945  
4707 4946  /*
4708 4947   * take a ref on the nce
4709 4948   */
4710 4949  void
4711 4950  nce_refhold(nce_t *nce)
4712 4951  {
4713 4952          mutex_enter(&nce->nce_lock);
4714 4953          nce->nce_refcnt++;
4715 4954          ASSERT((nce)->nce_refcnt != 0);
4716 4955          mutex_exit(&nce->nce_lock);
4717 4956  }
4718 4957  
4719 4958  /*
4720 4959   * release a ref on the nce; In general, this
4721 4960   * cannot be called with locks held because nce_inactive
4722 4961   * may result in nce_inactive which will take the ill_lock,
4723 4962   * do ipif_ill_refrele_tail etc. Thus the one exception
4724 4963   * where this can be called with locks held is when the caller
4725 4964   * is certain that the nce_refcnt is sufficient to prevent
4726 4965   * the invocation of nce_inactive.
4727 4966   */
4728 4967  void
4729 4968  nce_refrele(nce_t *nce)
4730 4969  {
4731 4970          ASSERT((nce)->nce_refcnt != 0);
4732 4971          mutex_enter(&nce->nce_lock);
4733 4972          if (--nce->nce_refcnt == 0)
4734 4973                  nce_inactive(nce); /* destroys the mutex */
4735 4974          else
4736 4975                  mutex_exit(&nce->nce_lock);
4737 4976  }
4738 4977  
4739 4978  /*
4740 4979   * free the nce after all refs have gone away.
4741 4980   */
4742 4981  static void
4743 4982  nce_inactive(nce_t *nce)
4744 4983  {
4745 4984          ill_t *ill = nce->nce_ill;
4746 4985  
4747 4986          ASSERT(nce->nce_refcnt == 0);
4748 4987  
4749 4988          ncec_refrele_notr(nce->nce_common);
4750 4989          nce->nce_common = NULL;
4751 4990          freemsg(nce->nce_fp_mp);
4752 4991          freemsg(nce->nce_dlur_mp);
4753 4992  
4754 4993          mutex_enter(&ill->ill_lock);
4755 4994          DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4756 4995              (char *), "nce", (void *), nce);
4757 4996          ill->ill_nce_cnt--;
4758 4997          nce->nce_ill = NULL;
4759 4998          /*
4760 4999           * If the number of ncec's associated with this ill have dropped
4761 5000           * to zero, check whether we need to restart any operation that
4762 5001           * is waiting for this to happen.
4763 5002           */
4764 5003          if (ILL_DOWN_OK(ill)) {
4765 5004                  /* ipif_ill_refrele_tail drops the ill_lock */
4766 5005                  ipif_ill_refrele_tail(ill);
  
    | 
      ↓ open down ↓ | 
    65 lines elided | 
    
      ↑ open up ↑ | 
  
4767 5006          } else {
4768 5007                  mutex_exit(&ill->ill_lock);
4769 5008          }
4770 5009  
4771 5010          mutex_destroy(&nce->nce_lock);
4772 5011          kmem_cache_free(nce_cache, nce);
4773 5012  }
4774 5013  
4775 5014  /*
4776 5015   * Add an nce to the ill_nce list.
     5016 + *
     5017 + * Adding multicast NCEs is subject to a per-ill limit. This function returns
     5018 + * NULL if that's the case, and it may reap a number of multicast nces.
     5019 + * Callers (and upstack) must be able to cope with NULL returns.
4777 5020   */
4778 5021  static nce_t *
4779      -nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
     5022 +nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
     5023 +    list_t *graveyard)
4780 5024  {
     5025 +        ASSERT(MUTEX_HELD(&ill->ill_lock));
     5026 +
     5027 +        if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
     5028 +                if (nce_too_many_mcast(ill, graveyard)) {
     5029 +                        kmem_cache_free(nce_cache, nce);
     5030 +                        return (NULL);
     5031 +                }
     5032 +                ill->ill_mcast_nces++;
     5033 +        }
     5034 +
4781 5035          bzero(nce, sizeof (*nce));
4782 5036          mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4783 5037          nce->nce_common = ncec;
4784 5038          nce->nce_addr = ncec->ncec_addr;
4785 5039          nce->nce_ill = ill;
4786 5040          DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4787 5041              (char *), "nce", (void *), nce);
4788 5042          ill->ill_nce_cnt++;
4789 5043  
4790 5044          nce->nce_refcnt = 1; /* for the thread */
4791 5045          ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4792 5046          nce->nce_dlur_mp = dlur_mp;
4793 5047  
4794 5048          /* add nce to the ill's fastpath list.  */
4795 5049          nce->nce_refcnt++; /* for the list */
4796 5050          list_insert_head(&ill->ill_nce, nce);
4797 5051          return (nce);
4798 5052  }
4799 5053  
4800 5054  static nce_t *
4801      -nce_add(ill_t *ill, ncec_t *ncec)
     5055 +nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
4802 5056  {
4803 5057          nce_t   *nce;
4804 5058          mblk_t  *dlur_mp = NULL;
4805 5059  
4806 5060          ASSERT(MUTEX_HELD(&ill->ill_lock));
4807 5061          ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4808 5062  
4809 5063          nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4810 5064          if (nce == NULL)
4811 5065                  return (NULL);
4812 5066          if (ncec->ncec_lladdr != NULL ||
4813 5067              ill->ill_net_type == IRE_IF_NORESOLVER) {
4814 5068                  dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4815 5069                      ill->ill_phys_addr_length, ill->ill_sap,
4816 5070                      ill->ill_sap_length);
4817 5071                  if (dlur_mp == NULL) {
4818 5072                          kmem_cache_free(nce_cache, nce);
4819 5073                          return (NULL);
4820 5074                  }
4821 5075          }
4822      -        return (nce_add_impl(ill, ncec, nce, dlur_mp));
     5076 +        /*
     5077 +         * If nce_add_impl() returns NULL due to on multicast limiting, caller
     5078 +         * will (correctly) assume ENOMEM.
     5079 +         */
     5080 +        return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
4823 5081  }
4824 5082  
4825 5083  /*
4826 5084   * remove the nce from the ill_faspath list
4827 5085   */
4828 5086  void
4829 5087  nce_delete(nce_t *nce)
4830 5088  {
4831 5089          ill_t   *ill = nce->nce_ill;
4832 5090  
4833 5091          ASSERT(MUTEX_HELD(&ill->ill_lock));
4834 5092  
4835 5093          mutex_enter(&nce->nce_lock);
  
    | 
      ↓ open down ↓ | 
    3 lines elided | 
    
      ↑ open up ↑ | 
  
4836 5094          if (nce->nce_is_condemned) {
4837 5095                  /*
4838 5096                   * some other thread has removed this nce from the ill_nce list
4839 5097                   */
4840 5098                  mutex_exit(&nce->nce_lock);
4841 5099                  return;
4842 5100          }
4843 5101          nce->nce_is_condemned = B_TRUE;
4844 5102          mutex_exit(&nce->nce_lock);
4845 5103  
     5104 +        /* Update the count of multicast NCEs. */
     5105 +        if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
     5106 +                ill->ill_mcast_nces--;
     5107 +
4846 5108          list_remove(&ill->ill_nce, nce);
4847 5109          /*
4848 5110           * even though we are holding the ill_lock, it is ok to
4849 5111           * call nce_refrele here because we know that we should have
4850 5112           * at least 2 refs on the nce: one for the thread, and one
4851 5113           * for the list. The refrele below will release the one for
4852 5114           * the list.
4853 5115           */
4854 5116          nce_refrele(nce);
4855 5117  }
4856 5118  
4857 5119  nce_t *
4858 5120  nce_lookup(ill_t *ill, const in6_addr_t *addr)
4859 5121  {
4860 5122          nce_t *nce = NULL;
4861 5123  
4862 5124          ASSERT(ill != NULL);
4863 5125          ASSERT(MUTEX_HELD(&ill->ill_lock));
4864 5126  
4865 5127          for (nce = list_head(&ill->ill_nce); nce != NULL;
4866 5128              nce = list_next(&ill->ill_nce, nce)) {
4867 5129                  if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4868 5130                          break;
4869 5131          }
4870 5132  
4871 5133          /*
4872 5134           * if we found the nce on the ill_nce list while holding
4873 5135           * the ill_lock, then it cannot be condemned yet.
4874 5136           */
4875 5137          if (nce != NULL) {
4876 5138                  ASSERT(!nce->nce_is_condemned);
4877 5139                  nce_refhold(nce);
4878 5140          }
4879 5141          return (nce);
4880 5142  }
4881 5143  
4882 5144  /*
4883 5145   * Walk the ill_nce list on ill. The callback function func() cannot perform
4884 5146   * any destructive actions.
4885 5147   */
4886 5148  static void
4887 5149  nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4888 5150  {
4889 5151          nce_t *nce = NULL, *nce_next;
4890 5152  
4891 5153          ASSERT(MUTEX_HELD(&ill->ill_lock));
4892 5154          for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4893 5155                  nce_next = list_next(&ill->ill_nce, nce);
4894 5156                  if (func(ill, nce, arg) != 0)
4895 5157                          break;
4896 5158                  nce = nce_next;
4897 5159          }
4898 5160  }
4899 5161  
4900 5162  void
4901 5163  nce_walk(ill_t *ill, pfi_t func, void *arg)
4902 5164  {
4903 5165          mutex_enter(&ill->ill_lock);
4904 5166          nce_walk_common(ill, func, arg);
4905 5167          mutex_exit(&ill->ill_lock);
4906 5168  }
4907 5169  
4908 5170  void
4909 5171  nce_flush(ill_t *ill, boolean_t flushall)
4910 5172  {
4911 5173          nce_t *nce, *nce_next;
4912 5174          list_t dead;
4913 5175  
4914 5176          list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4915 5177          mutex_enter(&ill->ill_lock);
4916 5178          for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4917 5179                  nce_next = list_next(&ill->ill_nce, nce);
4918 5180                  if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4919 5181                          nce = nce_next;
4920 5182                          continue;
4921 5183                  }
4922 5184                  /*
4923 5185                   * nce_delete requires that the caller should either not
4924 5186                   * be holding locks, or should hold a ref to ensure that
4925 5187                   * we wont hit ncec_inactive. So take a ref and clean up
4926 5188                   * after the list is flushed.
4927 5189                   */
4928 5190                  nce_refhold(nce);
4929 5191                  nce_delete(nce);
4930 5192                  list_insert_tail(&dead, nce);
4931 5193                  nce = nce_next;
4932 5194          }
4933 5195          mutex_exit(&ill->ill_lock);
4934 5196          while ((nce = list_head(&dead)) != NULL) {
4935 5197                  list_remove(&dead, nce);
4936 5198                  nce_refrele(nce);
4937 5199          }
4938 5200          ASSERT(list_is_empty(&dead));
4939 5201          list_destroy(&dead);
4940 5202  }
4941 5203  
4942 5204  /* Return an interval that is anywhere in the [1 .. intv] range */
4943 5205  static clock_t
4944 5206  nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4945 5207  {
4946 5208          clock_t rnd, frac;
4947 5209  
4948 5210          (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4949 5211          /* Note that clock_t is signed; must chop off bits */
4950 5212          rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4951 5213          if (initial_time) {
4952 5214                  if (intv <= 0)
4953 5215                          intv = 1;
4954 5216                  else
4955 5217                          intv = (rnd % intv) + 1;
4956 5218          } else {
4957 5219                  /* Compute 'frac' as 20% of the configured interval */
4958 5220                  if ((frac = intv / 5) <= 1)
4959 5221                          frac = 2;
4960 5222                  /* Set intv randomly in the range [intv-frac .. intv+frac] */
4961 5223                  if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4962 5224                          intv = 1;
4963 5225          }
4964 5226          return (intv);
4965 5227  }
4966 5228  
4967 5229  void
4968 5230  nce_resolv_ipmp_ok(ncec_t *ncec)
4969 5231  {
4970 5232          mblk_t *mp;
4971 5233          uint_t pkt_len;
4972 5234          iaflags_t ixaflags = IXAF_NO_TRACE;
4973 5235          nce_t *under_nce;
4974 5236          ill_t   *ill = ncec->ncec_ill;
4975 5237          boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4976 5238          ipif_t *src_ipif = NULL;
4977 5239          ip_stack_t *ipst = ill->ill_ipst;
4978 5240          ill_t *send_ill;
4979 5241          uint_t nprobes;
4980 5242  
4981 5243          ASSERT(IS_IPMP(ill));
4982 5244  
4983 5245          mutex_enter(&ncec->ncec_lock);
4984 5246          nprobes = ncec->ncec_nprobes;
4985 5247          mp = ncec->ncec_qd_mp;
4986 5248          ncec->ncec_qd_mp = NULL;
4987 5249          ncec->ncec_nprobes = 0;
4988 5250          mutex_exit(&ncec->ncec_lock);
4989 5251  
4990 5252          while (mp != NULL) {
4991 5253                  mblk_t *nxt_mp;
4992 5254  
4993 5255                  nxt_mp = mp->b_next;
4994 5256                  mp->b_next = NULL;
4995 5257                  if (isv6) {
4996 5258                          ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4997 5259  
4998 5260                          pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4999 5261                          src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
5000 5262                              ill, ALL_ZONES, ipst);
5001 5263                  } else {
5002 5264                          ipha_t *ipha = (ipha_t *)mp->b_rptr;
5003 5265  
5004 5266                          ixaflags |= IXAF_IS_IPV4;
5005 5267                          pkt_len = ntohs(ipha->ipha_length);
5006 5268                          src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5007 5269                              ill, ALL_ZONES, ipst);
5008 5270                  }
5009 5271  
5010 5272                  /*
5011 5273                   * find a new nce based on an under_ill. The first IPMP probe
5012 5274                   * packet gets queued, so we could still find a src_ipif that
5013 5275                   * matches an IPMP test address.
5014 5276                   */
5015 5277                  if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5016 5278                          /*
5017 5279                           * if src_ipif is null, this could be either a
5018 5280                           * forwarded packet or a probe whose src got deleted.
5019 5281                           * We identify the former case by looking for the
5020 5282                           * ncec_nprobes: the first ncec_nprobes packets are
5021 5283                           * probes;
5022 5284                           */
5023 5285                          if (src_ipif == NULL && nprobes > 0)
5024 5286                                  goto drop_pkt;
5025 5287  
5026 5288                          /*
5027 5289                           * For forwarded packets, we use the ipmp rotor
5028 5290                           * to find send_ill.
5029 5291                           */
5030 5292                          send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5031 5293                              B_TRUE);
5032 5294                  } else {
5033 5295                          send_ill = src_ipif->ipif_ill;
5034 5296                          ill_refhold(send_ill);
5035 5297                  }
5036 5298  
5037 5299                  DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5038 5300                      (ncec_t *), ncec, (ipif_t *),
5039 5301                      src_ipif, (ill_t *), send_ill);
5040 5302  
5041 5303                  if (send_ill == NULL) {
5042 5304                          if (src_ipif != NULL)
5043 5305                                  ipif_refrele(src_ipif);
5044 5306                          goto drop_pkt;
5045 5307                  }
5046 5308                  /* create an under_nce on send_ill */
5047 5309                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5048 5310                  if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5049 5311                          under_nce = nce_fastpath_create(send_ill, ncec);
5050 5312                  else
5051 5313                          under_nce = NULL;
5052 5314                  rw_exit(&ipst->ips_ill_g_lock);
5053 5315                  if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5054 5316                          nce_fastpath_trigger(under_nce);
5055 5317  
5056 5318                  ill_refrele(send_ill);
5057 5319                  if (src_ipif != NULL)
5058 5320                          ipif_refrele(src_ipif);
5059 5321  
5060 5322                  if (under_nce != NULL) {
5061 5323                          (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5062 5324                              ALL_ZONES, 0, NULL);
5063 5325                          nce_refrele(under_nce);
5064 5326                          if (nprobes > 0)
5065 5327                                  nprobes--;
5066 5328                          mp = nxt_mp;
5067 5329                          continue;
5068 5330                  }
5069 5331  drop_pkt:
5070 5332                  if (isv6) {
5071 5333                          BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5072 5334                  } else {
5073 5335                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5074 5336                  }
5075 5337                  ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5076 5338                  freemsg(mp);
5077 5339                  if (nprobes > 0)
5078 5340                          nprobes--;
5079 5341                  mp = nxt_mp;
5080 5342          }
5081 5343          ncec_cb_dispatch(ncec); /* complete callbacks */
5082 5344  }
  
    | 
      ↓ open down ↓ | 
    227 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX