Print this page
10472 Limit number of multicast NCEs
Reviewed by: Cody Peter Mello <melloc@writev.io>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>


   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * Copyright (c) 2018, Joyent, Inc.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/stropts.h>
  32 #include <sys/strsun.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/errno.h>
  35 #include <sys/dlpi.h>
  36 #include <sys/socket.h>
  37 #include <sys/ddi.h>
  38 #include <sys/sunddi.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/debug.h>
  41 #include <sys/vtrace.h>
  42 #include <sys/kmem.h>
  43 #include <sys/zone.h>
  44 #include <sys/ethernet.h>
  45 #include <sys/sdt.h>
  46 #include <sys/mac.h>


 110  * ncec_refcnt).
 111  */
 112 
 113 static  void    nce_cleanup_list(ncec_t *ncec);
 114 static  void    nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
 115 static  ncec_t  *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
 116     ncec_t *);
 117 static  nce_t   *nce_lookup_addr(ill_t *, const in6_addr_t *);
 118 static  int     nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
 119     uint16_t ncec_flags, nce_t **newnce);
 120 static  int     nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
 121     uint16_t ncec_flags, nce_t **newnce);
 122 static  boolean_t       ndp_xmit(ill_t *ill, uint32_t operation,
 123     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
 124     const in6_addr_t *target, int flag);
 125 static void     ncec_refhold_locked(ncec_t *);
 126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
 127 static  void    nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
 128 static  int     nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 129     uint16_t, uint16_t, nce_t **);
 130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
 131 static nce_t *nce_add(ill_t *, ncec_t *);
 132 static void nce_inactive(nce_t *);
 133 extern nce_t    *nce_lookup(ill_t *, const in6_addr_t *);
 134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
 135 static int      nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 136     uint16_t, uint16_t, nce_t **);
 137 static int      nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
 138     uint16_t, uint16_t, nce_t **);
 139 static int  nce_add_v6_postprocess(nce_t *);
 140 static int  nce_add_v4_postprocess(nce_t *);
 141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
 142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
 143 static void nce_resolv_ipmp_ok(ncec_t *);
 144 static void nce_walk_common(ill_t *, pfi_t, void *);
 145 static void nce_start_timer(ncec_t *, uint_t);
 146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
 147 static void nce_fastpath_trigger(nce_t *);
 148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
 149 
 150 #ifdef DEBUG
 151 static void     ncec_trace_cleanup(const ncec_t *);


 988 
 989         mutex_exit(&ndp->ndp_g_lock);
 990 
 991         if (free_nce_list != NULL) {
 992                 nce_cleanup_list(free_nce_list);
 993         }
 994 }
 995 
 996 /*
 997  * Walk everything.
 998  * Note that ill can be NULL hence can't derive the ipst from it.
 999  */
1000 void
1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003         ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004         ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006 
1007 /*






































































































































































































































1008  * For each interface an entry is added for the unspecified multicast group.
1009  * Here that mapping is used to form the multicast cache entry for a particular
1010  * multicast destination.
1011  */
1012 static int
1013 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1014     uint16_t flags, nce_t **newnce)
1015 {
1016         uchar_t         *hw_addr;
1017         int             err = 0;
1018         ip_stack_t      *ipst = ill->ill_ipst;
1019         nce_t           *nce;
1020 
1021         ASSERT(ill != NULL);
1022         ASSERT(ill->ill_isv6);
1023         ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1024 
1025         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1026         nce = nce_lookup_addr(ill, dst);
1027         if (nce != NULL) {


1033                  * For IRE_IF_RESOLVER a hardware mapping can be
1034                  * generated.
1035                  */
1036                 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1037                 if (hw_addr == NULL) {
1038                         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1039                         return (ENOMEM);
1040                 }
1041                 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1042         } else {
1043                 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1044                 hw_addr = NULL;
1045         }
1046         ASSERT((flags & NCE_F_MCAST) != 0);
1047         ASSERT((flags & NCE_F_NONUD) != 0);
1048         /* nce_state will be computed by nce_add_common() */
1049         err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1050             ND_UNCHANGED, &nce);
1051         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1052         if (err == 0)
1053                 err = nce_add_v6_postprocess(nce);
1054         if (hw_addr != NULL)
1055                 kmem_free(hw_addr, ill->ill_nd_lla_len);
1056         if (err != 0) {
1057                 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1058                 return (err);
1059         }
1060 done:
1061         ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1062         if (newnce != NULL)
1063                 *newnce = nce;
1064         else
1065                 nce_refrele(nce);
1066         return (0);
1067 }
1068 
1069 /*
1070  * Return the link layer address, and any flags of a ncec.
1071  */
1072 int
1073 ndp_query(ill_t *ill, struct lif_nd_req *lnr)


3083                 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3084                     ill->ill_sap_length);
3085         }
3086         mutex_exit(&ncec->ncec_lock);
3087         return (nce);
3088 }
3089 
3090 /*
3091  * we make nce_fp_mp to have an M_DATA prepend.
3092  * The caller ensures there is hold on ncec for this function.
3093  * Note that since ill_fastpath_probe() copies the mblk there is
3094  * no need to hold the nce or ncec beyond this function.
3095  *
3096  * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3097  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3098  * and will be returned back by this function, so that no extra nce_refrele
3099  * is required for the caller. The calls from nce_add_common() use this
3100  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3101  * nce_refrele of the returned nce (when it is non-null).
3102  */
3103 nce_t *
3104 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3105 {
3106         nce_t *nce;
3107         ill_t *ill = ncec->ncec_ill;
3108 
3109         ASSERT(ill != NULL);
3110 
3111         if (IS_IPMP(ill) && trigger_fp_req) {
3112                 trigger_fp_req = B_FALSE;
3113                 ipmp_ncec_refresh_nce(ncec);
3114         }
3115 
3116         /*
3117          * If the caller already has the nce corresponding to the ill, use
3118          * that one. Otherwise we have to lookup/add the nce. Calls from
3119          * nce_add_common() fall in the former category, and have just done
3120          * the nce lookup/add that can be reused.
3121          */
3122         if (ncec_nce == NULL)
3123                 nce = nce_fastpath_create(ill, ncec);


3141         int res;
3142         ill_t *ill = nce->nce_ill;
3143         ncec_t *ncec = nce->nce_common;
3144 
3145         res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3146         /*
3147          * EAGAIN is an indication of a transient error
3148          * i.e. allocation failure etc. leave the ncec in the list it
3149          * will be updated when another probe happens for another ire
3150          * if not it will be taken out of the list when the ire is
3151          * deleted.
3152          */
3153         if (res != 0 && res != EAGAIN && res != ENOTSUP)
3154                 nce_fastpath_list_delete(ill, ncec, NULL);
3155 }
3156 
3157 /*
3158  * Add ncec to the nce fastpath list on ill.
3159  */
3160 static nce_t *
3161 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3162 {
3163         nce_t *nce = NULL;
3164 
3165         ASSERT(MUTEX_HELD(&ill->ill_lock));
3166         /*
3167          * Atomically ensure that the ill is not CONDEMNED and is not going
3168          * down, before adding the NCE.
3169          */
3170         if (ill->ill_state_flags & ILL_CONDEMNED)
3171                 return (NULL);
3172         mutex_enter(&ncec->ncec_lock);
3173         /*
3174          * if ncec has not been deleted and
3175          * is not already in the list add it.
3176          */
3177         if (!NCE_ISCONDEMNED(ncec)) {
3178                 nce = nce_lookup(ill, &ncec->ncec_addr);
3179                 if (nce != NULL)
3180                         goto done;
3181                 nce = nce_add(ill, ncec);
3182         }
3183 done:
3184         mutex_exit(&ncec->ncec_lock);
3185         return (nce);
3186 }
3187 
3188 nce_t *
3189 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3190 {
3191         nce_t *nce;

3192 

3193         mutex_enter(&ill->ill_lock);
3194         nce = nce_ill_lookup_then_add_locked(ill, ncec);
3195         mutex_exit(&ill->ill_lock);

3196         return (nce);
3197 }
3198 
3199 
3200 /*
3201  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3202  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3203  * entry after all locks have been dropped.
3204  */
3205 void
3206 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3207 {
3208         nce_t *nce;
3209 
3210         ASSERT(ill != NULL);
3211 
3212         /* delete any nces referencing the ncec from underlying ills */
3213         if (IS_IPMP(ill))
3214                 ipmp_ncec_delete_nce(ncec);
3215 


3226         mutex_exit(&ill->ill_lock);
3227         if (nce != NULL) {
3228                 if (dead == NULL)
3229                         nce_refrele(nce);
3230                 else
3231                         list_insert_tail(dead, nce);
3232         }
3233 }
3234 
3235 /*
3236  * when the fastpath response does not fit in the datab
3237  * associated with the existing nce_fp_mp, we delete and
3238  * add the nce to retrigger fastpath based on the information
3239  * in the ncec_t.
3240  */
3241 static nce_t *
3242 nce_delete_then_add(nce_t *nce)
3243 {
3244         ill_t           *ill = nce->nce_ill;
3245         nce_t           *newnce = NULL;

3246 

3247         ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3248             (void *)nce, ill->ill_name));
3249         mutex_enter(&ill->ill_lock);
3250         mutex_enter(&nce->nce_common->ncec_lock);
3251         nce_delete(nce);
3252         /*
3253          * Make sure that ncec is not condemned before adding. We hold the
3254          * ill_lock and ncec_lock to synchronize with ncec_delete() and
3255          * ipmp_ncec_delete_nce()
3256          */
3257         if (!NCE_ISCONDEMNED(nce->nce_common))
3258                 newnce = nce_add(ill, nce->nce_common);
3259         mutex_exit(&nce->nce_common->ncec_lock);
3260         mutex_exit(&ill->ill_lock);

3261         nce_refrele(nce);
3262         return (newnce); /* could be null if nomem */
3263 }
3264 
3265 typedef struct nce_fp_match_s {
3266         nce_t   *nce_fp_match_res;
3267         mblk_t  *nce_fp_match_ack_mp;
3268 } nce_fp_match_t;
3269 
3270 /* ARGSUSED */
3271 static int
3272 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3273 {
3274         nce_fp_match_t  *nce_fp_marg = arg;
3275         ncec_t          *ncec = nce->nce_common;
3276         mblk_t          *mp = nce_fp_marg->nce_fp_match_ack_mp;
3277         uchar_t *mp_rptr, *ud_mp_rptr;
3278         mblk_t          *ud_mp = nce->nce_dlur_mp;
3279         ptrdiff_t       cmplen;
3280 


3955                 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3956                 if (hw_addr == NULL) {
3957                         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3958                         return (ENOMEM);
3959                 }
3960                 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3961         } else {
3962                 /*
3963                  * IRE_IF_NORESOLVER type simply copies the resolution
3964                  * cookie passed in.  So no hw_addr is needed.
3965                  */
3966                 hw_addr = NULL;
3967         }
3968         ASSERT(flags & NCE_F_MCAST);
3969         ASSERT(flags & NCE_F_NONUD);
3970         /* nce_state will be computed by nce_add_common() */
3971         err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3972             ND_UNCHANGED, &nce);
3973         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3974         if (err == 0)
3975                 err = nce_add_v4_postprocess(nce);
3976         if (hw_addr != NULL)
3977                 kmem_free(hw_addr, ill->ill_phys_addr_length);
3978         if (err != 0) {
3979                 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3980                 return (err);
3981         }
3982 done:
3983         if (newnce != NULL)
3984                 *newnce = nce;
3985         else
3986                 nce_refrele(nce);
3987         return (0);
3988 }
3989 
3990 /*
3991  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3992  * don't want to have to walk the list for every single one, so we gather up
3993  * batches at a time.
3994  */
3995 #define NCE_RESCHED_LIST_LEN    8


4379  *
4380  * When the caller passes in an nce_state of ND_UNCHANGED,
4381  * nce_add_common() will determine the state of the created nce based
4382  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4383  * be created with state set to the passed in nce_state.
4384  */
4385 static int
4386 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4387     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4388 {
4389         static  ncec_t          nce_nil;
4390         uchar_t                 *template = NULL;
4391         int                     err;
4392         ncec_t                  *ncec;
4393         ncec_t                  **ncep;
4394         ip_stack_t              *ipst = ill->ill_ipst;
4395         uint16_t                state;
4396         boolean_t               fastprobe = B_FALSE;
4397         struct ndp_g_s          *ndp;
4398         nce_t                   *nce = NULL;

4399         mblk_t                  *dlur_mp = NULL;
4400 
4401         if (ill->ill_isv6)
4402                 ndp = ill->ill_ipst->ips_ndp6;
4403         else
4404                 ndp = ill->ill_ipst->ips_ndp4;
4405 
4406         *retnce = NULL;
4407 
4408         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4409 
4410         if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4411                 ip0dbg(("nce_add_common: no addr\n"));
4412                 return (EINVAL);
4413         }
4414         if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4415                 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4416                 return (EINVAL);
4417         }
4418 


4669         }
4670         /*
4671          * Acquire the ncec_lock even before adding the ncec to the list
4672          * so that it cannot get deleted after the ncec is added, but
4673          * before we add the nce.
4674          */
4675         mutex_enter(&ncec->ncec_lock);
4676         if ((ncec->ncec_next = *ncep) != NULL)
4677                 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4678         *ncep = ncec;
4679         ncec->ncec_ptpn = ncep;
4680 
4681         /* Bump up the number of ncec's referencing this ill */
4682         DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4683             (char *), "ncec", (void *), ncec);
4684         ill->ill_ncec_cnt++;
4685         /*
4686          * Since we hold the ncec_lock at this time, the ncec cannot be
4687          * condemned, and we can safely add the nce.
4688          */
4689         *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);

4690         mutex_exit(&ncec->ncec_lock);
4691         mutex_exit(&ill->ill_lock);

4692 
4693         /* caller must trigger fastpath on *retnce */
4694         return (0);
4695 
4696 err_ret:
4697         if (ncec != NULL)
4698                 kmem_cache_free(ncec_cache, ncec);
4699         if (nce != NULL)
4700                 kmem_cache_free(nce_cache, nce);
4701         freemsg(dlur_mp);
4702         if (template != NULL)
4703                 kmem_free(template, ill->ill_phys_addr_length);
4704         return (err);
4705 }
4706 
4707 /*
4708  * take a ref on the nce
4709  */
4710 void
4711 nce_refhold(nce_t *nce)


4757         ill->ill_nce_cnt--;
4758         nce->nce_ill = NULL;
4759         /*
4760          * If the number of ncec's associated with this ill have dropped
4761          * to zero, check whether we need to restart any operation that
4762          * is waiting for this to happen.
4763          */
4764         if (ILL_DOWN_OK(ill)) {
4765                 /* ipif_ill_refrele_tail drops the ill_lock */
4766                 ipif_ill_refrele_tail(ill);
4767         } else {
4768                 mutex_exit(&ill->ill_lock);
4769         }
4770 
4771         mutex_destroy(&nce->nce_lock);
4772         kmem_cache_free(nce_cache, nce);
4773 }
4774 
4775 /*
4776  * Add an nce to the ill_nce list.




4777  */
4778 static nce_t *
4779 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)

4780 {










4781         bzero(nce, sizeof (*nce));
4782         mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4783         nce->nce_common = ncec;
4784         nce->nce_addr = ncec->ncec_addr;
4785         nce->nce_ill = ill;
4786         DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4787             (char *), "nce", (void *), nce);
4788         ill->ill_nce_cnt++;
4789 
4790         nce->nce_refcnt = 1; /* for the thread */
4791         ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4792         nce->nce_dlur_mp = dlur_mp;
4793 
4794         /* add nce to the ill's fastpath list.  */
4795         nce->nce_refcnt++; /* for the list */
4796         list_insert_head(&ill->ill_nce, nce);
4797         return (nce);
4798 }
4799 
4800 static nce_t *
4801 nce_add(ill_t *ill, ncec_t *ncec)
4802 {
4803         nce_t   *nce;
4804         mblk_t  *dlur_mp = NULL;
4805 
4806         ASSERT(MUTEX_HELD(&ill->ill_lock));
4807         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4808 
4809         nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4810         if (nce == NULL)
4811                 return (NULL);
4812         if (ncec->ncec_lladdr != NULL ||
4813             ill->ill_net_type == IRE_IF_NORESOLVER) {
4814                 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4815                     ill->ill_phys_addr_length, ill->ill_sap,
4816                     ill->ill_sap_length);
4817                 if (dlur_mp == NULL) {
4818                         kmem_cache_free(nce_cache, nce);
4819                         return (NULL);
4820                 }
4821         }
4822         return (nce_add_impl(ill, ncec, nce, dlur_mp));




4823 }
4824 
4825 /*
4826  * remove the nce from the ill_faspath list
4827  */
4828 void
4829 nce_delete(nce_t *nce)
4830 {
4831         ill_t   *ill = nce->nce_ill;
4832 
4833         ASSERT(MUTEX_HELD(&ill->ill_lock));
4834 
4835         mutex_enter(&nce->nce_lock);
4836         if (nce->nce_is_condemned) {
4837                 /*
4838                  * some other thread has removed this nce from the ill_nce list
4839                  */
4840                 mutex_exit(&nce->nce_lock);
4841                 return;
4842         }
4843         nce->nce_is_condemned = B_TRUE;
4844         mutex_exit(&nce->nce_lock);
4845 




4846         list_remove(&ill->ill_nce, nce);
4847         /*
4848          * even though we are holding the ill_lock, it is ok to
4849          * call nce_refrele here because we know that we should have
4850          * at least 2 refs on the nce: one for the thread, and one
4851          * for the list. The refrele below will release the one for
4852          * the list.
4853          */
4854         nce_refrele(nce);
4855 }
4856 
4857 nce_t *
4858 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4859 {
4860         nce_t *nce = NULL;
4861 
4862         ASSERT(ill != NULL);
4863         ASSERT(MUTEX_HELD(&ill->ill_lock));
4864 
4865         for (nce = list_head(&ill->ill_nce); nce != NULL;




   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * Copyright (c) 2019, Joyent, Inc.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/stropts.h>
  32 #include <sys/strsun.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/errno.h>
  35 #include <sys/dlpi.h>
  36 #include <sys/socket.h>
  37 #include <sys/ddi.h>
  38 #include <sys/sunddi.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/debug.h>
  41 #include <sys/vtrace.h>
  42 #include <sys/kmem.h>
  43 #include <sys/zone.h>
  44 #include <sys/ethernet.h>
  45 #include <sys/sdt.h>
  46 #include <sys/mac.h>


 110  * ncec_refcnt).
 111  */
 112 
 113 static  void    nce_cleanup_list(ncec_t *ncec);
 114 static  void    nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
 115 static  ncec_t  *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
 116     ncec_t *);
 117 static  nce_t   *nce_lookup_addr(ill_t *, const in6_addr_t *);
 118 static  int     nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
 119     uint16_t ncec_flags, nce_t **newnce);
 120 static  int     nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
 121     uint16_t ncec_flags, nce_t **newnce);
 122 static  boolean_t       ndp_xmit(ill_t *ill, uint32_t operation,
 123     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
 124     const in6_addr_t *target, int flag);
 125 static void     ncec_refhold_locked(ncec_t *);
 126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
 127 static  void    nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
 128 static  int     nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 129     uint16_t, uint16_t, nce_t **);
 130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
 131 static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
 132 static void nce_inactive(nce_t *);
 133 extern nce_t    *nce_lookup(ill_t *, const in6_addr_t *);
 134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
 135 static int      nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 136     uint16_t, uint16_t, nce_t **);
 137 static int      nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
 138     uint16_t, uint16_t, nce_t **);
 139 static int  nce_add_v6_postprocess(nce_t *);
 140 static int  nce_add_v4_postprocess(nce_t *);
 141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
 142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
 143 static void nce_resolv_ipmp_ok(ncec_t *);
 144 static void nce_walk_common(ill_t *, pfi_t, void *);
 145 static void nce_start_timer(ncec_t *, uint_t);
 146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
 147 static void nce_fastpath_trigger(nce_t *);
 148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
 149 
 150 #ifdef DEBUG
 151 static void     ncec_trace_cleanup(const ncec_t *);


 988 
 989         mutex_exit(&ndp->ndp_g_lock);
 990 
 991         if (free_nce_list != NULL) {
 992                 nce_cleanup_list(free_nce_list);
 993         }
 994 }
 995 
 996 /*
 997  * Walk everything.
 998  * Note that ill can be NULL hence can't derive the ipst from it.
 999  */
1000 void
1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003         ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004         ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006 
1007 /*
1008  * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
1009  * NCEs, and the number to reclaim if we hit the limit.  Used by
1010  * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
1011  * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
1012  */
1013 
1014 /* Maximum number of multicast NCEs on an ill. */
1015 uint_t ip_max_ill_mcast_nces = 16384;
1016 /*
1017  * Number of NCEs to delete if we hit the maximum above.  0 means *don't* and
1018  * return an error.  Non-zero means delete so many, and if the number is >=
1019  * the max above, that means delete them all.
1020  */
1021 uint_t ip_ill_mcast_reclaim = 256;
1022 
1023 /*
1024  * Encapsulate multicast ill capping in a function, for easier DTrace
1025  * detections.  Return a list of refheld NCEs to destroy-via-refrele.  That
1026  * list can be NULL, but can only be non-NULL if we successfully reclaimed.
1027  *
1028  * NOTE:  This function must be called while holding the ill_lock AND
1029  * JUST PRIOR to making the insertion into the ill_nce list.
1030  *
1031  * We can't release the ones we delete ourselves because the ill_lock is held
1032  * by the caller. They are, instead, passed back in a list_t for deletion
1033  * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
1034  *
1035  * While this covers nce_t, ncec_t gets done even further down the road.  See
1036  * nce_graveyard_free() for why.
1037  */
1038 static boolean_t
1039 nce_too_many_mcast(ill_t *ill, list_t *graveyard)
1040 {
1041         uint_t reclaim_count, max_count, reclaimed = 0;
1042         boolean_t too_many;
1043         nce_t *nce, *deadman;
1044 
1045         ASSERT(graveyard != NULL);
1046         ASSERT(list_is_empty(graveyard));
1047         ASSERT(MUTEX_HELD(&ill->ill_lock));
1048 
1049         /*
1050          * NOTE: Some grinning weirdo may have lowered the global max beyond
1051          * what this ill currently has.  The behavior in this case will be
1052          * trim-back just by the reclaim amount for any new ones.
1053          */
1054         max_count = ip_max_ill_mcast_nces;
1055         reclaim_count = min(ip_ill_mcast_reclaim, max_count);
1056 
1057         /* All good? */
1058         if (ill->ill_mcast_nces < max_count)
1059                 return (B_FALSE);       /* Yes, all good. */
1060 
1061         if (reclaim_count == 0)
1062                 return (B_TRUE);        /* Don't bother - we're stuck. */
1063 
1064         /* We need to reclaim now.  Exploit our held ill_lock. */
1065 
1066         /*
1067          * Start at the tail and work backwards, new nces are head-inserted,
1068          * so we'll be reaping the oldest entries.
1069          */
1070         nce = list_tail(&ill->ill_nce);
1071         while (reclaimed < reclaim_count) {
1072                 /* Skip ahead to a multicast NCE. */
1073                 while (nce != NULL &&
1074                     (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
1075                         nce = list_prev(&ill->ill_nce, nce);
1076                 }
1077                 if (nce == NULL)
1078                         break;
1079 
1080                 /*
1081                  * NOTE: For now, we just delete the first one(s) we find.
1082                  * This is not optimal, and may require some inspection of nce
1083                  * & its ncec to be better.
1084                  */
1085                 deadman = nce;
1086                 nce = list_prev(&ill->ill_nce, nce);
1087 
1088                 /* nce_delete() requires caller holds... */
1089                 nce_refhold(deadman);
1090                 nce_delete(deadman);    /* Bumps down ill_mcast_nces. */
1091 
1092                 /* Link the dead ones singly, still refheld... */
1093                 list_insert_tail(graveyard, deadman);
1094                 reclaimed++;
1095         }
1096 
1097         if (reclaimed != reclaim_count) {
1098                 /* We didn't have enough to reach reclaim_count. Why?!? */
1099                 DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
1100                     uint_t, reclaimed, uint_t, reclaim_count);
1101 
1102                 /* In case for some REALLY weird reason we found none! */
1103                 too_many = (reclaimed == 0);
1104         } else {
1105                 too_many = B_FALSE;
1106         }
1107 
1108         return (too_many);
1109 }
1110 
1111 static void
1112 ncec_mcast_reap_one(ncec_t *ncec, void *arg)
1113 {
1114         boolean_t reapit;
1115         ill_t *ill = (ill_t *)arg;
1116 
1117         /* Obvious no-lock-needed checks... */
1118         if (ncec == NULL || ncec->ncec_ill != ill ||
1119             (ncec->ncec_flags & NCE_F_MCAST) == 0)
1120                 return;
1121 
1122         mutex_enter(&ncec->ncec_lock);
1123         /*
1124          * It's refheld by the walk infrastructure. It has one reference for
1125          * being in the ndp_g_hash, and if an nce_t exists, that's one more.
1126          * We want ones without an nce_t, so 2 is the magic number.  If it's
1127          * LESS than 2, we have much bigger problems anyway.
1128          */
1129         ASSERT(ncec->ncec_refcnt >= 2);
1130         reapit = (ncec->ncec_refcnt == 2);
1131         mutex_exit(&ncec->ncec_lock);
1132 
1133         if (reapit) {
1134                 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
1135                 ncec_delete(ncec);
1136         }
1137 }
1138 
1139 /*
1140  * Attempt to reap stray multicast ncec_t structures left in the wake of
1141  * nce_graveyard_free(). This is a taskq servicing routine, as it's well
1142  * outside any netstack-global locks being held - ndp_g_lock in this case.  We
1143  * have a reference hold on the ill, which will prevent any unplumbing races.
1144  */
1145 static void
1146 ncec_mcast_reap(void *arg)
1147 {
1148         ill_t *ill = (ill_t *)arg;
1149 
1150         IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
1151         ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
1152         mutex_enter(&ill->ill_lock);
1153         ill->ill_mcast_ncec_cleanup = B_FALSE;
1154         /*
1155          * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
1156          * below for why.
1157          */
1158         ill->ill_refcnt--;
1159         if (ill->ill_refcnt == 0)
1160                 ipif_ill_refrele_tail(ill);     /* Drops ill_lock. */
1161         else
1162                 mutex_exit(&ill->ill_lock);
1163 }
1164 
1165 /*
1166  * Free a list (including handling an empty list or NULL list) of
1167  * reference-held NCEs that were reaped from a nce_too_many_mcast()
1168  * call. Separate because the caller must have dropped ndp_g_lock first.
1169  *
1170  * This also schedules a taskq task to unlink underlying NCECs from the
1171  * ndp_g_hash, which are protected by ndp_g_lock.
1172  */
1173 static void
1174 nce_graveyard_free(list_t *graveyard)
1175 {
1176         nce_t *deadman, *current;
1177         ill_t *ill;
1178         boolean_t doit;
1179 
1180         if (graveyard == NULL)
1181                 return;
1182 
1183         current = list_head(graveyard);
1184         if (current == NULL) {
1185                 list_destroy(graveyard);
1186                 return;
1187         }
1188 
1189         ill = current->nce_ill;
1190         /*
1191          * Normally one should ill_refhold(ill) here.  There's no _notr()
1192          * variant like there is for ire_t, dce_t, or even ncec_t, but this is
1193          * the ONLY case that'll break the mh_trace that IP debugging uses for
1194          * reference counts (i.e. they assume same thread releases as
1195          * holds). Instead, we inline ill_refhold() here.  We must do the same
1196          * in the release done by the ncec_mcast_reap() above.
1197          */
1198         mutex_enter(&ill->ill_lock);
1199         ill->ill_refcnt++;
1200         mutex_exit(&ill->ill_lock);
1201 
1202         while (current != NULL) {
1203                 ASSERT3P(ill, ==, current->nce_ill);
1204                 deadman = current;
1205                 current = list_next(graveyard, deadman);
1206                 list_remove(graveyard, deadman);
1207                 ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
1208                     0);
1209                 nce_refrele(deadman);
1210         }
1211         list_destroy(graveyard);
1212 
1213         mutex_enter(&ill->ill_lock);
1214         if (ill->ill_mcast_ncec_cleanup)
1215                 doit = B_FALSE;
1216         else {
1217                 ill->ill_mcast_ncec_cleanup = B_TRUE;
1218                 doit = B_TRUE;
1219         }
1220         mutex_exit(&ill->ill_lock);
1221         if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
1222             ill, TQ_NOSLEEP) == NULL) {
1223                 mutex_enter(&ill->ill_lock);
1224                 if (doit) {
1225                         IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
1226                         ill->ill_mcast_ncec_cleanup = B_FALSE;
1227                 }
1228                 /* There's no _notr() for ill_refrele(), so inline it here. */
1229                 ill->ill_refcnt--;
1230                 if (ill->ill_refcnt == 0)
1231                         ipif_ill_refrele_tail(ill);     /* Drops ill_lock */
1232                 else
1233                         mutex_exit(&ill->ill_lock);
1234         }
1235 }
1236 
1237 /*
1238  * For each interface an entry is added for the unspecified multicast group.
1239  * Here that mapping is used to form the multicast cache entry for a particular
1240  * multicast destination.
1241  */
1242 static int
1243 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1244     uint16_t flags, nce_t **newnce)
1245 {
1246         uchar_t         *hw_addr;
1247         int             err = 0;
1248         ip_stack_t      *ipst = ill->ill_ipst;
1249         nce_t           *nce;
1250 
1251         ASSERT(ill != NULL);
1252         ASSERT(ill->ill_isv6);
1253         ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1254 
1255         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1256         nce = nce_lookup_addr(ill, dst);
1257         if (nce != NULL) {


1263                  * For IRE_IF_RESOLVER a hardware mapping can be
1264                  * generated.
1265                  */
1266                 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1267                 if (hw_addr == NULL) {
1268                         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1269                         return (ENOMEM);
1270                 }
1271                 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1272         } else {
1273                 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1274                 hw_addr = NULL;
1275         }
1276         ASSERT((flags & NCE_F_MCAST) != 0);
1277         ASSERT((flags & NCE_F_NONUD) != 0);
1278         /* nce_state will be computed by nce_add_common() */
1279         err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1280             ND_UNCHANGED, &nce);
1281         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1282         if (err == 0)
1283                 err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
1284         if (hw_addr != NULL)
1285                 kmem_free(hw_addr, ill->ill_nd_lla_len);
1286         if (err != 0) {
1287                 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1288                 return (err);
1289         }
1290 done:
1291         ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1292         if (newnce != NULL)
1293                 *newnce = nce;
1294         else
1295                 nce_refrele(nce);
1296         return (0);
1297 }
1298 
1299 /*
1300  * Return the link layer address, and any flags of a ncec.
1301  */
1302 int
1303 ndp_query(ill_t *ill, struct lif_nd_req *lnr)


3313                 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3314                     ill->ill_sap_length);
3315         }
3316         mutex_exit(&ncec->ncec_lock);
3317         return (nce);
3318 }
3319 
3320 /*
3321  * we make nce_fp_mp to have an M_DATA prepend.
3322  * The caller ensures there is hold on ncec for this function.
3323  * Note that since ill_fastpath_probe() copies the mblk there is
3324  * no need to hold the nce or ncec beyond this function.
3325  *
3326  * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3327  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3328  * and will be returned back by this function, so that no extra nce_refrele
3329  * is required for the caller. The calls from nce_add_common() use this
3330  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3331  * nce_refrele of the returned nce (when it is non-null).
3332  */
3333 static nce_t *
3334 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3335 {
3336         nce_t *nce;
3337         ill_t *ill = ncec->ncec_ill;
3338 
3339         ASSERT(ill != NULL);
3340 
3341         if (IS_IPMP(ill) && trigger_fp_req) {
3342                 trigger_fp_req = B_FALSE;
3343                 ipmp_ncec_refresh_nce(ncec);
3344         }
3345 
3346         /*
3347          * If the caller already has the nce corresponding to the ill, use
3348          * that one. Otherwise we have to lookup/add the nce. Calls from
3349          * nce_add_common() fall in the former category, and have just done
3350          * the nce lookup/add that can be reused.
3351          */
3352         if (ncec_nce == NULL)
3353                 nce = nce_fastpath_create(ill, ncec);


3371         int res;
3372         ill_t *ill = nce->nce_ill;
3373         ncec_t *ncec = nce->nce_common;
3374 
3375         res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3376         /*
3377          * EAGAIN is an indication of a transient error
3378          * i.e. allocation failure etc. leave the ncec in the list it
3379          * will be updated when another probe happens for another ire
3380          * if not it will be taken out of the list when the ire is
3381          * deleted.
3382          */
3383         if (res != 0 && res != EAGAIN && res != ENOTSUP)
3384                 nce_fastpath_list_delete(ill, ncec, NULL);
3385 }
3386 
3387 /*
3388  * Add ncec to the nce fastpath list on ill.
3389  */
3390 static nce_t *
3391 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
3392 {
3393         nce_t *nce = NULL;
3394 
3395         ASSERT(MUTEX_HELD(&ill->ill_lock));
3396         /*
3397          * Atomically ensure that the ill is not CONDEMNED and is not going
3398          * down, before adding the NCE.
3399          */
3400         if (ill->ill_state_flags & ILL_CONDEMNED)
3401                 return (NULL);
3402         mutex_enter(&ncec->ncec_lock);
3403         /*
3404          * if ncec has not been deleted and
3405          * is not already in the list add it.
3406          */
3407         if (!NCE_ISCONDEMNED(ncec)) {
3408                 nce = nce_lookup(ill, &ncec->ncec_addr);
3409                 if (nce != NULL)
3410                         goto done;
3411                 nce = nce_add(ill, ncec, graveyard);
3412         }
3413 done:
3414         mutex_exit(&ncec->ncec_lock);
3415         return (nce);
3416 }
3417 
3418 static nce_t *
3419 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3420 {
3421         nce_t *nce;
3422         list_t graveyard;
3423 
3424         list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3425         mutex_enter(&ill->ill_lock);
3426         nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
3427         mutex_exit(&ill->ill_lock);
3428         nce_graveyard_free(&graveyard);
3429         return (nce);
3430 }
3431 
3432 
3433 /*
3434  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3435  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3436  * entry after all locks have been dropped.
3437  */
3438 void
3439 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3440 {
3441         nce_t *nce;
3442 
3443         ASSERT(ill != NULL);
3444 
3445         /* delete any nces referencing the ncec from underlying ills */
3446         if (IS_IPMP(ill))
3447                 ipmp_ncec_delete_nce(ncec);
3448 


3459         mutex_exit(&ill->ill_lock);
3460         if (nce != NULL) {
3461                 if (dead == NULL)
3462                         nce_refrele(nce);
3463                 else
3464                         list_insert_tail(dead, nce);
3465         }
3466 }
3467 
3468 /*
3469  * when the fastpath response does not fit in the datab
3470  * associated with the existing nce_fp_mp, we delete and
3471  * add the nce to retrigger fastpath based on the information
3472  * in the ncec_t.
3473  */
3474 static nce_t *
3475 nce_delete_then_add(nce_t *nce)
3476 {
3477         ill_t           *ill = nce->nce_ill;
3478         nce_t           *newnce = NULL;
3479         list_t          graveyard;
3480 
3481         list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3482         ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3483             (void *)nce, ill->ill_name));
3484         mutex_enter(&ill->ill_lock);
3485         mutex_enter(&nce->nce_common->ncec_lock);
3486         nce_delete(nce);
3487         /*
3488          * Make sure that ncec is not condemned before adding. We hold the
3489          * ill_lock and ncec_lock to synchronize with ncec_delete() and
3490          * ipmp_ncec_delete_nce()
3491          */
3492         if (!NCE_ISCONDEMNED(nce->nce_common))
3493                 newnce = nce_add(ill, nce->nce_common, &graveyard);
3494         mutex_exit(&nce->nce_common->ncec_lock);
3495         mutex_exit(&ill->ill_lock);
3496         nce_graveyard_free(&graveyard);
3497         nce_refrele(nce);
3498         return (newnce); /* could be null if nomem */
3499 }
3500 
3501 typedef struct nce_fp_match_s {
3502         nce_t   *nce_fp_match_res;
3503         mblk_t  *nce_fp_match_ack_mp;
3504 } nce_fp_match_t;
3505 
3506 /* ARGSUSED */
3507 static int
3508 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3509 {
3510         nce_fp_match_t  *nce_fp_marg = arg;
3511         ncec_t          *ncec = nce->nce_common;
3512         mblk_t          *mp = nce_fp_marg->nce_fp_match_ack_mp;
3513         uchar_t *mp_rptr, *ud_mp_rptr;
3514         mblk_t          *ud_mp = nce->nce_dlur_mp;
3515         ptrdiff_t       cmplen;
3516 


4191                 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
4192                 if (hw_addr == NULL) {
4193                         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4194                         return (ENOMEM);
4195                 }
4196                 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
4197         } else {
4198                 /*
4199                  * IRE_IF_NORESOLVER type simply copies the resolution
4200                  * cookie passed in.  So no hw_addr is needed.
4201                  */
4202                 hw_addr = NULL;
4203         }
4204         ASSERT(flags & NCE_F_MCAST);
4205         ASSERT(flags & NCE_F_NONUD);
4206         /* nce_state will be computed by nce_add_common() */
4207         err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
4208             ND_UNCHANGED, &nce);
4209         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4210         if (err == 0)
4211                 err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
4212         if (hw_addr != NULL)
4213                 kmem_free(hw_addr, ill->ill_phys_addr_length);
4214         if (err != 0) {
4215                 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
4216                 return (err);
4217         }
4218 done:
4219         if (newnce != NULL)
4220                 *newnce = nce;
4221         else
4222                 nce_refrele(nce);
4223         return (0);
4224 }
4225 
4226 /*
4227  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
4228  * don't want to have to walk the list for every single one, so we gather up
4229  * batches at a time.
4230  */
4231 #define NCE_RESCHED_LIST_LEN    8


4615  *
4616  * When the caller passes in an nce_state of ND_UNCHANGED,
4617  * nce_add_common() will determine the state of the created nce based
4618  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4619  * be created with state set to the passed in nce_state.
4620  */
4621 static int
4622 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4623     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4624 {
4625         static  ncec_t          nce_nil;
4626         uchar_t                 *template = NULL;
4627         int                     err;
4628         ncec_t                  *ncec;
4629         ncec_t                  **ncep;
4630         ip_stack_t              *ipst = ill->ill_ipst;
4631         uint16_t                state;
4632         boolean_t               fastprobe = B_FALSE;
4633         struct ndp_g_s          *ndp;
4634         nce_t                   *nce = NULL;
4635         list_t                  graveyard;
4636         mblk_t                  *dlur_mp = NULL;
4637 
4638         if (ill->ill_isv6)
4639                 ndp = ill->ill_ipst->ips_ndp6;
4640         else
4641                 ndp = ill->ill_ipst->ips_ndp4;
4642 
4643         *retnce = NULL;
4644 
4645         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4646 
4647         if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4648                 ip0dbg(("nce_add_common: no addr\n"));
4649                 return (EINVAL);
4650         }
4651         if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4652                 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4653                 return (EINVAL);
4654         }
4655 


4906         }
4907         /*
4908          * Acquire the ncec_lock even before adding the ncec to the list
4909          * so that it cannot get deleted after the ncec is added, but
4910          * before we add the nce.
4911          */
4912         mutex_enter(&ncec->ncec_lock);
4913         if ((ncec->ncec_next = *ncep) != NULL)
4914                 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4915         *ncep = ncec;
4916         ncec->ncec_ptpn = ncep;
4917 
4918         /* Bump up the number of ncec's referencing this ill */
4919         DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4920             (char *), "ncec", (void *), ncec);
4921         ill->ill_ncec_cnt++;
4922         /*
4923          * Since we hold the ncec_lock at this time, the ncec cannot be
4924          * condemned, and we can safely add the nce.
4925          */
4926         list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
4927         *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
4928         mutex_exit(&ncec->ncec_lock);
4929         mutex_exit(&ill->ill_lock);
4930         nce_graveyard_free(&graveyard);
4931 
4932         /* caller must trigger fastpath on *retnce */
4933         return (0);
4934 
4935 err_ret:
4936         if (ncec != NULL)
4937                 kmem_cache_free(ncec_cache, ncec);
4938         if (nce != NULL)
4939                 kmem_cache_free(nce_cache, nce);
4940         freemsg(dlur_mp);
4941         if (template != NULL)
4942                 kmem_free(template, ill->ill_phys_addr_length);
4943         return (err);
4944 }
4945 
4946 /*
4947  * take a ref on the nce
4948  */
4949 void
4950 nce_refhold(nce_t *nce)


4996         ill->ill_nce_cnt--;
4997         nce->nce_ill = NULL;
4998         /*
4999          * If the number of ncec's associated with this ill have dropped
5000          * to zero, check whether we need to restart any operation that
5001          * is waiting for this to happen.
5002          */
5003         if (ILL_DOWN_OK(ill)) {
5004                 /* ipif_ill_refrele_tail drops the ill_lock */
5005                 ipif_ill_refrele_tail(ill);
5006         } else {
5007                 mutex_exit(&ill->ill_lock);
5008         }
5009 
5010         mutex_destroy(&nce->nce_lock);
5011         kmem_cache_free(nce_cache, nce);
5012 }
5013 
5014 /*
5015  * Add an nce to the ill_nce list.
5016  *
5017  * Adding multicast NCEs is subject to a per-ill limit. This function returns
5018  * NULL if that's the case, and it may reap a number of multicast nces.
5019  * Callers (and upstack) must be able to cope with NULL returns.
5020  */
5021 static nce_t *
5022 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
5023     list_t *graveyard)
5024 {
5025         ASSERT(MUTEX_HELD(&ill->ill_lock));
5026 
5027         if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
5028                 if (nce_too_many_mcast(ill, graveyard)) {
5029                         kmem_cache_free(nce_cache, nce);
5030                         return (NULL);
5031                 }
5032                 ill->ill_mcast_nces++;
5033         }
5034 
5035         bzero(nce, sizeof (*nce));
5036         mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
5037         nce->nce_common = ncec;
5038         nce->nce_addr = ncec->ncec_addr;
5039         nce->nce_ill = ill;
5040         DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
5041             (char *), "nce", (void *), nce);
5042         ill->ill_nce_cnt++;
5043 
5044         nce->nce_refcnt = 1; /* for the thread */
5045         ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
5046         nce->nce_dlur_mp = dlur_mp;
5047 
5048         /* add nce to the ill's fastpath list.  */
5049         nce->nce_refcnt++; /* for the list */
5050         list_insert_head(&ill->ill_nce, nce);
5051         return (nce);
5052 }
5053 
5054 static nce_t *
5055 nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
5056 {
5057         nce_t   *nce;
5058         mblk_t  *dlur_mp = NULL;
5059 
5060         ASSERT(MUTEX_HELD(&ill->ill_lock));
5061         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
5062 
5063         nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
5064         if (nce == NULL)
5065                 return (NULL);
5066         if (ncec->ncec_lladdr != NULL ||
5067             ill->ill_net_type == IRE_IF_NORESOLVER) {
5068                 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
5069                     ill->ill_phys_addr_length, ill->ill_sap,
5070                     ill->ill_sap_length);
5071                 if (dlur_mp == NULL) {
5072                         kmem_cache_free(nce_cache, nce);
5073                         return (NULL);
5074                 }
5075         }
5076         /*
5077          * If nce_add_impl() returns NULL due to on multicast limiting, caller
5078          * will (correctly) assume ENOMEM.
5079          */
5080         return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
5081 }
5082 
5083 /*
5084  * remove the nce from the ill_faspath list
5085  */
5086 void
5087 nce_delete(nce_t *nce)
5088 {
5089         ill_t   *ill = nce->nce_ill;
5090 
5091         ASSERT(MUTEX_HELD(&ill->ill_lock));
5092 
5093         mutex_enter(&nce->nce_lock);
5094         if (nce->nce_is_condemned) {
5095                 /*
5096                  * some other thread has removed this nce from the ill_nce list
5097                  */
5098                 mutex_exit(&nce->nce_lock);
5099                 return;
5100         }
5101         nce->nce_is_condemned = B_TRUE;
5102         mutex_exit(&nce->nce_lock);
5103 
5104         /* Update the count of multicast NCEs. */
5105         if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
5106                 ill->ill_mcast_nces--;
5107 
5108         list_remove(&ill->ill_nce, nce);
5109         /*
5110          * even though we are holding the ill_lock, it is ok to
5111          * call nce_refrele here because we know that we should have
5112          * at least 2 refs on the nce: one for the thread, and one
5113          * for the list. The refrele below will release the one for
5114          * the list.
5115          */
5116         nce_refrele(nce);
5117 }
5118 
5119 nce_t *
5120 nce_lookup(ill_t *ill, const in6_addr_t *addr)
5121 {
5122         nce_t *nce = NULL;
5123 
5124         ASSERT(ill != NULL);
5125         ASSERT(MUTEX_HELD(&ill->ill_lock));
5126 
5127         for (nce = list_head(&ill->ill_nce); nce != NULL;